Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 8 additions & 9 deletions .github/workflows/release-stable.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -42,25 +42,24 @@ jobs:

# Fetch latest state after semantic-release pushed commits
# This ensures we get all commits that semantic-release created
git fetch origin release:release
git fetch origin main:main
git fetch origin

# Check if main is already up to date
if git diff --quiet main release; then
# Check if main is already up to date with release
if git diff --quiet origin/main origin/release; then
echo "main is already up to date with release"
exit 0
fi

# Check if main is ancestor of release (can fast-forward)
if git merge-base --is-ancestor main release; then
if git merge-base --is-ancestor origin/main origin/release; then
echo "Fast-forwarding main to release"
git checkout main
git merge --ff-only release
git checkout -B main origin/main
git merge --ff-only origin/release
git push origin main
else
echo "Rebasing main onto release (force-with-lease required)"
git checkout main
git rebase release
git checkout -B main origin/main
git rebase origin/release
# Use --force-with-lease for safety: only push if remote hasn't changed
git push --force-with-lease origin main
fi
Expand Down
302 changes: 69 additions & 233 deletions README.md

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "uv_build"

[project]
name = "logging-objects-with-schema"
version = "0.1.2"
version = "0.1.3rc1"
description = "Proxy logging wrapper that validates extra fields against a JSON schema."
readme = "README.md"
requires-python = ">=3.10"
Expand Down
43 changes: 41 additions & 2 deletions src/logging_objects_with_schema/errors.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,27 @@
class SchemaProblem:
"""Describes a single problem encountered while loading the schema.

This class is used to report schema validation errors during schema
compilation. Schema problems are fatal: if any are detected during
logger initialization, the application is terminated after logging
all problems to stderr via ``os._exit(1)``.

Schema problems can occur due to:
- Missing or inaccessible schema file
- Invalid JSON syntax
- Invalid schema structure (non-object top-level, missing required fields)
- Unknown type names or invalid type declarations
- Root key conflicts with reserved logging fields
- Excessive nesting depth (exceeds MAX_SCHEMA_DEPTH)

Attributes:
message: Human-readable description of the problem.
message: Human-readable description of the problem. Examples include:
- "Schema file not found: /path/to/logging_objects_with_schema.json"
- "Failed to parse JSON schema: Expecting ',' delimiter: line 5 column 10"
- "Unknown type 'string' at ServicePayload.RequestID"
- "Root key 'name' conflicts with reserved logging fields"
- "Schema nesting depth exceeds maximum allowed depth of 100 at "
"path ServicePayload.Metrics"
"""

message: str
Expand All @@ -20,8 +39,28 @@ class SchemaProblem:
class DataProblem:
"""Describes a single problem encountered while validating log data.

This class is used to report validation errors when applying the compiled
schema to user-provided ``extra`` fields during logging. Unlike
:class:`SchemaProblem`, data problems are not fatal: they are collected
and logged as ERROR messages *after* the main log record has been emitted,
ensuring 100% compatibility with standard logger behavior.

Data problems can occur due to:
- Type mismatches (e.g., providing str where int is expected)
- None values (None is never allowed for any type)
- Invalid list elements (non-homogeneous lists, non-primitive elements)
- Redundant fields (fields not defined in the schema)

Attributes:
message: Human-readable description of the validation problem.
message: JSON string containing structured error information. The message
is always a valid JSON object with the following structure:
``{"field": "...", "error": "...", "value": "..."}``
All values are serialized via ``repr()`` for safety and consistency.
Examples:
- ``{"field": "'user_id'", "error": "'has type str, expected int'", "value": "'abc-123'"}``
- ``{"field": "'request_id'", "error": "'is None'", "value": "None"}``
- ``{"field": "'tags'", "error": "'is a list but contains elements with types dict; expected all elements to be of type str'", "value": "[{'key': 'color'}]"}`` # noqa: E501
- ``{"field": "'unknown_field'", "error": "'is not defined in schema'", "value": "'some_value'"}``
"""

message: str
122 changes: 103 additions & 19 deletions src/logging_objects_with_schema/schema_applier.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

from __future__ import annotations

import json
from collections import defaultdict
from collections.abc import Mapping, MutableMapping
from typing import Any
Expand All @@ -14,6 +15,32 @@
from .schema_loader import CompiledSchema, SchemaLeaf


def _create_validation_error_json(field: str, error: str, value: Any) -> str:
"""Create JSON string for a single validation error.

All values are wrapped in repr() before JSON serialization. This ensures:
- Any value type can be safely serialized (even non-JSON-serializable types)
- The error message always contains a valid Python representation of the value
- Security: prevents issues with special characters or control sequences
- Consistency: all error messages have the same format regardless of value type

Args:
field: Field name that caused the validation error.
error: Error description.
value: Invalid value that caused the error.

Returns:
JSON string with field, error, and value (all via repr() for safety).
"""
return json.dumps(
{
"field": repr(field),
"error": repr(error),
"value": repr(value),
}
)


def _validate_list_value(
value: list,
source: str,
Expand All @@ -34,25 +61,29 @@ def _validate_list_value(
DataProblem if validation fails, None if validation succeeds.
"""
if item_expected_type is None:
return DataProblem(
f"Field '{source}' is list but has no item type configured",
)
error_msg = "is a list but has no item type configured"
return DataProblem(_create_validation_error_json(source, error_msg, value))

if len(value) == 0:
# Empty lists are always valid
return None

# Collect unique type names of items that don't match the expected type.
# We use a set comprehension to get unique type names (not the types themselves)
# for the error message. This gives a clear, readable error message showing
# which types were found (e.g., "int, str") vs what was expected.
invalid_item_types = {
type(item).__name__ for item in value if type(item) is not item_expected_type
}

if invalid_item_types:
return DataProblem(
f"Field '{source}' is a list but contains elements "
error_msg = (
f"is a list but contains elements "
f"with types {sorted(invalid_item_types)}; "
f"expected all elements to be of type "
f"{item_expected_type.__name__}",
f"{item_expected_type.__name__}"
)
return DataProblem(_create_validation_error_json(source, error_msg, value))

return None

Expand All @@ -73,13 +104,22 @@ def _set_nested_value(
value: The value to set at the target location.
"""
current = target
# Navigate through intermediate dictionaries, creating them as needed.
# We iterate through all keys except the last one (path[:-1]) to build
# the nested structure.
for key in path[:-1]:
child = current.get(key)
# If the key doesn't exist or exists but is not a dict, create a new dict.
# This overwrites any non-dict value that might have been there (which
# shouldn't happen in normal operation, but we handle it defensively).
# We use isinstance() instead of checking for None because we need to
# ensure the value is actually a dict, not just that the key exists.
if not isinstance(child, dict):
child = {}
current[key] = child
current = child

# Set the final value at the last key in the path
current[path[-1]] = value


Expand Down Expand Up @@ -107,11 +147,12 @@ def _validate_and_apply_leaf(
# bool is a subclass of int). This ensures that the actual
# runtime type matches the schema type exactly.
if type(value) is not leaf.expected_type:
error_msg = (
f"has type {type(value).__name__}, "
f"expected {leaf.expected_type.__name__}"
)
problems.append(
DataProblem(
f"Field '{source}' has type {type(value).__name__}, "
f"expected {leaf.expected_type.__name__}",
),
DataProblem(_create_validation_error_json(source, error_msg, value))
)
return

Expand All @@ -138,6 +179,12 @@ def _strip_empty(node: Any) -> Any:
This helper is used by ``_apply_schema_internal`` on the final payload
to avoid leaving empty containers created during schema application.

Note: Lists are not processed (they are returned as-is). This is intentional:
- Lists in the schema are always homogeneous primitive types (validated earlier)
- Empty lists are valid and should be preserved
- We only need to clean up empty dicts that were created as intermediate
containers during schema application but ended up empty

Note:
This function is part of the internal implementation details and is
not considered a public API. Its signature and behaviour may change
Expand Down Expand Up @@ -198,6 +245,24 @@ def _apply_schema_internal(
The function itself does not raise exceptions; it only accumulates
:class:`DataProblem` instances for the caller to handle.

Performance considerations:
Time complexity is O(n + m) where n is the number of leaves in the
compiled schema and m is the number of fields in ``extra_values``.
The function groups leaves by source field name to avoid redundant
validation when a single source is used by multiple leaves. For typical
schemas (< 100 leaves) and typical extra dictionaries (< 50 fields),
the overhead is negligible (< 1ms). Memory complexity is O(n + m) for
the output structures.

Limitations:
- Very large ``extra_values`` dictionaries (> 1000 fields) may cause
noticeable overhead, but this is uncommon in practice
- Deeply nested output structures (limited by schema depth) may increase
memory usage, but the depth is already limited by MAX_SCHEMA_DEPTH
- All validation errors are collected before returning; for schemas with
many leaves and many validation failures, the problems list may grow
large, but this is expected behavior for debugging purposes

Note:
This function is used internally by :class:`SchemaLogger` and is not
considered part of the public API. Its signature and behaviour may
Expand All @@ -209,33 +274,51 @@ def _apply_schema_internal(
extra: dict[str, Any] = {}
problems: list[DataProblem] = []

# Group leaves by source for efficient processing
# Group leaves by source field name. This is necessary because a single source
# can be referenced by multiple leaves (allowing the same value to appear in
# different locations in the output structure). Grouping allows us to process
# all leaves for a given source together, which is more efficient and allows
# us to validate the value once per source (e.g., checking for None) rather
# than once per leaf.
source_to_leaves: dict[str, list[SchemaLeaf]] = defaultdict(list)
for leaf in compiled.leaves:
source_to_leaves[leaf.source].append(leaf)

used_sources = set(source_to_leaves.keys())

# Process each source that appears in the schema. If a source is missing from
# extra_values, we silently skip it (this is normal - not all sources need to
# be present in every log call). We only validate and apply sources that are
# actually provided.
for source, leaves in source_to_leaves.items():
if source not in extra_values:
# Source not provided - this is normal, not an error. Skip it.
continue

value = extra_values[source]

# Check for None values explicitly (None values are not allowed)
# This check must be done once per source, not once per leaf
# Check for None values explicitly. None is never allowed for any type,
# so we check it once per source (not once per leaf) before attempting
# type-specific validation. This avoids redundant checks when a source
# is used by multiple leaves.
if value is None:
error_msg = "is None"
problems.append(
DataProblem(
f"Field '{source}' is None",
),
DataProblem(_create_validation_error_json(source, error_msg, None))
)
continue

# Validate the value against each leaf that references this source.
# Each leaf validates independently, so a value might pass validation
# for some leaves (where type matches) but fail for others (where type
# doesn't match). The value is written only to locations where validation
# succeeds.
for leaf in leaves:
_validate_and_apply_leaf(leaf, value, source, extra, problems)

# Report redundant fields for any keys not referenced by schema leaves.
# Report redundant fields: any keys in extra_values that are not referenced
# by any schema leaf. These are fields that the user provided but which are
# not defined in the schema, so they cannot be included in the log output.
# Optimization: if schema is empty (no used_sources), all fields are redundant,
# so we can skip the membership check for each key.
redundant_keys = (
Expand All @@ -244,10 +327,11 @@ def _apply_schema_internal(
else (key for key in extra_values.keys() if key not in used_sources)
)
for key in redundant_keys:
error_msg = "is not defined in schema"
problems.append(
DataProblem(
f"Field '{key}' is not defined in schema",
),
_create_validation_error_json(key, error_msg, extra_values[key])
)
)

cleaned_extra = _strip_empty(extra)
Expand Down
Loading