Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 2 additions & 8 deletions docs/tutorials/virtual_db_tutorial.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -147,13 +147,7 @@
"cell_type": "markdown",
"id": "cell-3",
"metadata": {},
"source": [
"## Initializing VirtualDB\n",
"\n",
"Creating a VirtualDB instance loads and validates the config but does\n",
"**not** download any data yet. Views are registered lazily on the first\n",
"`query()`, `tables()`, or `describe()` call."
]
"source": "## Initializing VirtualDB\n\nCreating a VirtualDB instance loads and validates the config, downloads any\nnecessary data, and registers all views immediately."
},
{
"cell_type": "code",
Expand Down Expand Up @@ -5100,4 +5094,4 @@
},
"nbformat": 4,
"nbformat_minor": 5
}
}
14 changes: 4 additions & 10 deletions docs/virtual_db.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,27 +25,21 @@ and the [tutorial](tutorials/virtual_db_tutorial.ipynb) for usage examples.

## Advanced Usage

After any public method is called (e.g. `vdb.tables()`), the underlying DuckDB
connection is available as `vdb._db`. You can use `_db` to execute any SQL
on the database, eg creating more views, or creating a table in memory
The underlying DuckDB connection is available as `vdb._conn`. You can use
`_conn` to execute any SQL on the database, eg creating more views, or
creating a table in memory.

Custom **views** created this way appear in `tables()`, `describe()`, and
`get_fields()` automatically because those methods query DuckDB's
`information_schema`. Custom **tables** do not appear in `tables()` (which
only lists views), but are fully queryable via `vdb.query()`.

Call at least one public method first to ensure the connection is initialized
before accessing `_db` directly.

Example -- create a materialized analysis table::

# Trigger view registration
vdb.tables()

# Create a persistent in-memory table from a complex query.
# This example selects one "best" Hackett-2020 sample per regulator
# using a priority system: ZEV+P > GEV+P > GEV+M.
vdb._db.execute("""
vdb._conn.execute("""
CREATE OR REPLACE TABLE hackett_analysis_set AS
WITH regulator_tiers AS (
SELECT
Expand Down
3 changes: 1 addition & 2 deletions docs/virtual_db_configuration.md
Original file line number Diff line number Diff line change
Expand Up @@ -255,8 +255,7 @@ for more detailed explanation of comparative datasets and composite IDs.
## Internal Structure

VirtualDB uses an in-memory DuckDB database to construct a layered hierarchy
of SQL views over locally cached Parquet files. Views are created lazily on
first query and are not persisted to disk.
of SQL views over locally cached Parquet files. Views are created on initialization and are not persisted to disk.

### View Hierarchy

Expand Down
2 changes: 1 addition & 1 deletion tfbpapi/datacard.py
Original file line number Diff line number Diff line change
Expand Up @@ -307,7 +307,7 @@ def _build_metadata_fields_map(self) -> None:
]
break
else:
self.logger.warning(
self.logger.info(
"No metadata fields found for data config '%s' "
"in repo '%s' -- no embedded metadata_fields and "
"no metadata config with applies_to",
Expand Down
42 changes: 38 additions & 4 deletions tfbpapi/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

"""

import logging
from enum import Enum
from functools import cached_property
from pathlib import Path
Expand All @@ -29,6 +30,9 @@
FactorAliases: TypeAlias = dict[str, dict[str, list[str | int | float | bool]]]


logger = logging.getLogger(__name__)


class DatasetType(str, Enum):
"""Supported dataset types."""

Expand Down Expand Up @@ -761,6 +765,23 @@ def validate_factor_aliases(cls, v: FactorAliases) -> FactorAliases:
)
return v

@model_validator(mode="after")
def validate_repositories_have_datasets(self) -> "MetadataConfig":
"""
Validate that every repository defines at least one dataset.

:return: The validated MetadataConfig instance
:raises ValueError: If any repository has no datasets defined

"""
for repo_id, repo_config in self.repositories.items():
if not repo_config.dataset:
raise ValueError(
f"Repository '{repo_id}' must define at least one "
"dataset under the 'dataset' key."
)
return self

@model_validator(mode="after")
def validate_unique_db_names(self) -> "MetadataConfig":
"""
Expand Down Expand Up @@ -791,13 +812,19 @@ def validate_unique_db_names(self) -> "MetadataConfig":

@model_validator(mode="before")
@classmethod
def parse_repositories(cls, data: Any) -> dict[str, Any]:
def parse_config(cls, data: Any) -> dict[str, Any]:
"""
Parse repository configurations from 'repositories' key.
Parse and validate all top-level sections of the VirtualDB configuration.

Handles the four top-level sections: ``repositories`` (required),
``factor_aliases``, ``missing_value_labels``, and ``description``
(all optional). Logs an INFO message for each optional section that
is absent from the configuration.

:param data: Raw configuration data
:return: Processed configuration with parsed repositories
:raises ValueError: If repositories are invalid or missing
:return: Processed configuration dict ready for Pydantic field validation
:raises ValueError: If ``repositories`` is missing or empty, or if
any repository config is invalid

"""
if not isinstance(data, dict):
Expand All @@ -811,6 +838,13 @@ def parse_repositories(cls, data: Any) -> dict[str, Any]:
"with at least one repository"
)

for optional_key in ("factor_aliases", "missing_value_labels", "description"):
if not data.get(optional_key):
logger.info(
"No '%s' section found in VirtualDB configuration.",
optional_key,
)

# Parse each repository config
repositories = {}
for repo_id, repo_config in repositories_data.items():
Expand Down
Loading