BrentLab · cmatKhan · Mar 10, 2026 · Mar 10, 2026
diff --git a/docs/tutorials/virtual_db_tutorial.ipynb b/docs/tutorials/virtual_db_tutorial.ipynb
@@ -147,13 +147,7 @@
    "cell_type": "markdown",
    "id": "cell-3",
    "metadata": {},
-   "source": [
-    "## Initializing VirtualDB\n",
-    "\n",
-    "Creating a VirtualDB instance loads and validates the config but does\n",
-    "**not** download any data yet. Views are registered lazily on the first\n",
-    "`query()`, `tables()`, or `describe()` call."
-   ]
+   "source": "## Initializing VirtualDB\n\nCreating a VirtualDB instance loads and validates the config, downloads any\nnecessary data, and registers all views immediately."
   },
   {
    "cell_type": "code",
@@ -5100,4 +5094,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 5
-}
+}
diff --git a/docs/virtual_db.md b/docs/virtual_db.md
@@ -25,27 +25,21 @@ and the [tutorial](tutorials/virtual_db_tutorial.ipynb) for usage examples.
 
 ## Advanced Usage
 
-After any public method is called (e.g. `vdb.tables()`), the underlying DuckDB
-connection is available as `vdb._db`. You can use `_db` to execute any SQL
-on the database, eg creating more views, or creating a table in memory
+The underlying DuckDB connection is available as `vdb._conn`. You can use
+`_conn` to execute any SQL on the database, eg creating more views, or
+creating a table in memory.
 
 Custom **views** created this way appear in `tables()`, `describe()`, and
 `get_fields()` automatically because those methods query DuckDB's
 `information_schema`. Custom **tables** do not appear in `tables()` (which
 only lists views), but are fully queryable via `vdb.query()`.
 
-Call at least one public method first to ensure the connection is initialized
-before accessing `_db` directly.
-
 Example -- create a materialized analysis table::
 
-    # Trigger view registration
-    vdb.tables()
-
     # Create a persistent in-memory table from a complex query.
     # This example selects one "best" Hackett-2020 sample per regulator
     # using a priority system: ZEV+P > GEV+P > GEV+M.
-    vdb._db.execute("""
+    vdb._conn.execute("""
         CREATE OR REPLACE TABLE hackett_analysis_set AS
         WITH regulator_tiers AS (
             SELECT

diff --git a/docs/virtual_db_configuration.md b/docs/virtual_db_configuration.md
@@ -255,8 +255,7 @@ for more detailed explanation of comparative datasets and composite IDs.
 ## Internal Structure
 
 VirtualDB uses an in-memory DuckDB database to construct a layered hierarchy
-of SQL views over locally cached Parquet files. Views are created lazily on
-first query and are not persisted to disk.
+of SQL views over locally cached Parquet files. Views are created on initialization and are not persisted to disk.
 
 ### View Hierarchy
 

diff --git a/tfbpapi/datacard.py b/tfbpapi/datacard.py
@@ -307,7 +307,7 @@ def _build_metadata_fields_map(self) -> None:
                     ]
                     break
             else:
-                self.logger.warning(
+                self.logger.info(
                     "No metadata fields found for data config '%s' "
                     "in repo '%s' -- no embedded metadata_fields and "
                     "no metadata config with applies_to",

diff --git a/tfbpapi/models.py b/tfbpapi/models.py
@@ -9,6 +9,7 @@
 
 """
 
+import logging
 from enum import Enum
 from functools import cached_property
 from pathlib import Path
@@ -29,6 +30,9 @@
 FactorAliases: TypeAlias = dict[str, dict[str, list[str | int | float | bool]]]
 
 
+logger = logging.getLogger(__name__)
+
+
 class DatasetType(str, Enum):
     """Supported dataset types."""
 
@@ -761,6 +765,23 @@ def validate_factor_aliases(cls, v: FactorAliases) -> FactorAliases:
                     )
         return v
 
+    @model_validator(mode="after")
+    def validate_repositories_have_datasets(self) -> "MetadataConfig":
+        """
+        Validate that every repository defines at least one dataset.
+
+        :return: The validated MetadataConfig instance
+        :raises ValueError: If any repository has no datasets defined
+
+        """
+        for repo_id, repo_config in self.repositories.items():
+            if not repo_config.dataset:
+                raise ValueError(
+                    f"Repository '{repo_id}' must define at least one "
+                    "dataset under the 'dataset' key."
+                )
+        return self
+
     @model_validator(mode="after")
     def validate_unique_db_names(self) -> "MetadataConfig":
         """
@@ -791,13 +812,19 @@ def validate_unique_db_names(self) -> "MetadataConfig":
 
     @model_validator(mode="before")
     @classmethod
-    def parse_repositories(cls, data: Any) -> dict[str, Any]:
+    def parse_config(cls, data: Any) -> dict[str, Any]:
         """
-        Parse repository configurations from 'repositories' key.
+        Parse and validate all top-level sections of the VirtualDB configuration.
+
+        Handles the four top-level sections: ``repositories`` (required),
+        ``factor_aliases``, ``missing_value_labels``, and ``description``
+        (all optional). Logs an INFO message for each optional section that
+        is absent from the configuration.
 
         :param data: Raw configuration data
-        :return: Processed configuration with parsed repositories
-        :raises ValueError: If repositories are invalid or missing
+        :return: Processed configuration dict ready for Pydantic field validation
+        :raises ValueError: If ``repositories`` is missing or empty, or if
+            any repository config is invalid
 
         """
         if not isinstance(data, dict):
@@ -811,6 +838,13 @@ def parse_repositories(cls, data: Any) -> dict[str, Any]:
                 "with at least one repository"
             )
 
+        for optional_key in ("factor_aliases", "missing_value_labels", "description"):
+            if not data.get(optional_key):
+                logger.info(
+                    "No '%s' section found in VirtualDB configuration.",
+                    optional_key,
+                )
+
         # Parse each repository config
         repositories = {}
         for repo_id, repo_config in repositories_data.items():