From a278c40596e4d87dc6cca92a835208bef476e82b Mon Sep 17 00:00:00 2001 From: chasem Date: Wed, 11 Mar 2026 12:06:40 -0500 Subject: [PATCH 1/4] adding dtype factor and updating local branch of dev --- docs/virtual_db_configuration.md | 66 ++++- tfbpapi/models.py | 16 +- tfbpapi/tests/test_virtual_db.py | 257 ++++++++++++++++++- tfbpapi/virtual_db.py | 421 ++++++++++++++++++++++++++++++- 4 files changed, 745 insertions(+), 15 deletions(-) diff --git a/docs/virtual_db_configuration.md b/docs/virtual_db_configuration.md index 06897c3..90e58d0 100644 --- a/docs/virtual_db_configuration.md +++ b/docs/virtual_db_configuration.md @@ -139,13 +139,77 @@ during metadata extraction and query filtering. - `string` - Text data (default if not specified) - `numeric` - Numeric values (integers or floating-point numbers) - `bool` - Boolean values (true/false) +- `factor` - Categorical data backed by a DuckDB ENUM type (see below) **When to use dtype**: 1. **Numeric filtering**: Required for fields used with comparison operators (`<`, `>`, `<=`, `>=`, `between`) 2. **Type consistency**: When source data might be extracted with incorrect type -3. **Performance**: Helps with query optimization and prevents type mismatches +3. **Categorical columns**: Use `factor` when a field has a fixed, known set of + levels and you want DuckDB to enforce membership and enable efficient storage + +### factor dtype (DuckDB ENUM) + +When `dtype: factor` is set on a field-only mapping, VirtualDB registers a DuckDB +ENUM type from the field's `class_label` definition in the DataCard and casts the +column to that type in the `_meta` view. + +**Requirements**: + +- `dtype: factor` may only be used with field-only mappings (`field:` specified, + no `path:` or `expression:`). +- The DataCard must declare the field with `dtype: {class_label: {names: [...]}}`. + If the field is missing, has a non-`class_label` dtype, or the `names` list is + absent or empty, VirtualDB raises a `ValueError` at view-registration time. + +**Column naming when the output name matches the source field**: + +When the mapping key equals the source field name (the common case, e.g. +`time: {field: time, dtype: factor}`), the raw column is preserved in the view +under a `_orig` alias so that the original values remain accessible: + +- `time` -- ENUM-typed column with levels from the DataCard +- `time_orig` -- original raw column (e.g., DOUBLE or VARCHAR) + +If `time_orig` already exists in the parquet, VirtualDB finds the next available +name: `time_orig_1`, `time_orig_2`, etc. + +**Example DataCard feature definition** (in the HuggingFace dataset card YAML): + +```yaml +- name: time + dtype: + class_label: + names: + - 0 + - 5 + - 10 + - 15 + - 20 + - 45 + - 90 + description: Time point in minutes after induction +``` + +**Example VirtualDB config**: + +```yaml +repositories: + BrentLab/hackett_2020: + dataset: + hackett_2020: + db_name: hackett + sample_id: + field: sample_id + time: + field: time + dtype: factor +``` + +After view registration, `hackett_meta` will contain: +- `time` -- ENUM column, queryable as `WHERE time = '45'` +- `time_orig` -- original numeric column ## Tags diff --git a/tfbpapi/models.py b/tfbpapi/models.py index c2c9501..ae0d918 100644 --- a/tfbpapi/models.py +++ b/tfbpapi/models.py @@ -364,7 +364,13 @@ class PropertyMapping(BaseModel): None, description="SQL expression for derived fields" ) dtype: str | None = Field( - None, description="Data type for conversion: 'string', 'numeric', or 'bool'" + None, + description=( + "Data type for conversion: 'string', 'numeric', 'bool', or 'factor'. " + "When 'factor', the field must reference a DataCard field with a " + "class_label dtype specifying the allowed levels. VirtualDB will " + "register a DuckDB ENUM type and cast the column to it." + ), ) @field_validator("path", "field", "expression", mode="before") @@ -390,6 +396,8 @@ def validate_field_types(self) -> "PropertyMapping": """ Ensure at least one field type is specified and mutually exclusive. + Also validates dtype='factor' requires a field (not expression or path-only). + :return: The validated PropertyMapping instance :raises ValueError: If validation constraints are violated @@ -404,6 +412,12 @@ def validate_field_types(self) -> "PropertyMapping": raise ValueError( "At least one of 'field', 'path', or 'expression' must be specified" ) + if self.dtype == "factor": + if self.expression is not None or self.field is None: + raise ValueError( + "dtype='factor' requires 'field' to be specified and " + "cannot be used with 'expression' or as a path-only mapping" + ) return self diff --git a/tfbpapi/tests/test_virtual_db.py b/tfbpapi/tests/test_virtual_db.py index 6129880..3d32e52 100644 --- a/tfbpapi/tests/test_virtual_db.py +++ b/tfbpapi/tests/test_virtual_db.py @@ -15,7 +15,7 @@ import yaml # type: ignore from tfbpapi.datacard import DatasetSchema -from tfbpapi.models import DatasetType, MetadataConfig +from tfbpapi.models import DatasetType, FeatureInfo, MetadataConfig from tfbpapi.virtual_db import VirtualDB # ------------------------------------------------------------------ @@ -1469,3 +1469,258 @@ def test_external_metadata_join(self, tmp_path, monkeypatch): raw_result = v.query("SELECT * FROM chip ORDER BY sample_id") assert "db_id" in raw_result.columns assert len(raw_result) == 4 # 4 data rows + + +# ------------------------------------------------------------------ +# Tests: dtype='factor' (DuckDB ENUM) +# ------------------------------------------------------------------ + + +class TestFactorDtype: + """Tests for PropertyMapping dtype='factor' and DuckDB ENUM columns.""" + + def _make_vdb_with_factor(self, tmp_path, monkeypatch, feature_dtype): + """ + Helper: build a VirtualDB with one dataset whose 'category' field + has a PropertyMapping with dtype='factor'. ``feature_dtype`` is + passed as the FeatureInfo.dtype for the 'category' field in the + mock DataCard. + """ + import tfbpapi.virtual_db as vdb_module + + df = pd.DataFrame( + { + "sample_id": [1, 1, 2, 2], + "category": ["A", "B", "A", "C"], + "value": [1.0, 2.0, 3.0, 4.0], + } + ) + parquet_path = tmp_path / "data.parquet" + files = {("TestOrg/ds", "cfg"): [_write_parquet(parquet_path, df)]} + + config = { + "repositories": { + "TestOrg/ds": { + "dataset": { + "cfg": { + "db_name": "ds", + "sample_id": {"field": "sample_id"}, + "category": { + "field": "category", + "dtype": "factor", + }, + } + } + } + } + } + config_file = tmp_path / "config.yaml" + with open(config_file, "w") as f: + yaml.dump(config, f) + + card = MagicMock() + card.get_metadata_fields.return_value = ["sample_id", "category"] + card.get_field_definitions.return_value = {} + card.get_experimental_conditions.return_value = {} + card.get_metadata_config_name.return_value = None + card.get_dataset_schema.return_value = DatasetSchema( + data_columns={"sample_id", "category", "value"}, + metadata_columns={"sample_id", "category"}, + join_columns=set(), + metadata_source="embedded", + external_metadata_config=None, + is_partitioned=False, + ) + feature_list = [ + FeatureInfo( + name="category", + dtype=feature_dtype, + description="A categorical field", + ), + FeatureInfo( + name="sample_id", + dtype="int64", + description="Sample identifier", + ), + ] + card.get_features.return_value = feature_list + + monkeypatch.setattr( + VirtualDB, + "_resolve_parquet_files", + lambda self, repo_id, cn: files.get((repo_id, cn), []), + ) + monkeypatch.setattr( + vdb_module, + "_cached_datacard", + lambda repo_id, token=None: card, + ) + return VirtualDB(config_file) + + def test_factor_dtype_creates_enum_column(self, tmp_path, monkeypatch): + """Dtype='factor' casts the column to a DuckDB ENUM in the _meta view.""" + v = self._make_vdb_with_factor( + tmp_path, + monkeypatch, + feature_dtype={"class_label": {"names": ["A", "B", "C"]}}, + ) + df = v.query("SELECT * FROM ds_meta ORDER BY sample_id") + assert "category" in df.columns + # Values should be preserved + assert set(df["category"].dropna()) == {"A", "B", "C"} + + def test_factor_dtype_enum_type_registered(self, tmp_path, monkeypatch): + """The DuckDB ENUM type is registered and can be queried.""" + v = self._make_vdb_with_factor( + tmp_path, + monkeypatch, + feature_dtype={"class_label": {"names": ["A", "B", "C"]}}, + ) + # Trigger view registration + v.tables() + # The ENUM type should be registered in DuckDB + types_df = v._conn.execute( + "SELECT type_name FROM duckdb_types() WHERE logical_type = 'ENUM'" + ).fetchdf() + assert "_enum_category" in types_df["type_name"].tolist() + + def test_factor_dtype_missing_class_label_raises(self, tmp_path, monkeypatch): + """ValueError is raised when the DataCard field has no class_label dtype.""" + with pytest.raises(ValueError, match="class_label"): + v = self._make_vdb_with_factor( + tmp_path, + monkeypatch, + feature_dtype="string", # not a class_label dict + ) + v.tables() # triggers view registration + + def test_factor_dtype_no_names_raises(self, tmp_path, monkeypatch): + """ValueError is raised when class_label has no 'names' key.""" + with pytest.raises(ValueError, match="names"): + v = self._make_vdb_with_factor( + tmp_path, + monkeypatch, + feature_dtype={"class_label": {}}, # no names + ) + v.tables() + + def test_factor_dtype_model_validator_requires_field(self): + """PropertyMapping with dtype='factor' and no field raises ValidationError.""" + from pydantic import ValidationError + + with pytest.raises(ValidationError, match="factor"): + from tfbpapi.models import PropertyMapping + + PropertyMapping.model_validate({"path": "some.path", "dtype": "factor"}) + + def test_factor_dtype_model_validator_rejects_expression(self): + """PropertyMapping with dtype='factor' and expression raises ValidationError.""" + from pydantic import ValidationError + + with pytest.raises(ValidationError): + from tfbpapi.models import PropertyMapping + + PropertyMapping.model_validate({"expression": "col > 0", "dtype": "factor"}) + + def test_factor_dtype_inplace_renames_raw_to_orig(self, tmp_path, monkeypatch): + """ + When dtype='factor' maps a field to the same output name (e.g. + category: {field: category, dtype: factor}), the raw column is + renamed to _orig in the _meta view, and the ENUM-cast column + keeps the original name. + """ + v = self._make_vdb_with_factor( + tmp_path, + monkeypatch, + feature_dtype={"class_label": {"names": ["A", "B", "C"]}}, + ) + df = v.query("SELECT * FROM ds_meta ORDER BY sample_id") + # ENUM-cast column keeps the original name + assert "category" in df.columns + # Raw numeric/string original is preserved under _orig alias + assert "category_orig" in df.columns + # The _orig column should hold the raw values + assert set(df["category_orig"].dropna()) == {"A", "B", "C"} + + def test_factor_dtype_orig_suffix_avoids_collision(self, tmp_path, monkeypatch): + """When _orig already exists in the parquet, the rename uses _orig_1 + instead.""" + import tfbpapi.virtual_db as vdb_module + + df = pd.DataFrame( + { + "sample_id": [1, 2], + "category": ["A", "B"], + "category_orig": ["x", "y"], # pre-existing _orig column + "value": [1.0, 2.0], + } + ) + parquet_path = tmp_path / "data2.parquet" + files = {("TestOrg/ds2", "cfg2"): [_write_parquet(parquet_path, df)]} + + config = { + "repositories": { + "TestOrg/ds2": { + "dataset": { + "cfg2": { + "db_name": "ds2", + "sample_id": {"field": "sample_id"}, + "category": { + "field": "category", + "dtype": "factor", + }, + } + } + } + } + } + config_file = tmp_path / "config2.yaml" + with open(config_file, "w") as f: + yaml.dump(config, f) + + card = MagicMock() + card.get_metadata_fields.return_value = [ + "sample_id", + "category", + "category_orig", + ] + card.get_field_definitions.return_value = {} + card.get_experimental_conditions.return_value = {} + card.get_metadata_config_name.return_value = None + card.get_dataset_schema.return_value = DatasetSchema( + data_columns={"sample_id", "category", "category_orig", "value"}, + metadata_columns={"sample_id", "category", "category_orig"}, + join_columns=set(), + metadata_source="embedded", + external_metadata_config=None, + is_partitioned=False, + ) + card.get_features.return_value = [ + FeatureInfo( + name="category", + dtype={"class_label": {"names": ["A", "B"]}}, + description="categorical", + ), + FeatureInfo( + name="sample_id", + dtype="int64", + description="id", + ), + ] + + monkeypatch.setattr( + VirtualDB, + "_resolve_parquet_files", + lambda self, repo_id, cn: files.get((repo_id, cn), []), + ) + monkeypatch.setattr( + vdb_module, + "_cached_datacard", + lambda repo_id, token=None: card, + ) + v = VirtualDB(config_file) + + result = v.query("SELECT * FROM ds2_meta ORDER BY sample_id") + # Should use _orig_1 because _orig is taken + assert "category_orig_1" in result.columns + assert "category" in result.columns diff --git a/tfbpapi/virtual_db.py b/tfbpapi/virtual_db.py index eaa77d3..108304d 100644 --- a/tfbpapi/virtual_db.py +++ b/tfbpapi/virtual_db.py @@ -8,6 +8,8 @@ use, a set of common fields, datasets that contain comparative analytics, and more. VirtualDB, this code, then uses DuckDB to construct views over Parquet files cached locally on initialization. For primary datasets, VirtualDB creates metadata +VirtualDB, this code, then uses DuckDB to construct views over Parquet files cached +locally on initialization. For primary datasets, VirtualDB creates metadata views (one row per sample with derived columns) and full data views (measurement-level data joined to metadata). For comparative analysis datasets, VirtualDB creates expanded views that parse composite ID fields into ``_source`` (aliased to the configured @@ -53,6 +55,7 @@ from tfbpapi.datacard import DataCard, DatasetSchema from tfbpapi.models import DatasetType, MetadataConfig +from tfbpapi.models import DatasetType, MetadataConfig logger = logging.getLogger(__name__) @@ -175,19 +178,29 @@ def __init__( Creates the DuckDB connection and registers all views immediately. + Creates the DuckDB connection and registers all views immediately. + :param config_path: Path to YAML configuration file :param token: Optional HuggingFace token for private datasets :param duckdb_connection: Optional DuckDB connection. If provided, views will be registered on this connection instead of creating a new in-memory database. This provides a method of using a persistent database file. If not provided, an in-memory DuckDB connection is created. + This provides a method of using a persistent database file. If not provided, + an in-memory DuckDB connection is created. :raises FileNotFoundError: If config file does not exist :raises ValueError: If configuration is invalid + :raises ValueError: If configuration is invalid """ self.config = MetadataConfig.from_yaml(config_path) self.token = token + self._conn: duckdb.DuckDBPyConnection = ( + duckdb_connection + if duckdb_connection is not None + else duckdb.connect(":memory:") + ) self._conn: duckdb.DuckDBPyConnection = ( duckdb_connection if duckdb_connection is not None @@ -204,6 +217,10 @@ def __init__( self._validate_datacards() self._update_cache() self._register_all_views() + self._load_datacards() + self._validate_datacards() + self._update_cache() + self._register_all_views() # ------------------------------------------------------------------ # Public API @@ -245,6 +262,8 @@ def query(self, sql: str, **params: Any) -> pd.DataFrame: if params: return self._conn.execute(resolved, params).fetchdf() return self._conn.execute(resolved).fetchdf() + return self._conn.execute(resolved, params).fetchdf() + return self._conn.execute(resolved).fetchdf() def prepare(self, name: str, sql: str, overwrite: bool = False) -> None: """ @@ -271,6 +290,7 @@ def prepare(self, name: str, sql: str, overwrite: bool = False) -> None: """ + if name in self._list_views() and not overwrite: error_msg = ( f"Prepared-query name '{name}' collides with " @@ -289,6 +309,7 @@ def tables(self) -> list[str]: """ + return sorted(self._list_views()) def describe(self, table: str | None = None) -> pd.DataFrame: @@ -301,13 +322,16 @@ def describe(self, table: str | None = None) -> pd.DataFrame: """ + if table is not None: + df = self._conn.execute(f"DESCRIBE {table}").fetchdf() df = self._conn.execute(f"DESCRIBE {table}").fetchdf() df.insert(0, "table", table) return df frames = [] for view in sorted(self._list_views()): + df = self._conn.execute(f"DESCRIBE {view}").fetchdf() df = self._conn.execute(f"DESCRIBE {view}").fetchdf() df.insert(0, "table", view) frames.append(df) @@ -324,7 +348,9 @@ def get_fields(self, table: str | None = None) -> list[str]: """ + if table is not None: + cols = self._conn.execute( cols = self._conn.execute( f"SELECT column_name FROM information_schema.columns " f"WHERE table_name = '{table}'" @@ -333,6 +359,7 @@ def get_fields(self, table: str | None = None) -> list[str]: all_cols: set[str] = set() for view in self._list_views(): + cols = self._conn.execute( cols = self._conn.execute( f"SELECT column_name FROM information_schema.columns " f"WHERE table_name = '{view}'" @@ -351,12 +378,14 @@ def get_common_fields(self) -> list[str]: """ + meta_views = self._get_primary_meta_view_names() if not meta_views: return [] sets = [] for view in meta_views: + cols = self._conn.execute( cols = self._conn.execute( f"SELECT column_name FROM information_schema.columns " f"WHERE table_name = '{view}'" @@ -407,6 +436,7 @@ def get_tags(self, db_name: str) -> dict[str, str]: # ------------------------------------------------------------------ # Initialisation phases + # Initialisation phases # ------------------------------------------------------------------ def _load_datacards(self) -> None: @@ -433,6 +463,44 @@ def _load_datacards(self) -> None: exc, ) + def _validate_datacards(self) -> None: + """ + Cross-check the VirtualDB config against the loaded datacards. + + Checks that every dataset with a ``links`` field in the VirtualDB + config has ``dataset_type: comparative`` in its HuggingFace datacard. + Also resolves ``self._dataset_schemas`` and + ``self._external_meta_configs`` (keyed by ``db_name``) for use by + ``_update_cache`` and ``_register_all_views``. + + :raises ValueError: If a dataset with ``links`` does not have + ``dataset_type: comparative`` in its datacard. + + """ + def _load_datacards(self) -> None: + """ + Fetch (or load from cache) the DataCard for every distinct repo. + + Populates ``self._datacards`` keyed by ``repo_id``. Failures are + logged as warnings and the repo is omitted from the dict so that + subsequent phases can skip it gracefully. + + """ + self._datacards: dict[str, DataCard] = {} + seen_repos: set[str] = set() + for repo_id, _ in self._db_name_map.values(): + if repo_id in seen_repos: + continue + seen_repos.add(repo_id) + try: + self._datacards[repo_id] = _cached_datacard(repo_id, token=self.token) + except Exception as exc: + logger.warning( + "Could not load datacard for repo '%s': %s", + repo_id, + exc, + ) + def _validate_datacards(self) -> None: """ Cross-check the VirtualDB config against the loaded datacards. @@ -451,6 +519,9 @@ def _validate_datacards(self) -> None: # db_name -> external metadata config_name (for applies_to datasets) self._external_meta_configs: dict[str, str] = {} + # db_name -> external metadata config_name (for applies_to datasets) + self._external_meta_configs: dict[str, str] = {} + for db_name, (repo_id, config_name) in self._db_name_map.items(): repo_cfg = self.config.repositories.get(repo_id) ds_cfg = ( @@ -460,6 +531,34 @@ def _validate_datacards(self) -> None: ) card = self._datacards.get(repo_id) + # Validate comparative dataset_type agreement. + if ds_cfg and ds_cfg.links: + if card is not None: + dc_config = card.get_config(config_name) + if ( + dc_config is not None + and dc_config.dataset_type != DatasetType.COMPARATIVE + ): + raise ValueError( + f"Dataset '{config_name}' in repo '{repo_id}' has " + f"'links' in the VirtualDB config, indicating a " + f"comparative dataset, but the HuggingFace datacard " + f"declares dataset_type='{dc_config.dataset_type}'. " + f"Update the datacard to use dataset_type: comparative." + ) + continue # comparative datasets need no schema resolution + + # Resolve dataset schema and external metadata config for + # primary datasets. + if card is None: + repo_cfg = self.config.repositories.get(repo_id) + ds_cfg = ( + repo_cfg.dataset.get(config_name) + if repo_cfg and repo_cfg.dataset + else None + ) + card = self._datacards.get(repo_id) + # Validate comparative dataset_type agreement. if ds_cfg and ds_cfg.links: if card is not None: @@ -497,6 +596,9 @@ def _validate_datacards(self) -> None: schema is not None and schema.metadata_source == "external" and schema.external_metadata_config + schema is not None + and schema.metadata_source == "external" + and schema.external_metadata_config ): self._external_meta_configs[db_name] = schema.external_metadata_config @@ -521,6 +623,54 @@ def _update_cache(self) -> None: files = self._resolve_parquet_files(repo_id, ext_config_name) self._parquet_files[f"__{db_name}_meta"] = files + def _register_all_views(self) -> None: + """ + Register all DuckDB views in dependency order. + + Expects ``self._parquet_files``, ``self._dataset_schemas``, and + ``self._external_meta_configs`` to have been populated by the earlier + init phases. No network or disk access occurs here. + + """ + # 1. Raw per-dataset views (internal ___parquet + # plus public for primary datasets only) + for db_name, (repo_id, config_name) in self._db_name_map.items(): + comparative = self._is_comparative(repo_id, config_name) + self._register_raw_view( + db_name, + parquet_only=comparative, + ) + + # 2. External metadata parquet views. + # When a data config's metadata lives in a separate HF config + # (applies_to), register its parquet as ___metadata_parquet. + self._external_meta_views: dict[str, str] = {} + for db_name, ext_config_name in self._external_meta_configs.items(): + meta_view = f"__{db_name}_metadata_parquet" + files = self._parquet_files.get(f"__{db_name}_meta", []) + self._external_meta_configs[db_name] = schema.external_metadata_config + + def _update_cache(self) -> None: + """ + Download (or locate cached) Parquet files for all dataset configs. + + Populates ``self._parquet_files`` keyed by ``db_name``. For datasets + with external metadata (identified during ``_validate_datacards``), + also downloads those files and stores them under the key + ``"___meta"`` so ``_register_all_views`` can read them + without further network calls. + + """ + self._parquet_files: dict[str, list[str]] = {} + for db_name, (repo_id, config_name) in self._db_name_map.items(): + files = self._resolve_parquet_files(repo_id, config_name) + self._parquet_files[db_name] = files + + for db_name, ext_config_name in self._external_meta_configs.items(): + repo_id, _ = self._db_name_map[db_name] + files = self._resolve_parquet_files(repo_id, ext_config_name) + self._parquet_files[f"__{db_name}_meta"] = files + def _register_all_views(self) -> None: """ Register all DuckDB views in dependency order. @@ -552,10 +702,14 @@ def _register_all_views(self) -> None: "'%s' (db_name '%s') -- skipping external metadata view", ext_config_name, db_name, + "'%s' (db_name '%s') -- skipping external metadata view", + ext_config_name, + db_name, ) continue files_sql = ", ".join(f"'{f}'" for f in files) try: + self._conn.execute( self._conn.execute( f"CREATE OR REPLACE VIEW {meta_view} AS " f"SELECT * FROM read_parquet([{files_sql}])" @@ -569,17 +723,20 @@ def _register_all_views(self) -> None: continue self._external_meta_views[db_name] = meta_view + # 3. Metadata views for primary datasets (_meta) # 3. Metadata views for primary datasets (_meta) for db_name, (repo_id, config_name) in self._db_name_map.items(): if not self._is_comparative(repo_id, config_name): self._register_meta_view(db_name, repo_id, config_name) + # 4. Replace primary raw views with join to _meta so # 4. Replace primary raw views with join to _meta so # derived columns (e.g. carbon_source) are available for db_name, (repo_id, config_name) in self._db_name_map.items(): if not self._is_comparative(repo_id, config_name): self._enrich_raw_view(db_name) + # 5. Comparative expanded views (pre-parsed composite IDs) # 5. Comparative expanded views (pre-parsed composite IDs) for db_name, (repo_id, config_name) in self._db_name_map.items(): repo_cfg = self.config.repositories.get(repo_id) @@ -675,6 +832,7 @@ def _register_raw_view( ) -> None: """ Register a raw DuckDB view over pre-resolved Parquet files. + Register a raw DuckDB view over pre-resolved Parquet files. Creates an internal ``___parquet`` view that reads directly from the Parquet files. For primary datasets, also @@ -687,14 +845,19 @@ def _register_raw_view( Parquet files must have been resolved by ``_update_cache`` before this method is called. + Parquet files must have been resolved by ``_update_cache`` + before this method is called. + :param db_name: View name :param parquet_only: If True, only create the internal ``___parquet`` view (no public ````). """ files = self._parquet_files.get(db_name, []) + files = self._parquet_files.get(db_name, []) if not files: logger.warning( + "No parquet files for db_name '%s' -- skipping view", "No parquet files for db_name '%s' -- skipping view", db_name, ) @@ -702,6 +865,7 @@ def _register_raw_view( files_sql = ", ".join(f"'{f}'" for f in files) parquet_sql = f"SELECT * FROM read_parquet([{files_sql}])" + self._conn.execute( self._conn.execute( f"CREATE OR REPLACE VIEW __{db_name}_parquet AS " f"{parquet_sql}" ) @@ -722,6 +886,22 @@ def _register_raw_view( cols_sql = ", ".join(parts) public_select = f"SELECT {cols_sql} FROM __{db_name}_parquet" self._conn.execute(f"CREATE OR REPLACE VIEW {db_name} AS {public_select}") + sample_col = self._get_sample_id_col(db_name) + if sample_col == "sample_id": + public_select = f"SELECT * FROM __{db_name}_parquet" + else: + raw_cols = self._get_view_columns(f"__{db_name}_parquet") + parts: list[str] = [] + for col in raw_cols: + if col == sample_col: + parts.append(f"{col} AS sample_id") + elif col == "sample_id": + parts.append(f"{col} AS sample_id_orig") + else: + parts.append(col) + cols_sql = ", ".join(parts) + public_select = f"SELECT {cols_sql} FROM __{db_name}_parquet" + self._conn.execute(f"CREATE OR REPLACE VIEW {db_name} AS {public_select}") def _register_meta_view(self, db_name: str, repo_id: str, config_name: str) -> None: """ @@ -757,6 +937,8 @@ def _register_meta_view(self, db_name: str, repo_id: str, config_name: str) -> N # FROM clause construction. schema: DatasetSchema | None = self._dataset_schemas.get(db_name) ext_meta_view: str | None = self._external_meta_views.get(db_name) + schema: DatasetSchema | None = self._dataset_schemas.get(db_name) + ext_meta_view: str | None = self._external_meta_views.get(db_name) is_external = ( ext_meta_view is not None @@ -815,7 +997,49 @@ def qualify(col: str) -> str: return f"m.{col}" return f"d.{col}" + # Resolve derived property expressions first. + # When a factor mapping has the same output name as its source + # field (e.g. time -> time), the raw column must be renamed to + # avoid a duplicate column name in the SELECT. The rename uses + # "_orig", or "_orig_1", etc., to avoid collisions with + # other columns that already exist in the parquet. + prop_result = self._resolve_property_columns(repo_id, config_name) + + # Collect all column names that exist in the parquet so we can + # find a unique _orig suffix when needed. + all_parquet_cols: set[str] = set(self._get_view_columns(parquet_view)) + + # Map: raw_col -> alias_in_select for factor-overridden cols + factor_col_renames: dict[str, str] = {} + if prop_result is not None: + _derived_exprs, _prop_raw_cols = prop_result + for expr in _derived_exprs: + # Detect CAST( AS _enum_) AS patterns + # where == (in-place factor override) + if not expr.startswith("CAST("): + continue + parts = expr.rsplit(" AS ", 1) + if len(parts) != 2: + continue + out_col = parts[1].strip() + # Check whether the source field has the same name as + # the output column (in-place override case) + cast_inner = parts[0][len("CAST(") :] + src_field = cast_inner.split(" AS ")[0].strip() + if src_field == out_col and out_col in all_parquet_cols: + # Find a unique _orig name + candidate = f"{out_col}_orig" + n = 1 + while candidate in all_parquet_cols or candidate in ( + v for v in factor_col_renames.values() + ): + candidate = f"{out_col}_orig_{n}" + n += 1 + factor_col_renames[src_field] = candidate + # Build SELECT: sample_id + metadata cols (deduplicated). + # Raw columns that are factor-overridden are emitted with their + # _orig alias instead of their original name. # If the configured sample_id column differs from "sample_id", # rename it so all views expose a consistent "sample_id" column. # If the parquet also has a literal "sample_id" column, preserve @@ -823,20 +1047,31 @@ def qualify(col: str) -> str: seen: set[str] = set() select_parts: list[str] = [] rename_sample = sample_col != "sample_id" + rename_sample = sample_col != "sample_id" def add_col(col: str) -> None: - if col not in seen: - seen.add(col) - if rename_sample and col == sample_col: - select_parts.append(f"{qualify(col)} AS sample_id") - elif rename_sample and col == "sample_id": - select_parts.append(f"{qualify(col)} AS sample_id_orig") - else: - select_parts.append(qualify(col)) + if col in seen: + return + seen.add(col) + alias = factor_col_renames.get(col) + if alias: + select_parts.append(f"{qualify(col)} AS {alias}") + elif rename_sample and col == sample_col: + select_parts.append(f"{qualify(col)} AS sample_id") + elif rename_sample and col == "sample_id": + select_parts.append(f"{qualify(col)} AS sample_id_orig") + else: + select_parts.append(qualify(col)) add_col(sample_col) # When renaming, check if the parquet source also has a literal # "sample_id" column; if so, preserve it as "sample_id_orig". + if rename_sample: + source_cols = set(self._get_view_columns(parquet_view)) + if "sample_id" in source_cols: + add_col("sample_id") + # When renaming, check if the parquet source also has a literal + # "sample_id" column; if so, preserve it as "sample_id_orig". if rename_sample: source_cols = set(self._get_view_columns(parquet_view)) if "sample_id" in source_cols: @@ -845,12 +1080,21 @@ def add_col(col: str) -> None: add_col(col) # Add derived property expressions from the VirtualDB config - prop_result = self._resolve_property_columns(repo_id, config_name) if prop_result is not None: derived_exprs, prop_raw_cols = prop_result # Ensure source columns needed by expressions are selected for col in prop_raw_cols: add_col(col) + # Rewrite CAST expressions to use the _orig alias when the + # source field was renamed to avoid collision. + if factor_col_renames: + rewritten = [] + for expr in derived_exprs: + for orig, alias in factor_col_renames.items(): + # Replace "CAST( AS" with "CAST( AS" + expr = expr.replace(f"CAST({orig} AS", f"CAST({alias} AS") + rewritten.append(expr) + derived_exprs = rewritten # Qualify source column references inside CASE WHEN expressions if is_join: qualified_exprs = [] @@ -873,6 +1117,7 @@ def add_col(col: str) -> None: ) try: self._conn.execute(sql) + self._conn.execute(sql) except BinderException as exc: raise BinderException( f"Failed to create meta view '{db_name}_meta'.\n" @@ -899,6 +1144,8 @@ def _enrich_raw_view(self, db_name: str) -> None: if not self._view_exists(meta_name) or not self._view_exists(parquet_name): return + raw_cols_list = self._get_view_columns(parquet_name) + raw_cols = set(raw_cols_list) raw_cols_list = self._get_view_columns(parquet_name) raw_cols = set(raw_cols_list) meta_cols = set(self._get_view_columns(meta_name)) @@ -906,6 +1153,22 @@ def _enrich_raw_view(self, db_name: str) -> None: sample_col = self._get_sample_id_col(db_name) rename_sample = sample_col != "sample_id" + # Columns to pull from _meta that aren't already in raw parquet, + # accounting for the sample_id rename: when renaming, "sample_id" + # will appear in meta_cols (as the renamed column) but not in + # raw_cols (which has the original name), so we must exclude it + # from extra_cols since the rename in the raw SELECT already + # provides it. + if rename_sample: + # "sample_id" and "sample_id_orig" come from the raw SELECT + # rename, not from meta + extra_cols = meta_cols - raw_cols - {"sample_id", "sample_id_orig"} + else: + extra_cols = meta_cols - raw_cols + + sample_col = self._get_sample_id_col(db_name) + rename_sample = sample_col != "sample_id" + # Columns to pull from _meta that aren't already in raw parquet, # accounting for the sample_id rename: when renaming, "sample_id" # will appear in meta_cols (as the renamed column) but not in @@ -920,6 +1183,9 @@ def _enrich_raw_view(self, db_name: str) -> None: extra_cols = meta_cols - raw_cols if not extra_cols: + # No derived columns to add -- the view created in + # _register_raw_view (which already handles the rename) + # is sufficient. # No derived columns to add -- the view created in # _register_raw_view (which already handles the rename) # is sufficient. @@ -950,11 +1216,39 @@ def _enrich_raw_view(self, db_name: str) -> None: else: join_clause = f"JOIN {meta_name} m USING ({sample_col})" + self._conn.execute( + if rename_sample: + # Build explicit SELECT to rename the sample column + raw_parts: list[str] = [] + for col in raw_cols_list: + if col == sample_col: + raw_parts.append(f"r.{col} AS sample_id") + elif col == "sample_id": + raw_parts.append(f"r.{col} AS sample_id_orig") + else: + raw_parts.append(f"r.{col}") + raw_select = ", ".join(raw_parts) + else: + raw_select = "r.*" + + if extra_cols: + extra_select = ", ".join(f"m.{c}" for c in sorted(extra_cols)) + full_select = f"{raw_select}, {extra_select}" + else: + full_select = raw_select + + if rename_sample: + join_clause = f"JOIN {meta_name} m ON r.{sample_col} = m.sample_id" + else: + join_clause = f"JOIN {meta_name} m USING ({sample_col})" + self._conn.execute( f"CREATE OR REPLACE VIEW {db_name} AS " f"SELECT {full_select} " + f"SELECT {full_select} " f"FROM {parquet_name} r " f"{join_clause}" + f"{join_clause}" ) def _get_view_columns(self, view: str) -> list[str]: @@ -967,6 +1261,7 @@ def _get_view_columns(self, view: str) -> list[str]: """ df = self._conn.execute(f"DESCRIBE {view}").fetchdf() + df = self._conn.execute(f"DESCRIBE {view}").fetchdf() return df["column_name"].tolist() def _get_sample_id_col(self, db_name: str) -> str: @@ -996,6 +1291,9 @@ def _resolve_metadata_fields( """ try: + card = self._datacards.get(repo_id) or _cached_datacard( + repo_id, token=self.token + ) card = self._datacards.get(repo_id) or _cached_datacard( repo_id, token=self.token ) @@ -1008,6 +1306,76 @@ def _resolve_metadata_fields( ) return None + def _get_class_label_names( + self, card: Any, config_name: str, field: str + ) -> list[str]: + """ + Return the ENUM levels for a field with class_label dtype. + + Looks up the FeatureInfo for ``field`` in the DataCard config and + extracts the ``names`` list from its ``class_label`` dtype dict. + + :param card: DataCard instance + :param config_name: Configuration name + :param field: Field name to look up + :return: List of level strings + :raises ValueError: If the field is not found, has no class_label dtype, + or the class_label dict has no ``names`` key + + """ + try: + features = card.get_features(config_name) + except Exception as exc: + raise ValueError( + f"Could not retrieve features for config '{config_name}': {exc}" + ) from exc + + feature = next((f for f in features if f.name == field), None) + if feature is None: + raise ValueError( + f"Field '{field}' not found in DataCard config '{config_name}'. " + "dtype='factor' requires the field to be declared in the DataCard." + ) + + dtype = feature.dtype + if not isinstance(dtype, dict) or "class_label" not in dtype: + raise ValueError( + f"dtype='factor' is set for field '{field}' in config " + f"'{config_name}', but the DataCard dtype is {dtype!r} rather " + "than a class_label dict. " + "The DataCard must declare dtype: {class_label: {names: [...]}}." + ) + + class_label = dtype["class_label"] + names = class_label.get("names") if isinstance(class_label, dict) else None + if not names: + raise ValueError( + f"class_label for field '{field}' in config '{config_name}' " + "has no 'names' key or the names list is empty. " + "Specify levels as: class_label: {names: [level1, level2, ...]}." + ) + + return [str(n) for n in names] + + def _ensure_enum_type(self, type_name: str, levels: list[str]) -> None: + """ + Create or replace a DuckDB ENUM type with the given levels. + + DuckDB ENUM types must be registered before use in CAST expressions. Drops any + existing type with the same name first to allow re-registration on repeated view + builds. + + :param type_name: SQL identifier for the ENUM type + :param levels: Ordered list of allowed string values + + """ + try: + self._conn.execute(f"DROP TYPE IF EXISTS {type_name}") + except Exception: + pass # type may not exist yet + escaped = ", ".join(f"'{v.replace(chr(39), chr(39)*2)}'" for v in levels) + self._conn.execute(f"CREATE TYPE {type_name} AS ENUM ({escaped})") + def _resolve_alias(self, col: str, value: str) -> str: """ Apply factor alias to a value if one is configured. @@ -1060,6 +1428,9 @@ def _resolve_property_columns( card = self._datacards.get(repo_id) or _cached_datacard( repo_id, token=self.token ) + card = self._datacards.get(repo_id) or _cached_datacard( + repo_id, token=self.token + ) except Exception as exc: logger.warning( "Could not load DataCard for %s: %s", @@ -1075,9 +1446,19 @@ def _resolve_property_columns( continue if mapping.field is not None and mapping.path is None: - # Type A: field-only (alias or no-op) + # Type A: field-only (alias or ENUM cast) raw_cols.add(mapping.field) - if key == mapping.field: + if mapping.dtype == "factor": + # Fetch class_label levels from DataCard, register ENUM, + # and emit a CAST expression. Raises ValueError if the + # DataCard does not declare a class_label dtype. + enum_type = f"_enum_{key}" + levels = self._get_class_label_names( + card, config_name, mapping.field + ) + self._ensure_enum_type(enum_type, levels) + expressions.append(f"CAST({mapping.field} AS {enum_type}) AS {key}") + elif key == mapping.field: # no-op -- column already present as raw col pass else: @@ -1085,7 +1466,15 @@ def _resolve_property_columns( continue if mapping.field is not None and mapping.path is not None: - # Type B: field + path -- resolve from definitions + # Type B: field + path -- resolve from definitions. + # dtype='factor' is not supported here: levels come from a + # class_label field, not a definitions path. + if mapping.dtype == "factor": + raise ValueError( + f"dtype='factor' is not supported for field+path mappings " + f"(key='{key}'). Use dtype='factor' only with field-only " + "mappings that reference a class_label field in the DataCard." + ) raw_cols.add(mapping.field) expr = self._build_field_path_expr( key, @@ -1338,6 +1727,7 @@ def _register_comparative_expanded_view( return cols_sql = ", ".join(extra_cols) + self._conn.execute( self._conn.execute( f"CREATE OR REPLACE VIEW {db_name}_expanded AS " f"SELECT *, {cols_sql} FROM {parquet_view}" @@ -1357,6 +1747,7 @@ def _is_comparative(self, repo_id: str, config_name: str) -> bool: def _list_views(self) -> list[str]: """Return list of public views (excludes internal __ prefixed).""" + df = self._conn.execute( df = self._conn.execute( "SELECT table_name FROM information_schema.tables " "WHERE table_schema = 'main' AND table_type = 'VIEW'" @@ -1365,6 +1756,7 @@ def _list_views(self) -> list[str]: def _view_exists(self, name: str) -> bool: """Check whether a view is registered (including internal).""" + df = self._conn.execute( df = self._conn.execute( "SELECT table_name FROM information_schema.tables " "WHERE table_schema = 'main' AND table_type = 'VIEW' " @@ -1406,6 +1798,11 @@ def __repr__(self) -> str: n_repos = len(self.config.repositories) n_datasets = len(self._db_name_map) n_views = len(self._list_views()) + return ( + f"VirtualDB({n_repos} repos, " + f"{n_datasets} datasets, " + f"{n_views} views)" + n_views = len(self._list_views()) return ( f"VirtualDB({n_repos} repos, " f"{n_datasets} datasets, " From f989b16501be778053effba1fe4a2a683d7d9480 Mon Sep 17 00:00:00 2001 From: chasem Date: Wed, 11 Mar 2026 12:31:12 -0500 Subject: [PATCH 2/4] tryign to resovle merge conflicts --- tfbpapi/virtual_db.py | 213 +----------------------------------------- 1 file changed, 2 insertions(+), 211 deletions(-) diff --git a/tfbpapi/virtual_db.py b/tfbpapi/virtual_db.py index 108304d..911f0a2 100644 --- a/tfbpapi/virtual_db.py +++ b/tfbpapi/virtual_db.py @@ -8,8 +8,6 @@ use, a set of common fields, datasets that contain comparative analytics, and more. VirtualDB, this code, then uses DuckDB to construct views over Parquet files cached locally on initialization. For primary datasets, VirtualDB creates metadata -VirtualDB, this code, then uses DuckDB to construct views over Parquet files cached -locally on initialization. For primary datasets, VirtualDB creates metadata views (one row per sample with derived columns) and full data views (measurement-level data joined to metadata). For comparative analysis datasets, VirtualDB creates expanded views that parse composite ID fields into ``_source`` (aliased to the configured @@ -55,7 +53,6 @@ from tfbpapi.datacard import DataCard, DatasetSchema from tfbpapi.models import DatasetType, MetadataConfig -from tfbpapi.models import DatasetType, MetadataConfig logger = logging.getLogger(__name__) @@ -178,29 +175,19 @@ def __init__( Creates the DuckDB connection and registers all views immediately. - Creates the DuckDB connection and registers all views immediately. - :param config_path: Path to YAML configuration file :param token: Optional HuggingFace token for private datasets :param duckdb_connection: Optional DuckDB connection. If provided, views will be registered on this connection instead of creating a new in-memory database. This provides a method of using a persistent database file. If not provided, an in-memory DuckDB connection is created. - This provides a method of using a persistent database file. If not provided, - an in-memory DuckDB connection is created. :raises FileNotFoundError: If config file does not exist :raises ValueError: If configuration is invalid - :raises ValueError: If configuration is invalid """ self.config = MetadataConfig.from_yaml(config_path) self.token = token - self._conn: duckdb.DuckDBPyConnection = ( - duckdb_connection - if duckdb_connection is not None - else duckdb.connect(":memory:") - ) self._conn: duckdb.DuckDBPyConnection = ( duckdb_connection if duckdb_connection is not None @@ -217,10 +204,6 @@ def __init__( self._validate_datacards() self._update_cache() self._register_all_views() - self._load_datacards() - self._validate_datacards() - self._update_cache() - self._register_all_views() # ------------------------------------------------------------------ # Public API @@ -262,8 +245,6 @@ def query(self, sql: str, **params: Any) -> pd.DataFrame: if params: return self._conn.execute(resolved, params).fetchdf() return self._conn.execute(resolved).fetchdf() - return self._conn.execute(resolved, params).fetchdf() - return self._conn.execute(resolved).fetchdf() def prepare(self, name: str, sql: str, overwrite: bool = False) -> None: """ @@ -290,7 +271,6 @@ def prepare(self, name: str, sql: str, overwrite: bool = False) -> None: """ - if name in self._list_views() and not overwrite: error_msg = ( f"Prepared-query name '{name}' collides with " @@ -309,7 +289,6 @@ def tables(self) -> list[str]: """ - return sorted(self._list_views()) def describe(self, table: str | None = None) -> pd.DataFrame: @@ -322,7 +301,6 @@ def describe(self, table: str | None = None) -> pd.DataFrame: """ - if table is not None: df = self._conn.execute(f"DESCRIBE {table}").fetchdf() df = self._conn.execute(f"DESCRIBE {table}").fetchdf() @@ -348,9 +326,7 @@ def get_fields(self, table: str | None = None) -> list[str]: """ - if table is not None: - cols = self._conn.execute( cols = self._conn.execute( f"SELECT column_name FROM information_schema.columns " f"WHERE table_name = '{table}'" @@ -359,7 +335,6 @@ def get_fields(self, table: str | None = None) -> list[str]: all_cols: set[str] = set() for view in self._list_views(): - cols = self._conn.execute( cols = self._conn.execute( f"SELECT column_name FROM information_schema.columns " f"WHERE table_name = '{view}'" @@ -378,14 +353,12 @@ def get_common_fields(self) -> list[str]: """ - meta_views = self._get_primary_meta_view_names() if not meta_views: return [] sets = [] for view in meta_views: - cols = self._conn.execute( cols = self._conn.execute( f"SELECT column_name FROM information_schema.columns " f"WHERE table_name = '{view}'" @@ -436,7 +409,6 @@ def get_tags(self, db_name: str) -> dict[str, str]: # ------------------------------------------------------------------ # Initialisation phases - # Initialisation phases # ------------------------------------------------------------------ def _load_datacards(self) -> None: @@ -463,44 +435,6 @@ def _load_datacards(self) -> None: exc, ) - def _validate_datacards(self) -> None: - """ - Cross-check the VirtualDB config against the loaded datacards. - - Checks that every dataset with a ``links`` field in the VirtualDB - config has ``dataset_type: comparative`` in its HuggingFace datacard. - Also resolves ``self._dataset_schemas`` and - ``self._external_meta_configs`` (keyed by ``db_name``) for use by - ``_update_cache`` and ``_register_all_views``. - - :raises ValueError: If a dataset with ``links`` does not have - ``dataset_type: comparative`` in its datacard. - - """ - def _load_datacards(self) -> None: - """ - Fetch (or load from cache) the DataCard for every distinct repo. - - Populates ``self._datacards`` keyed by ``repo_id``. Failures are - logged as warnings and the repo is omitted from the dict so that - subsequent phases can skip it gracefully. - - """ - self._datacards: dict[str, DataCard] = {} - seen_repos: set[str] = set() - for repo_id, _ in self._db_name_map.values(): - if repo_id in seen_repos: - continue - seen_repos.add(repo_id) - try: - self._datacards[repo_id] = _cached_datacard(repo_id, token=self.token) - except Exception as exc: - logger.warning( - "Could not load datacard for repo '%s': %s", - repo_id, - exc, - ) - def _validate_datacards(self) -> None: """ Cross-check the VirtualDB config against the loaded datacards. @@ -519,9 +453,6 @@ def _validate_datacards(self) -> None: # db_name -> external metadata config_name (for applies_to datasets) self._external_meta_configs: dict[str, str] = {} - # db_name -> external metadata config_name (for applies_to datasets) - self._external_meta_configs: dict[str, str] = {} - for db_name, (repo_id, config_name) in self._db_name_map.items(): repo_cfg = self.config.repositories.get(repo_id) ds_cfg = ( @@ -531,34 +462,6 @@ def _validate_datacards(self) -> None: ) card = self._datacards.get(repo_id) - # Validate comparative dataset_type agreement. - if ds_cfg and ds_cfg.links: - if card is not None: - dc_config = card.get_config(config_name) - if ( - dc_config is not None - and dc_config.dataset_type != DatasetType.COMPARATIVE - ): - raise ValueError( - f"Dataset '{config_name}' in repo '{repo_id}' has " - f"'links' in the VirtualDB config, indicating a " - f"comparative dataset, but the HuggingFace datacard " - f"declares dataset_type='{dc_config.dataset_type}'. " - f"Update the datacard to use dataset_type: comparative." - ) - continue # comparative datasets need no schema resolution - - # Resolve dataset schema and external metadata config for - # primary datasets. - if card is None: - repo_cfg = self.config.repositories.get(repo_id) - ds_cfg = ( - repo_cfg.dataset.get(config_name) - if repo_cfg and repo_cfg.dataset - else None - ) - card = self._datacards.get(repo_id) - # Validate comparative dataset_type agreement. if ds_cfg and ds_cfg.links: if card is not None: @@ -596,9 +499,6 @@ def _validate_datacards(self) -> None: schema is not None and schema.metadata_source == "external" and schema.external_metadata_config - schema is not None - and schema.metadata_source == "external" - and schema.external_metadata_config ): self._external_meta_configs[db_name] = schema.external_metadata_config @@ -623,54 +523,6 @@ def _update_cache(self) -> None: files = self._resolve_parquet_files(repo_id, ext_config_name) self._parquet_files[f"__{db_name}_meta"] = files - def _register_all_views(self) -> None: - """ - Register all DuckDB views in dependency order. - - Expects ``self._parquet_files``, ``self._dataset_schemas``, and - ``self._external_meta_configs`` to have been populated by the earlier - init phases. No network or disk access occurs here. - - """ - # 1. Raw per-dataset views (internal ___parquet - # plus public for primary datasets only) - for db_name, (repo_id, config_name) in self._db_name_map.items(): - comparative = self._is_comparative(repo_id, config_name) - self._register_raw_view( - db_name, - parquet_only=comparative, - ) - - # 2. External metadata parquet views. - # When a data config's metadata lives in a separate HF config - # (applies_to), register its parquet as ___metadata_parquet. - self._external_meta_views: dict[str, str] = {} - for db_name, ext_config_name in self._external_meta_configs.items(): - meta_view = f"__{db_name}_metadata_parquet" - files = self._parquet_files.get(f"__{db_name}_meta", []) - self._external_meta_configs[db_name] = schema.external_metadata_config - - def _update_cache(self) -> None: - """ - Download (or locate cached) Parquet files for all dataset configs. - - Populates ``self._parquet_files`` keyed by ``db_name``. For datasets - with external metadata (identified during ``_validate_datacards``), - also downloads those files and stores them under the key - ``"___meta"`` so ``_register_all_views`` can read them - without further network calls. - - """ - self._parquet_files: dict[str, list[str]] = {} - for db_name, (repo_id, config_name) in self._db_name_map.items(): - files = self._resolve_parquet_files(repo_id, config_name) - self._parquet_files[db_name] = files - - for db_name, ext_config_name in self._external_meta_configs.items(): - repo_id, _ = self._db_name_map[db_name] - files = self._resolve_parquet_files(repo_id, ext_config_name) - self._parquet_files[f"__{db_name}_meta"] = files - def _register_all_views(self) -> None: """ Register all DuckDB views in dependency order. @@ -702,14 +554,10 @@ def _register_all_views(self) -> None: "'%s' (db_name '%s') -- skipping external metadata view", ext_config_name, db_name, - "'%s' (db_name '%s') -- skipping external metadata view", - ext_config_name, - db_name, ) continue files_sql = ", ".join(f"'{f}'" for f in files) try: - self._conn.execute( self._conn.execute( f"CREATE OR REPLACE VIEW {meta_view} AS " f"SELECT * FROM read_parquet([{files_sql}])" @@ -831,8 +679,8 @@ def _register_raw_view( parquet_only: bool = False, ) -> None: """ - Register a raw DuckDB view over pre-resolved Parquet files. - Register a raw DuckDB view over pre-resolved Parquet files. + Register a raw DuckDB view over pre-resolved Parquet files. Register a raw + DuckDB view over pre-resolved Parquet files. Creates an internal ``___parquet`` view that reads directly from the Parquet files. For primary datasets, also @@ -854,10 +702,8 @@ def _register_raw_view( """ files = self._parquet_files.get(db_name, []) - files = self._parquet_files.get(db_name, []) if not files: logger.warning( - "No parquet files for db_name '%s' -- skipping view", "No parquet files for db_name '%s' -- skipping view", db_name, ) @@ -865,7 +711,6 @@ def _register_raw_view( files_sql = ", ".join(f"'{f}'" for f in files) parquet_sql = f"SELECT * FROM read_parquet([{files_sql}])" - self._conn.execute( self._conn.execute( f"CREATE OR REPLACE VIEW __{db_name}_parquet AS " f"{parquet_sql}" ) @@ -886,22 +731,6 @@ def _register_raw_view( cols_sql = ", ".join(parts) public_select = f"SELECT {cols_sql} FROM __{db_name}_parquet" self._conn.execute(f"CREATE OR REPLACE VIEW {db_name} AS {public_select}") - sample_col = self._get_sample_id_col(db_name) - if sample_col == "sample_id": - public_select = f"SELECT * FROM __{db_name}_parquet" - else: - raw_cols = self._get_view_columns(f"__{db_name}_parquet") - parts: list[str] = [] - for col in raw_cols: - if col == sample_col: - parts.append(f"{col} AS sample_id") - elif col == "sample_id": - parts.append(f"{col} AS sample_id_orig") - else: - parts.append(col) - cols_sql = ", ".join(parts) - public_select = f"SELECT {cols_sql} FROM __{db_name}_parquet" - self._conn.execute(f"CREATE OR REPLACE VIEW {db_name} AS {public_select}") def _register_meta_view(self, db_name: str, repo_id: str, config_name: str) -> None: """ @@ -937,8 +766,6 @@ def _register_meta_view(self, db_name: str, repo_id: str, config_name: str) -> N # FROM clause construction. schema: DatasetSchema | None = self._dataset_schemas.get(db_name) ext_meta_view: str | None = self._external_meta_views.get(db_name) - schema: DatasetSchema | None = self._dataset_schemas.get(db_name) - ext_meta_view: str | None = self._external_meta_views.get(db_name) is_external = ( ext_meta_view is not None @@ -1216,39 +1043,11 @@ def _enrich_raw_view(self, db_name: str) -> None: else: join_clause = f"JOIN {meta_name} m USING ({sample_col})" - self._conn.execute( - if rename_sample: - # Build explicit SELECT to rename the sample column - raw_parts: list[str] = [] - for col in raw_cols_list: - if col == sample_col: - raw_parts.append(f"r.{col} AS sample_id") - elif col == "sample_id": - raw_parts.append(f"r.{col} AS sample_id_orig") - else: - raw_parts.append(f"r.{col}") - raw_select = ", ".join(raw_parts) - else: - raw_select = "r.*" - - if extra_cols: - extra_select = ", ".join(f"m.{c}" for c in sorted(extra_cols)) - full_select = f"{raw_select}, {extra_select}" - else: - full_select = raw_select - - if rename_sample: - join_clause = f"JOIN {meta_name} m ON r.{sample_col} = m.sample_id" - else: - join_clause = f"JOIN {meta_name} m USING ({sample_col})" - self._conn.execute( f"CREATE OR REPLACE VIEW {db_name} AS " f"SELECT {full_select} " - f"SELECT {full_select} " f"FROM {parquet_name} r " f"{join_clause}" - f"{join_clause}" ) def _get_view_columns(self, view: str) -> list[str]: @@ -1727,7 +1526,6 @@ def _register_comparative_expanded_view( return cols_sql = ", ".join(extra_cols) - self._conn.execute( self._conn.execute( f"CREATE OR REPLACE VIEW {db_name}_expanded AS " f"SELECT *, {cols_sql} FROM {parquet_view}" @@ -1747,7 +1545,6 @@ def _is_comparative(self, repo_id: str, config_name: str) -> bool: def _list_views(self) -> list[str]: """Return list of public views (excludes internal __ prefixed).""" - df = self._conn.execute( df = self._conn.execute( "SELECT table_name FROM information_schema.tables " "WHERE table_schema = 'main' AND table_type = 'VIEW'" @@ -1756,7 +1553,6 @@ def _list_views(self) -> list[str]: def _view_exists(self, name: str) -> bool: """Check whether a view is registered (including internal).""" - df = self._conn.execute( df = self._conn.execute( "SELECT table_name FROM information_schema.tables " "WHERE table_schema = 'main' AND table_type = 'VIEW' " @@ -1798,11 +1594,6 @@ def __repr__(self) -> str: n_repos = len(self.config.repositories) n_datasets = len(self._db_name_map) n_views = len(self._list_views()) - return ( - f"VirtualDB({n_repos} repos, " - f"{n_datasets} datasets, " - f"{n_views} views)" - n_views = len(self._list_views()) return ( f"VirtualDB({n_repos} repos, " f"{n_datasets} datasets, " From 83c3382fa52c9ef9ad54772bcaf6fdeffc7a9885 Mon Sep 17 00:00:00 2001 From: chasem Date: Wed, 11 Mar 2026 13:05:10 -0500 Subject: [PATCH 3/4] removing duplicate code lines --- tfbpapi/virtual_db.py | 19 ------------------- 1 file changed, 19 deletions(-) diff --git a/tfbpapi/virtual_db.py b/tfbpapi/virtual_db.py index 911f0a2..45f477c 100644 --- a/tfbpapi/virtual_db.py +++ b/tfbpapi/virtual_db.py @@ -302,14 +302,12 @@ def describe(self, table: str | None = None) -> pd.DataFrame: """ if table is not None: - df = self._conn.execute(f"DESCRIBE {table}").fetchdf() df = self._conn.execute(f"DESCRIBE {table}").fetchdf() df.insert(0, "table", table) return df frames = [] for view in sorted(self._list_views()): - df = self._conn.execute(f"DESCRIBE {view}").fetchdf() df = self._conn.execute(f"DESCRIBE {view}").fetchdf() df.insert(0, "table", view) frames.append(df) @@ -571,13 +569,11 @@ def _register_all_views(self) -> None: continue self._external_meta_views[db_name] = meta_view - # 3. Metadata views for primary datasets (_meta) # 3. Metadata views for primary datasets (_meta) for db_name, (repo_id, config_name) in self._db_name_map.items(): if not self._is_comparative(repo_id, config_name): self._register_meta_view(db_name, repo_id, config_name) - # 4. Replace primary raw views with join to _meta so # 4. Replace primary raw views with join to _meta so # derived columns (e.g. carbon_source) are available for db_name, (repo_id, config_name) in self._db_name_map.items(): @@ -693,9 +689,6 @@ def _register_raw_view( Parquet files must have been resolved by ``_update_cache`` before this method is called. - Parquet files must have been resolved by ``_update_cache`` - before this method is called. - :param db_name: View name :param parquet_only: If True, only create the internal ``___parquet`` view (no public ````). @@ -874,7 +867,6 @@ def qualify(col: str) -> str: seen: set[str] = set() select_parts: list[str] = [] rename_sample = sample_col != "sample_id" - rename_sample = sample_col != "sample_id" def add_col(col: str) -> None: if col in seen: @@ -944,7 +936,6 @@ def add_col(col: str) -> None: ) try: self._conn.execute(sql) - self._conn.execute(sql) except BinderException as exc: raise BinderException( f"Failed to create meta view '{db_name}_meta'.\n" @@ -1010,9 +1001,6 @@ def _enrich_raw_view(self, db_name: str) -> None: extra_cols = meta_cols - raw_cols if not extra_cols: - # No derived columns to add -- the view created in - # _register_raw_view (which already handles the rename) - # is sufficient. # No derived columns to add -- the view created in # _register_raw_view (which already handles the rename) # is sufficient. @@ -1060,7 +1048,6 @@ def _get_view_columns(self, view: str) -> list[str]: """ df = self._conn.execute(f"DESCRIBE {view}").fetchdf() - df = self._conn.execute(f"DESCRIBE {view}").fetchdf() return df["column_name"].tolist() def _get_sample_id_col(self, db_name: str) -> str: @@ -1090,9 +1077,6 @@ def _resolve_metadata_fields( """ try: - card = self._datacards.get(repo_id) or _cached_datacard( - repo_id, token=self.token - ) card = self._datacards.get(repo_id) or _cached_datacard( repo_id, token=self.token ) @@ -1227,9 +1211,6 @@ def _resolve_property_columns( card = self._datacards.get(repo_id) or _cached_datacard( repo_id, token=self.token ) - card = self._datacards.get(repo_id) or _cached_datacard( - repo_id, token=self.token - ) except Exception as exc: logger.warning( "Could not load DataCard for %s: %s", From 385ee5736b31aebd99c2fbdbee4dd63c222cdff8 Mon Sep 17 00:00:00 2001 From: chasem Date: Wed, 11 Mar 2026 13:07:57 -0500 Subject: [PATCH 4/4] still removing duplicate lines --- tfbpapi/virtual_db.py | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/tfbpapi/virtual_db.py b/tfbpapi/virtual_db.py index 45f477c..5ac1178 100644 --- a/tfbpapi/virtual_db.py +++ b/tfbpapi/virtual_db.py @@ -675,8 +675,7 @@ def _register_raw_view( parquet_only: bool = False, ) -> None: """ - Register a raw DuckDB view over pre-resolved Parquet files. Register a raw - DuckDB view over pre-resolved Parquet files. + Register a raw DuckDB view over pre-resolved Parquet files. Creates an internal ``___parquet`` view that reads directly from the Parquet files. For primary datasets, also @@ -885,12 +884,6 @@ def add_col(col: str) -> None: add_col(sample_col) # When renaming, check if the parquet source also has a literal # "sample_id" column; if so, preserve it as "sample_id_orig". - if rename_sample: - source_cols = set(self._get_view_columns(parquet_view)) - if "sample_id" in source_cols: - add_col("sample_id") - # When renaming, check if the parquet source also has a literal - # "sample_id" column; if so, preserve it as "sample_id_orig". if rename_sample: source_cols = set(self._get_view_columns(parquet_view)) if "sample_id" in source_cols: