diff --git a/docs/virtual_db_configuration.md b/docs/virtual_db_configuration.md index 06897c3..90e58d0 100644 --- a/docs/virtual_db_configuration.md +++ b/docs/virtual_db_configuration.md @@ -139,13 +139,77 @@ during metadata extraction and query filtering. - `string` - Text data (default if not specified) - `numeric` - Numeric values (integers or floating-point numbers) - `bool` - Boolean values (true/false) +- `factor` - Categorical data backed by a DuckDB ENUM type (see below) **When to use dtype**: 1. **Numeric filtering**: Required for fields used with comparison operators (`<`, `>`, `<=`, `>=`, `between`) 2. **Type consistency**: When source data might be extracted with incorrect type -3. **Performance**: Helps with query optimization and prevents type mismatches +3. **Categorical columns**: Use `factor` when a field has a fixed, known set of + levels and you want DuckDB to enforce membership and enable efficient storage + +### factor dtype (DuckDB ENUM) + +When `dtype: factor` is set on a field-only mapping, VirtualDB registers a DuckDB +ENUM type from the field's `class_label` definition in the DataCard and casts the +column to that type in the `_meta` view. + +**Requirements**: + +- `dtype: factor` may only be used with field-only mappings (`field:` specified, + no `path:` or `expression:`). +- The DataCard must declare the field with `dtype: {class_label: {names: [...]}}`. + If the field is missing, has a non-`class_label` dtype, or the `names` list is + absent or empty, VirtualDB raises a `ValueError` at view-registration time. + +**Column naming when the output name matches the source field**: + +When the mapping key equals the source field name (the common case, e.g. +`time: {field: time, dtype: factor}`), the raw column is preserved in the view +under a `_orig` alias so that the original values remain accessible: + +- `time` -- ENUM-typed column with levels from the DataCard +- `time_orig` -- original raw column (e.g., DOUBLE or VARCHAR) + +If `time_orig` already exists in the parquet, VirtualDB finds the next available +name: `time_orig_1`, `time_orig_2`, etc. + +**Example DataCard feature definition** (in the HuggingFace dataset card YAML): + +```yaml +- name: time + dtype: + class_label: + names: + - 0 + - 5 + - 10 + - 15 + - 20 + - 45 + - 90 + description: Time point in minutes after induction +``` + +**Example VirtualDB config**: + +```yaml +repositories: + BrentLab/hackett_2020: + dataset: + hackett_2020: + db_name: hackett + sample_id: + field: sample_id + time: + field: time + dtype: factor +``` + +After view registration, `hackett_meta` will contain: +- `time` -- ENUM column, queryable as `WHERE time = '45'` +- `time_orig` -- original numeric column ## Tags diff --git a/tfbpapi/models.py b/tfbpapi/models.py index c2c9501..ae0d918 100644 --- a/tfbpapi/models.py +++ b/tfbpapi/models.py @@ -364,7 +364,13 @@ class PropertyMapping(BaseModel): None, description="SQL expression for derived fields" ) dtype: str | None = Field( - None, description="Data type for conversion: 'string', 'numeric', or 'bool'" + None, + description=( + "Data type for conversion: 'string', 'numeric', 'bool', or 'factor'. " + "When 'factor', the field must reference a DataCard field with a " + "class_label dtype specifying the allowed levels. VirtualDB will " + "register a DuckDB ENUM type and cast the column to it." + ), ) @field_validator("path", "field", "expression", mode="before") @@ -390,6 +396,8 @@ def validate_field_types(self) -> "PropertyMapping": """ Ensure at least one field type is specified and mutually exclusive. + Also validates dtype='factor' requires a field (not expression or path-only). + :return: The validated PropertyMapping instance :raises ValueError: If validation constraints are violated @@ -404,6 +412,12 @@ def validate_field_types(self) -> "PropertyMapping": raise ValueError( "At least one of 'field', 'path', or 'expression' must be specified" ) + if self.dtype == "factor": + if self.expression is not None or self.field is None: + raise ValueError( + "dtype='factor' requires 'field' to be specified and " + "cannot be used with 'expression' or as a path-only mapping" + ) return self diff --git a/tfbpapi/tests/test_virtual_db.py b/tfbpapi/tests/test_virtual_db.py index 6129880..3d32e52 100644 --- a/tfbpapi/tests/test_virtual_db.py +++ b/tfbpapi/tests/test_virtual_db.py @@ -15,7 +15,7 @@ import yaml # type: ignore from tfbpapi.datacard import DatasetSchema -from tfbpapi.models import DatasetType, MetadataConfig +from tfbpapi.models import DatasetType, FeatureInfo, MetadataConfig from tfbpapi.virtual_db import VirtualDB # ------------------------------------------------------------------ @@ -1469,3 +1469,258 @@ def test_external_metadata_join(self, tmp_path, monkeypatch): raw_result = v.query("SELECT * FROM chip ORDER BY sample_id") assert "db_id" in raw_result.columns assert len(raw_result) == 4 # 4 data rows + + +# ------------------------------------------------------------------ +# Tests: dtype='factor' (DuckDB ENUM) +# ------------------------------------------------------------------ + + +class TestFactorDtype: + """Tests for PropertyMapping dtype='factor' and DuckDB ENUM columns.""" + + def _make_vdb_with_factor(self, tmp_path, monkeypatch, feature_dtype): + """ + Helper: build a VirtualDB with one dataset whose 'category' field + has a PropertyMapping with dtype='factor'. ``feature_dtype`` is + passed as the FeatureInfo.dtype for the 'category' field in the + mock DataCard. + """ + import tfbpapi.virtual_db as vdb_module + + df = pd.DataFrame( + { + "sample_id": [1, 1, 2, 2], + "category": ["A", "B", "A", "C"], + "value": [1.0, 2.0, 3.0, 4.0], + } + ) + parquet_path = tmp_path / "data.parquet" + files = {("TestOrg/ds", "cfg"): [_write_parquet(parquet_path, df)]} + + config = { + "repositories": { + "TestOrg/ds": { + "dataset": { + "cfg": { + "db_name": "ds", + "sample_id": {"field": "sample_id"}, + "category": { + "field": "category", + "dtype": "factor", + }, + } + } + } + } + } + config_file = tmp_path / "config.yaml" + with open(config_file, "w") as f: + yaml.dump(config, f) + + card = MagicMock() + card.get_metadata_fields.return_value = ["sample_id", "category"] + card.get_field_definitions.return_value = {} + card.get_experimental_conditions.return_value = {} + card.get_metadata_config_name.return_value = None + card.get_dataset_schema.return_value = DatasetSchema( + data_columns={"sample_id", "category", "value"}, + metadata_columns={"sample_id", "category"}, + join_columns=set(), + metadata_source="embedded", + external_metadata_config=None, + is_partitioned=False, + ) + feature_list = [ + FeatureInfo( + name="category", + dtype=feature_dtype, + description="A categorical field", + ), + FeatureInfo( + name="sample_id", + dtype="int64", + description="Sample identifier", + ), + ] + card.get_features.return_value = feature_list + + monkeypatch.setattr( + VirtualDB, + "_resolve_parquet_files", + lambda self, repo_id, cn: files.get((repo_id, cn), []), + ) + monkeypatch.setattr( + vdb_module, + "_cached_datacard", + lambda repo_id, token=None: card, + ) + return VirtualDB(config_file) + + def test_factor_dtype_creates_enum_column(self, tmp_path, monkeypatch): + """Dtype='factor' casts the column to a DuckDB ENUM in the _meta view.""" + v = self._make_vdb_with_factor( + tmp_path, + monkeypatch, + feature_dtype={"class_label": {"names": ["A", "B", "C"]}}, + ) + df = v.query("SELECT * FROM ds_meta ORDER BY sample_id") + assert "category" in df.columns + # Values should be preserved + assert set(df["category"].dropna()) == {"A", "B", "C"} + + def test_factor_dtype_enum_type_registered(self, tmp_path, monkeypatch): + """The DuckDB ENUM type is registered and can be queried.""" + v = self._make_vdb_with_factor( + tmp_path, + monkeypatch, + feature_dtype={"class_label": {"names": ["A", "B", "C"]}}, + ) + # Trigger view registration + v.tables() + # The ENUM type should be registered in DuckDB + types_df = v._conn.execute( + "SELECT type_name FROM duckdb_types() WHERE logical_type = 'ENUM'" + ).fetchdf() + assert "_enum_category" in types_df["type_name"].tolist() + + def test_factor_dtype_missing_class_label_raises(self, tmp_path, monkeypatch): + """ValueError is raised when the DataCard field has no class_label dtype.""" + with pytest.raises(ValueError, match="class_label"): + v = self._make_vdb_with_factor( + tmp_path, + monkeypatch, + feature_dtype="string", # not a class_label dict + ) + v.tables() # triggers view registration + + def test_factor_dtype_no_names_raises(self, tmp_path, monkeypatch): + """ValueError is raised when class_label has no 'names' key.""" + with pytest.raises(ValueError, match="names"): + v = self._make_vdb_with_factor( + tmp_path, + monkeypatch, + feature_dtype={"class_label": {}}, # no names + ) + v.tables() + + def test_factor_dtype_model_validator_requires_field(self): + """PropertyMapping with dtype='factor' and no field raises ValidationError.""" + from pydantic import ValidationError + + with pytest.raises(ValidationError, match="factor"): + from tfbpapi.models import PropertyMapping + + PropertyMapping.model_validate({"path": "some.path", "dtype": "factor"}) + + def test_factor_dtype_model_validator_rejects_expression(self): + """PropertyMapping with dtype='factor' and expression raises ValidationError.""" + from pydantic import ValidationError + + with pytest.raises(ValidationError): + from tfbpapi.models import PropertyMapping + + PropertyMapping.model_validate({"expression": "col > 0", "dtype": "factor"}) + + def test_factor_dtype_inplace_renames_raw_to_orig(self, tmp_path, monkeypatch): + """ + When dtype='factor' maps a field to the same output name (e.g. + category: {field: category, dtype: factor}), the raw column is + renamed to _orig in the _meta view, and the ENUM-cast column + keeps the original name. + """ + v = self._make_vdb_with_factor( + tmp_path, + monkeypatch, + feature_dtype={"class_label": {"names": ["A", "B", "C"]}}, + ) + df = v.query("SELECT * FROM ds_meta ORDER BY sample_id") + # ENUM-cast column keeps the original name + assert "category" in df.columns + # Raw numeric/string original is preserved under _orig alias + assert "category_orig" in df.columns + # The _orig column should hold the raw values + assert set(df["category_orig"].dropna()) == {"A", "B", "C"} + + def test_factor_dtype_orig_suffix_avoids_collision(self, tmp_path, monkeypatch): + """When _orig already exists in the parquet, the rename uses _orig_1 + instead.""" + import tfbpapi.virtual_db as vdb_module + + df = pd.DataFrame( + { + "sample_id": [1, 2], + "category": ["A", "B"], + "category_orig": ["x", "y"], # pre-existing _orig column + "value": [1.0, 2.0], + } + ) + parquet_path = tmp_path / "data2.parquet" + files = {("TestOrg/ds2", "cfg2"): [_write_parquet(parquet_path, df)]} + + config = { + "repositories": { + "TestOrg/ds2": { + "dataset": { + "cfg2": { + "db_name": "ds2", + "sample_id": {"field": "sample_id"}, + "category": { + "field": "category", + "dtype": "factor", + }, + } + } + } + } + } + config_file = tmp_path / "config2.yaml" + with open(config_file, "w") as f: + yaml.dump(config, f) + + card = MagicMock() + card.get_metadata_fields.return_value = [ + "sample_id", + "category", + "category_orig", + ] + card.get_field_definitions.return_value = {} + card.get_experimental_conditions.return_value = {} + card.get_metadata_config_name.return_value = None + card.get_dataset_schema.return_value = DatasetSchema( + data_columns={"sample_id", "category", "category_orig", "value"}, + metadata_columns={"sample_id", "category", "category_orig"}, + join_columns=set(), + metadata_source="embedded", + external_metadata_config=None, + is_partitioned=False, + ) + card.get_features.return_value = [ + FeatureInfo( + name="category", + dtype={"class_label": {"names": ["A", "B"]}}, + description="categorical", + ), + FeatureInfo( + name="sample_id", + dtype="int64", + description="id", + ), + ] + + monkeypatch.setattr( + VirtualDB, + "_resolve_parquet_files", + lambda self, repo_id, cn: files.get((repo_id, cn), []), + ) + monkeypatch.setattr( + vdb_module, + "_cached_datacard", + lambda repo_id, token=None: card, + ) + v = VirtualDB(config_file) + + result = v.query("SELECT * FROM ds2_meta ORDER BY sample_id") + # Should use _orig_1 because _orig is taken + assert "category_orig_1" in result.columns + assert "category" in result.columns diff --git a/tfbpapi/virtual_db.py b/tfbpapi/virtual_db.py index eaa77d3..5ac1178 100644 --- a/tfbpapi/virtual_db.py +++ b/tfbpapi/virtual_db.py @@ -580,6 +580,7 @@ def _register_all_views(self) -> None: if not self._is_comparative(repo_id, config_name): self._enrich_raw_view(db_name) + # 5. Comparative expanded views (pre-parsed composite IDs) # 5. Comparative expanded views (pre-parsed composite IDs) for db_name, (repo_id, config_name) in self._db_name_map.items(): repo_cfg = self.config.repositories.get(repo_id) @@ -815,7 +816,49 @@ def qualify(col: str) -> str: return f"m.{col}" return f"d.{col}" + # Resolve derived property expressions first. + # When a factor mapping has the same output name as its source + # field (e.g. time -> time), the raw column must be renamed to + # avoid a duplicate column name in the SELECT. The rename uses + # "_orig", or "_orig_1", etc., to avoid collisions with + # other columns that already exist in the parquet. + prop_result = self._resolve_property_columns(repo_id, config_name) + + # Collect all column names that exist in the parquet so we can + # find a unique _orig suffix when needed. + all_parquet_cols: set[str] = set(self._get_view_columns(parquet_view)) + + # Map: raw_col -> alias_in_select for factor-overridden cols + factor_col_renames: dict[str, str] = {} + if prop_result is not None: + _derived_exprs, _prop_raw_cols = prop_result + for expr in _derived_exprs: + # Detect CAST( AS _enum_) AS patterns + # where == (in-place factor override) + if not expr.startswith("CAST("): + continue + parts = expr.rsplit(" AS ", 1) + if len(parts) != 2: + continue + out_col = parts[1].strip() + # Check whether the source field has the same name as + # the output column (in-place override case) + cast_inner = parts[0][len("CAST(") :] + src_field = cast_inner.split(" AS ")[0].strip() + if src_field == out_col and out_col in all_parquet_cols: + # Find a unique _orig name + candidate = f"{out_col}_orig" + n = 1 + while candidate in all_parquet_cols or candidate in ( + v for v in factor_col_renames.values() + ): + candidate = f"{out_col}_orig_{n}" + n += 1 + factor_col_renames[src_field] = candidate + # Build SELECT: sample_id + metadata cols (deduplicated). + # Raw columns that are factor-overridden are emitted with their + # _orig alias instead of their original name. # If the configured sample_id column differs from "sample_id", # rename it so all views expose a consistent "sample_id" column. # If the parquet also has a literal "sample_id" column, preserve @@ -825,14 +868,18 @@ def qualify(col: str) -> str: rename_sample = sample_col != "sample_id" def add_col(col: str) -> None: - if col not in seen: - seen.add(col) - if rename_sample and col == sample_col: - select_parts.append(f"{qualify(col)} AS sample_id") - elif rename_sample and col == "sample_id": - select_parts.append(f"{qualify(col)} AS sample_id_orig") - else: - select_parts.append(qualify(col)) + if col in seen: + return + seen.add(col) + alias = factor_col_renames.get(col) + if alias: + select_parts.append(f"{qualify(col)} AS {alias}") + elif rename_sample and col == sample_col: + select_parts.append(f"{qualify(col)} AS sample_id") + elif rename_sample and col == "sample_id": + select_parts.append(f"{qualify(col)} AS sample_id_orig") + else: + select_parts.append(qualify(col)) add_col(sample_col) # When renaming, check if the parquet source also has a literal @@ -845,12 +892,21 @@ def add_col(col: str) -> None: add_col(col) # Add derived property expressions from the VirtualDB config - prop_result = self._resolve_property_columns(repo_id, config_name) if prop_result is not None: derived_exprs, prop_raw_cols = prop_result # Ensure source columns needed by expressions are selected for col in prop_raw_cols: add_col(col) + # Rewrite CAST expressions to use the _orig alias when the + # source field was renamed to avoid collision. + if factor_col_renames: + rewritten = [] + for expr in derived_exprs: + for orig, alias in factor_col_renames.items(): + # Replace "CAST( AS" with "CAST( AS" + expr = expr.replace(f"CAST({orig} AS", f"CAST({alias} AS") + rewritten.append(expr) + derived_exprs = rewritten # Qualify source column references inside CASE WHEN expressions if is_join: qualified_exprs = [] @@ -899,6 +955,8 @@ def _enrich_raw_view(self, db_name: str) -> None: if not self._view_exists(meta_name) or not self._view_exists(parquet_name): return + raw_cols_list = self._get_view_columns(parquet_name) + raw_cols = set(raw_cols_list) raw_cols_list = self._get_view_columns(parquet_name) raw_cols = set(raw_cols_list) meta_cols = set(self._get_view_columns(meta_name)) @@ -906,6 +964,22 @@ def _enrich_raw_view(self, db_name: str) -> None: sample_col = self._get_sample_id_col(db_name) rename_sample = sample_col != "sample_id" + # Columns to pull from _meta that aren't already in raw parquet, + # accounting for the sample_id rename: when renaming, "sample_id" + # will appear in meta_cols (as the renamed column) but not in + # raw_cols (which has the original name), so we must exclude it + # from extra_cols since the rename in the raw SELECT already + # provides it. + if rename_sample: + # "sample_id" and "sample_id_orig" come from the raw SELECT + # rename, not from meta + extra_cols = meta_cols - raw_cols - {"sample_id", "sample_id_orig"} + else: + extra_cols = meta_cols - raw_cols + + sample_col = self._get_sample_id_col(db_name) + rename_sample = sample_col != "sample_id" + # Columns to pull from _meta that aren't already in raw parquet, # accounting for the sample_id rename: when renaming, "sample_id" # will appear in meta_cols (as the renamed column) but not in @@ -1008,6 +1082,76 @@ def _resolve_metadata_fields( ) return None + def _get_class_label_names( + self, card: Any, config_name: str, field: str + ) -> list[str]: + """ + Return the ENUM levels for a field with class_label dtype. + + Looks up the FeatureInfo for ``field`` in the DataCard config and + extracts the ``names`` list from its ``class_label`` dtype dict. + + :param card: DataCard instance + :param config_name: Configuration name + :param field: Field name to look up + :return: List of level strings + :raises ValueError: If the field is not found, has no class_label dtype, + or the class_label dict has no ``names`` key + + """ + try: + features = card.get_features(config_name) + except Exception as exc: + raise ValueError( + f"Could not retrieve features for config '{config_name}': {exc}" + ) from exc + + feature = next((f for f in features if f.name == field), None) + if feature is None: + raise ValueError( + f"Field '{field}' not found in DataCard config '{config_name}'. " + "dtype='factor' requires the field to be declared in the DataCard." + ) + + dtype = feature.dtype + if not isinstance(dtype, dict) or "class_label" not in dtype: + raise ValueError( + f"dtype='factor' is set for field '{field}' in config " + f"'{config_name}', but the DataCard dtype is {dtype!r} rather " + "than a class_label dict. " + "The DataCard must declare dtype: {class_label: {names: [...]}}." + ) + + class_label = dtype["class_label"] + names = class_label.get("names") if isinstance(class_label, dict) else None + if not names: + raise ValueError( + f"class_label for field '{field}' in config '{config_name}' " + "has no 'names' key or the names list is empty. " + "Specify levels as: class_label: {names: [level1, level2, ...]}." + ) + + return [str(n) for n in names] + + def _ensure_enum_type(self, type_name: str, levels: list[str]) -> None: + """ + Create or replace a DuckDB ENUM type with the given levels. + + DuckDB ENUM types must be registered before use in CAST expressions. Drops any + existing type with the same name first to allow re-registration on repeated view + builds. + + :param type_name: SQL identifier for the ENUM type + :param levels: Ordered list of allowed string values + + """ + try: + self._conn.execute(f"DROP TYPE IF EXISTS {type_name}") + except Exception: + pass # type may not exist yet + escaped = ", ".join(f"'{v.replace(chr(39), chr(39)*2)}'" for v in levels) + self._conn.execute(f"CREATE TYPE {type_name} AS ENUM ({escaped})") + def _resolve_alias(self, col: str, value: str) -> str: """ Apply factor alias to a value if one is configured. @@ -1075,9 +1219,19 @@ def _resolve_property_columns( continue if mapping.field is not None and mapping.path is None: - # Type A: field-only (alias or no-op) + # Type A: field-only (alias or ENUM cast) raw_cols.add(mapping.field) - if key == mapping.field: + if mapping.dtype == "factor": + # Fetch class_label levels from DataCard, register ENUM, + # and emit a CAST expression. Raises ValueError if the + # DataCard does not declare a class_label dtype. + enum_type = f"_enum_{key}" + levels = self._get_class_label_names( + card, config_name, mapping.field + ) + self._ensure_enum_type(enum_type, levels) + expressions.append(f"CAST({mapping.field} AS {enum_type}) AS {key}") + elif key == mapping.field: # no-op -- column already present as raw col pass else: @@ -1085,7 +1239,15 @@ def _resolve_property_columns( continue if mapping.field is not None and mapping.path is not None: - # Type B: field + path -- resolve from definitions + # Type B: field + path -- resolve from definitions. + # dtype='factor' is not supported here: levels come from a + # class_label field, not a definitions path. + if mapping.dtype == "factor": + raise ValueError( + f"dtype='factor' is not supported for field+path mappings " + f"(key='{key}'). Use dtype='factor' only with field-only " + "mappings that reference a class_label field in the DataCard." + ) raw_cols.add(mapping.field) expr = self._build_field_path_expr( key,