diff --git a/docs/virtual_db_configuration.md b/docs/virtual_db_configuration.md
index 06897c3..90e58d0 100644
--- a/docs/virtual_db_configuration.md
+++ b/docs/virtual_db_configuration.md
@@ -139,13 +139,77 @@ during metadata extraction and query filtering.
- `string` - Text data (default if not specified)
- `numeric` - Numeric values (integers or floating-point numbers)
- `bool` - Boolean values (true/false)
+- `factor` - Categorical data backed by a DuckDB ENUM type (see below)
**When to use dtype**:
1. **Numeric filtering**: Required for fields used with comparison operators
(`<`, `>`, `<=`, `>=`, `between`)
2. **Type consistency**: When source data might be extracted with incorrect type
-3. **Performance**: Helps with query optimization and prevents type mismatches
+3. **Categorical columns**: Use `factor` when a field has a fixed, known set of
+ levels and you want DuckDB to enforce membership and enable efficient storage
+
+### factor dtype (DuckDB ENUM)
+
+When `dtype: factor` is set on a field-only mapping, VirtualDB registers a DuckDB
+ENUM type from the field's `class_label` definition in the DataCard and casts the
+column to that type in the `_meta` view.
+
+**Requirements**:
+
+- `dtype: factor` may only be used with field-only mappings (`field:` specified,
+ no `path:` or `expression:`).
+- The DataCard must declare the field with `dtype: {class_label: {names: [...]}}`.
+ If the field is missing, has a non-`class_label` dtype, or the `names` list is
+ absent or empty, VirtualDB raises a `ValueError` at view-registration time.
+
+**Column naming when the output name matches the source field**:
+
+When the mapping key equals the source field name (the common case, e.g.
+`time: {field: time, dtype: factor}`), the raw column is preserved in the view
+under a `_orig` alias so that the original values remain accessible:
+
+- `time` -- ENUM-typed column with levels from the DataCard
+- `time_orig` -- original raw column (e.g., DOUBLE or VARCHAR)
+
+If `time_orig` already exists in the parquet, VirtualDB finds the next available
+name: `time_orig_1`, `time_orig_2`, etc.
+
+**Example DataCard feature definition** (in the HuggingFace dataset card YAML):
+
+```yaml
+- name: time
+ dtype:
+ class_label:
+ names:
+ - 0
+ - 5
+ - 10
+ - 15
+ - 20
+ - 45
+ - 90
+ description: Time point in minutes after induction
+```
+
+**Example VirtualDB config**:
+
+```yaml
+repositories:
+ BrentLab/hackett_2020:
+ dataset:
+ hackett_2020:
+ db_name: hackett
+ sample_id:
+ field: sample_id
+ time:
+ field: time
+ dtype: factor
+```
+
+After view registration, `hackett_meta` will contain:
+- `time` -- ENUM column, queryable as `WHERE time = '45'`
+- `time_orig` -- original numeric column
## Tags
diff --git a/tfbpapi/models.py b/tfbpapi/models.py
index c2c9501..ae0d918 100644
--- a/tfbpapi/models.py
+++ b/tfbpapi/models.py
@@ -364,7 +364,13 @@ class PropertyMapping(BaseModel):
None, description="SQL expression for derived fields"
)
dtype: str | None = Field(
- None, description="Data type for conversion: 'string', 'numeric', or 'bool'"
+ None,
+ description=(
+ "Data type for conversion: 'string', 'numeric', 'bool', or 'factor'. "
+ "When 'factor', the field must reference a DataCard field with a "
+ "class_label dtype specifying the allowed levels. VirtualDB will "
+ "register a DuckDB ENUM type and cast the column to it."
+ ),
)
@field_validator("path", "field", "expression", mode="before")
@@ -390,6 +396,8 @@ def validate_field_types(self) -> "PropertyMapping":
"""
Ensure at least one field type is specified and mutually exclusive.
+ Also validates dtype='factor' requires a field (not expression or path-only).
+
:return: The validated PropertyMapping instance
:raises ValueError: If validation constraints are violated
@@ -404,6 +412,12 @@ def validate_field_types(self) -> "PropertyMapping":
raise ValueError(
"At least one of 'field', 'path', or 'expression' must be specified"
)
+ if self.dtype == "factor":
+ if self.expression is not None or self.field is None:
+ raise ValueError(
+ "dtype='factor' requires 'field' to be specified and "
+ "cannot be used with 'expression' or as a path-only mapping"
+ )
return self
diff --git a/tfbpapi/tests/test_virtual_db.py b/tfbpapi/tests/test_virtual_db.py
index 6129880..3d32e52 100644
--- a/tfbpapi/tests/test_virtual_db.py
+++ b/tfbpapi/tests/test_virtual_db.py
@@ -15,7 +15,7 @@
import yaml # type: ignore
from tfbpapi.datacard import DatasetSchema
-from tfbpapi.models import DatasetType, MetadataConfig
+from tfbpapi.models import DatasetType, FeatureInfo, MetadataConfig
from tfbpapi.virtual_db import VirtualDB
# ------------------------------------------------------------------
@@ -1469,3 +1469,258 @@ def test_external_metadata_join(self, tmp_path, monkeypatch):
raw_result = v.query("SELECT * FROM chip ORDER BY sample_id")
assert "db_id" in raw_result.columns
assert len(raw_result) == 4 # 4 data rows
+
+
+# ------------------------------------------------------------------
+# Tests: dtype='factor' (DuckDB ENUM)
+# ------------------------------------------------------------------
+
+
+class TestFactorDtype:
+ """Tests for PropertyMapping dtype='factor' and DuckDB ENUM columns."""
+
+ def _make_vdb_with_factor(self, tmp_path, monkeypatch, feature_dtype):
+ """
+ Helper: build a VirtualDB with one dataset whose 'category' field
+ has a PropertyMapping with dtype='factor'. ``feature_dtype`` is
+ passed as the FeatureInfo.dtype for the 'category' field in the
+ mock DataCard.
+ """
+ import tfbpapi.virtual_db as vdb_module
+
+ df = pd.DataFrame(
+ {
+ "sample_id": [1, 1, 2, 2],
+ "category": ["A", "B", "A", "C"],
+ "value": [1.0, 2.0, 3.0, 4.0],
+ }
+ )
+ parquet_path = tmp_path / "data.parquet"
+ files = {("TestOrg/ds", "cfg"): [_write_parquet(parquet_path, df)]}
+
+ config = {
+ "repositories": {
+ "TestOrg/ds": {
+ "dataset": {
+ "cfg": {
+ "db_name": "ds",
+ "sample_id": {"field": "sample_id"},
+ "category": {
+ "field": "category",
+ "dtype": "factor",
+ },
+ }
+ }
+ }
+ }
+ }
+ config_file = tmp_path / "config.yaml"
+ with open(config_file, "w") as f:
+ yaml.dump(config, f)
+
+ card = MagicMock()
+ card.get_metadata_fields.return_value = ["sample_id", "category"]
+ card.get_field_definitions.return_value = {}
+ card.get_experimental_conditions.return_value = {}
+ card.get_metadata_config_name.return_value = None
+ card.get_dataset_schema.return_value = DatasetSchema(
+ data_columns={"sample_id", "category", "value"},
+ metadata_columns={"sample_id", "category"},
+ join_columns=set(),
+ metadata_source="embedded",
+ external_metadata_config=None,
+ is_partitioned=False,
+ )
+ feature_list = [
+ FeatureInfo(
+ name="category",
+ dtype=feature_dtype,
+ description="A categorical field",
+ ),
+ FeatureInfo(
+ name="sample_id",
+ dtype="int64",
+ description="Sample identifier",
+ ),
+ ]
+ card.get_features.return_value = feature_list
+
+ monkeypatch.setattr(
+ VirtualDB,
+ "_resolve_parquet_files",
+ lambda self, repo_id, cn: files.get((repo_id, cn), []),
+ )
+ monkeypatch.setattr(
+ vdb_module,
+ "_cached_datacard",
+ lambda repo_id, token=None: card,
+ )
+ return VirtualDB(config_file)
+
+ def test_factor_dtype_creates_enum_column(self, tmp_path, monkeypatch):
+ """Dtype='factor' casts the column to a DuckDB ENUM in the _meta view."""
+ v = self._make_vdb_with_factor(
+ tmp_path,
+ monkeypatch,
+ feature_dtype={"class_label": {"names": ["A", "B", "C"]}},
+ )
+ df = v.query("SELECT * FROM ds_meta ORDER BY sample_id")
+ assert "category" in df.columns
+ # Values should be preserved
+ assert set(df["category"].dropna()) == {"A", "B", "C"}
+
+ def test_factor_dtype_enum_type_registered(self, tmp_path, monkeypatch):
+ """The DuckDB ENUM type is registered and can be queried."""
+ v = self._make_vdb_with_factor(
+ tmp_path,
+ monkeypatch,
+ feature_dtype={"class_label": {"names": ["A", "B", "C"]}},
+ )
+ # Trigger view registration
+ v.tables()
+ # The ENUM type should be registered in DuckDB
+ types_df = v._conn.execute(
+ "SELECT type_name FROM duckdb_types() WHERE logical_type = 'ENUM'"
+ ).fetchdf()
+ assert "_enum_category" in types_df["type_name"].tolist()
+
+ def test_factor_dtype_missing_class_label_raises(self, tmp_path, monkeypatch):
+ """ValueError is raised when the DataCard field has no class_label dtype."""
+ with pytest.raises(ValueError, match="class_label"):
+ v = self._make_vdb_with_factor(
+ tmp_path,
+ monkeypatch,
+ feature_dtype="string", # not a class_label dict
+ )
+ v.tables() # triggers view registration
+
+ def test_factor_dtype_no_names_raises(self, tmp_path, monkeypatch):
+ """ValueError is raised when class_label has no 'names' key."""
+ with pytest.raises(ValueError, match="names"):
+ v = self._make_vdb_with_factor(
+ tmp_path,
+ monkeypatch,
+ feature_dtype={"class_label": {}}, # no names
+ )
+ v.tables()
+
+ def test_factor_dtype_model_validator_requires_field(self):
+ """PropertyMapping with dtype='factor' and no field raises ValidationError."""
+ from pydantic import ValidationError
+
+ with pytest.raises(ValidationError, match="factor"):
+ from tfbpapi.models import PropertyMapping
+
+ PropertyMapping.model_validate({"path": "some.path", "dtype": "factor"})
+
+ def test_factor_dtype_model_validator_rejects_expression(self):
+ """PropertyMapping with dtype='factor' and expression raises ValidationError."""
+ from pydantic import ValidationError
+
+ with pytest.raises(ValidationError):
+ from tfbpapi.models import PropertyMapping
+
+ PropertyMapping.model_validate({"expression": "col > 0", "dtype": "factor"})
+
+ def test_factor_dtype_inplace_renames_raw_to_orig(self, tmp_path, monkeypatch):
+ """
+ When dtype='factor' maps a field to the same output name (e.g.
+ category: {field: category, dtype: factor}), the raw column is
+ renamed to
_orig in the _meta view, and the ENUM-cast column
+ keeps the original name.
+ """
+ v = self._make_vdb_with_factor(
+ tmp_path,
+ monkeypatch,
+ feature_dtype={"class_label": {"names": ["A", "B", "C"]}},
+ )
+ df = v.query("SELECT * FROM ds_meta ORDER BY sample_id")
+ # ENUM-cast column keeps the original name
+ assert "category" in df.columns
+ # Raw numeric/string original is preserved under _orig alias
+ assert "category_orig" in df.columns
+ # The _orig column should hold the raw values
+ assert set(df["category_orig"].dropna()) == {"A", "B", "C"}
+
+ def test_factor_dtype_orig_suffix_avoids_collision(self, tmp_path, monkeypatch):
+ """When _orig already exists in the parquet, the rename uses _orig_1
+ instead."""
+ import tfbpapi.virtual_db as vdb_module
+
+ df = pd.DataFrame(
+ {
+ "sample_id": [1, 2],
+ "category": ["A", "B"],
+ "category_orig": ["x", "y"], # pre-existing _orig column
+ "value": [1.0, 2.0],
+ }
+ )
+ parquet_path = tmp_path / "data2.parquet"
+ files = {("TestOrg/ds2", "cfg2"): [_write_parquet(parquet_path, df)]}
+
+ config = {
+ "repositories": {
+ "TestOrg/ds2": {
+ "dataset": {
+ "cfg2": {
+ "db_name": "ds2",
+ "sample_id": {"field": "sample_id"},
+ "category": {
+ "field": "category",
+ "dtype": "factor",
+ },
+ }
+ }
+ }
+ }
+ }
+ config_file = tmp_path / "config2.yaml"
+ with open(config_file, "w") as f:
+ yaml.dump(config, f)
+
+ card = MagicMock()
+ card.get_metadata_fields.return_value = [
+ "sample_id",
+ "category",
+ "category_orig",
+ ]
+ card.get_field_definitions.return_value = {}
+ card.get_experimental_conditions.return_value = {}
+ card.get_metadata_config_name.return_value = None
+ card.get_dataset_schema.return_value = DatasetSchema(
+ data_columns={"sample_id", "category", "category_orig", "value"},
+ metadata_columns={"sample_id", "category", "category_orig"},
+ join_columns=set(),
+ metadata_source="embedded",
+ external_metadata_config=None,
+ is_partitioned=False,
+ )
+ card.get_features.return_value = [
+ FeatureInfo(
+ name="category",
+ dtype={"class_label": {"names": ["A", "B"]}},
+ description="categorical",
+ ),
+ FeatureInfo(
+ name="sample_id",
+ dtype="int64",
+ description="id",
+ ),
+ ]
+
+ monkeypatch.setattr(
+ VirtualDB,
+ "_resolve_parquet_files",
+ lambda self, repo_id, cn: files.get((repo_id, cn), []),
+ )
+ monkeypatch.setattr(
+ vdb_module,
+ "_cached_datacard",
+ lambda repo_id, token=None: card,
+ )
+ v = VirtualDB(config_file)
+
+ result = v.query("SELECT * FROM ds2_meta ORDER BY sample_id")
+ # Should use _orig_1 because _orig is taken
+ assert "category_orig_1" in result.columns
+ assert "category" in result.columns
diff --git a/tfbpapi/virtual_db.py b/tfbpapi/virtual_db.py
index eaa77d3..5ac1178 100644
--- a/tfbpapi/virtual_db.py
+++ b/tfbpapi/virtual_db.py
@@ -580,6 +580,7 @@ def _register_all_views(self) -> None:
if not self._is_comparative(repo_id, config_name):
self._enrich_raw_view(db_name)
+ # 5. Comparative expanded views (pre-parsed composite IDs)
# 5. Comparative expanded views (pre-parsed composite IDs)
for db_name, (repo_id, config_name) in self._db_name_map.items():
repo_cfg = self.config.repositories.get(repo_id)
@@ -815,7 +816,49 @@ def qualify(col: str) -> str:
return f"m.{col}"
return f"d.{col}"
+ # Resolve derived property expressions first.
+ # When a factor mapping has the same output name as its source
+ # field (e.g. time -> time), the raw column must be renamed to
+ # avoid a duplicate column name in the SELECT. The rename uses
+ # "_orig", or "_orig_1", etc., to avoid collisions with
+ # other columns that already exist in the parquet.
+ prop_result = self._resolve_property_columns(repo_id, config_name)
+
+ # Collect all column names that exist in the parquet so we can
+ # find a unique _orig suffix when needed.
+ all_parquet_cols: set[str] = set(self._get_view_columns(parquet_view))
+
+ # Map: raw_col -> alias_in_select for factor-overridden cols
+ factor_col_renames: dict[str, str] = {}
+ if prop_result is not None:
+ _derived_exprs, _prop_raw_cols = prop_result
+ for expr in _derived_exprs:
+ # Detect CAST( AS _enum_) AS patterns
+ # where == (in-place factor override)
+ if not expr.startswith("CAST("):
+ continue
+ parts = expr.rsplit(" AS ", 1)
+ if len(parts) != 2:
+ continue
+ out_col = parts[1].strip()
+ # Check whether the source field has the same name as
+ # the output column (in-place override case)
+ cast_inner = parts[0][len("CAST(") :]
+ src_field = cast_inner.split(" AS ")[0].strip()
+ if src_field == out_col and out_col in all_parquet_cols:
+ # Find a unique _orig name
+ candidate = f"{out_col}_orig"
+ n = 1
+ while candidate in all_parquet_cols or candidate in (
+ v for v in factor_col_renames.values()
+ ):
+ candidate = f"{out_col}_orig_{n}"
+ n += 1
+ factor_col_renames[src_field] = candidate
+
# Build SELECT: sample_id + metadata cols (deduplicated).
+ # Raw columns that are factor-overridden are emitted with their
+ # _orig alias instead of their original name.
# If the configured sample_id column differs from "sample_id",
# rename it so all views expose a consistent "sample_id" column.
# If the parquet also has a literal "sample_id" column, preserve
@@ -825,14 +868,18 @@ def qualify(col: str) -> str:
rename_sample = sample_col != "sample_id"
def add_col(col: str) -> None:
- if col not in seen:
- seen.add(col)
- if rename_sample and col == sample_col:
- select_parts.append(f"{qualify(col)} AS sample_id")
- elif rename_sample and col == "sample_id":
- select_parts.append(f"{qualify(col)} AS sample_id_orig")
- else:
- select_parts.append(qualify(col))
+ if col in seen:
+ return
+ seen.add(col)
+ alias = factor_col_renames.get(col)
+ if alias:
+ select_parts.append(f"{qualify(col)} AS {alias}")
+ elif rename_sample and col == sample_col:
+ select_parts.append(f"{qualify(col)} AS sample_id")
+ elif rename_sample and col == "sample_id":
+ select_parts.append(f"{qualify(col)} AS sample_id_orig")
+ else:
+ select_parts.append(qualify(col))
add_col(sample_col)
# When renaming, check if the parquet source also has a literal
@@ -845,12 +892,21 @@ def add_col(col: str) -> None:
add_col(col)
# Add derived property expressions from the VirtualDB config
- prop_result = self._resolve_property_columns(repo_id, config_name)
if prop_result is not None:
derived_exprs, prop_raw_cols = prop_result
# Ensure source columns needed by expressions are selected
for col in prop_raw_cols:
add_col(col)
+ # Rewrite CAST expressions to use the _orig alias when the
+ # source field was renamed to avoid collision.
+ if factor_col_renames:
+ rewritten = []
+ for expr in derived_exprs:
+ for orig, alias in factor_col_renames.items():
+ # Replace "CAST( AS" with "CAST( AS"
+ expr = expr.replace(f"CAST({orig} AS", f"CAST({alias} AS")
+ rewritten.append(expr)
+ derived_exprs = rewritten
# Qualify source column references inside CASE WHEN expressions
if is_join:
qualified_exprs = []
@@ -899,6 +955,8 @@ def _enrich_raw_view(self, db_name: str) -> None:
if not self._view_exists(meta_name) or not self._view_exists(parquet_name):
return
+ raw_cols_list = self._get_view_columns(parquet_name)
+ raw_cols = set(raw_cols_list)
raw_cols_list = self._get_view_columns(parquet_name)
raw_cols = set(raw_cols_list)
meta_cols = set(self._get_view_columns(meta_name))
@@ -906,6 +964,22 @@ def _enrich_raw_view(self, db_name: str) -> None:
sample_col = self._get_sample_id_col(db_name)
rename_sample = sample_col != "sample_id"
+ # Columns to pull from _meta that aren't already in raw parquet,
+ # accounting for the sample_id rename: when renaming, "sample_id"
+ # will appear in meta_cols (as the renamed column) but not in
+ # raw_cols (which has the original name), so we must exclude it
+ # from extra_cols since the rename in the raw SELECT already
+ # provides it.
+ if rename_sample:
+ # "sample_id" and "sample_id_orig" come from the raw SELECT
+ # rename, not from meta
+ extra_cols = meta_cols - raw_cols - {"sample_id", "sample_id_orig"}
+ else:
+ extra_cols = meta_cols - raw_cols
+
+ sample_col = self._get_sample_id_col(db_name)
+ rename_sample = sample_col != "sample_id"
+
# Columns to pull from _meta that aren't already in raw parquet,
# accounting for the sample_id rename: when renaming, "sample_id"
# will appear in meta_cols (as the renamed column) but not in
@@ -1008,6 +1082,76 @@ def _resolve_metadata_fields(
)
return None
+ def _get_class_label_names(
+ self, card: Any, config_name: str, field: str
+ ) -> list[str]:
+ """
+ Return the ENUM levels for a field with class_label dtype.
+
+ Looks up the FeatureInfo for ``field`` in the DataCard config and
+ extracts the ``names`` list from its ``class_label`` dtype dict.
+
+ :param card: DataCard instance
+ :param config_name: Configuration name
+ :param field: Field name to look up
+ :return: List of level strings
+ :raises ValueError: If the field is not found, has no class_label dtype,
+ or the class_label dict has no ``names`` key
+
+ """
+ try:
+ features = card.get_features(config_name)
+ except Exception as exc:
+ raise ValueError(
+ f"Could not retrieve features for config '{config_name}': {exc}"
+ ) from exc
+
+ feature = next((f for f in features if f.name == field), None)
+ if feature is None:
+ raise ValueError(
+ f"Field '{field}' not found in DataCard config '{config_name}'. "
+ "dtype='factor' requires the field to be declared in the DataCard."
+ )
+
+ dtype = feature.dtype
+ if not isinstance(dtype, dict) or "class_label" not in dtype:
+ raise ValueError(
+ f"dtype='factor' is set for field '{field}' in config "
+ f"'{config_name}', but the DataCard dtype is {dtype!r} rather "
+ "than a class_label dict. "
+ "The DataCard must declare dtype: {class_label: {names: [...]}}."
+ )
+
+ class_label = dtype["class_label"]
+ names = class_label.get("names") if isinstance(class_label, dict) else None
+ if not names:
+ raise ValueError(
+ f"class_label for field '{field}' in config '{config_name}' "
+ "has no 'names' key or the names list is empty. "
+ "Specify levels as: class_label: {names: [level1, level2, ...]}."
+ )
+
+ return [str(n) for n in names]
+
+ def _ensure_enum_type(self, type_name: str, levels: list[str]) -> None:
+ """
+ Create or replace a DuckDB ENUM type with the given levels.
+
+ DuckDB ENUM types must be registered before use in CAST expressions. Drops any
+ existing type with the same name first to allow re-registration on repeated view
+ builds.
+
+ :param type_name: SQL identifier for the ENUM type
+ :param levels: Ordered list of allowed string values
+
+ """
+ try:
+ self._conn.execute(f"DROP TYPE IF EXISTS {type_name}")
+ except Exception:
+ pass # type may not exist yet
+ escaped = ", ".join(f"'{v.replace(chr(39), chr(39)*2)}'" for v in levels)
+ self._conn.execute(f"CREATE TYPE {type_name} AS ENUM ({escaped})")
+
def _resolve_alias(self, col: str, value: str) -> str:
"""
Apply factor alias to a value if one is configured.
@@ -1075,9 +1219,19 @@ def _resolve_property_columns(
continue
if mapping.field is not None and mapping.path is None:
- # Type A: field-only (alias or no-op)
+ # Type A: field-only (alias or ENUM cast)
raw_cols.add(mapping.field)
- if key == mapping.field:
+ if mapping.dtype == "factor":
+ # Fetch class_label levels from DataCard, register ENUM,
+ # and emit a CAST expression. Raises ValueError if the
+ # DataCard does not declare a class_label dtype.
+ enum_type = f"_enum_{key}"
+ levels = self._get_class_label_names(
+ card, config_name, mapping.field
+ )
+ self._ensure_enum_type(enum_type, levels)
+ expressions.append(f"CAST({mapping.field} AS {enum_type}) AS {key}")
+ elif key == mapping.field:
# no-op -- column already present as raw col
pass
else:
@@ -1085,7 +1239,15 @@ def _resolve_property_columns(
continue
if mapping.field is not None and mapping.path is not None:
- # Type B: field + path -- resolve from definitions
+ # Type B: field + path -- resolve from definitions.
+ # dtype='factor' is not supported here: levels come from a
+ # class_label field, not a definitions path.
+ if mapping.dtype == "factor":
+ raise ValueError(
+ f"dtype='factor' is not supported for field+path mappings "
+ f"(key='{key}'). Use dtype='factor' only with field-only "
+ "mappings that reference a class_label field in the DataCard."
+ )
raw_cols.add(mapping.field)
expr = self._build_field_path_expr(
key,