diff --git a/tfbpapi/virtual_db.py b/tfbpapi/virtual_db.py index 5ac1178..2f5ca09 100644 --- a/tfbpapi/virtual_db.py +++ b/tfbpapi/virtual_db.py @@ -43,6 +43,7 @@ from __future__ import annotations import logging +import re from functools import lru_cache from pathlib import Path from typing import Any @@ -580,7 +581,6 @@ def _register_all_views(self) -> None: if not self._is_comparative(repo_id, config_name): self._enrich_raw_view(db_name) - # 5. Comparative expanded views (pre-parsed composite IDs) # 5. Comparative expanded views (pre-parsed composite IDs) for db_name, (repo_id, config_name) in self._db_name_map.items(): repo_cfg = self.config.repositories.get(repo_id) @@ -833,18 +833,26 @@ def qualify(col: str) -> str: if prop_result is not None: _derived_exprs, _prop_raw_cols = prop_result for expr in _derived_exprs: - # Detect CAST( AS _enum_) AS patterns - # where == (in-place factor override) - if not expr.startswith("CAST("): - continue + # Detect factor CAST expressions of the form: + # CAST(CAST( AS VARCHAR) AS _enum_) AS + # where == (in-place factor override). + # The output column name is the last " AS " token. parts = expr.rsplit(" AS ", 1) if len(parts) != 2: continue out_col = parts[1].strip() - # Check whether the source field has the same name as - # the output column (in-place override case) - cast_inner = parts[0][len("CAST(") :] - src_field = cast_inner.split(" AS ")[0].strip() + # Extract the innermost source field from the CAST chain. + # Handles both: + # CAST(CAST( AS VARCHAR) AS _enum_) + # CAST(CAST(CAST( AS BIGINT) AS VARCHAR) AS _enum_) + m = re.match( + r"CAST\(CAST\((?:CAST\()?(\w+)(?:\s+AS\s+BIGINT\))?" + r"\s+AS\s+VARCHAR\)\s+AS\s+_enum_\w+\)", + parts[0], + ) + if m is None: + continue + src_field = m.group(1) if src_field == out_col and out_col in all_parquet_cols: # Find a unique _orig name candidate = f"{out_col}_orig" @@ -894,8 +902,16 @@ def add_col(col: str) -> None: # Add derived property expressions from the VirtualDB config if prop_result is not None: derived_exprs, prop_raw_cols = prop_result - # Ensure source columns needed by expressions are selected - for col in prop_raw_cols: + # Ensure source columns needed by expressions are selected. + # For external metadata datasets, restrict to columns physically + # present in the metadata parquet -- data columns must not bleed + # into the meta view. + allowed_raw_cols = ( + [c for c in prop_raw_cols if c in actual_meta_cols] + if is_external + else prop_raw_cols + ) + for col in allowed_raw_cols: add_col(col) # Rewrite CAST expressions to use the _orig alias when the # source field was renamed to avoid collision. @@ -1230,7 +1246,17 @@ def _resolve_property_columns( card, config_name, mapping.field ) self._ensure_enum_type(enum_type, levels) - expressions.append(f"CAST({mapping.field} AS {enum_type}) AS {key}") + # If all levels are integer-valued strings (e.g. '0', + # '90'), the parquet column may be DOUBLE (e.g. 90.0). + # Cast through BIGINT first to strip the decimal before + # converting to VARCHAR so '90.0' becomes '90'. + all_int = all(re.fullmatch(r"-?\d+", lv) for lv in levels) + inner = ( + f"CAST({mapping.field} AS BIGINT)" if all_int else mapping.field + ) + expressions.append( + f"CAST(CAST({inner} AS VARCHAR)" f" AS {enum_type}) AS {key}" + ) elif key == mapping.field: # no-op -- column already present as raw col pass