diff --git a/docs/tutorials/virtual_db_tutorial.ipynb b/docs/tutorials/virtual_db_tutorial.ipynb index ffc73b9..b371889 100644 --- a/docs/tutorials/virtual_db_tutorial.ipynb +++ b/docs/tutorials/virtual_db_tutorial.ipynb @@ -298,11 +298,11 @@ "name": "stderr", "output_type": "stream", "text": [ - "Fetching 1 files: 100%|██████████| 1/1 [00:00<00:00, 11305.40it/s]\n", - "Fetching 1 files: 100%|██████████| 1/1 [00:00<00:00, 6442.86it/s]\n", - "Fetching 1 files: 100%|██████████| 1/1 [00:00<00:00, 9868.95it/s]\n", + "Fetching 1 files: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 1/1 [00:00<00:00, 11305.40it/s]\n", + "Fetching 1 files: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 1/1 [00:00<00:00, 6442.86it/s]\n", + "Fetching 1 files: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 1/1 [00:00<00:00, 9868.95it/s]\n", "No metadata fields found for data config 'dto' in repo 'BrentLab/yeast_comparative_analysis' -- no embedded metadata_fields and no metadata config with applies_to\n", - "Fetching 30 files: 100%|██████████| 30/30 [00:00<00:00, 7124.69it/s]\n", + "Fetching 30 files: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 30/30 [00:00<00:00, 7124.69it/s]\n", "Key 'carbon_source' not found at path 'media.carbon_source' (current keys: ['name'])\n", "Key 'carbon_source' not found at path 'media.carbon_source' (current keys: ['name'])\n", "Key 'carbon_source' not found at path 'media.carbon_source' (current keys: ['name'])\n", @@ -919,6 +919,20 @@ "print(\"Common fields:\", vdb.get_common_fields())" ] }, + { + "cell_type": "markdown", + "id": "missing-value-labels-md", + "metadata": {}, + "source": "## Missing Value Labels\n\nWhen a property key is listed under `missing_value_labels`, every dataset\nthat does **not** have an explicit mapping for that property will still expose\nthe column in its `_meta` view, filled with the configured fallback string.\n\nIn the config above, `carbon_source: unspecified` is set in `missing_value_labels`.\nAll three datasets (harbison, kemmeren, hackett) happen to have an explicit\n`carbon_source` mapping, so they resolve real values.\n\nTo demonstrate the fallback, we build a minimal config that omits `carbon_source`\nfrom kemmeren. Without `missing_value_labels`, kemmeren would have no\n`carbon_source` column at all. With it, the column appears with the default value." + }, + { + "cell_type": "code", + "execution_count": null, + "id": "missing-value-labels-code", + "metadata": {}, + "outputs": [], + "source": "minimal_yaml = \"\"\"\nrepositories:\n BrentLab/harbison_2004:\n dataset:\n harbison_2004:\n db_name: harbison2\n sample_id:\n field: sample_id\n # harbison has carbon_source mapped via field+path\n carbon_source:\n field: condition\n path: media.carbon_source.compound\n\n BrentLab/kemmeren_2014:\n dataset:\n kemmeren_2014:\n db_name: kemmeren2\n sample_id:\n field: sample_id\n # kemmeren has NO carbon_source mapping -- fallback will apply\n\nfactor_aliases:\n carbon_source:\n glucose: [D-glucose, dextrose, glu]\n\nmissing_value_labels:\n carbon_source: unspecified\n\"\"\"\n\nimport tempfile\nfrom pathlib import Path\nfrom tfbpapi.virtual_db import VirtualDB\n\np = Path(tempfile.mkdtemp()) / \"minimal.yaml\"\np.write_text(minimal_yaml)\nvdb2 = VirtualDB(str(p))\n\n# harbison resolves real values from DataCard definitions\nprint(\"harbison2 carbon_source values:\")\nprint(vdb2.query(\"SELECT DISTINCT carbon_source FROM harbison2_meta\"))\n\n# kemmeren has no mapping -- gets the missing_value_labels fallback\nprint(\"\\nkemmeren2 carbon_source values:\")\nprint(vdb2.query(\"SELECT DISTINCT carbon_source FROM kemmeren2_meta\"))\n\n# Both views expose the column, enabling cross-dataset queries without COALESCE\nprint(\"\\ncross-dataset query using carbon_source on both:\")\nprint(vdb2.query(\"\"\"\n SELECT 'harbison' AS dataset, carbon_source, COUNT(*) AS n\n FROM harbison2_meta GROUP BY carbon_source\n UNION ALL\n SELECT 'kemmeren' AS dataset, carbon_source, COUNT(*) AS n\n FROM kemmeren2_meta GROUP BY carbon_source\n ORDER BY dataset, carbon_source\n\"\"\"))\n\np.unlink(missing_ok=True)\n" + }, { "cell_type": "markdown", "id": "cell-10", @@ -1821,7 +1835,7 @@ " \n", " \n", "\n", - "
63 rows × 2 columns
\n", + "63 rows \u00d7 2 columns
\n", "" ], "text/plain": [ @@ -4857,7 +4871,7 @@ " \n", " \n", "\n", - "29804 rows × 15 columns
\n", + "29804 rows \u00d7 15 columns
\n", "" ], "text/plain": [ diff --git a/docs/virtual_db_configuration.md b/docs/virtual_db_configuration.md index 90e58d0..fd1e5f4 100644 --- a/docs/virtual_db_configuration.md +++ b/docs/virtual_db_configuration.md @@ -267,6 +267,33 @@ data via `(repo_id, config_name)` pairs for programmatic or developer use: vdb.config.get_tags("BrentLab/harbison_2004", "harbison_2004") ``` +## Missing Value Labels + +`missing_value_labels` is a top-level mapping from property name to a default +string value. When a property is listed here, every dataset's `_meta` view will +include that column -- even datasets that have no explicit mapping for it. For +those datasets, the column is emitted as the constant fallback value. + +Datasets that *do* have an explicit mapping for the property are unaffected; they +resolve the value normally (from field definitions, a path, or an expression). + +```yaml +missing_value_labels: + carbon_source: "unspecified" + temperature_celsius: "unspecified" +``` + +**Behavior by dataset**: + +| Dataset | `carbon_source` mapping | `carbon_source` in `_meta` | +|---------|------------------------|---------------------------| +| harbison | `field: condition, path: media.carbon_source.compound` | resolved from DataCard definitions | +| degron | (none) | `'unspecified'` (fallback) | + +Without `missing_value_labels`, datasets that lack the mapping simply do not +include the column in their `_meta` view, making cross-dataset queries on that +column error or require `COALESCE`. + ## Comparative Datasets Comparative datasets differ from other dataset types in that they represent diff --git a/tfbpapi/virtual_db.py b/tfbpapi/virtual_db.py index 2f5ca09..ed8453f 100644 --- a/tfbpapi/virtual_db.py +++ b/tfbpapi/virtual_db.py @@ -1210,25 +1210,30 @@ def _resolve_property_columns( """ mappings = self.config.get_property_mappings(repo_id, config_name) - if not mappings: + if not mappings and not self.config.missing_value_labels: return None expressions: list[str] = [] raw_cols: set[str] = set() - try: - card = self._datacards.get(repo_id) or _cached_datacard( - repo_id, token=self.token - ) - except Exception as exc: - logger.warning( - "Could not load DataCard for %s: %s", - repo_id, - exc, - ) - return None + card = None + if mappings: + try: + card = self._datacards.get(repo_id) or _cached_datacard( + repo_id, token=self.token + ) + except Exception as exc: + logger.warning( + "Could not load DataCard for %s: %s", + repo_id, + exc, + ) for key, mapping in mappings.items(): + if card is None: + # Cannot resolve field/path mappings without a DataCard; + # skip this mapping and fall through to missing_value_labels. + continue if mapping.expression is not None: # Type D: expression expressions.append(f"({mapping.expression}) AS {key}") @@ -1300,6 +1305,14 @@ def _resolve_property_columns( expressions.append(expr) continue + # For any key in missing_value_labels that was not covered by an + # explicit mapping for this dataset, emit a constant literal so that + # every _meta view exposes the column (with the fallback value). + for key, label in self.config.missing_value_labels.items(): + if key not in mappings: + escaped = label.replace("'", "''") + expressions.append(f"'{escaped}' AS {key}") + if not expressions and not raw_cols: return None