diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index c340f25..bdb4d39 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -32,14 +32,7 @@ jobs: strategy: matrix: # see https://github.com/actions/runner-images - os: - [ - ubuntu-22.04, - ubuntu-24.04, - macos-13, - windows-2022, - windows-2019, - ] + os: [ubuntu-22.04, ubuntu-24.04, macos-15, windows-2022] python-version: ["3.11"] runs-on: ${{ matrix.os }} steps: diff --git a/.gitignore b/.gitignore index 3eeb70f..cf9aeb9 100644 --- a/.gitignore +++ b/.gitignore @@ -1,8 +1,14 @@ -#mac files +# claude +CLAUDE.md +.claude/ +claude_output/ +claude_logs/ + +# mac files **/.DS_Store # Dataset directory -data/ +./data/ # logs **/logs/ diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 0961d95..e45b42f 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,4 +1,4 @@ -exclude: ^docs/|devcontainer.json|.*/snapshots/ +exclude: ^docs/|devcontainer.json|.*/snapshots/|mkdocs.yml default_stages: [commit] default_language_version: @@ -62,6 +62,7 @@ repos: rev: v1.15.0 hooks: - id: mypy + additional_dependencies: [types-requests] - repo: https://github.com/markdownlint/markdownlint rev: v0.12.0 diff --git a/docs/brentlab_yeastresources_collection.md b/docs/brentlab_yeastresources_collection.md new file mode 100644 index 0000000..4f49a02 --- /dev/null +++ b/docs/brentlab_yeastresources_collection.md @@ -0,0 +1,378 @@ +# BrentLab Yeast Resources Collection + +This document describes the BrentLab yeast resources collection on HuggingFace as an example implementation of the [datacard specifications](huggingface_datacard.md). This collection demonstrates best practices for organizing transcription factor binding and perturbation datasets for *Saccharomyces cerevisiae*. + +## Collection Overview + +The BrentLab yeast resources collection contains 11 datasets related to yeast transcription factor binding and gene expression regulation: + +1. **barkai_compendium** - ChEC-seq binding data across multiple GEO series +2. **callingcards** - Calling Cards transposon-based binding data +3. **hackett_2020** - TF overexpression with nutrient limitation +4. **harbison_2004** - ChIP-chip binding across 14 environmental conditions +5. **hu_2007_reimand_2010** - TF knockout expression data +6. **hughes_2006** - TF perturbation screen (overexpression and knockout) +7. **kemmeren_2014** - TF deletion expression profiling +8. **mahendrawada_2025** - ChEC-seq and nascent RNA-seq data +9. **rossi_2021** - ChIP-exo binding data +10. **yeast_comparative_analysis** - Cross-dataset comparative analyses +11. **yeast_genome_resources** - Reference genomic features + +## Standardized Media Names + +The collection uses standardized media names to facilitate cross-dataset queries. When specifying media in datacards, use these canonical names: + +### Rich Media + +- **YPD** (Yeast extract Peptone Dextrose) + - Carbon source: 2% D-glucose + - Nitrogen sources: 1% yeast extract, 2% peptone + - Standard rich medium for yeast growth + +- **yeast_extract_peptone** + - Base medium without specified carbon source + - Used with galactose (YPGal) or raffinose (YPRaff) + +### Minimal/Defined Media + +- **minimal** or **minimal_glucose** + - Minimal defined medium with glucose as carbon source + - Nitrogen source varies by experiment + +- **synthetic_complete** or **synthetic_complete_dextrose** + - Defined medium with complete amino acid supplementation + - Carbon source: typically 2% D-glucose + - Nitrogen source: yeast nitrogen base + amino acid dropout mix + +- **synthetic_complete_minus_X** + - Synthetic complete medium lacking specific nutrient(s) + - Examples: `synthetic_complete_minus_thiamine`, `synthetic_complete_minus_phosphate` + - Used for nutrient deprivation experiments + +- **selective_medium** + - Defined medium for plasmid selection + - Specific composition varies by selection markers + +## Standardized Strain Backgrounds + +The collection primarily uses these strain backgrounds: + +- **BY4741** - MATa his3Δ1 leu2Δ0 met15Δ0 ura3Δ0 + - Used in: hu_2007_reimand_2010, kemmeren_2014 + +- **W303** - Common alternative strain background + - Used in: harbison_2004 (derivative Z1256) + +- **S288C** - Reference genome strain + - Used in: Various datasets + +Strain background can be specified as a string or detailed object: + +```yaml +# Simple string +experimental_conditions: + strain_background: BY4741 + +# Detailed specification +experimental_conditions: + strain_background: + genotype: BY4741 + mating_type: MATa + markers: + - his3Δ1 + - leu2Δ0 + - met15Δ0 + - ura3Δ0 + source: Open_Biosystems + description: Knockout strains for nonessential transcription factors +``` + +## Standard Experimental Conditions + +### Growth Temperature + +Standard growth temperature across the collection is **30°C** unless otherwise noted. + +Exceptions: +- **rossi_2021**: 25°C baseline with 37°C heat shock for some samples +- **hu_2007_reimand_2010**: Heat shock at 39°C for heat shock response TFs +- **callingcards**: the experiments are performed at room temperature (~22-25°C) + +### Growth Phase + +Common growth phase specifications: + +These labels are taken from the original publications. In some cases the OD600 +is noted + +- **early_log_phase** +- **mid_log_phase** +- **late_log_phase** +- **stationary_phase** - eg barkai_compendium, which are allowed to grow overnight. The + cells are harvested at a very high density (OD600 4.0). + +Example: +```yaml +experimental_conditions: + growth_phase_at_harvest: + stage: mid_log_phase + od600: 0.6 + od600_tolerance: 0.1 +``` + +### Cultivation Methods + +Standard cultivation methods used: + +- **liquid_culture** - Standard batch culture in flasks +- **batch** - Batch culture +- **plate** - Growth on agar plates +- **chemostat** - Continuous culture (hackett_2020) + +## Concentration Specifications + +**Always use `concentration_percent`** for all concentration specifications. +Convert other units to percentage: + +- **mg/ml to percent**: divide by 10 (e.g., 5 mg/ml = 0.5%) +- **g/L to percent**: divide by 10 (e.g., 6.71 g/L = 0.671%) +- **Molar to percent**: convert using molecular weight + - Example: 100 nM rapamycin = 9.142e-6% + +### Examples from the Collection + +```yaml +# Yeast nitrogen base: 6.71 g/L = 0.671% +- compound: yeast_nitrogen_base + concentration_percent: 0.671 + +# Alpha factor: 5 mg/ml = 0.5% +- compound: alpha_factor_pheromone + concentration_percent: 0.5 + +# Rapamycin: 100 nM = 9.142e-6% +chemical_treatment: + compound: rapamycin + concentration_percent: 9.142e-6 +``` + +## Field Naming Conventions + +The collection follows these field naming conventions: + +### Gene/Feature Identifiers + +- **regulator_locus_tag**: Systematic ID of regulatory factor (e.g., "YJR060W") +- **regulator_symbol**: Common name of regulatory factor (e.g., "CBF1") +- **target_locus_tag**: Systematic ID of target gene +- **target_symbol**: Common name of target gene + +All locus tags and symbols join to **yeast_genome_resources** dataset. + +### Quantitative Measurements Examples + +Common measurement field names: + +- **effect**, **log2fc**, **log2_ratio** - Log fold change measurements +- **pvalue**, **pval**, **p_value** - Statistical significance +- **padj**, **adj_p_value** - FDR-adjusted p-values +- **binding_score**, **peak_score** - Binding strength metrics +- **enrichment** - Enrichment ratios + +### Experimental Metadata Examples + +- **sample_id** - Unique sample identifier (integer) +- **db_id** - Legacy database identifier (deprecated, do not use) +- **batch** - Experimental batch identifier +- **replicate** - Biological replicate number +- **time** - Timepoint in timecourse experiments + +## Dataset Type Usage Examples + +### genomic_features + +**yeast_genome_resources** provides reference annotations: +- Gene coordinates and strand information +- Systematic IDs (locus_tag) and common names (symbol) +- Feature types (gene, ncRNA_gene, tRNA_gene, etc.) + +Used for joining regulator/target identifiers across all other datasets. + +### annotated_features + +Most common dataset type in the collection. Examples: + +- **hackett_2020**: TF overexpression with timecourse measurements +- **harbison_2004**: ChIP-chip binding with condition field definitions +- **kemmeren_2014**: TF deletion expression data +- **mahendrawada_2025**: ChEC-seq binding scores + +Typical structure: regulator × target × measurements, with optional condition fields. + +### genome_map + +Position-level data, typically partitioned by sample or accession: + +- **barkai_compendium**: ChEC-seq pileup data partitioned by Series/Accession +- **rossi_2021**: ChIP-exo 5' tag coverage partitioned by sample +- **callingcards**: Transposon insertion density partitioned by batch + +### metadata + +Separate metadata configs or embedded metadata via `metadata_fields`: + +**Separate config example** (barkai_compendium): +```yaml +- config_name: GSE178430_metadata + dataset_type: metadata + applies_to: ["genomic_coverage"] +``` + +**Embedded metadata example** (harbison_2004): +```yaml +- config_name: harbison_2004 + dataset_type: annotated_features + metadata_fields: ["regulator_locus_tag", "regulator_symbol", "condition"] +``` + +### comparative + +**yeast_comparative_analysis** provides cross-dataset analysis results: + +- **dto config**: Direct Target Overlap analysis comparing binding and perturbation experiments +- Uses `source_sample` role for composite identifiers +- Format: `"repo_id;config_name;sample_id"` (semicolon-separated) +- Contains 8 quantitative measures: rank thresholds, set sizes, FDR, p-values +- Partitioned by binding_repo_dataset and perturbation_repo_dataset + +**Composite Sample Identifiers**: +Comparative datasets use composite identifiers to reference samples from other datasets: +- `binding_id`: Points to a binding experiment (e.g., `BrentLab/callingcards;annotated_features;1`) +- `perturbation_id`: Points to a perturbation experiment (e.g., `BrentLab/hackett_2020;hackett_2020;200`) + +**Typical structure**: source_sample_1 x source_sample_2 x ... x measurements + +**Use case**: Answer questions like "Which binding experiments show significant overlap with perturbation effects?" + +## Categorical Condition Definitions + +Many datasets define categorical experimental conditions using the `definitions` field. + +### harbison_2004 Environmental Conditions + +14 conditions with detailed specifications: +- **YPD** (rich media baseline) +- **SM** (amino acid starvation) +- **RAPA** (rapamycin treatment) +- **H2O2Hi**, **H2O2Lo** (oxidative stress) +- **HEAT** (heat shock) +- **GAL**, **RAFF** (alternative carbon sources) +- And 6 more... + +Each condition definition includes media composition, temperature, growth phase, and treatments. + +### hackett_2020 Nutrient Limitations + +```yaml +restriction: + definitions: + P: # Phosphate limitation + media: + phosphate_source: + - compound: potassium_phosphate_monobasic + concentration_percent: 0.002 + N: # Nitrogen limitation + media: + nitrogen_source: + - compound: ammonium_sulfate + concentration_percent: 0.004 + M: # Undefined limitation + description: "Not defined in the paper" +``` + +### hu_2007_reimand_2010 Treatment Conditions + +```yaml +heat_shock: + definitions: + true: + temperature_celsius: 39 + duration_minutes: 15 + false: + description: Standard growth conditions at 30°C +``` + +## Partitioning Strategies + +Large genome_map datasets use partitioning: + +**barkai_compendium** - Two-level partitioning: +```yaml +partitioning: + partition_by: ["Series", "Accession"] + path_template: "genome_map/*/*/part-0.parquet" +``` + +**callingcards** - Batch partitioning: +```yaml +partitioning: + enabled: true + partition_by: ["batch"] + path_template: "genome_map/batch={batch}/*.parquet" +``` + +## Collection-Wide Best Practices + +### 1. Omit unspecified fields with a comment + +`tfbpapi` will handle adding "unspecified" to fields which are not common across +datasets. + +```yaml +# CORRECT +experimental_conditions: + temperature_celsius: 30 + # cultivation_method is note noted in the paper and is omitted + +# INCORRECT +experimental_conditions: + temperature_celsius: unspecified +``` + +### 2. Document Source Publications + +If the original paper used something like g/L, then convert that to +`concentration_percent` and add a comment with the original value and units. + +```yaml +carbon_source: + - compound: D-glucose + # Saldanha et al 2004: 10 g/L + concentration_percent: 1 +``` + +### 3. Use Standard Field Roles + +Apply semantic roles consistently: +- `regulator_identifier` - for regulator fields +- `target_identifier` - for target fields +- `quantitative_measure` - for measurements +- `experimental_condition` - for condition fields +- `genomic_coordinate` - for positional data + +### 4. Provide sample_id + +All annotated_features datasets should include `sample_id` to uniquely identify experimental samples. This enables cross-dataset joining and metadata management. + +### 5. Specify metadata_fields or applies_to + +For datasets with metadata, either: +- Use `metadata_fields` to extract from the data itself, OR +- Create separate metadata config with `applies_to` field + +### 6. Use Consistent Gene Identifiers + +All regulator/target identifiers must be joinable to **yeast_genome_resources**: +- Use current systematic IDs (ORF names) +- Include both locus_tag and symbol fields +- Mark with appropriate roles diff --git a/docs/datacard.md b/docs/datacard.md new file mode 100644 index 0000000..cfab1f1 --- /dev/null +++ b/docs/datacard.md @@ -0,0 +1,6 @@ +# DataCard + +::: tfbpapi.datacard.DataCard + options: + show_root_heading: true + show_source: true diff --git a/docs/errors.md b/docs/errors.md new file mode 100644 index 0000000..6ba92ff --- /dev/null +++ b/docs/errors.md @@ -0,0 +1,28 @@ +# Custom Exceptions + +## HfDataFetchError + +::: tfbpapi.errors.HfDataFetchError + options: + show_root_heading: true + show_source: true + +Raised when HuggingFace API requests fail during data fetching operations. + +## DataCardError + +::: tfbpapi.errors.DataCardError + options: + show_root_heading: true + show_source: true + +Base exception for DataCard operations. + +## DataCardValidationError + +::: tfbpapi.errors.DataCardValidationError + options: + show_root_heading: true + show_source: true + +Raised when dataset card validation fails during parsing or loading. \ No newline at end of file diff --git a/docs/fetchers.md b/docs/fetchers.md new file mode 100644 index 0000000..2901a79 --- /dev/null +++ b/docs/fetchers.md @@ -0,0 +1,16 @@ +# Data Fetchers + +::: tfbpapi.fetchers.HfDataCardFetcher + options: + show_root_heading: true + show_source: true + +::: tfbpapi.fetchers.HfRepoStructureFetcher + options: + show_root_heading: true + show_source: true + +::: tfbpapi.fetchers.HfSizeInfoFetcher + options: + show_root_heading: true + show_source: true diff --git a/docs/hf_cache_manager.md b/docs/hf_cache_manager.md new file mode 100644 index 0000000..752b712 --- /dev/null +++ b/docs/hf_cache_manager.md @@ -0,0 +1,6 @@ +# HfCacheManager + +::: tfbpapi.hf_cache_manager.HfCacheManager + options: + show_root_heading: true + show_source: true diff --git a/docs/huggingface_datacard.md b/docs/huggingface_datacard.md new file mode 100644 index 0000000..d56c771 --- /dev/null +++ b/docs/huggingface_datacard.md @@ -0,0 +1,496 @@ +# HuggingFace Dataset Card Format + +This document describes the expected YAML metadata format for HuggingFace dataset +repositories used with the tfbpapi package. The metadata is defined in the repository's +README.md file, at the top in a yaml block, and provides structured information about +the dataset configuration and contents. + +This documentation is intended for developers preparing or augmenting a huggingface +dataset repository to be compatible with tfbpapi. Before reading, please review the +[BrentLab/hackett_2020](https://huggingface.co/datasets/BrentLab/hackett_2020/blob/main/README.md) +datacard as an example of a complete implementation of a simple repository. After +reviewing Hackett 2020 and this documentation, it might be helpful to review a more +complex example such as: + +- [BrentLab/barkai_compendium](https://huggingface.co/datasets/BrentLab/barkai_compendium): + This contains a `genome_map` partitioned dataset with separate metadata applied via + the `applies_to` field. +- [Brentlab/rossi_2021](https://huggingface.co/datasets/BrentLab/rossi_2021): + This contains multiple `annotated_features` datasets with embedded metadata +- [Brentlab/yeast_genomic_features](https://huggingface.co/datasets/BrentLab/yeast_genomic_features): + This contains a simple `genomic_features` dataset used as a reference for other + datasets in the collection. + +## Dataset Types + +The `dataset_type` field is a property of each config (hierarchically under +`config_name`). `tfbpapi` recognizes the following dataset types: + +### 1. `genomic_features` +Static information about genomic features (genes, promoters, etc.) +- **Use case**: Gene annotations, regulatory classifications, static feature data +- **Structure**: One row per genomic feature +- **Required fields**: Usually includes gene identifiers, coordinates, classifications + +### 2. `annotated_features` +Quantitative data associated with genomic features. A field `sample_id` should exist +to identify single experiments in a single set of conditions. +- **Use case**: Expression data, binding scores, differential expression results +- **Structure**: Each sample will have one row per genomic feature measured. The + role `quantitative_measure` should be used to identify measurement columns. +- **Common fields**: `regulator_*`, `target_*` fields with the roles + `regulator_identifier` and `target_identifier` respectively. Fields with the role + `quantitative_measure` for measurements. + +### 3. `genome_map` +Position-level data across genomic coordinates +- **Use case**: Signal tracks, coverage data, genome-wide binding profiles +- **Structure**: Position-value pairs, often large datasets +- **Required fields**: `chr` (chromosome), `pos` (position), signal values + +### 4. `metadata` +Experimental metadata and sample descriptions +- **Use case**: Sample information, experimental conditions, protocol details. Note + that this can also include per-sample QC metrics. For cross-sample QC or analysis, + see [comparative](#5-comparative) below. +- **Structure**: One row per sample +- **Common fields**: Sample identifiers, experimental conditions, publication info +- **Special field**: `applies_to` - Optional list of config names this metadata applies to + +### 5. `comparative` + +Quality control metrics, validation results, and cross-dataset analysis outputs. + +**Use cases**: +- Cross-dataset quality assessments and validation metrics +- Analysis results relating samples across datasets or repositories +- Comparative analyses (e.g., binding vs expression correlation) + +**Structure**: One row represents an observation on 2 or more samples. Note that the + name of the column containing the sample references isn't specified. However, the + role and format of the sample references are strictly defined. See + [Defining Sample References](#defining-sample-references) below. + +#### Defining Sample References + +The name of the field which contains the sample reference is user-defined. However, +the contents of that field, and its role, must be as follows: + +- **`source_sample`**: Fields containing composite sample identifiers. This must be in + the format `"repo_id;config_name;sample_id"`. + +``` +"repo_id;config_name;sample_id" +``` + +Examples: +- `"BrentLab/harbison_2004;harbison_2004;CBF1_YPD"` +- `"BrentLab/kemmeren_2014;kemmeren_2014;sample_42"` + +## Experimental Conditions + +Experimental conditions can be specified in three ways: +1. **Top-level** `experimental_conditions`: Apply to all configs in the repository. + Use when experimental parameters are common across all datasets. This will occur + at the same level as `configs` +2. **Config-level** `experimental_conditions`: Apply to a specific config + ([dataset](#dataset)). Use when certain datasets have experimental parameters that + are not shared by all other datasets in the [repository](#huggingface-repo), but + are common to all [samples](#sample) within that dataset. +3. **Field-level** with `role: experimental_condition` ([feature-roles](#feature-roles)): For + per-sample or per-measurement variation in experimental conditions stored as + data columns. This is specified in the + `dataset_info.features` ([feature-definitions](#feature-definitions)) + section of a config. `experimental_condition` fields which are categorical can are + specifically defined in [categorical fields with value definitions](#categorical-fields-with-value-definitions). + +The priority of experimental conditions is: + +field-level > config-level > top-level + +**Example of all three methods:** +```yaml +# Top-level experimental conditions (apply to all [datasets](#dataset) in the repo) +experimental_conditions: + temperature_celsius: 30 +configs: +- config_name: overexpression_data + description: TF overexpression perturbation data + dataset_type: annotated_features + # The overexpression_data [dataset](#dataset) has an additional experimental + # condition that is specific to this dataset + experimental_conditions: + strain_background: "BY4741" + data_files: + - split: train + path: overexpression.parquet + dataset_info: + features: + - name: time + dtype: float + description: Time point in minutes + role: experimental_condition + - name: mechanism + dtype: string + description: Induction mechanism (GEV or ZEV) + role: experimental_condition + definitions: + GEV: + perturbation_method: + type: inducible_overexpression + system: GEV + inducer: beta-estradiol + description: "Galactose-inducible estrogen receptor-VP16 fusion system" + ZEV: + perturbation_method: + type: inducible_overexpression + system: ZEV + inducer: beta-estradiol + description: >- + "Z3 (synthetic zinc finger)-estrogen receptor-VP16 fusion system" + - name: log2_ratio + dtype: float + description: Log2 fold change + role: quantitative_measure +``` + +## Feature Definitions + +Each config must include detailed feature definitions in `dataset_info.features`: +```yaml +dataset_info: + features: + - name: field_name # Column name in the data + dtype: string # Data type (string, int64, float64, etc.) + description: "Detailed description of what this field contains" + role: "target_identifier" # Optional: semantic role of the feature +``` + +### Categorical Fields with Value Definitions + +For fields with `role: experimental_condition` that contain categorical values, you can +provide structured definitions for each value using the `definitions` field. This allows +machine-parsable specification of what each condition value means experimentally: +```yaml +- name: condition + dtype: + class_label: + names: ["standard", "heat_shock"] + role: experimental_condition + description: Growth condition of the sample + definitions: + standard: + media: + name: synthetic_complete + carbon_source: + - compound: D-glucose + concentration_percent: 2 + nitrogen_source: + - compound: yeast_nitrogen_base + # lastname et al 2025 used 6.71 g/L + concentration_percent: 0.671 + specifications: + - without_amino_acids + - without_ammonium_sulfate + - compound: ammonium_sulfate + # lastname et al 2025 used 5 g/L + concentration_percent: 0.5 + - compound: amino_acid_dropout_mix + # lastname et al 2025 used 2 g/L + concentration_percent: 0.2 + heat_shock: + temperature_celsius: 37 + duration_minutes: 10 +``` + +Each key in `definitions` must correspond to a possible value in the field. +The structure under each value provides experimental parameters specific to that +condition using the same nested format as `experimental_conditions` at config or +top level. + +### Naming Conventions + +**Gene/Feature Identifiers:** +- `(regulator/target)_locus_tag`: Systematic gene identifiers (e.g., "YJR060W"). Must + be able to join to a genomic_features dataset. If none is specific, + then the BrentLab/yeast_genomic_features is used +- `(regulator/target)_symbol`: Standard gene symbols (e.g., "CBF1"). Must be able to + join to a genomic_features dataset. If none is specific, + then the BrentLab/yeast_genomic_features is used + +**Genomic Coordinates:** +Unless otherwise noted, assume that coordinates are 0-based, half-open intervals + +- `chr`: Chromosome identifier +- `start`, `end`: Genomic coordinates +- `pos`: Single position +- `strand`: Strand information (+ or -) + +## Feature Roles + +The optional `role` field provides semantic meaning to features, especially useful +for annotated_features datasets. The following roles are recognized by tfbpapi. +**NOTE** `experimental_condition` is a reserved role with additional behavior +as described above. + +## Partitioned Datasets + +For large datasets (eg most genome_map datasets), use partitioning: + +```yaml +dataset_info: + partitioning: + enabled: true + partition_by: ["accession"] # Partition column(s) + path_template: "data/accession={accession}/*.parquet" +``` + +This allows efficient querying of subsets without loading the entire dataset. + +## Metadata + +### Metadata Relationships with `applies_to` + +For metadata configs, you can explicitly specify which other configs the metadata +applies to using the `applies_to` field. This provides more control than automatic +type-based matching. + +```yaml +configs: +# Data configs +- config_name: genome_map_data + dataset_type: genome_map + # ... rest of config + +- config_name: binding_scores + dataset_type: annotated_features + # ... rest of config + +- config_name: expression_data + dataset_type: annotated_features + # ... rest of config + +# Metadata config that applies to multiple data configs +- config_name: repo_metadata + dataset_type: metadata + applies_to: ["genome_map_data", "binding_scores", "expression_data"] + # ... rest of config +``` + +### Embedded Metadata with `metadata_fields` + +When no explicit metadata config exists, you can extract metadata directly from the +dataset's own files using the `metadata_fields` field. This specifies which fields +should be treated as metadata. + +### Single File Embedded Metadata + +For single parquet files, the system extracts distinct values using `SELECT DISTINCT`: + +```yaml +- config_name: binding_data + dataset_type: annotated_features + metadata_fields: ["regulator_symbol", "experimental_condition"] + data_files: + - split: train + path: binding_measurements.parquet + dataset_info: + features: + - name: regulator_symbol + dtype: string + description: Transcription factor name + - name: experimental_condition + dtype: string + description: Experimental treatment + - name: binding_score + dtype: float64 + description: Quantitative measurement +``` + +### Partitioned Dataset Embedded Metadata + +For partitioned datasets, partition values are extracted from directory structure: + +```yaml +- config_name: genome_map_data + dataset_type: genome_map + metadata_fields: ["run_accession", "regulator_symbol"] + data_files: + - split: train + path: genome_map/accession=*/regulator=*/*.parquet + dataset_info: + features: + - name: chr + dtype: string + description: Chromosome + - name: pos + dtype: int32 + description: Position + - name: signal + dtype: float32 + description: Signal intensity + partitioning: + enabled: true + partition_by: ["run_accession", "regulator_symbol"] +``` + +## Data File Organization + +### Single Files +```yaml +data_files: +- split: train + path: single_file.parquet +``` + +### Multiple Files/Partitioned Data +```yaml +data_files: +- split: train + path: data_directory/*/*.parquet # Glob patterns supported +``` + +## Complete Example Structure + +```yaml +license: mit +language: [en] +tags: [biology, genomics, transcription-factors] +pretty_name: "Example Genomics Dataset" +size_categories: [100K 5:\n", + " print(f\" ... and {len(repo_sizes) - 5} more repositories\")\n", + "\n", + "# Calculate total revisions\n", + "total_revisions = sum(len(repo.revisions) for repo in cache_info.repos)\n", + "print(f\"\\nTotal revisions across all repos: {total_revisions}\")\n", + "\n", + "# Show age distribution\n", + "from datetime import datetime\n", + "now = datetime.now().timestamp()\n", + "old_revisions = 0\n", + "for repo in cache_info.repos:\n", + " for rev in repo.revisions:\n", + " age_days = (now - rev.last_modified) / (24 * 3600)\n", + " if age_days > 30:\n", + " old_revisions += 1\n", + "\n", + "print(f\"Revisions older than 30 days: {old_revisions}\")\n", + "print(f\"Recent revisions (≤30 days): {total_revisions - old_revisions}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Querying Loaded Metadata\n", + "\n", + "Once metadata is loaded into DuckDB, we can query it using SQL." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Internal Cache Management Methods\n", + "\n", + "HfCacheManager provides several internal methods that work behind the scenes. Let's explore what these methods do and how they integrate with the caching strategy." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 4. Working with Specific Metadata Configurations\n", + "\n", + "You can also retrieve metadata for specific configurations rather than all at once." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "HfCacheManager Internal Methods:\n", + "===================================\n", + "\n", + "1. _get_metadata_for_config(config)\n", + " → Implements the 3-case strategy for a specific configuration\n", + " → Returns detailed result with strategy used and success status\n", + "\n", + "2. _check_metadata_exists_in_duckdb(table_name)\n", + " → Case 1: Checks if metadata table already exists in DuckDB\n", + " → Fast check using information_schema.tables\n", + "\n", + "3. _load_metadata_from_cache(config, table_name)\n", + " → Case 2: Attempts to load from local HuggingFace cache\n", + " → Uses try_to_load_from_cache() to find cached files\n", + "\n", + "4. _download_and_load_metadata(config, table_name)\n", + " → Case 3: Downloads from HuggingFace Hub if not cached\n", + " → Uses snapshot_download() for efficient file retrieval\n", + "\n", + "5. _create_duckdb_table_from_files(file_paths, table_name)\n", + " → Creates DuckDB views from parquet files\n", + " → Handles both single files and multiple files efficiently\n", + "\n", + "6. _extract_embedded_metadata_field(data_table, field, metadata_table)\n", + " → Extracts metadata fields from data tables\n", + " → Creates separate queryable metadata views\n", + "\n", + "These methods work together to provide:\n", + "• Transparent caching that 'just works'\n", + "• Minimal network usage through intelligent fallbacks\n", + "• Fast metadata access via DuckDB views\n", + "• Automatic handling of different file structures\n" + ] + } + ], + "source": [ + "# Demonstrate understanding of internal cache methods\n", + "print(\"HfCacheManager Internal Methods:\")\n", + "print(\"=\" * 35)\n", + "\n", + "print(\"\\n1. _get_metadata_for_config(config)\")\n", + "print(\" → Implements the 3-case strategy for a specific configuration\")\n", + "print(\" → Returns detailed result with strategy used and success status\")\n", + "\n", + "print(\"\\n2. _check_metadata_exists_in_duckdb(table_name)\")\n", + "print(\" → Case 1: Checks if metadata table already exists in DuckDB\")\n", + "print(\" → Fast check using information_schema.tables\")\n", + "\n", + "print(\"\\n3. _load_metadata_from_cache(config, table_name)\")\n", + "print(\" → Case 2: Attempts to load from local HuggingFace cache\")\n", + "print(\" → Uses try_to_load_from_cache() to find cached files\")\n", + "\n", + "print(\"\\n4. _download_and_load_metadata(config, table_name)\")\n", + "print(\" → Case 3: Downloads from HuggingFace Hub if not cached\")\n", + "print(\" → Uses snapshot_download() for efficient file retrieval\")\n", + "\n", + "print(\"\\n5. _create_duckdb_table_from_files(file_paths, table_name)\")\n", + "print(\" → Creates DuckDB views from parquet files\")\n", + "print(\" → Handles both single files and multiple files efficiently\")\n", + "\n", + "print(\"\\n6. _extract_embedded_metadata_field(data_table, field, metadata_table)\")\n", + "print(\" → Extracts metadata fields from data tables\")\n", + "print(\" → Creates separate queryable metadata views\")\n", + "\n", + "print(\"\\nThese methods work together to provide:\")\n", + "print(\"• Transparent caching that 'just works'\")\n", + "print(\"• Minimal network usage through intelligent fallbacks\")\n", + "print(\"• Fast metadata access via DuckDB views\")\n", + "print(\"• Automatic handling of different file structures\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 5. Extracting Embedded Metadata\n", + "\n", + "Some datasets have metadata embedded within their data files. The HfCacheManager can extract this embedded metadata into separate, queryable tables." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 4. Embedded Metadata Extraction\n", + "\n", + "One unique feature of HfCacheManager is the ability to extract embedded metadata fields from data tables into separate, queryable metadata tables." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Demonstrate embedded metadata extraction concept\n", + "print(\"Embedded Metadata Extraction:\")\n", + "print(\"=\" * 35)\n", + "\n", + "print(\"\\nScenario: You have a data table with embedded metadata fields\")\n", + "print(\"Example: genomics data with 'experimental_condition' field\")\n", + "\n", + "# Create sample data to demonstrate the concept\n", + "conn.execute(\"\"\"\n", + " CREATE TABLE sample_genomics_data AS \n", + " SELECT \n", + " 'gene_' || (row_number() OVER()) as gene_id,\n", + " random() * 1000 as expression_value,\n", + " CASE \n", + " WHEN (row_number() OVER()) % 4 = 0 THEN 'control'\n", + " WHEN (row_number() OVER()) % 4 = 1 THEN 'treatment_A'\n", + " WHEN (row_number() OVER()) % 4 = 2 THEN 'treatment_B'\n", + " ELSE 'stress_condition'\n", + " END as experimental_condition,\n", + " CASE \n", + " WHEN (row_number() OVER()) % 3 = 0 THEN 'timepoint_0h'\n", + " WHEN (row_number() OVER()) % 3 = 1 THEN 'timepoint_6h'\n", + " ELSE 'timepoint_24h'\n", + " END as timepoint\n", + " FROM range(100)\n", + "\"\"\")\n", + "\n", + "print(\"✓ Created sample genomics data with embedded metadata fields\")\n", + "\n", + "# Show the data structure\n", + "sample_data = conn.execute(\n", + " \"SELECT * FROM sample_genomics_data LIMIT 5\"\n", + ").fetchall()\n", + "\n", + "print(f\"\\nSample data structure:\")\n", + "print(\"gene_id | expression_value | experimental_condition | timepoint\")\n", + "print(\"-\" * 65)\n", + "for row in sample_data:\n", + " print(f\"{row[0]:8} | {row[1]:15.1f} | {row[2]:20} | {row[3]}\")\n", + "\n", + "print(f\"\\nEmbedded metadata fields identified:\")\n", + "print(\"• experimental_condition: Contains treatment/control information\")\n", + "print(\"• timepoint: Contains temporal sampling information\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Use HfCacheManager to extract embedded metadata\n", + "print(\"Using HfCacheManager for Metadata Extraction:\")\n", + "print(\"=\" * 50)\n", + "\n", + "# Extract experimental_condition metadata\n", + "success1 = cache_manager._extract_embedded_metadata_field(\n", + " 'sample_genomics_data', \n", + " 'experimental_condition', \n", + " 'metadata_experimental_conditions'\n", + ")\n", + "\n", + "# Extract timepoint metadata \n", + "success2 = cache_manager._extract_embedded_metadata_field(\n", + " 'sample_genomics_data',\n", + " 'timepoint', \n", + " 'metadata_timepoints'\n", + ")\n", + "\n", + "print(f\"Experimental condition extraction: {'✓ Success' if success1 else '✗ Failed'}\")\n", + "print(f\"Timepoint extraction: {'✓ Success' if success2 else '✗ Failed'}\")\n", + "\n", + "# Show extracted metadata tables\n", + "if success1:\n", + " print(f\"\\nExtracted experimental conditions:\")\n", + " conditions = conn.execute(\n", + " \"SELECT value, count FROM metadata_experimental_conditions ORDER BY count DESC\"\n", + " ).fetchall()\n", + " \n", + " for condition, count in conditions:\n", + " print(f\" • {condition}: {count} samples\")\n", + "\n", + "if success2:\n", + " print(f\"\\nExtracted timepoints:\")\n", + " timepoints = conn.execute(\n", + " \"SELECT value, count FROM metadata_timepoints ORDER BY count DESC\"\n", + " ).fetchall()\n", + " \n", + " for timepoint, count in timepoints:\n", + " print(f\" • {timepoint}: {count} samples\")\n", + "\n", + "print(f\"\\nBenefits of extraction:\")\n", + "print(\"• Separate queryable metadata tables\")\n", + "print(\"• Fast metadata-based filtering and analysis\") \n", + "print(\"• Clear separation of data and metadata concerns\")\n", + "print(\"• Reusable metadata across different analyses\")" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Current HuggingFace Cache Status:\n", + "===================================\n", + "Total size: 5.5G\n", + "Number of repositories: 11\n", + "\n", + "Repository breakdown:\n", + " • BrentLab/yeast_comparative_analysis: 166.1K (1 revisions)\n", + " • BrentLab/yeast_genome_resources: 114.5K (7 revisions)\n", + " • BrentLab/barkai_compendium: 3.6G (1 revisions)\n", + " • BrentLab/kemmeren_2014: 646.2M (3 revisions)\n", + " • BrentLab/hu_2007_reimand_2010: 42.7M (1 revisions)\n", + " ... and 6 more repositories\n", + "\n", + "Target repository (BrentLab/mahendrawada_2025) cache info:\n", + " Size: 94.3M\n", + " Revisions: 4\n", + " Latest revision: af5ac9dc\n", + " Last modified: 1763578870.280984\n" + ] + } + ], + "source": [ + "from huggingface_hub import scan_cache_dir\n", + "\n", + "# Get current cache information \n", + "cache_info = scan_cache_dir()\n", + "\n", + "print(\"Current HuggingFace Cache Status:\")\n", + "print(\"=\" * 35)\n", + "print(f\"Total size: {cache_info.size_on_disk_str}\")\n", + "print(f\"Number of repositories: {len(cache_info.repos)}\")\n", + "\n", + "print(\"\\nRepository breakdown:\")\n", + "for repo in list(cache_info.repos)[:5]: # Show first 5 repos\n", + " print(f\" • {repo.repo_id}: {repo.size_on_disk_str} ({len(repo.revisions)} revisions)\")\n", + "\n", + "if len(cache_info.repos) > 5:\n", + " print(f\" ... and {len(cache_info.repos) - 5} more repositories\")\n", + "\n", + "# Show target repository if it exists in cache\n", + "target_repo = None\n", + "for repo in cache_info.repos:\n", + " if repo.repo_id == cache_manager.repo_id:\n", + " target_repo = repo\n", + " break\n", + "\n", + "if target_repo:\n", + " print(f\"\\nTarget repository ({cache_manager.repo_id}) cache info:\")\n", + " print(f\" Size: {target_repo.size_on_disk_str}\")\n", + " print(f\" Revisions: {len(target_repo.revisions)}\")\n", + " if target_repo.revisions:\n", + " latest_rev = max(target_repo.revisions, key=lambda r: r.last_modified)\n", + " print(f\" Latest revision: {latest_rev.commit_hash[:8]}\")\n", + " print(f\" Last modified: {latest_rev.last_modified}\")\n", + "else:\n", + " print(f\"\\nTarget repository ({cache_manager.repo_id}) not found in cache.\")\n", + " print(\"It may need to be downloaded first.\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Cache Cleanup by Age" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Cleaning cache by age (30+ days old):\n", + "========================================\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:__main__:Found 41 old revisions. Will free 4.7G\n", + "INFO:__main__:Dry run completed. Use dry_run=False to execute deletion\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Cleanup strategy created:\n", + "Expected space freed: 4.7G\n", + "Items to delete: 46\n", + "\n", + "Breakdown of items to delete:\n", + " • Blob files: 27\n", + " • Reference files: 0\n", + " • Repository directories: 7\n", + " • Snapshot directories: 12\n", + "\n", + "Sample blob files to delete:\n", + " • /home/chase/.cache/huggingface/hub/datasets--BrentLab--harbison_2004/blobs/b5fbd9e98fd8ddadeeb5631e3b6f5055e917c98d\n", + " • /home/chase/.cache/huggingface/hub/datasets--BrentLab--hackett_2020/blobs/a85bd6b418d9644d9adaa1269c27f97469a4aaee51af63cf1aa041f62cd8ba2c\n", + " • /home/chase/.cache/huggingface/hub/datasets--BrentLab--hackett_2020/blobs/c3e72ccb1b8deba4bbfd18abe6081de7ec3914d9\n", + " ... and 24 more blob files\n" + ] + } + ], + "source": [ + "# Clean cache entries older than 30 days (dry run)\n", + "print(\"Cleaning cache by age (30+ days old):\")\n", + "print(\"=\" * 40)\n", + "\n", + "age_cleanup = cache_manager.clean_cache_by_age(\n", + " max_age_days=30,\n", + " dry_run=True # Set to False to actually execute\n", + ")\n", + "\n", + "print(f\"\\nCleanup strategy created:\")\n", + "print(f\"Expected space freed: {age_cleanup.expected_freed_size_str}\")\n", + "\n", + "# Count total items to delete across all categories\n", + "total_items = len(age_cleanup.blobs) + len(age_cleanup.refs) + len(age_cleanup.repos) + len(age_cleanup.snapshots)\n", + "print(f\"Items to delete: {total_items}\")\n", + "\n", + "# Show breakdown of what would be deleted\n", + "if total_items > 0:\n", + " print(f\"\\nBreakdown of items to delete:\")\n", + " print(f\" • Blob files: {len(age_cleanup.blobs)}\")\n", + " print(f\" • Reference files: {len(age_cleanup.refs)}\")\n", + " print(f\" • Repository directories: {len(age_cleanup.repos)}\")\n", + " print(f\" • Snapshot directories: {len(age_cleanup.snapshots)}\")\n", + " \n", + " # Show some example items\n", + " if age_cleanup.blobs:\n", + " print(f\"\\nSample blob files to delete:\")\n", + " for item in list(age_cleanup.blobs)[:3]:\n", + " print(f\" • {item}\")\n", + " if len(age_cleanup.blobs) > 3:\n", + " print(f\" ... and {len(age_cleanup.blobs) - 3} more blob files\")\n", + "else:\n", + " print(\"No old files found for cleanup.\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Cache Cleanup by Size" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Cleaning cache to target size: 1GB\n", + "========================================\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:__main__:Selected 17 revisions for deletion. Will free 3.8G\n", + "INFO:__main__:Dry run completed. Use dry_run=False to execute deletion\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Size-based cleanup strategy:\n", + "Expected space freed: 3.8G\n", + "Items to delete: 85\n", + "\n", + "Comparing cleanup strategies for 1GB:\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:__main__:Selected 17 revisions for deletion. Will free 3.8G\n", + "INFO:__main__:Dry run completed. Use dry_run=False to execute deletion\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " • oldest_first : 3.8G (85 items)\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:__main__:Selected 4 revisions for deletion. Will free 4.0G\n", + "INFO:__main__:Dry run completed. Use dry_run=False to execute deletion\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " • largest_first : 4.0G (8 items)\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:__main__:Selected 17 revisions for deletion. Will free 3.8G\n", + "INFO:__main__:Dry run completed. Use dry_run=False to execute deletion\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " • least_used : 3.8G (85 items)\n" + ] + } + ], + "source": [ + "# Clean cache to target size (dry run)\n", + "target_size = \"1GB\"\n", + "print(f\"Cleaning cache to target size: {target_size}\")\n", + "print(\"=\" * 40)\n", + "\n", + "size_cleanup = cache_manager.clean_cache_by_size(\n", + " target_size=target_size,\n", + " strategy=\"oldest_first\", # Can be: oldest_first, largest_first, least_used\n", + " dry_run=True\n", + ")\n", + "\n", + "print(f\"\\nSize-based cleanup strategy:\")\n", + "print(f\"Expected space freed: {size_cleanup.expected_freed_size_str}\")\n", + "\n", + "# Count total items to delete across all categories\n", + "total_items = len(size_cleanup.blobs) + len(size_cleanup.refs) + len(size_cleanup.repos) + len(size_cleanup.snapshots)\n", + "print(f\"Items to delete: {total_items}\")\n", + "\n", + "# Compare different strategies\n", + "strategies = [\"oldest_first\", \"largest_first\", \"least_used\"]\n", + "print(f\"\\nComparing cleanup strategies for {target_size}:\")\n", + "\n", + "for strategy in strategies:\n", + " try:\n", + " strategy_result = cache_manager.clean_cache_by_size(\n", + " target_size=target_size,\n", + " strategy=strategy,\n", + " dry_run=True\n", + " )\n", + " strategy_total = (len(strategy_result.blobs) + len(strategy_result.refs) + \n", + " len(strategy_result.repos) + len(strategy_result.snapshots))\n", + " print(f\" • {strategy:15}: {strategy_result.expected_freed_size_str:>8} \"\n", + " f\"({strategy_total} items)\")\n", + " except Exception as e:\n", + " print(f\" • {strategy:15}: Error - {e}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Cleaning Unused Revisions" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Cleaning unused revisions (keep latest 2 per repo):\n", + "==================================================\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:__main__:Found 31 unused revisions. Will free 642.9M\n", + "INFO:__main__:Dry run completed. Use dry_run=False to execute deletion\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Revision cleanup strategy:\n", + "Expected space freed: 642.9M\n", + "Items to delete: 118\n", + "\n", + "Breakdown of cleanup:\n", + " • Blob files: 87\n", + " • Reference files: 0\n", + " • Repository directories: 0\n", + " • Snapshot directories: 31\n", + "\n", + "Per-repository revision analysis:\n", + "\n", + " • BrentLab/yeast_comparative_analysis:\n", + " Total revisions: 1\n", + " Would keep: 1\n", + " Would delete: 0\n", + " Keep: ac03d065 (modified: 1767824941.5531375)\n", + "\n", + " • BrentLab/yeast_genome_resources:\n", + " Total revisions: 7\n", + " Would keep: 2\n", + " Would delete: 5\n", + " Keep: 42beb284 (modified: 1758155946.5549896)\n", + " Keep: 15fdb72f (modified: 1755819093.2306638)\n", + " Delete: 7441b9a8 (modified: 1755816785.6988702)\n", + "\n", + " • BrentLab/barkai_compendium:\n", + " Total revisions: 1\n", + " Would keep: 1\n", + " Would delete: 0\n", + " Keep: a987ef37 (modified: 1756926783.3167186)\n" + ] + } + ], + "source": [ + "# Clean unused revisions, keeping only the latest 2 per repository\n", + "print(\"Cleaning unused revisions (keep latest 2 per repo):\")\n", + "print(\"=\" * 50)\n", + "\n", + "revision_cleanup = cache_manager.clean_unused_revisions(\n", + " keep_latest=2,\n", + " dry_run=True\n", + ")\n", + "\n", + "print(f\"\\nRevision cleanup strategy:\")\n", + "print(f\"Expected space freed: {revision_cleanup.expected_freed_size_str}\")\n", + "\n", + "# Count total items to delete across all categories\n", + "total_items = len(revision_cleanup.blobs) + len(revision_cleanup.refs) + len(revision_cleanup.repos) + len(revision_cleanup.snapshots)\n", + "print(f\"Items to delete: {total_items}\")\n", + "\n", + "# Show breakdown\n", + "if total_items > 0:\n", + " print(f\"\\nBreakdown of cleanup:\")\n", + " print(f\" • Blob files: {len(revision_cleanup.blobs)}\")\n", + " print(f\" • Reference files: {len(revision_cleanup.refs)}\") \n", + " print(f\" • Repository directories: {len(revision_cleanup.repos)}\")\n", + " print(f\" • Snapshot directories: {len(revision_cleanup.snapshots)}\")\n", + "\n", + "# Show repository-specific breakdown\n", + "cache_info = scan_cache_dir()\n", + "if cache_info.repos:\n", + " print(\"\\nPer-repository revision analysis:\")\n", + " for repo in list(cache_info.repos)[:3]:\n", + " print(f\"\\n • {repo.repo_id}:\")\n", + " print(f\" Total revisions: {len(repo.revisions)}\")\n", + " print(f\" Would keep: {min(2, len(repo.revisions))}\")\n", + " print(f\" Would delete: {max(0, len(repo.revisions) - 2)}\")\n", + " \n", + " # Show revision details\n", + " sorted_revisions = sorted(repo.revisions, key=lambda r: r.last_modified, reverse=True)\n", + " for i, rev in enumerate(sorted_revisions[:2]):\n", + " print(f\" Keep: {rev.commit_hash[:8]} (modified: {rev.last_modified})\")\n", + " \n", + " for rev in sorted_revisions[2:3]: # Show one that would be deleted\n", + " print(f\" Delete: {rev.commit_hash[:8]} (modified: {rev.last_modified})\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Automated Cache Management" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:__main__:Starting automated cache cleanup...\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Automated cache cleanup (comprehensive):\n", + "========================================\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:__main__:Found 41 old revisions. Will free 4.7G\n", + "INFO:__main__:Dry run completed. Use dry_run=False to execute deletion\n", + "INFO:__main__:Found 31 unused revisions. Will free 642.9M\n", + "INFO:__main__:Dry run completed. Use dry_run=False to execute deletion\n", + "INFO:__main__:Selected 9 revisions for deletion. Will free 2.8M\n", + "INFO:__main__:Dry run completed. Use dry_run=False to execute deletion\n", + "INFO:__main__:Automated cleanup complete. Total freed: 5.0GB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Automated cleanup executed 3 strategies:\n", + " 1. Strategy freed: 4.7G\n", + " 2. Strategy freed: 642.9M\n", + " 3. Strategy freed: 2.8M\n", + "\n", + "Total space that would be freed: 5.0GB\n", + "Cache size after cleanup: 129.8MB\n" + ] + } + ], + "source": [ + "# Automated cache cleanup with multiple strategies\n", + "print(\"Automated cache cleanup (comprehensive):\")\n", + "print(\"=\" * 40)\n", + "\n", + "auto_cleanup = cache_manager.auto_clean_cache(\n", + " max_age_days=30, # Remove anything older than 30 days\n", + " max_total_size=\"5GB\", # Target maximum cache size\n", + " keep_latest_per_repo=2, # Keep 2 latest revisions per repo\n", + " dry_run=True # Dry run for safety\n", + ")\n", + "\n", + "print(f\"\\nAutomated cleanup executed {len(auto_cleanup)} strategies:\")\n", + "\n", + "total_freed = 0\n", + "for i, strategy in enumerate(auto_cleanup, 1):\n", + " print(f\" {i}. Strategy freed: {strategy.expected_freed_size_str}\")\n", + " total_freed += strategy.expected_freed_size\n", + "\n", + "print(f\"\\nTotal space that would be freed: {cache_manager._format_bytes(total_freed)}\")\n", + "\n", + "# Calculate final cache size\n", + "current_cache = scan_cache_dir()\n", + "final_size = current_cache.size_on_disk - total_freed\n", + "print(f\"Cache size after cleanup: {cache_manager._format_bytes(max(0, final_size))}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 7. Best Practices and Performance Tips\n", + "\n", + "Here are some best practices for using HfCacheManager effectively:" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Performance Best Practices" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Performance Demonstration: Cache Management Benefits\n", + "=======================================================\n", + "\n", + "Demonstrating cache cleanup performance...\n", + "\n", + "1. Cache scanning performance:\n", + " Time to scan cache: 0.096 seconds\n", + " Repositories found: 11\n", + " Total cache size: 5.5G\n", + "\n", + "2. Cleanup strategy creation performance:\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:__main__:Found 41 old revisions. Will free 4.7G\n", + "INFO:__main__:Dry run completed. Use dry_run=False to execute deletion\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " Age cleanup strategy: 0.094 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:__main__:Selected 17 revisions for deletion. Will free 3.8G\n", + "INFO:__main__:Dry run completed. Use dry_run=False to execute deletion\n", + "INFO:__main__:Found 31 unused revisions. Will free 642.9M\n", + "INFO:__main__:Dry run completed. Use dry_run=False to execute deletion\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " Size cleanup strategy: 0.093 seconds\n", + " Revision cleanup strategy: 0.100 seconds\n", + "\n", + "Performance insights:\n", + "• Cache scanning is fast: 0.096s for 11 repos\n", + "• Cleanup strategy creation is efficient\n", + "• Dry runs allow safe preview of cleanup operations\n", + "• Multiple strategies can be compared quickly\n" + ] + } + ], + "source": [ + "import time\n", + "\n", + "print(\"Performance Demonstration: Cache Management Benefits\")\n", + "print(\"=\" * 55)\n", + "\n", + "print(\"\\nDemonstrating cache cleanup performance...\")\n", + "\n", + "# Show performance of cache scanning and cleanup strategy creation\n", + "print(\"\\n1. Cache scanning performance:\")\n", + "start_time = time.time()\n", + "cache_info = scan_cache_dir()\n", + "scan_time = time.time() - start_time\n", + "print(f\" Time to scan cache: {scan_time:.3f} seconds\")\n", + "print(f\" Repositories found: {len(cache_info.repos)}\")\n", + "print(f\" Total cache size: {cache_info.size_on_disk_str}\")\n", + "\n", + "# Show performance of cleanup strategy creation\n", + "print(\"\\n2. Cleanup strategy creation performance:\")\n", + "\n", + "start_time = time.time()\n", + "age_strategy = cache_manager.clean_cache_by_age(max_age_days=30, dry_run=True)\n", + "age_time = time.time() - start_time\n", + "print(f\" Age cleanup strategy: {age_time:.3f} seconds\")\n", + "\n", + "start_time = time.time()\n", + "size_strategy = cache_manager.clean_cache_by_size(target_size=\"1GB\", dry_run=True)\n", + "size_time = time.time() - start_time\n", + "print(f\" Size cleanup strategy: {size_time:.3f} seconds\")\n", + "\n", + "start_time = time.time()\n", + "revision_strategy = cache_manager.clean_unused_revisions(keep_latest=2, dry_run=True)\n", + "revision_time = time.time() - start_time\n", + "print(f\" Revision cleanup strategy: {revision_time:.3f} seconds\")\n", + "\n", + "print(f\"\\nPerformance insights:\")\n", + "print(f\"• Cache scanning is fast: {scan_time:.3f}s for {len(cache_info.repos)} repos\")\n", + "print(f\"• Cleanup strategy creation is efficient\")\n", + "print(f\"• Dry runs allow safe preview of cleanup operations\")\n", + "print(f\"• Multiple strategies can be compared quickly\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Memory and Storage Optimization" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Memory and Storage Optimization Tips:\n", + "========================================\n", + "\n", + "1. DuckDB Views vs Tables:\n", + " • HfCacheManager creates VIEWS by default (not tables)\n", + " • Views reference original parquet files without duplication\n", + " • This saves storage space while enabling fast SQL queries\n", + "\n", + "2. Metadata-First Workflow:\n", + " • Load metadata first to understand data structure\n", + " • Use metadata to filter and select specific data subsets\n", + " • Avoid loading entire datasets when only portions are needed\n", + "\n", + "3. Cache Management Strategy:\n", + " • Run automated cleanup regularly\n", + " • Keep cache size reasonable for your system\n", + " • Prioritize keeping recent and frequently-used datasets\n" + ] + } + ], + "source": [ + "print(\"Memory and Storage Optimization Tips:\")\n", + "print(\"=\" * 40)\n", + "\n", + "print(\"\\n1. DuckDB Views vs Tables:\")\n", + "print(\" • HfCacheManager creates VIEWS by default (not tables)\")\n", + "print(\" • Views reference original parquet files without duplication\")\n", + "print(\" • This saves storage space while enabling fast SQL queries\")\n", + "\n", + "print(\"\\n2. Metadata-First Workflow:\")\n", + "print(\" • Load metadata first to understand data structure\")\n", + "print(\" • Use metadata to filter and select specific data subsets\")\n", + "print(\" • Avoid loading entire datasets when only portions are needed\")\n", + "\n", + "print(\"\\n3. Cache Management Strategy:\")\n", + "print(\" • Run automated cleanup regularly\")\n", + "print(\" • Keep cache size reasonable for your system\")\n", + "print(\" • Prioritize keeping recent and frequently-used datasets\")\n", + "\n", + "# Demonstrate DuckDB view benefits\n", + "tables_info = conn.execute(\n", + " \"SELECT table_name, table_type FROM information_schema.tables WHERE table_name LIKE 'metadata_%'\"\n", + ").fetchall()\n", + "\n", + "if tables_info:\n", + " print(f\"\\nCurrent DuckDB objects ({len(tables_info)} total):\")\n", + " for table_name, table_type in tables_info:\n", + " print(f\" • {table_name}: {table_type}\")\n", + " \n", + " view_count = sum(1 for _, table_type in tables_info if table_type == 'VIEW')\n", + " print(f\"\\n {view_count} views created (space-efficient!)\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 8. Integration with Other Components\n", + "\n", + "The HfCacheManager works seamlessly with other components in the tfbpapi ecosystem." + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "HfCacheManager Integration Workflow:\n", + "========================================\n", + "\n", + "1. Cache Management Setup:\n", + " from tfbpapi.HfCacheManager import HfCacheManager\n", + " cache_mgr = HfCacheManager(repo_id, duckdb_conn)\n", + " # Inherits all DataCard functionality + cache management\n", + "\n", + "2. Proactive Cache Cleanup:\n", + " # Clean before large operations\n", + " cache_mgr.auto_clean_cache(max_total_size='5GB', dry_run=False)\n", + " # Or use specific strategies\n", + " cache_mgr.clean_cache_by_age(max_age_days=30)\n", + "\n", + "3. Data Loading with Cache Awareness:\n", + " # The 3-case strategy works automatically with HfQueryAPI\n", + " from tfbpapi import HfQueryAPI\n", + " query_api = HfQueryAPI(repo_id, duckdb_conn)\n", + " # Metadata loading uses cache manager's strategy\n", + " data_df = query_api.get_pandas('config_name')\n", + "\n", + "4. Embedded Metadata Extraction:\n", + " # Extract metadata fields after data loading\n", + " cache_mgr._extract_embedded_metadata_field(\n", + " 'data_table_name', 'metadata_field', 'metadata_table_name')\n", + "\n", + "5. Regular Cache Maintenance:\n", + " # Schedule regular cleanup\n", + " cache_mgr.clean_unused_revisions(keep_latest=2)\n", + " cache_mgr.clean_cache_by_size('10GB', strategy='oldest_first')\n", + "\n", + "Current Session State:\n", + "Repository: BrentLab/mahendrawada_2025\n", + "DuckDB tables: 0\n", + "HF cache size: 5.5G\n", + "Cache repositories: 11\n" + ] + } + ], + "source": [ + "print(\"HfCacheManager Integration Workflow:\")\n", + "print(\"=\" * 40)\n", + "\n", + "print(\"\\n1. Cache Management Setup:\")\n", + "print(\" from tfbpapi.HfCacheManager import HfCacheManager\")\n", + "print(\" cache_mgr = HfCacheManager(repo_id, duckdb_conn)\")\n", + "print(\" # Inherits all DataCard functionality + cache management\")\n", + "\n", + "print(\"\\n2. Proactive Cache Cleanup:\")\n", + "print(\" # Clean before large operations\")\n", + "print(\" cache_mgr.auto_clean_cache(max_total_size='5GB', dry_run=False)\")\n", + "print(\" # Or use specific strategies\")\n", + "print(\" cache_mgr.clean_cache_by_age(max_age_days=30)\")\n", + "\n", + "print(\"\\n3. Data Loading with Cache Awareness:\")\n", + "print(\" # The 3-case strategy works automatically with HfQueryAPI\")\n", + "print(\" from tfbpapi import HfQueryAPI\")\n", + "print(\" query_api = HfQueryAPI(repo_id, duckdb_conn)\")\n", + "print(\" # Metadata loading uses cache manager's strategy\")\n", + "print(\" data_df = query_api.get_pandas('config_name')\")\n", + "\n", + "print(\"\\n4. Embedded Metadata Extraction:\")\n", + "print(\" # Extract metadata fields after data loading\")\n", + "print(\" cache_mgr._extract_embedded_metadata_field(\")\n", + "print(\" 'data_table_name', 'metadata_field', 'metadata_table_name')\")\n", + "\n", + "print(\"\\n5. Regular Cache Maintenance:\")\n", + "print(\" # Schedule regular cleanup\")\n", + "print(\" cache_mgr.clean_unused_revisions(keep_latest=2)\")\n", + "print(\" cache_mgr.clean_cache_by_size('10GB', strategy='oldest_first')\")\n", + "\n", + "# Show current state\n", + "print(f\"\\nCurrent Session State:\")\n", + "print(f\"Repository: {cache_manager.repo_id}\")\n", + "print(f\"DuckDB tables: {len(conn.execute('SELECT table_name FROM information_schema.tables').fetchall())}\")\n", + "\n", + "cache_info = scan_cache_dir()\n", + "print(f\"HF cache size: {cache_info.size_on_disk_str}\")\n", + "print(f\"Cache repositories: {len(cache_info.repos)}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 9. Troubleshooting and Error Handling\n", + "\n", + "The HfCacheManager includes comprehensive error handling and diagnostic capabilities." + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Cache Management Troubleshooting:\n", + "===================================\n", + "\n", + "1. Import and Setup Issues:\n", + " • Ensure correct import: from tfbpapi.HfCacheManager import HfCacheManager\n", + " • Verify DuckDB connection: conn = duckdb.connect(':memory:')\n", + " • Check repository access permissions\n", + "\n", + "2. Cache Space and Performance Issues:\n", + " Current cache size: 5.5G\n", + " • Use auto_clean_cache() for automated management\n", + " • Monitor cache growth with scan_cache_dir()\n", + " • Set appropriate size limits for your system\n", + "\n", + "3. Cache Cleanup Issues:\n", + " • Use dry_run=True first to preview changes\n", + " • Check disk permissions for cache directory\n", + " • Verify no active processes are using cached files\n", + "\n", + "4. DuckDB Integration Issues:\n", + " • Ensure DuckDB connection is active\n", + " • Check memory limits for in-memory databases\n", + " • Verify table names don't conflict\n", + "\n", + "Cache Health Check:\n", + "✓ DuckDB connection: DuckDB OK\n", + "✓ Cache access: 11 repositories found\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:__main__:No old revisions found to delete\n", + "INFO:__main__:Found 0 old revisions. Will free 0.0\n", + "INFO:__main__:Dry run completed. Use dry_run=False to execute deletion\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "✓ Cache cleanup methods: Working\n", + "\n", + "Current Status:\n", + "Repository: BrentLab/mahendrawada_2025\n", + "Logger configured: True\n", + "Cache management ready: ✓\n" + ] + } + ], + "source": [ + "print(\"Cache Management Troubleshooting:\")\n", + "print(\"=\" * 35)\n", + "\n", + "print(\"\\n1. Import and Setup Issues:\")\n", + "print(\" • Ensure correct import: from tfbpapi.HfCacheManager import HfCacheManager\")\n", + "print(\" • Verify DuckDB connection: conn = duckdb.connect(':memory:')\")\n", + "print(\" • Check repository access permissions\")\n", + "\n", + "print(\"\\n2. Cache Space and Performance Issues:\")\n", + "try:\n", + " cache_info = scan_cache_dir()\n", + " print(f\" Current cache size: {cache_info.size_on_disk_str}\")\n", + " print(\" • Use auto_clean_cache() for automated management\")\n", + " print(\" • Monitor cache growth with scan_cache_dir()\")\n", + " print(\" • Set appropriate size limits for your system\")\n", + " \n", + " # Show if cache is getting large\n", + " total_gb = cache_info.size_on_disk / (1024**3)\n", + " if total_gb > 10:\n", + " print(f\" ⚠️ Large cache detected ({total_gb:.1f}GB) - consider cleanup\")\n", + " \n", + "except Exception as e:\n", + " print(f\" Cache scan error: {e}\")\n", + "\n", + "print(\"\\n3. Cache Cleanup Issues:\")\n", + "print(\" • Use dry_run=True first to preview changes\")\n", + "print(\" • Check disk permissions for cache directory\")\n", + "print(\" • Verify no active processes are using cached files\")\n", + "\n", + "print(\"\\n4. DuckDB Integration Issues:\")\n", + "print(\" • Ensure DuckDB connection is active\")\n", + "print(\" • Check memory limits for in-memory databases\")\n", + "print(\" • Verify table names don't conflict\")\n", + "\n", + "# Perform health checks\n", + "print(f\"\\nCache Health Check:\")\n", + "\n", + "# Test DuckDB\n", + "try:\n", + " test_result = conn.execute(\"SELECT 'DuckDB OK' as status\").fetchone()\n", + " print(f\"✓ DuckDB connection: {test_result[0]}\")\n", + "except Exception as e:\n", + " print(f\"✗ DuckDB connection: {e}\")\n", + "\n", + "# Test cache access\n", + "try:\n", + " cache_info = scan_cache_dir()\n", + " print(f\"✓ Cache access: {len(cache_info.repos)} repositories found\")\n", + "except Exception as e:\n", + " print(f\"✗ Cache access: {e}\")\n", + "\n", + "# Test cache manager methods\n", + "try:\n", + " test_cleanup = cache_manager.clean_cache_by_age(max_age_days=999, dry_run=True)\n", + " print(f\"✓ Cache cleanup methods: Working\")\n", + "except Exception as e:\n", + " print(f\"✗ Cache cleanup methods: {e}\")\n", + "\n", + "print(f\"\\nCurrent Status:\")\n", + "print(f\"Repository: {cache_manager.repo_id}\")\n", + "print(f\"Logger configured: {cache_manager.logger is not None}\")\n", + "print(f\"Cache management ready: ✓\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "tfbpapi-py3.11", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.9" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/docs/tutorials/datacard_tutorial.ipynb b/docs/tutorials/datacard_tutorial.ipynb new file mode 100644 index 0000000..1556a1c --- /dev/null +++ b/docs/tutorials/datacard_tutorial.ipynb @@ -0,0 +1,606 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# DataCard Tutorial: Exploring HuggingFace Dataset Metadata\n", + "\n", + "The `DataCard` class provides an interface for exploring HuggingFace dataset metadata without loading the actual genomic data. This is particularly useful for:\n", + "\n", + "- Understanding dataset structure and available configurations\n", + "- Exploring experimental conditions at all hierarchy levels\n", + "- Discovering metadata relationships\n", + "- Planning data analysis workflows and metadata table creation\n", + "\n", + "In this tutorial, we'll explore the **BrentLab/harbison_2004** dataset, which contains ChIP-chip data for transcription factor binding across 14 environmental conditions in yeast." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1. Instantiating a DataCard Object" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Repository: BrentLab/harbison_2004\n" + ] + } + ], + "source": [ + "from tfbpapi.datacard import DataCard\n", + "\n", + "card = DataCard('BrentLab/harbison_2004')\n", + "\n", + "print(f\"Repository: {card.repo_id}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2. Repository Overview\n", + "\n", + "Let's start by getting a high-level overview of the dataset." + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Repository Information:\n", + "========================================\n", + "repo_id : BrentLab/harbison_2004\n", + "pretty_name : Harbison, 2004 ChIP-chip\n", + "license : mit\n", + "tags : ['genomics', 'yeast', 'transcription', 'binding']\n", + "language : ['en']\n", + "size_categories : ['1M` -- the full measurement-level data (one row per sample-target pair)\n", + "- `_meta` -- deduplicated sample-level metadata (one row per sample),\n", + " including derived columns from config property mappings (e.g., `carbon_source` resolved from DataCard field definitions, with factor aliases applied)." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "cell-6", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Registered views:\n", + " dto_expanded\n", + " hackett\n", + " hackett_meta\n", + " harbison\n", + " harbison_meta\n", + " kemmeren\n", + " kemmeren_meta\n" + ] + } + ], + "source": [ + "# List all registered views\n", + "print(\"Registered views:\")\n", + "for name in vdb.tables():\n", + " print(f\" {name}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "pdebujnqb9q", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.microsoft.datawrangler.viewer.v0+json": { + "columns": [ + { + "name": "index", + "rawType": "int64", + "type": "integer" + }, + { + "name": "table", + "rawType": "object", + "type": "string" + }, + { + "name": "column_name", + "rawType": "object", + "type": "string" + }, + { + "name": "column_type", + "rawType": "object", + "type": "string" + }, + { + "name": "null", + "rawType": "object", + "type": "string" + }, + { + "name": "key", + "rawType": "object", + "type": "unknown" + }, + { + "name": "default", + "rawType": "object", + "type": "unknown" + }, + { + "name": "extra", + "rawType": "object", + "type": "unknown" + } + ], + "ref": "4448022f-1d53-48aa-8c25-13f59b938630", + "rows": [ + [ + "0", + "harbison_meta", + "sample_id", + "INTEGER", + "YES", + null, + null, + null + ], + [ + "1", + "harbison_meta", + "condition", + "VARCHAR", + "YES", + null, + null, + null + ], + [ + "2", + "harbison_meta", + "regulator_locus_tag", + "VARCHAR", + "YES", + null, + null, + null + ], + [ + "3", + "harbison_meta", + "regulator_symbol", + "VARCHAR", + "YES", + null, + null, + null + ], + [ + "4", + "harbison_meta", + "carbon_source", + "VARCHAR", + "YES", + null, + null, + null + ], + [ + "5", + "harbison_meta", + "temperature_celsius", + "DOUBLE", + "YES", + null, + null, + null + ] + ], + "shape": { + "columns": 7, + "rows": 6 + } + }, + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
tablecolumn_namecolumn_typenullkeydefaultextra
0harbison_metasample_idINTEGERYESNoneNoneNone
1harbison_metaconditionVARCHARYESNoneNoneNone
2harbison_metaregulator_locus_tagVARCHARYESNoneNoneNone
3harbison_metaregulator_symbolVARCHARYESNoneNoneNone
4harbison_metacarbon_sourceVARCHARYESNoneNoneNone
5harbison_metatemperature_celsiusDOUBLEYESNoneNoneNone
\n", + "
" + ], + "text/plain": [ + " table column_name column_type null key default extra\n", + "0 harbison_meta sample_id INTEGER YES None None None\n", + "1 harbison_meta condition VARCHAR YES None None None\n", + "2 harbison_meta regulator_locus_tag VARCHAR YES None None None\n", + "3 harbison_meta regulator_symbol VARCHAR YES None None None\n", + "4 harbison_meta carbon_source VARCHAR YES None None None\n", + "5 harbison_meta temperature_celsius DOUBLE YES None None None" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# The _meta view has sample-level metadata plus derived columns\n", + "# (carbon_source, temperature_celsius resolved from condition definitions)\n", + "vdb.describe(\"harbison_meta\")" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "9deee334", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.microsoft.datawrangler.viewer.v0+json": { + "columns": [ + { + "name": "index", + "rawType": "int64", + "type": "integer" + }, + { + "name": "table", + "rawType": "object", + "type": "string" + }, + { + "name": "column_name", + "rawType": "object", + "type": "string" + }, + { + "name": "column_type", + "rawType": "object", + "type": "string" + }, + { + "name": "null", + "rawType": "object", + "type": "string" + }, + { + "name": "key", + "rawType": "object", + "type": "unknown" + }, + { + "name": "default", + "rawType": "object", + "type": "unknown" + }, + { + "name": "extra", + "rawType": "object", + "type": "unknown" + } + ], + "ref": "96218820-955e-406b-91ab-9502edc25713", + "rows": [ + [ + "0", + "harbison", + "sample_id", + "INTEGER", + "YES", + null, + null, + null + ], + [ + "1", + "harbison", + "db_id", + "DOUBLE", + "YES", + null, + null, + null + ], + [ + "2", + "harbison", + "regulator_locus_tag", + "VARCHAR", + "YES", + null, + null, + null + ], + [ + "3", + "harbison", + "regulator_symbol", + "VARCHAR", + "YES", + null, + null, + null + ], + [ + "4", + "harbison", + "condition", + "VARCHAR", + "YES", + null, + null, + null + ], + [ + "5", + "harbison", + "target_locus_tag", + "VARCHAR", + "YES", + null, + null, + null + ], + [ + "6", + "harbison", + "target_symbol", + "VARCHAR", + "YES", + null, + null, + null + ], + [ + "7", + "harbison", + "effect", + "DOUBLE", + "YES", + null, + null, + null + ], + [ + "8", + "harbison", + "pvalue", + "DOUBLE", + "YES", + null, + null, + null + ], + [ + "9", + "harbison", + "carbon_source", + "VARCHAR", + "YES", + null, + null, + null + ], + [ + "10", + "harbison", + "temperature_celsius", + "DOUBLE", + "YES", + null, + null, + null + ] + ], + "shape": { + "columns": 7, + "rows": 11 + } + }, + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
tablecolumn_namecolumn_typenullkeydefaultextra
0harbisonsample_idINTEGERYESNoneNoneNone
1harbisondb_idDOUBLEYESNoneNoneNone
2harbisonregulator_locus_tagVARCHARYESNoneNoneNone
3harbisonregulator_symbolVARCHARYESNoneNoneNone
4harbisonconditionVARCHARYESNoneNoneNone
5harbisontarget_locus_tagVARCHARYESNoneNoneNone
6harbisontarget_symbolVARCHARYESNoneNoneNone
7harbisoneffectDOUBLEYESNoneNoneNone
8harbisonpvalueDOUBLEYESNoneNoneNone
9harbisoncarbon_sourceVARCHARYESNoneNoneNone
10harbisontemperature_celsiusDOUBLEYESNoneNoneNone
\n", + "
" + ], + "text/plain": [ + " table column_name column_type null key default extra\n", + "0 harbison sample_id INTEGER YES None None None\n", + "1 harbison db_id DOUBLE YES None None None\n", + "2 harbison regulator_locus_tag VARCHAR YES None None None\n", + "3 harbison regulator_symbol VARCHAR YES None None None\n", + "4 harbison condition VARCHAR YES None None None\n", + "5 harbison target_locus_tag VARCHAR YES None None None\n", + "6 harbison target_symbol VARCHAR YES None None None\n", + "7 harbison effect DOUBLE YES None None None\n", + "8 harbison pvalue DOUBLE YES None None None\n", + "9 harbison carbon_source VARCHAR YES None None None\n", + "10 harbison temperature_celsius DOUBLE YES None None None" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# The full view has measurement-level data (one row per sample-target pair)\n", + "vdb.describe(\"harbison\")" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "cell-9", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Common fields: ['carbon_source', 'regulator_locus_tag', 'regulator_symbol', 'sample_id', 'temperature_celsius']\n" + ] + } + ], + "source": [ + "# Columns common to ALL primary dataset views\n", + "print(\"Common fields:\", vdb.get_common_fields())" + ] + }, + { + "cell_type": "markdown", + "id": "missing-value-labels-md", + "metadata": {}, + "source": [ + "## Missing Value Labels\n", + "\n", + "When a property key is listed under `missing_value_labels`, every dataset\n", + "that does **not** have an explicit mapping for that property will still expose\n", + "the column in its `_meta` view, filled with the configured fallback string.\n", + "\n", + "In the config above, `carbon_source: unspecified` is set in `missing_value_labels`.\n", + "All three datasets (harbison, kemmeren, hackett) happen to have an explicit\n", + "`carbon_source` mapping, so they resolve real values.\n", + "\n", + "To demonstrate the fallback, we build a minimal config that omits `carbon_source`\n", + "from kemmeren. Without `missing_value_labels`, kemmeren would have no\n", + "`carbon_source` column at all. With it, the column appears with the default value." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "missing-value-labels-code", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Fetching 1 files: 100%|██████████| 1/1 [00:00<00:00, 6316.72it/s]\n", + "Fetching 1 files: 100%|██████████| 1/1 [00:00<00:00, 6990.51it/s]\n", + "Key 'carbon_source' not found at path 'media.carbon_source' (current keys: ['name'])\n", + "Key 'carbon_source' not found at path 'media.carbon_source' (current keys: ['name'])\n", + "Key 'carbon_source' not found at path 'media.carbon_source' (current keys: ['name'])\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "harbison2 carbon_source values:\n", + " carbon_source\n", + "0 D-raffinose\n", + "1 D-galactose\n", + "2 glucose\n", + "3 unspecified\n", + "\n", + "kemmeren2 carbon_source values:\n", + " carbon_source\n", + "0 unspecified\n", + "\n", + "cross-dataset query using carbon_source on both:\n", + " dataset carbon_source n\n", + "0 harbison D-galactose 4\n", + "1 harbison D-raffinose 1\n", + "2 harbison glucose 310\n", + "3 harbison unspecified 37\n", + "4 kemmeren unspecified 1484\n" + ] + } + ], + "source": [ + "minimal_yaml = \"\"\"\n", + "repositories:\n", + " BrentLab/harbison_2004:\n", + " dataset:\n", + " harbison_2004:\n", + " db_name: harbison2\n", + " sample_id:\n", + " field: sample_id\n", + " # harbison has carbon_source mapped via field+path\n", + " carbon_source:\n", + " field: condition\n", + " path: media.carbon_source.compound\n", + "\n", + " BrentLab/kemmeren_2014:\n", + " dataset:\n", + " kemmeren_2014:\n", + " db_name: kemmeren2\n", + " sample_id:\n", + " field: sample_id\n", + " # kemmeren has NO carbon_source mapping -- fallback will apply\n", + "\n", + "factor_aliases:\n", + " carbon_source:\n", + " glucose: [D-glucose, dextrose, glu]\n", + "\n", + "missing_value_labels:\n", + " carbon_source: unspecified\n", + "\"\"\"\n", + "\n", + "import tempfile\n", + "from pathlib import Path\n", + "from tfbpapi.virtual_db import VirtualDB\n", + "\n", + "p = Path(tempfile.mkdtemp()) / \"minimal.yaml\"\n", + "p.write_text(minimal_yaml)\n", + "vdb2 = VirtualDB(str(p))\n", + "\n", + "# harbison resolves real values from DataCard definitions\n", + "print(\"harbison2 carbon_source values:\")\n", + "print(vdb2.query(\"SELECT DISTINCT carbon_source FROM harbison2_meta\"))\n", + "\n", + "# kemmeren has no mapping -- gets the missing_value_labels fallback\n", + "print(\"\\nkemmeren2 carbon_source values:\")\n", + "print(vdb2.query(\"SELECT DISTINCT carbon_source FROM kemmeren2_meta\"))\n", + "\n", + "# Both views expose the column, enabling cross-dataset queries without COALESCE\n", + "print(\"\\ncross-dataset query using carbon_source on both:\")\n", + "print(vdb2.query(\"\"\"\n", + " SELECT 'harbison' AS dataset, carbon_source, COUNT(*) AS n\n", + " FROM harbison2_meta GROUP BY carbon_source\n", + " UNION ALL\n", + " SELECT 'kemmeren' AS dataset, carbon_source, COUNT(*) AS n\n", + " FROM kemmeren2_meta GROUP BY carbon_source\n", + " ORDER BY dataset, carbon_source\n", + "\"\"\"))\n", + "\n", + "p.unlink(missing_ok=True)\n" + ] + }, + { + "cell_type": "markdown", + "id": "cell-10", + "metadata": {}, + "source": [ + "## Querying VirtualDB\n", + "\n", + "The `.query()` method executes SQL queries against the registered views. You can write complex SQL queries that join across multiple datasets, filter based on metadata, and aggregate results as needed. \n", + "\n", + "You can also use parameterized queries to safely inject variables into your SQL statements, and prepared statements for repeated queries with different parameters. \n", + "Named prepared statements can be passed to `.prepare()` and then executed with\n", + "`.query()` with any parameterized values passed in as an arbitrary number of key/value\n", + "arguments." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1a705f1c", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.microsoft.datawrangler.viewer.v0+json": { + "columns": [ + { + "name": "index", + "rawType": "int64", + "type": "integer" + }, + { + "name": "sample_id", + "rawType": "int32", + "type": "integer" + }, + { + "name": "condition", + "rawType": "object", + "type": "string" + }, + { + "name": "regulator_locus_tag", + "rawType": "object", + "type": "string" + }, + { + "name": "regulator_symbol", + "rawType": "object", + "type": "string" + }, + { + "name": "carbon_source", + "rawType": "object", + "type": "string" + }, + { + "name": "temperature_celsius", + "rawType": "float64", + "type": "float" + } + ], + "ref": "f6d762b2-08cc-4514-93fb-89fac1ce6c8b", + "rows": [ + [ + "0", + "118", + "H2O2Hi", + "YGL073W", + "HSF1", + "glucose", + "30.0" + ], + [ + "1", + "216", + "YPD", + "YKR064W", + "OAF3", + "glucose", + "30.0" + ], + [ + "2", + "314", + "SM", + "YOR358W", + "HAP5", + "unspecified", + "30.0" + ], + [ + "3", + "330", + "YPD", + "YPL177C", + "CUP9", + "glucose", + "30.0" + ], + [ + "4", + "9", + "RAPA", + "YBL103C", + "RTG3", + "glucose", + "30.0" + ] + ], + "shape": { + "columns": 6, + "rows": 5 + } + }, + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
sample_idconditionregulator_locus_tagregulator_symbolcarbon_sourcetemperature_celsius
0118H2O2HiYGL073WHSF1glucose30.0
1216YPDYKR064WOAF3glucose30.0
2314SMYOR358WHAP5unspecified30.0
3330YPDYPL177CCUP9glucose30.0
49RAPAYBL103CRTG3glucose30.0
\n", + "
" + ], + "text/plain": [ + " sample_id condition regulator_locus_tag regulator_symbol carbon_source \\\n", + "0 118 H2O2Hi YGL073W HSF1 glucose \n", + "1 216 YPD YKR064W OAF3 glucose \n", + "2 314 SM YOR358W HAP5 unspecified \n", + "3 330 YPD YPL177C CUP9 glucose \n", + "4 9 RAPA YBL103C RTG3 glucose \n", + "\n", + " temperature_celsius \n", + "0 30.0 \n", + "1 30.0 \n", + "2 30.0 \n", + "3 30.0 \n", + "4 30.0 " + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Query the _meta view for sample-level metadata (one row per sample)\n", + "# Note: carbon_source is derived from the condition column's DataCard definitions\n", + "# with factor aliases already applied (D-glucose -> glucose)\n", + "df_meta = vdb.query(\"SELECT * FROM harbison_meta LIMIT 5\")\n", + "df_meta" + ] + }, + { + "cell_type": "markdown", + "id": "cell-16", + "metadata": {}, + "source": [ + "## 5. Parameterized Queries\n", + "\n", + "Pass keyword arguments to `query()` and reference them with\n", + "DuckDB's `$name` syntax." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cell-17", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.microsoft.datawrangler.viewer.v0+json": { + "columns": [ + { + "name": "index", + "rawType": "int64", + "type": "integer" + }, + { + "name": "sample_id", + "rawType": "int32", + "type": "integer" + }, + { + "name": "db_id", + "rawType": "float64", + "type": "float" + }, + { + "name": "regulator_locus_tag", + "rawType": "object", + "type": "string" + }, + { + "name": "regulator_symbol", + "rawType": "object", + "type": "string" + }, + { + "name": "condition", + "rawType": "object", + "type": "string" + }, + { + "name": "target_locus_tag", + "rawType": "object", + "type": "string" + }, + { + "name": "target_symbol", + "rawType": "object", + "type": "string" + }, + { + "name": "effect", + "rawType": "float64", + "type": "float" + }, + { + "name": "pvalue", + "rawType": "float64", + "type": "float" + }, + { + "name": "carbon_source", + "rawType": "object", + "type": "string" + }, + { + "name": "temperature_celsius", + "rawType": "float64", + "type": "float" + } + ], + "ref": "e270aed6-9c2d-445a-b7d5-9fe1f96455ff", + "rows": [ + [ + "0", + "13", + "12.0", + "YBR049C", + "REB1", + "H2O2Hi", + "YPR204W", + "YPR204W", + "0.90161323", + "0.6769426", + "glucose", + "30.0" + ], + [ + "1", + "13", + "12.0", + "YBR049C", + "REB1", + "H2O2Hi", + "YPR203W", + "YPR203W", + "1.0534522", + "0.38440432", + "glucose", + "30.0" + ], + [ + "2", + "13", + "12.0", + "YBR049C", + "REB1", + "H2O2Hi", + "YPR202W", + "YPR202W", + "1.0534522", + "0.38440432", + "glucose", + "30.0" + ], + [ + "3", + "13", + "12.0", + "YBR049C", + "REB1", + "H2O2Hi", + "YPR201W", + "ARR3", + "0.84429803", + "0.66537467", + "glucose", + "30.0" + ], + [ + "4", + "13", + "12.0", + "YBR049C", + "REB1", + "H2O2Hi", + "YPR200C", + "ARR2", + "0.84429803", + "0.66537467", + "glucose", + "30.0" + ] + ], + "shape": { + "columns": 11, + "rows": 5 + } + }, + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
sample_iddb_idregulator_locus_tagregulator_symbolconditiontarget_locus_tagtarget_symboleffectpvaluecarbon_sourcetemperature_celsius
01312.0YBR049CREB1H2O2HiYPR204WYPR204W0.9016130.676943glucose30.0
11312.0YBR049CREB1H2O2HiYPR203WYPR203W1.0534520.384404glucose30.0
21312.0YBR049CREB1H2O2HiYPR202WYPR202W1.0534520.384404glucose30.0
31312.0YBR049CREB1H2O2HiYPR201WARR30.8442980.665375glucose30.0
41312.0YBR049CREB1H2O2HiYPR200CARR20.8442980.665375glucose30.0
\n", + "
" + ], + "text/plain": [ + " sample_id db_id regulator_locus_tag regulator_symbol condition \\\n", + "0 13 12.0 YBR049C REB1 H2O2Hi \n", + "1 13 12.0 YBR049C REB1 H2O2Hi \n", + "2 13 12.0 YBR049C REB1 H2O2Hi \n", + "3 13 12.0 YBR049C REB1 H2O2Hi \n", + "4 13 12.0 YBR049C REB1 H2O2Hi \n", + "\n", + " target_locus_tag target_symbol effect pvalue carbon_source \\\n", + "0 YPR204W YPR204W 0.901613 0.676943 glucose \n", + "1 YPR203W YPR203W 1.053452 0.384404 glucose \n", + "2 YPR202W YPR202W 1.053452 0.384404 glucose \n", + "3 YPR201W ARR3 0.844298 0.665375 glucose \n", + "4 YPR200C ARR2 0.844298 0.665375 glucose \n", + "\n", + " temperature_celsius \n", + "0 30.0 \n", + "1 30.0 \n", + "2 30.0 \n", + "3 30.0 \n", + "4 30.0 " + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# A parameterized query has the following form, where `$reg` is a placeholder\n", + "# that gets replaced with the value provided in the `reg` argument.\n", + "vdb.query(\n", + " \"SELECT * FROM harbison WHERE regulator_symbol = $reg LIMIT 5\",\n", + " reg=\"REB1\",\n", + ")\n", + "\n", + "# A parameterized query can be saved for future use with the `.prepare()` method" + ] + }, + { + "cell_type": "markdown", + "id": "cell-18", + "metadata": {}, + "source": [ + "## Prepared Queries\n", + "\n", + "Use `prepare()` to register a named, reusable query template.\n", + "Then call it by name via `query()`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cell-19", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.microsoft.datawrangler.viewer.v0+json": { + "columns": [ + { + "name": "index", + "rawType": "int64", + "type": "integer" + }, + { + "name": "regulator_symbol", + "rawType": "object", + "type": "string" + }, + { + "name": "n", + "rawType": "int64", + "type": "integer" + } + ], + "ref": "1f789068-26a6-466d-b977-ce3a58e6b547", + "rows": [ + [ + "0", + "MSN2", + "6" + ], + [ + "1", + "MSN4", + "5" + ], + [ + "2", + "HSF1", + "4" + ], + [ + "3", + "STE12", + "4" + ], + [ + "4", + "SKN7", + "4" + ], + [ + "5", + "YAP1", + "4" + ], + [ + "6", + "DIG1", + "4" + ], + [ + "7", + "RTG3", + "4" + ], + [ + "8", + "PHO2", + "3" + ], + [ + "9", + "ROX1", + "3" + ], + [ + "10", + "GZF3", + "3" + ], + [ + "11", + "SFP1", + "3" + ], + [ + "12", + "KSS1", + "3" + ], + [ + "13", + "CIN5", + "3" + ], + [ + "14", + "NRG1", + "3" + ], + [ + "15", + "MBP1", + "3" + ], + [ + "16", + "GAT1", + "3" + ], + [ + "17", + "AFT2", + "3" + ], + [ + "18", + "MOT3", + "3" + ], + [ + "19", + "PHD1", + "3" + ], + [ + "20", + "TEC1", + "3" + ], + [ + "21", + "YAP7", + "3" + ], + [ + "22", + "RIM101", + "3" + ], + [ + "23", + "AFT1", + "3" + ], + [ + "24", + "YJL206C", + "3" + ], + [ + "25", + "RPN4", + "3" + ], + [ + "26", + "REB1", + "3" + ], + [ + "27", + "FHL1", + "3" + ], + [ + "28", + "FKH2", + "3" + ], + [ + "29", + "MAL33", + "3" + ], + [ + "30", + "RPH1", + "3" + ], + [ + "31", + "YAP6", + "3" + ], + [ + "32", + "SOK2", + "2" + ], + [ + "33", + "HAP2", + "2" + ], + [ + "34", + "CAD1", + "2" + ], + [ + "35", + "MAC1", + "2" + ], + [ + "36", + "UME6", + "2" + ], + [ + "37", + "YAP5", + "2" + ], + [ + "38", + "MOT2", + "2" + ], + [ + "39", + "UME1", + "2" + ], + [ + "40", + "DAL81", + "2" + ], + [ + "41", + "GLN3", + "2" + ], + [ + "42", + "ARR1", + "2" + ], + [ + "43", + "IME4", + "2" + ], + [ + "44", + "ASH1", + "2" + ], + [ + "45", + "RLM1", + "2" + ], + [ + "46", + "MSS11", + "2" + ], + [ + "47", + "MCM1", + "2" + ], + [ + "48", + "MGA1", + "2" + ], + [ + "49", + "RDS1", + "2" + ] + ], + "shape": { + "columns": 2, + "rows": 63 + } + }, + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
regulator_symboln
0MSN26
1MSN45
2HSF14
3STE124
4SKN74
.........
58PUT32
59RTG12
60ADR12
61UGA32
62PDR12
\n", + "

63 rows × 2 columns

\n", + "
" + ], + "text/plain": [ + " regulator_symbol n\n", + "0 MSN2 6\n", + "1 MSN4 5\n", + "2 HSF1 4\n", + "3 STE12 4\n", + "4 SKN7 4\n", + ".. ... ..\n", + "58 PUT3 2\n", + "59 RTG1 2\n", + "60 ADR1 2\n", + "61 UGA3 2\n", + "62 PDR1 2\n", + "\n", + "[63 rows x 2 columns]" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Register a prepared query\n", + "vdb.prepare(\"glucose_regs\", \"\"\"\n", + " SELECT regulator_symbol, COUNT(*) AS n\n", + " FROM harbison_meta\n", + " WHERE carbon_source = $cs\n", + " GROUP BY regulator_symbol\n", + " HAVING n >= $min_n\n", + " ORDER BY n DESC\n", + "\"\"\")\n", + "\n", + "# note that rather than a SQL statement, we pass in the name of the prepared query\n", + "# and provide the appropriate parameters\n", + "vdb.query(\"glucose_regs\", cs=\"glucose\", min_n=2)" + ] + }, + { + "cell_type": "markdown", + "id": "cell-20", + "metadata": {}, + "source": [ + "## 7. Comparative Dataset Views\n", + "\n", + "Comparative datasets (those with `links`) get an extra view type:\n", + "\n", + "**`_expanded`**: For each composite ID field, adds two parsed columns:\n", + "- `_source` -- the source dataset, aliased to `db_name` when\n", + " the `repo_id;config_name` pair is in the VirtualDB config.\n", + "- `_id` -- the sample_id component.\n", + "\n", + "This makes it easy to join or filter by source dataset without manually\n", + "parsing composite IDs." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cell-21", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.microsoft.datawrangler.viewer.v0+json": { + "columns": [ + { + "name": "index", + "rawType": "int64", + "type": "integer" + }, + { + "name": "binding_id", + "rawType": "object", + "type": "string" + }, + { + "name": "perturbation_id", + "rawType": "object", + "type": "string" + }, + { + "name": "binding_rank_threshold", + "rawType": "float64", + "type": "float" + }, + { + "name": "perturbation_rank_threshold", + "rawType": "float64", + "type": "float" + }, + { + "name": "binding_set_size", + "rawType": "float64", + "type": "float" + }, + { + "name": "perturbation_set_size", + "rawType": "float64", + "type": "float" + }, + { + "name": "dto_fdr", + "rawType": "float64", + "type": "float" + }, + { + "name": "dto_empirical_pvalue", + "rawType": "float64", + "type": "float" + }, + { + "name": "pr_ranking_column", + "rawType": "object", + "type": "string" + }, + { + "name": "binding_repo_dataset", + "rawType": "object", + "type": "string" + }, + { + "name": "perturbation_repo_dataset", + "rawType": "object", + "type": "string" + }, + { + "name": "binding_id_id", + "rawType": "object", + "type": "string" + }, + { + "name": "binding_id_source", + "rawType": "object", + "type": "string" + }, + { + "name": "perturbation_id_id", + "rawType": "object", + "type": "string" + }, + { + "name": "perturbation_id_source", + "rawType": "object", + "type": "string" + } + ], + "ref": "426c5717-57fa-4c0d-aa1f-b1947914421c", + "rows": [ + [ + "0", + "BrentLab/harbison_2004;harbison_2004;105", + "BrentLab/hughes_2006;overexpression;10", + "11.0", + "206.0", + "12.0", + "206.0", + "0.041292917490562644", + "0.017", + "log2fc", + "harbison_2004-harbison_2004", + "hughes_2006-overexpression", + "105", + "harbison", + "10", + "BrentLab/hughes_2006;overexpression" + ], + [ + "1", + "BrentLab/harbison_2004;harbison_2004;108", + "BrentLab/hughes_2006;overexpression;11", + "60.0", + "67.0", + "60.0", + "67.0", + "0.05428351009647073", + "0.0", + "log2fc", + "harbison_2004-harbison_2004", + "hughes_2006-overexpression", + "108", + "harbison", + "11", + "BrentLab/hughes_2006;overexpression" + ], + [ + "2", + "BrentLab/harbison_2004;harbison_2004;109", + "BrentLab/hughes_2006;overexpression;11", + "27.0", + "1265.0", + "27.0", + "1265.0", + "0.12321364371741866", + "0.057", + "log2fc", + "harbison_2004-harbison_2004", + "hughes_2006-overexpression", + "109", + "harbison", + "11", + "BrentLab/hughes_2006;overexpression" + ] + ], + "shape": { + "columns": 15, + "rows": 3 + } + }, + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
binding_idperturbation_idbinding_rank_thresholdperturbation_rank_thresholdbinding_set_sizeperturbation_set_sizedto_fdrdto_empirical_pvaluepr_ranking_columnbinding_repo_datasetperturbation_repo_datasetbinding_id_idbinding_id_sourceperturbation_id_idperturbation_id_source
0BrentLab/harbison_2004;harbison_2004;105BrentLab/hughes_2006;overexpression;1011.0206.012.0206.00.0412930.017log2fcharbison_2004-harbison_2004hughes_2006-overexpression105harbison10BrentLab/hughes_2006;overexpression
1BrentLab/harbison_2004;harbison_2004;108BrentLab/hughes_2006;overexpression;1160.067.060.067.00.0542840.000log2fcharbison_2004-harbison_2004hughes_2006-overexpression108harbison11BrentLab/hughes_2006;overexpression
2BrentLab/harbison_2004;harbison_2004;109BrentLab/hughes_2006;overexpression;1127.01265.027.01265.00.1232140.057log2fcharbison_2004-harbison_2004hughes_2006-overexpression109harbison11BrentLab/hughes_2006;overexpression
\n", + "
" + ], + "text/plain": [ + " binding_id \\\n", + "0 BrentLab/harbison_2004;harbison_2004;105 \n", + "1 BrentLab/harbison_2004;harbison_2004;108 \n", + "2 BrentLab/harbison_2004;harbison_2004;109 \n", + "\n", + " perturbation_id binding_rank_threshold \\\n", + "0 BrentLab/hughes_2006;overexpression;10 11.0 \n", + "1 BrentLab/hughes_2006;overexpression;11 60.0 \n", + "2 BrentLab/hughes_2006;overexpression;11 27.0 \n", + "\n", + " perturbation_rank_threshold binding_set_size perturbation_set_size \\\n", + "0 206.0 12.0 206.0 \n", + "1 67.0 60.0 67.0 \n", + "2 1265.0 27.0 1265.0 \n", + "\n", + " dto_fdr dto_empirical_pvalue pr_ranking_column \\\n", + "0 0.041293 0.017 log2fc \n", + "1 0.054284 0.000 log2fc \n", + "2 0.123214 0.057 log2fc \n", + "\n", + " binding_repo_dataset perturbation_repo_dataset binding_id_id \\\n", + "0 harbison_2004-harbison_2004 hughes_2006-overexpression 105 \n", + "1 harbison_2004-harbison_2004 hughes_2006-overexpression 108 \n", + "2 harbison_2004-harbison_2004 hughes_2006-overexpression 109 \n", + "\n", + " binding_id_source perturbation_id_id perturbation_id_source \n", + "0 harbison 10 BrentLab/hughes_2006;overexpression \n", + "1 harbison 11 BrentLab/hughes_2006;overexpression \n", + "2 harbison 11 BrentLab/hughes_2006;overexpression " + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# The expanded view has parsed _source and _id columns for each link field\n", + "vdb.query(\"SELECT * FROM dto_expanded LIMIT 3\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cell-22", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.microsoft.datawrangler.viewer.v0+json": { + "columns": [ + { + "name": "index", + "rawType": "int64", + "type": "integer" + }, + { + "name": "sample_id", + "rawType": "int32", + "type": "integer" + }, + { + "name": "condition", + "rawType": "object", + "type": "string" + }, + { + "name": "regulator_locus_tag", + "rawType": "object", + "type": "string" + }, + { + "name": "regulator_symbol", + "rawType": "object", + "type": "string" + }, + { + "name": "carbon_source", + "rawType": "object", + "type": "string" + }, + { + "name": "temperature_celsius", + "rawType": "float64", + "type": "float" + }, + { + "name": "dto_empirical_pvalue", + "rawType": "float64", + "type": "float" + }, + { + "name": "dto_fdr", + "rawType": "float64", + "type": "float" + } + ], + "ref": "43a63e85-fc7c-4630-873c-6a44f8af7442", + "rows": [ + [ + "0", + "314", + "SM", + "YOR358W", + "HAP5", + "unspecified", + "30.0", + "0.0", + "0.047097156398104266" + ], + [ + "1", + "240", + "YPD", + "YML007W", + "YAP1", + "glucose", + "30.0", + "0.0", + "0.14091317634369943" + ], + [ + "2", + "330", + "YPD", + "YPL177C", + "CUP9", + "glucose", + "30.0", + "0.0", + "0.00039874225300765584" + ], + [ + "3", + "114", + "H2O2Hi", + "YGL071W", + "AFT1", + "glucose", + "30.0", + "0.0", + "0.09653511969862681" + ], + [ + "4", + "118", + "H2O2Hi", + "YGL073W", + "HSF1", + "glucose", + "30.0", + "0.0", + "0.03150882247029168" + ], + [ + "5", + "31", + "H2O2Hi", + "YDL020C", + "RPN4", + "glucose", + "30.0", + "0.0", + "0.12466961356179365" + ], + [ + "6", + "303", + "YPD", + "YOR028C", + "CIN5", + "glucose", + "30.0", + "0.0", + "0.03621718920889537" + ], + [ + "7", + "36", + "H2O2Lo", + "YDL056W", + "MBP1", + "glucose", + "30.0", + "0.0", + "0.04300429120153643" + ], + [ + "8", + "15", + "YPD", + "YBR049C", + "REB1", + "glucose", + "30.0", + "0.0", + "0.07954075079166496" + ], + [ + "9", + "162", + "H2O2Lo", + "YIL101C", + "XBP1", + "glucose", + "30.0", + "0.0", + "0.22690440962955793" + ] + ], + "shape": { + "columns": 8, + "rows": 10 + } + }, + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
sample_idconditionregulator_locus_tagregulator_symbolcarbon_sourcetemperature_celsiusdto_empirical_pvaluedto_fdr
0314SMYOR358WHAP5unspecified30.00.00.047097
1240YPDYML007WYAP1glucose30.00.00.140913
2330YPDYPL177CCUP9glucose30.00.00.000399
3114H2O2HiYGL071WAFT1glucose30.00.00.096535
4118H2O2HiYGL073WHSF1glucose30.00.00.031509
531H2O2HiYDL020CRPN4glucose30.00.00.124670
6303YPDYOR028CCIN5glucose30.00.00.036217
736H2O2LoYDL056WMBP1glucose30.00.00.043004
815YPDYBR049CREB1glucose30.00.00.079541
9162H2O2LoYIL101CXBP1glucose30.00.00.226904
\n", + "
" + ], + "text/plain": [ + " sample_id condition regulator_locus_tag regulator_symbol carbon_source \\\n", + "0 314 SM YOR358W HAP5 unspecified \n", + "1 240 YPD YML007W YAP1 glucose \n", + "2 330 YPD YPL177C CUP9 glucose \n", + "3 114 H2O2Hi YGL071W AFT1 glucose \n", + "4 118 H2O2Hi YGL073W HSF1 glucose \n", + "5 31 H2O2Hi YDL020C RPN4 glucose \n", + "6 303 YPD YOR028C CIN5 glucose \n", + "7 36 H2O2Lo YDL056W MBP1 glucose \n", + "8 15 YPD YBR049C REB1 glucose \n", + "9 162 H2O2Lo YIL101C XBP1 glucose \n", + "\n", + " temperature_celsius dto_empirical_pvalue dto_fdr \n", + "0 30.0 0.0 0.047097 \n", + "1 30.0 0.0 0.140913 \n", + "2 30.0 0.0 0.000399 \n", + "3 30.0 0.0 0.096535 \n", + "4 30.0 0.0 0.031509 \n", + "5 30.0 0.0 0.124670 \n", + "6 30.0 0.0 0.036217 \n", + "7 30.0 0.0 0.043004 \n", + "8 30.0 0.0 0.079541 \n", + "9 30.0 0.0 0.226904 " + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Join harbison metadata to dto via the expanded view's parsed columns\n", + "vdb.query(\"\"\"\n", + " SELECT h.*, d.dto_empirical_pvalue, d.dto_fdr\n", + " FROM harbison_meta h\n", + " JOIN dto_expanded d\n", + " ON CAST(h.sample_id AS VARCHAR) = d.binding_id_id\n", + " AND d.binding_id_source = 'harbison'\n", + " WHERE d.dto_empirical_pvalue <= 0.01\n", + " ORDER BY d.dto_empirical_pvalue\n", + " LIMIT 10\n", + "\"\"\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cell-23", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.microsoft.datawrangler.viewer.v0+json": { + "columns": [ + { + "name": "index", + "rawType": "int64", + "type": "integer" + }, + { + "name": "harbison_sample_id", + "rawType": "int32", + "type": "integer" + }, + { + "name": "regulator_symbol", + "rawType": "object", + "type": "string" + }, + { + "name": "dto_empirical_pvalue", + "rawType": "float64", + "type": "float" + }, + { + "name": "hackett_sample_id", + "rawType": "object", + "type": "string" + } + ], + "ref": "75bcfd39-bdd3-40ed-8c32-57a86f2e5145", + "rows": [ + [ + "0", + "289", + "DAL82", + "0.0", + "1213" + ], + [ + "1", + "224", + "ACE2", + "0.0", + "901" + ], + [ + "2", + "283", + "RAP1", + "0.0", + "96_238" + ], + [ + "3", + "8", + "RTG3", + "0.0", + "57" + ], + [ + "4", + "75", + "CAD1", + "0.0", + "360" + ], + [ + "5", + "246", + "ARG81", + "0.0", + "1023" + ], + [ + "6", + "209", + "HAP4", + "0.0", + "802" + ], + [ + "7", + "83", + "GCN4", + "0.0", + "357" + ], + [ + "8", + "55", + "SWI5", + "0.0", + "253" + ], + [ + "9", + "189", + "HIR3", + "0.0", + "772" + ] + ], + "shape": { + "columns": 4, + "rows": 10 + } + }, + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
harbison_sample_idregulator_symboldto_empirical_pvaluehackett_sample_id
0289DAL820.01213
1224ACE20.0901
2283RAP10.096_238
38RTG30.057
475CAD10.0360
5246ARG810.01023
6209HAP40.0802
783GCN40.0357
855SWI50.0253
9189HIR30.0772
\n", + "
" + ], + "text/plain": [ + " harbison_sample_id regulator_symbol dto_empirical_pvalue hackett_sample_id\n", + "0 289 DAL82 0.0 1213\n", + "1 224 ACE2 0.0 901\n", + "2 283 RAP1 0.0 96_238\n", + "3 8 RTG3 0.0 57\n", + "4 75 CAD1 0.0 360\n", + "5 246 ARG81 0.0 1023\n", + "6 209 HAP4 0.0 802\n", + "7 83 GCN4 0.0 357\n", + "8 55 SWI5 0.0 253\n", + "9 189 HIR3 0.0 772" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Cross-dataset join: harbison binding with hackett perturbation data\n", + "# via the DTO comparative dataset\n", + "vdb.query(\"\"\"\n", + " SELECT\n", + " h.sample_id AS harbison_sample_id,\n", + " h.regulator_symbol,\n", + " d.dto_empirical_pvalue,\n", + " d.perturbation_id_id AS hackett_sample_id\n", + " FROM harbison_meta h\n", + " JOIN dto_expanded d\n", + " ON CAST(h.sample_id AS VARCHAR) = d.binding_id_id\n", + " AND d.binding_id_source = 'harbison'\n", + " WHERE d.dto_empirical_pvalue <= 0.01\n", + " ORDER BY d.dto_empirical_pvalue\n", + " LIMIT 10\n", + "\"\"\")" + ] + }, + { + "cell_type": "markdown", + "id": "cell-24", + "metadata": {}, + "source": [ + "## A realistic example\n", + "\n", + "Hackett has multiple experimental conditions that are unique to that dataset. There are\n", + "some regulators which have replicates within those conditions. We need to find those \n", + "regulators and design a query which returns only 1 sample per condition set." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f03e942a", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.microsoft.datawrangler.viewer.v0+json": { + "columns": [ + { + "name": "index", + "rawType": "int64", + "type": "integer" + }, + { + "name": "regulator_symbol", + "rawType": "object", + "type": "string" + }, + { + "name": "time", + "rawType": "float64", + "type": "float" + }, + { + "name": "mechanism", + "rawType": "object", + "type": "string" + }, + { + "name": "restriction", + "rawType": "object", + "type": "string" + }, + { + "name": "n", + "rawType": "int64", + "type": "integer" + } + ], + "ref": "933fb2fe-799d-4a25-ae4e-ef95ed28bbc4", + "rows": [ + [ + "0", + "SWI1", + "15.0", + "ZEV", + "P", + "3" + ], + [ + "1", + "SWI1", + "30.0", + "ZEV", + "P", + "3" + ], + [ + "2", + "SWI1", + "20.0", + "ZEV", + "P", + "3" + ], + [ + "3", + "SWI1", + "5.0", + "ZEV", + "P", + "3" + ], + [ + "4", + "SWI1", + "90.0", + "ZEV", + "P", + "3" + ], + [ + "5", + "SWI1", + "0.0", + "ZEV", + "P", + "3" + ], + [ + "6", + "SWI1", + "45.0", + "ZEV", + "P", + "3" + ], + [ + "7", + "SWI1", + "10.0", + "ZEV", + "P", + "3" + ], + [ + "8", + "MAC1", + "90.0", + "GEV", + "P", + "2" + ], + [ + "9", + "RDS2", + "20.0", + "ZEV", + "P", + "2" + ], + [ + "10", + "MAC1", + "45.0", + "GEV", + "P", + "2" + ], + [ + "11", + "MAC1", + "15.0", + "GEV", + "P", + "2" + ], + [ + "12", + "RDS2", + "30.0", + "ZEV", + "P", + "2" + ], + [ + "13", + "MAC1", + "30.0", + "GEV", + "P", + "2" + ], + [ + "14", + "RDS2", + "45.0", + "ZEV", + "P", + "2" + ], + [ + "15", + "RDS2", + "15.0", + "ZEV", + "P", + "2" + ], + [ + "16", + "MAC1", + "5.0", + "GEV", + "P", + "2" + ], + [ + "17", + "GCN4", + "15.0", + "ZEV", + "P", + "2" + ], + [ + "18", + "RDS2", + "10.0", + "ZEV", + "P", + "2" + ], + [ + "19", + "RDS2", + "0.0", + "ZEV", + "P", + "2" + ], + [ + "20", + "RDS2", + "90.0", + "ZEV", + "P", + "2" + ], + [ + "21", + "GCN4", + "45.0", + "ZEV", + "P", + "2" + ], + [ + "22", + "GCN4", + "30.0", + "ZEV", + "P", + "2" + ], + [ + "23", + "MAC1", + "0.0", + "GEV", + "P", + "2" + ], + [ + "24", + "RDS2", + "5.0", + "ZEV", + "P", + "2" + ], + [ + "25", + "GCN4", + "90.0", + "ZEV", + "P", + "2" + ], + [ + "26", + "GCN4", + "0.0", + "ZEV", + "P", + "2" + ] + ], + "shape": { + "columns": 5, + "rows": 27 + } + }, + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
regulator_symboltimemechanismrestrictionn
0SWI115.0ZEVP3
1SWI130.0ZEVP3
2SWI120.0ZEVP3
3SWI15.0ZEVP3
4SWI190.0ZEVP3
5SWI10.0ZEVP3
6SWI145.0ZEVP3
7SWI110.0ZEVP3
8MAC190.0GEVP2
9RDS220.0ZEVP2
10MAC145.0GEVP2
11MAC115.0GEVP2
12RDS230.0ZEVP2
13MAC130.0GEVP2
14RDS245.0ZEVP2
15RDS215.0ZEVP2
16MAC15.0GEVP2
17GCN415.0ZEVP2
18RDS210.0ZEVP2
19RDS20.0ZEVP2
20RDS290.0ZEVP2
21GCN445.0ZEVP2
22GCN430.0ZEVP2
23MAC10.0GEVP2
24RDS25.0ZEVP2
25GCN490.0ZEVP2
26GCN40.0ZEVP2
\n", + "
" + ], + "text/plain": [ + " regulator_symbol time mechanism restriction n\n", + "0 SWI1 15.0 ZEV P 3\n", + "1 SWI1 30.0 ZEV P 3\n", + "2 SWI1 20.0 ZEV P 3\n", + "3 SWI1 5.0 ZEV P 3\n", + "4 SWI1 90.0 ZEV P 3\n", + "5 SWI1 0.0 ZEV P 3\n", + "6 SWI1 45.0 ZEV P 3\n", + "7 SWI1 10.0 ZEV P 3\n", + "8 MAC1 90.0 GEV P 2\n", + "9 RDS2 20.0 ZEV P 2\n", + "10 MAC1 45.0 GEV P 2\n", + "11 MAC1 15.0 GEV P 2\n", + "12 RDS2 30.0 ZEV P 2\n", + "13 MAC1 30.0 GEV P 2\n", + "14 RDS2 45.0 ZEV P 2\n", + "15 RDS2 15.0 ZEV P 2\n", + "16 MAC1 5.0 GEV P 2\n", + "17 GCN4 15.0 ZEV P 2\n", + "18 RDS2 10.0 ZEV P 2\n", + "19 RDS2 0.0 ZEV P 2\n", + "20 RDS2 90.0 ZEV P 2\n", + "21 GCN4 45.0 ZEV P 2\n", + "22 GCN4 30.0 ZEV P 2\n", + "23 MAC1 0.0 GEV P 2\n", + "24 RDS2 5.0 ZEV P 2\n", + "25 GCN4 90.0 ZEV P 2\n", + "26 GCN4 0.0 ZEV P 2" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Query hackett to find regulators with multiple samples in the same (time, mechanism)\n", + "# condition\n", + "vdb.query(\"\"\"\n", + " SELECT regulator_symbol, time, mechanism, restriction, COUNT(*) AS n\n", + " FROM hackett_meta\n", + " GROUP BY regulator_symbol, time, mechanism, restriction\n", + " HAVING n > 1\n", + " ORDER BY n DESC\n", + "\"\"\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4d869036", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.microsoft.datawrangler.viewer.v0+json": { + "columns": [ + { + "name": "index", + "rawType": "int64", + "type": "integer" + }, + { + "name": "sample_id", + "rawType": "int32", + "type": "integer" + }, + { + "name": "date", + "rawType": "object", + "type": "string" + }, + { + "name": "mechanism", + "rawType": "object", + "type": "string" + }, + { + "name": "regulator_locus_tag", + "rawType": "object", + "type": "string" + }, + { + "name": "regulator_symbol", + "rawType": "object", + "type": "string" + }, + { + "name": "restriction", + "rawType": "object", + "type": "string" + }, + { + "name": "strain", + "rawType": "object", + "type": "string" + }, + { + "name": "time", + "rawType": "float64", + "type": "float" + }, + { + "name": "carbon_source", + "rawType": "object", + "type": "string" + }, + { + "name": "temperature_celsius", + "rawType": "float64", + "type": "float" + } + ], + "ref": "58631e0d-0adf-41e4-9676-3e51aecbc7dd", + "rows": [ + [ + "0", + "1636", + "20161117", + "ZEV", + "YPL016W", + "SWI1", + "P", + "SMY2266c", + "20.0", + "glucose", + "30.0" + ], + [ + "1", + "1620", + "20161117", + "ZEV", + "YPL016W", + "SWI1", + "P", + "SMY2266a", + "20.0", + "glucose", + "30.0" + ], + [ + "2", + "1628", + "20161117", + "ZEV", + "YPL016W", + "SWI1", + "P", + "SMY2266b", + "20.0", + "glucose", + "30.0" + ] + ], + "shape": { + "columns": 10, + "rows": 3 + } + }, + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
sample_iddatemechanismregulator_locus_tagregulator_symbolrestrictionstraintimecarbon_sourcetemperature_celsius
0163620161117ZEVYPL016WSWI1PSMY2266c20.0glucose30.0
1162020161117ZEVYPL016WSWI1PSMY2266a20.0glucose30.0
2162820161117ZEVYPL016WSWI1PSMY2266b20.0glucose30.0
\n", + "
" + ], + "text/plain": [ + " sample_id date mechanism regulator_locus_tag regulator_symbol \\\n", + "0 1636 20161117 ZEV YPL016W SWI1 \n", + "1 1620 20161117 ZEV YPL016W SWI1 \n", + "2 1628 20161117 ZEV YPL016W SWI1 \n", + "\n", + " restriction strain time carbon_source temperature_celsius \n", + "0 P SMY2266c 20.0 glucose 30.0 \n", + "1 P SMY2266a 20.0 glucose 30.0 \n", + "2 P SMY2266b 20.0 glucose 30.0 " + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# SWI1 has 3 samples at time=20, mechanism=ZEV. Let's look at just those samples\n", + "vdb.query(\"\"\"\n", + " SELECT *\n", + " FROM hackett_meta\n", + " WHERE regulator_symbol = 'SWI1'\n", + " AND time = 20\n", + " AND mechanism = 'ZEV'\n", + "\"\"\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "89408d2b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['SWI1', 'GCN4', 'RDS2', 'MAC1']\n" + ] + } + ], + "source": [ + "# In this case, there are three strains with otherwise the same experimental conditions.\n", + "# Rather than trying to choose among these right now, we might just want to get a\n", + "# unique list of the regulators with replicates in order to exclude them from an\n", + "# analysis that doesn't expect replicates.\n", + "replicated_hackett_regulators = vdb.query(\"\"\"\n", + " SELECT DISTINCT regulator_symbol\n", + " FROM hackett_meta\n", + " GROUP BY regulator_symbol, time, mechanism, restriction\n", + " HAVING COUNT(*) > 1\n", + "\"\"\").regulator_symbol.tolist()\n", + "print(replicated_hackett_regulators)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5a3b802b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['SWI1', 'GCN4', 'RDS2', 'MAC1', 'GEV']\n" + ] + } + ], + "source": [ + "# GEV is another \"regulator\" we want to exclude\n", + "replicated_hackett_regulators.append(\"GEV\")\n", + "print(replicated_hackett_regulators)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "abed8bc2", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.microsoft.datawrangler.viewer.v0+json": { + "columns": [ + { + "name": "index", + "rawType": "int64", + "type": "integer" + }, + { + "name": "binding_id", + "rawType": "object", + "type": "string" + }, + { + "name": "perturbation_id", + "rawType": "object", + "type": "string" + }, + { + "name": "binding_rank_threshold", + "rawType": "float64", + "type": "float" + }, + { + "name": "perturbation_rank_threshold", + "rawType": "float64", + "type": "float" + }, + { + "name": "binding_set_size", + "rawType": "float64", + "type": "float" + }, + { + "name": "perturbation_set_size", + "rawType": "float64", + "type": "float" + }, + { + "name": "dto_fdr", + "rawType": "float64", + "type": "float" + }, + { + "name": "dto_empirical_pvalue", + "rawType": "float64", + "type": "float" + }, + { + "name": "pr_ranking_column", + "rawType": "object", + "type": "string" + }, + { + "name": "binding_repo_dataset", + "rawType": "object", + "type": "string" + }, + { + "name": "perturbation_repo_dataset", + "rawType": "object", + "type": "string" + }, + { + "name": "binding_id_id", + "rawType": "object", + "type": "string" + }, + { + "name": "binding_id_source", + "rawType": "object", + "type": "string" + }, + { + "name": "perturbation_id_id", + "rawType": "object", + "type": "string" + }, + { + "name": "perturbation_id_source", + "rawType": "object", + "type": "string" + } + ], + "ref": "bf27f61f-dbfa-482f-a0a0-235eeabc3fee", + "rows": [ + [ + "0", + "BrentLab/harbison_2004;harbison_2004;105", + "BrentLab/hughes_2006;overexpression;10", + "11.0", + "206.0", + "12.0", + "206.0", + "0.041292917490562644", + "0.017", + "log2fc", + "harbison_2004-harbison_2004", + "hughes_2006-overexpression", + "105", + "harbison", + "10", + "BrentLab/hughes_2006;overexpression" + ], + [ + "1", + "BrentLab/harbison_2004;harbison_2004;108", + "BrentLab/hughes_2006;overexpression;11", + "60.0", + "67.0", + "60.0", + "67.0", + "0.05428351009647073", + "0.0", + "log2fc", + "harbison_2004-harbison_2004", + "hughes_2006-overexpression", + "108", + "harbison", + "11", + "BrentLab/hughes_2006;overexpression" + ], + [ + "2", + "BrentLab/harbison_2004;harbison_2004;109", + "BrentLab/hughes_2006;overexpression;11", + "27.0", + "1265.0", + "27.0", + "1265.0", + "0.12321364371741866", + "0.057", + "log2fc", + "harbison_2004-harbison_2004", + "hughes_2006-overexpression", + "109", + "harbison", + "11", + "BrentLab/hughes_2006;overexpression" + ], + [ + "3", + "BrentLab/harbison_2004;harbison_2004;112", + "BrentLab/hughes_2006;overexpression;12", + "532.0", + "1093.0", + "532.0", + "1093.0", + "0.4363046674390623", + "0.092", + "log2fc", + "harbison_2004-harbison_2004", + "hughes_2006-overexpression", + "112", + "harbison", + "12", + "BrentLab/hughes_2006;overexpression" + ], + [ + "4", + "BrentLab/harbison_2004;harbison_2004;113", + "BrentLab/hughes_2006;overexpression;12", + "10.0", + "556.0", + "10.0", + "556.0", + "0.01756663927480034", + "0.002", + "log2fc", + "harbison_2004-harbison_2004", + "hughes_2006-overexpression", + "113", + "harbison", + "12", + "BrentLab/hughes_2006;overexpression" + ], + [ + "5", + "BrentLab/harbison_2004;harbison_2004;118", + "BrentLab/hughes_2006;overexpression;13", + "574.0", + "354.0", + "574.0", + "354.0", + "0.13894295437217577", + "0.0", + "log2fc", + "harbison_2004-harbison_2004", + "hughes_2006-overexpression", + "118", + "harbison", + "13", + "BrentLab/hughes_2006;overexpression" + ], + [ + "6", + "BrentLab/harbison_2004;harbison_2004;119", + "BrentLab/hughes_2006;overexpression;13", + "251.0", + "492.0", + "251.0", + "492.0", + "0.11808548603694578", + "0.0", + "log2fc", + "harbison_2004-harbison_2004", + "hughes_2006-overexpression", + "119", + "harbison", + "13", + "BrentLab/hughes_2006;overexpression" + ], + [ + "7", + "BrentLab/harbison_2004;harbison_2004;120", + "BrentLab/hughes_2006;overexpression;13", + "14.0", + "2954.0", + "14.0", + "2954.0", + "0.1616346595561947", + "1.0", + "log2fc", + "harbison_2004-harbison_2004", + "hughes_2006-overexpression", + "120", + "harbison", + "13", + "BrentLab/hughes_2006;overexpression" + ], + [ + "8", + "BrentLab/harbison_2004;harbison_2004;121", + "BrentLab/hughes_2006;overexpression;13", + "422.0", + "544.0", + "423.0", + "544.0", + "0.401585299611564", + "0.001", + "log2fc", + "harbison_2004-harbison_2004", + "hughes_2006-overexpression", + "121", + "harbison", + "13", + "BrentLab/hughes_2006;overexpression" + ], + [ + "9", + "BrentLab/harbison_2004;harbison_2004;122", + "BrentLab/hughes_2006;overexpression;14", + "842.0", + "152.0", + "842.0", + "152.0", + "0.37750827352885596", + "0.106", + "log2fc", + "harbison_2004-harbison_2004", + "hughes_2006-overexpression", + "122", + "harbison", + "14", + "BrentLab/hughes_2006;overexpression" + ], + [ + "10", + "BrentLab/harbison_2004;harbison_2004;124", + "BrentLab/hughes_2006;overexpression;15", + "402.0", + "1417.0", + "402.0", + "1417.0", + "0.279937313245534", + "0.0", + "log2fc", + "harbison_2004-harbison_2004", + "hughes_2006-overexpression", + "124", + "harbison", + "15", + "BrentLab/hughes_2006;overexpression" + ], + [ + "11", + "BrentLab/harbison_2004;harbison_2004;137", + "BrentLab/hughes_2006;overexpression;17", + "29.0", + "5.0", + "29.0", + "5.0", + "0.005954520941937803", + "0.043", + "log2fc", + "harbison_2004-harbison_2004", + "hughes_2006-overexpression", + "137", + "harbison", + "17", + "BrentLab/hughes_2006;overexpression" + ], + [ + "12", + "BrentLab/harbison_2004;harbison_2004;141", + "BrentLab/hughes_2006;overexpression;18", + "653.0", + "1620.0", + "654.0", + "1620.0", + "0.442997844156436", + "0.812", + "log2fc", + "harbison_2004-harbison_2004", + "hughes_2006-overexpression", + "141", + "harbison", + "18", + "BrentLab/hughes_2006;overexpression" + ], + [ + "13", + "BrentLab/harbison_2004;harbison_2004;142", + "BrentLab/hughes_2006;overexpression;18", + "497.0", + "25.0", + "497.0", + "25.0", + "0.3308129606327521", + "0.921", + "log2fc", + "harbison_2004-harbison_2004", + "hughes_2006-overexpression", + "142", + "harbison", + "18", + "BrentLab/hughes_2006;overexpression" + ], + [ + "14", + "BrentLab/harbison_2004;harbison_2004;150", + "BrentLab/hughes_2006;overexpression;19", + "91.0", + "1948.0", + "91.0", + "1948.0", + "0.2949755757517485", + "0.578", + "log2fc", + "harbison_2004-harbison_2004", + "hughes_2006-overexpression", + "150", + "harbison", + "19", + "BrentLab/hughes_2006;overexpression" + ], + [ + "15", + "BrentLab/harbison_2004;harbison_2004;151", + "BrentLab/hughes_2006;overexpression;21", + "57.0", + "386.0", + "57.0", + "386.0", + "0.0656826352687399", + "0.0", + "log2fc", + "harbison_2004-harbison_2004", + "hughes_2006-overexpression", + "151", + "harbison", + "21", + "BrentLab/hughes_2006;overexpression" + ], + [ + "16", + "BrentLab/harbison_2004;harbison_2004;152", + "BrentLab/hughes_2006;overexpression;21", + "272.0", + "526.0", + "272.0", + "526.0", + "0.2405177062735934", + "0.0", + "log2fc", + "harbison_2004-harbison_2004", + "hughes_2006-overexpression", + "152", + "harbison", + "21", + "BrentLab/hughes_2006;overexpression" + ], + [ + "17", + "BrentLab/harbison_2004;harbison_2004;153", + "BrentLab/hughes_2006;overexpression;21", + "186.0", + "1060.0", + "186.0", + "1060.0", + "0.20770457061222172", + "0.0", + "log2fc", + "harbison_2004-harbison_2004", + "hughes_2006-overexpression", + "153", + "harbison", + "21", + "BrentLab/hughes_2006;overexpression" + ], + [ + "18", + "BrentLab/harbison_2004;harbison_2004;154", + "BrentLab/hughes_2006;overexpression;21", + "65.0", + "398.0", + "65.0", + "398.0", + "0.10461443622068167", + "0.0", + "log2fc", + "harbison_2004-harbison_2004", + "hughes_2006-overexpression", + "154", + "harbison", + "21", + "BrentLab/hughes_2006;overexpression" + ], + [ + "19", + "BrentLab/harbison_2004;harbison_2004;157", + "BrentLab/hughes_2006;overexpression;22", + "482.0", + "176.0", + "482.0", + "176.0", + "0.14485664209958654", + "0.0", + "log2fc", + "harbison_2004-harbison_2004", + "hughes_2006-overexpression", + "157", + "harbison", + "22", + "BrentLab/hughes_2006;overexpression" + ], + [ + "20", + "BrentLab/harbison_2004;harbison_2004;158", + "BrentLab/hughes_2006;overexpression;22", + "354.0", + "215.0", + "354.0", + "215.0", + "0.12060713643717419", + "0.0", + "log2fc", + "harbison_2004-harbison_2004", + "hughes_2006-overexpression", + "158", + "harbison", + "22", + "BrentLab/hughes_2006;overexpression" + ], + [ + "21", + "BrentLab/harbison_2004;harbison_2004;159", + "BrentLab/hughes_2006;overexpression;22", + "550.0", + "611.0", + "550.0", + "611.0", + "0.2924649934604871", + "0.0", + "log2fc", + "harbison_2004-harbison_2004", + "hughes_2006-overexpression", + "159", + "harbison", + "22", + "BrentLab/hughes_2006;overexpression" + ], + [ + "22", + "BrentLab/harbison_2004;harbison_2004;160", + "BrentLab/hughes_2006;overexpression;22", + "77.0", + "625.0", + "77.0", + "625.0", + "0.1062495373846105", + "0.0", + "log2fc", + "harbison_2004-harbison_2004", + "hughes_2006-overexpression", + "160", + "harbison", + "22", + "BrentLab/hughes_2006;overexpression" + ], + [ + "23", + "BrentLab/harbison_2004;harbison_2004;161", + "BrentLab/hughes_2006;overexpression;23", + "37.0", + "3236.0", + "37.0", + "3236.0", + "0.014875454821573575", + "0.456", + "log2fc", + "harbison_2004-harbison_2004", + "hughes_2006-overexpression", + "161", + "harbison", + "23", + "BrentLab/hughes_2006;overexpression" + ], + [ + "24", + "BrentLab/harbison_2004;harbison_2004;162", + "BrentLab/hughes_2006;overexpression;24", + "417.0", + "1082.0", + "417.0", + "1082.0", + "0.22690440962955793", + "0.0", + "log2fc", + "harbison_2004-harbison_2004", + "hughes_2006-overexpression", + "162", + "harbison", + "24", + "BrentLab/hughes_2006;overexpression" + ], + [ + "25", + "BrentLab/harbison_2004;harbison_2004;163", + "BrentLab/hughes_2006;overexpression;24", + "896.0", + "710.0", + "896.0", + "710.0", + "0.41161010647006896", + "0.002", + "log2fc", + "harbison_2004-harbison_2004", + "hughes_2006-overexpression", + "163", + "harbison", + "24", + "BrentLab/hughes_2006;overexpression" + ], + [ + "26", + "BrentLab/harbison_2004;harbison_2004;174", + "BrentLab/hughes_2006;overexpression;26", + "55.0", + "2135.0", + "55.0", + "2135.0", + "0.08879402276624998", + "0.006", + "log2fc", + "harbison_2004-harbison_2004", + "hughes_2006-overexpression", + "174", + "harbison", + "26", + "BrentLab/hughes_2006;overexpression" + ], + [ + "27", + "BrentLab/harbison_2004;harbison_2004;175", + "BrentLab/hughes_2006;overexpression;27", + "79.0", + "354.0", + "79.0", + "354.0", + "0.36280804176948345", + "0.485", + "log2fc", + "harbison_2004-harbison_2004", + "hughes_2006-overexpression", + "175", + "harbison", + "27", + "BrentLab/hughes_2006;overexpression" + ], + [ + "28", + "BrentLab/harbison_2004;harbison_2004;176", + "BrentLab/hughes_2006;overexpression;27", + "1.0", + "604.0", + "1.0", + "604.0", + "0.0", + "0.981", + "log2fc", + "harbison_2004-harbison_2004", + "hughes_2006-overexpression", + "176", + "harbison", + "27", + "BrentLab/hughes_2006;overexpression" + ], + [ + "29", + "BrentLab/harbison_2004;harbison_2004;177", + "BrentLab/hughes_2006;overexpression;28", + "10.0", + "3654.0", + "10.0", + "3654.0", + "0.0", + "1.0", + "log2fc", + "harbison_2004-harbison_2004", + "hughes_2006-overexpression", + "177", + "harbison", + "28", + "BrentLab/hughes_2006;overexpression" + ], + [ + "30", + "BrentLab/harbison_2004;harbison_2004;178", + "BrentLab/hughes_2006;overexpression;28", + "20.0", + "61.0", + "22.0", + "61.0", + "0.10253010965306489", + "0.707", + "log2fc", + "harbison_2004-harbison_2004", + "hughes_2006-overexpression", + "178", + "harbison", + "28", + "BrentLab/hughes_2006;overexpression" + ], + [ + "31", + "BrentLab/harbison_2004;harbison_2004;179", + "BrentLab/hughes_2006;overexpression;28", + "6.0", + "1128.0", + "6.0", + "1128.0", + "0.15157064533525078", + "0.968", + "log2fc", + "harbison_2004-harbison_2004", + "hughes_2006-overexpression", + "179", + "harbison", + "28", + "BrentLab/hughes_2006;overexpression" + ], + [ + "32", + "BrentLab/harbison_2004;harbison_2004;191", + "BrentLab/hughes_2006;overexpression;29", + "342.0", + "174.0", + "342.0", + "174.0", + "0.42452813230271436", + "0.452", + "log2fc", + "harbison_2004-harbison_2004", + "hughes_2006-overexpression", + "191", + "harbison", + "29", + "BrentLab/hughes_2006;overexpression" + ], + [ + "33", + "BrentLab/harbison_2004;harbison_2004;192", + "BrentLab/hughes_2006;overexpression;30", + "132.0", + "227.0", + "132.0", + "227.0", + "0.22362783869614716", + "0.002", + "log2fc", + "harbison_2004-harbison_2004", + "hughes_2006-overexpression", + "192", + "harbison", + "30", + "BrentLab/hughes_2006;overexpression" + ], + [ + "34", + "BrentLab/harbison_2004;harbison_2004;193", + "BrentLab/hughes_2006;overexpression;30", + "322.0", + "442.0", + "322.0", + "442.0", + "0.40950351528951207", + "0.021", + "log2fc", + "harbison_2004-harbison_2004", + "hughes_2006-overexpression", + "193", + "harbison", + "30", + "BrentLab/hughes_2006;overexpression" + ], + [ + "35", + "BrentLab/harbison_2004;harbison_2004;194", + "BrentLab/hughes_2006;overexpression;30", + "76.0", + "43.0", + "76.0", + "43.0", + "0.12124752831206184", + "0.395", + "log2fc", + "harbison_2004-harbison_2004", + "hughes_2006-overexpression", + "194", + "harbison", + "30", + "BrentLab/hughes_2006;overexpression" + ], + [ + "36", + "BrentLab/harbison_2004;harbison_2004;201", + "BrentLab/hughes_2006;overexpression;31", + "136.0", + "1104.0", + "136.0", + "1104.0", + "0.2752121157648751", + "0.001", + "log2fc", + "harbison_2004-harbison_2004", + "hughes_2006-overexpression", + "201", + "harbison", + "31", + "BrentLab/hughes_2006;overexpression" + ], + [ + "37", + "BrentLab/harbison_2004;harbison_2004;202", + "BrentLab/hughes_2006;overexpression;31", + "287.0", + "36.0", + "287.0", + "36.0", + "0.06401671759841812", + "0.0", + "log2fc", + "harbison_2004-harbison_2004", + "hughes_2006-overexpression", + "202", + "harbison", + "31", + "BrentLab/hughes_2006;overexpression" + ], + [ + "38", + "BrentLab/harbison_2004;harbison_2004;203", + "BrentLab/hughes_2006;overexpression;31", + "88.0", + "41.0", + "88.0", + "41.0", + "0.06563294471122981", + "0.003", + "log2fc", + "harbison_2004-harbison_2004", + "hughes_2006-overexpression", + "203", + "harbison", + "31", + "BrentLab/hughes_2006;overexpression" + ], + [ + "39", + "BrentLab/harbison_2004;harbison_2004;204", + "BrentLab/hughes_2006;overexpression;31", + "318.0", + "1948.0", + "319.0", + "1948.0", + "0.380107954958676", + "0.57", + "log2fc", + "harbison_2004-harbison_2004", + "hughes_2006-overexpression", + "204", + "harbison", + "31", + "BrentLab/hughes_2006;overexpression" + ], + [ + "40", + "BrentLab/harbison_2004;harbison_2004;205", + "BrentLab/hughes_2006;overexpression;31", + "467.0", + "646.0", + "467.0", + "646.0", + "0.42659723019346846", + "0.006", + "log2fc", + "harbison_2004-harbison_2004", + "hughes_2006-overexpression", + "205", + "harbison", + "31", + "BrentLab/hughes_2006;overexpression" + ], + [ + "41", + "BrentLab/harbison_2004;harbison_2004;207", + "BrentLab/hughes_2006;overexpression;32", + "55.0", + "230.0", + "56.0", + "230.0", + "0.3233042722751513", + "0.796", + "log2fc", + "harbison_2004-harbison_2004", + "hughes_2006-overexpression", + "207", + "harbison", + "32", + "BrentLab/hughes_2006;overexpression" + ], + [ + "42", + "BrentLab/harbison_2004;harbison_2004;208", + "BrentLab/hughes_2006;overexpression;32", + "25.0", + "126.0", + "25.0", + "126.0", + "0.0489281862304512", + "0.0", + "log2fc", + "harbison_2004-harbison_2004", + "hughes_2006-overexpression", + "208", + "harbison", + "32", + "BrentLab/hughes_2006;overexpression" + ], + [ + "43", + "BrentLab/harbison_2004;harbison_2004;209", + "BrentLab/hughes_2006;overexpression;32", + "122.0", + "688.0", + "122.0", + "688.0", + "0.10777396924484826", + "0.0", + "log2fc", + "harbison_2004-harbison_2004", + "hughes_2006-overexpression", + "209", + "harbison", + "32", + "BrentLab/hughes_2006;overexpression" + ], + [ + "44", + "BrentLab/harbison_2004;harbison_2004;210", + "BrentLab/hughes_2006;overexpression;33", + "97.0", + "2113.0", + "97.0", + "2113.0", + "0.30052307036231024", + "0.807", + "log2fc", + "harbison_2004-harbison_2004", + "hughes_2006-overexpression", + "210", + "harbison", + "33", + "BrentLab/hughes_2006;overexpression" + ], + [ + "45", + "BrentLab/harbison_2004;harbison_2004;219", + "BrentLab/hughes_2006;overexpression;34", + "172.0", + "245.0", + "172.0", + "245.0", + "0.41551695727724847", + "0.505", + "log2fc", + "harbison_2004-harbison_2004", + "hughes_2006-overexpression", + "219", + "harbison", + "34", + "BrentLab/hughes_2006;overexpression" + ], + [ + "46", + "BrentLab/harbison_2004;harbison_2004;225", + "BrentLab/hughes_2006;overexpression;35", + "314.0", + "12.0", + "314.0", + "12.0", + "0.15336823656300558", + "0.877", + "log2fc", + "harbison_2004-harbison_2004", + "hughes_2006-overexpression", + "225", + "harbison", + "35", + "BrentLab/hughes_2006;overexpression" + ], + [ + "47", + "BrentLab/harbison_2004;harbison_2004;228", + "BrentLab/hughes_2006;overexpression;36", + "358.0", + "2316.0", + "358.0", + "2316.0", + "0.33853600995025945", + "0.804", + "log2fc", + "harbison_2004-harbison_2004", + "hughes_2006-overexpression", + "228", + "harbison", + "36", + "BrentLab/hughes_2006;overexpression" + ], + [ + "48", + "BrentLab/harbison_2004;harbison_2004;231", + "BrentLab/hughes_2006;overexpression;38", + "77.0", + "362.0", + "77.0", + "362.0", + "0.32227814728264126", + "0.36", + "log2fc", + "harbison_2004-harbison_2004", + "hughes_2006-overexpression", + "231", + "harbison", + "38", + "BrentLab/hughes_2006;overexpression" + ], + [ + "49", + "BrentLab/harbison_2004;harbison_2004;232", + "BrentLab/hughes_2006;overexpression;38", + "40.0", + "3302.0", + "41.0", + "3302.0", + "0.01832419557792558", + "0.593", + "log2fc", + "harbison_2004-harbison_2004", + "hughes_2006-overexpression", + "232", + "harbison", + "38", + "BrentLab/hughes_2006;overexpression" + ] + ], + "shape": { + "columns": 15, + "rows": 29804 + } + }, + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
binding_idperturbation_idbinding_rank_thresholdperturbation_rank_thresholdbinding_set_sizeperturbation_set_sizedto_fdrdto_empirical_pvaluepr_ranking_columnbinding_repo_datasetperturbation_repo_datasetbinding_id_idbinding_id_sourceperturbation_id_idperturbation_id_source
0BrentLab/harbison_2004;harbison_2004;105BrentLab/hughes_2006;overexpression;1011.0206.012.0206.00.0412930.017log2fcharbison_2004-harbison_2004hughes_2006-overexpression105harbison10BrentLab/hughes_2006;overexpression
1BrentLab/harbison_2004;harbison_2004;108BrentLab/hughes_2006;overexpression;1160.067.060.067.00.0542840.000log2fcharbison_2004-harbison_2004hughes_2006-overexpression108harbison11BrentLab/hughes_2006;overexpression
2BrentLab/harbison_2004;harbison_2004;109BrentLab/hughes_2006;overexpression;1127.01265.027.01265.00.1232140.057log2fcharbison_2004-harbison_2004hughes_2006-overexpression109harbison11BrentLab/hughes_2006;overexpression
3BrentLab/harbison_2004;harbison_2004;112BrentLab/hughes_2006;overexpression;12532.01093.0532.01093.00.4363050.092log2fcharbison_2004-harbison_2004hughes_2006-overexpression112harbison12BrentLab/hughes_2006;overexpression
4BrentLab/harbison_2004;harbison_2004;113BrentLab/hughes_2006;overexpression;1210.0556.010.0556.00.0175670.002log2fcharbison_2004-harbison_2004hughes_2006-overexpression113harbison12BrentLab/hughes_2006;overexpression
................................................
29799BrentLab/callingcards;annotated_features_combi...BrentLab/kemmeren_2014;kemmeren_2014;784154.0905.0154.0905.00.0906650.000pvaluecallingcards-annotated_features_combinedkemmeren_2014-kemmeren_2014724-692-688BrentLab/callingcards;annotated_features_combined784kemmeren
29800BrentLab/callingcards;annotated_features_combi...BrentLab/kemmeren_2014;kemmeren_2014;666215.0108.0215.0108.00.0750360.005pvaluecallingcards-annotated_features_combinedkemmeren_2014-kemmeren_2014725-435-395BrentLab/callingcards;annotated_features_combined666kemmeren
29801BrentLab/callingcards;annotated_features_combi...BrentLab/kemmeren_2014;kemmeren_2014;271221.0925.0221.0925.00.4034840.126pvaluecallingcards-annotated_features_combinedkemmeren_2014-kemmeren_2014726-445-424BrentLab/callingcards;annotated_features_combined271kemmeren
29802BrentLab/callingcards;annotated_features_combi...BrentLab/kemmeren_2014;kemmeren_2014;1077281.073.0283.077.00.0959480.174pvaluecallingcards-annotated_features_combinedkemmeren_2014-kemmeren_201479-33BrentLab/callingcards;annotated_features_combined1077kemmeren
29803BrentLab/callingcards;annotated_features_combi...BrentLab/kemmeren_2014;kemmeren_2014;963526.0227.0527.0227.00.0649190.000pvaluecallingcards-annotated_features_combinedkemmeren_2014-kemmeren_201496-49BrentLab/callingcards;annotated_features_combined963kemmeren
\n", + "

29804 rows × 15 columns

\n", + "
" + ], + "text/plain": [ + " binding_id \\\n", + "0 BrentLab/harbison_2004;harbison_2004;105 \n", + "1 BrentLab/harbison_2004;harbison_2004;108 \n", + "2 BrentLab/harbison_2004;harbison_2004;109 \n", + "3 BrentLab/harbison_2004;harbison_2004;112 \n", + "4 BrentLab/harbison_2004;harbison_2004;113 \n", + "... ... \n", + "29799 BrentLab/callingcards;annotated_features_combi... \n", + "29800 BrentLab/callingcards;annotated_features_combi... \n", + "29801 BrentLab/callingcards;annotated_features_combi... \n", + "29802 BrentLab/callingcards;annotated_features_combi... \n", + "29803 BrentLab/callingcards;annotated_features_combi... \n", + "\n", + " perturbation_id binding_rank_threshold \\\n", + "0 BrentLab/hughes_2006;overexpression;10 11.0 \n", + "1 BrentLab/hughes_2006;overexpression;11 60.0 \n", + "2 BrentLab/hughes_2006;overexpression;11 27.0 \n", + "3 BrentLab/hughes_2006;overexpression;12 532.0 \n", + "4 BrentLab/hughes_2006;overexpression;12 10.0 \n", + "... ... ... \n", + "29799 BrentLab/kemmeren_2014;kemmeren_2014;784 154.0 \n", + "29800 BrentLab/kemmeren_2014;kemmeren_2014;666 215.0 \n", + "29801 BrentLab/kemmeren_2014;kemmeren_2014;271 221.0 \n", + "29802 BrentLab/kemmeren_2014;kemmeren_2014;1077 281.0 \n", + "29803 BrentLab/kemmeren_2014;kemmeren_2014;963 526.0 \n", + "\n", + " perturbation_rank_threshold binding_set_size perturbation_set_size \\\n", + "0 206.0 12.0 206.0 \n", + "1 67.0 60.0 67.0 \n", + "2 1265.0 27.0 1265.0 \n", + "3 1093.0 532.0 1093.0 \n", + "4 556.0 10.0 556.0 \n", + "... ... ... ... \n", + "29799 905.0 154.0 905.0 \n", + "29800 108.0 215.0 108.0 \n", + "29801 925.0 221.0 925.0 \n", + "29802 73.0 283.0 77.0 \n", + "29803 227.0 527.0 227.0 \n", + "\n", + " dto_fdr dto_empirical_pvalue pr_ranking_column \\\n", + "0 0.041293 0.017 log2fc \n", + "1 0.054284 0.000 log2fc \n", + "2 0.123214 0.057 log2fc \n", + "3 0.436305 0.092 log2fc \n", + "4 0.017567 0.002 log2fc \n", + "... ... ... ... \n", + "29799 0.090665 0.000 pvalue \n", + "29800 0.075036 0.005 pvalue \n", + "29801 0.403484 0.126 pvalue \n", + "29802 0.095948 0.174 pvalue \n", + "29803 0.064919 0.000 pvalue \n", + "\n", + " binding_repo_dataset perturbation_repo_dataset \\\n", + "0 harbison_2004-harbison_2004 hughes_2006-overexpression \n", + "1 harbison_2004-harbison_2004 hughes_2006-overexpression \n", + "2 harbison_2004-harbison_2004 hughes_2006-overexpression \n", + "3 harbison_2004-harbison_2004 hughes_2006-overexpression \n", + "4 harbison_2004-harbison_2004 hughes_2006-overexpression \n", + "... ... ... \n", + "29799 callingcards-annotated_features_combined kemmeren_2014-kemmeren_2014 \n", + "29800 callingcards-annotated_features_combined kemmeren_2014-kemmeren_2014 \n", + "29801 callingcards-annotated_features_combined kemmeren_2014-kemmeren_2014 \n", + "29802 callingcards-annotated_features_combined kemmeren_2014-kemmeren_2014 \n", + "29803 callingcards-annotated_features_combined kemmeren_2014-kemmeren_2014 \n", + "\n", + " binding_id_id binding_id_source \\\n", + "0 105 harbison \n", + "1 108 harbison \n", + "2 109 harbison \n", + "3 112 harbison \n", + "4 113 harbison \n", + "... ... ... \n", + "29799 724-692-688 BrentLab/callingcards;annotated_features_combined \n", + "29800 725-435-395 BrentLab/callingcards;annotated_features_combined \n", + "29801 726-445-424 BrentLab/callingcards;annotated_features_combined \n", + "29802 79-33 BrentLab/callingcards;annotated_features_combined \n", + "29803 96-49 BrentLab/callingcards;annotated_features_combined \n", + "\n", + " perturbation_id_id perturbation_id_source \n", + "0 10 BrentLab/hughes_2006;overexpression \n", + "1 11 BrentLab/hughes_2006;overexpression \n", + "2 11 BrentLab/hughes_2006;overexpression \n", + "3 12 BrentLab/hughes_2006;overexpression \n", + "4 12 BrentLab/hughes_2006;overexpression \n", + "... ... ... \n", + "29799 784 kemmeren \n", + "29800 666 kemmeren \n", + "29801 271 kemmeren \n", + "29802 1077 kemmeren \n", + "29803 963 kemmeren \n", + "\n", + "[29804 rows x 15 columns]" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "vdb.query(\"SELECT * FROM dto_expanded\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cell-25", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " sample_id regulator_symbol time mechanism \\\n", + "0 448 ACA1 15.0 ZEV \n", + "1 448 ACA1 15.0 ZEV \n", + "2 448 ACA1 15.0 ZEV \n", + "3 448 ACA1 15.0 ZEV \n", + "4 448 ACA1 15.0 ZEV \n", + "\n", + " binding_id \\\n", + "0 BrentLab/callingcards;annotated_features;803 \n", + "1 BrentLab/harbison_2004;harbison_2004;88 \n", + "2 BrentLab/mahendrawada_2025;chec_mahendrawada_m... \n", + "3 BrentLab/callingcards;annotated_features;126 \n", + "4 BrentLab/callingcards;annotated_features;156 \n", + "\n", + " perturbation_id binding_rank_threshold \\\n", + "0 BrentLab/hackett_2020;hackett_2020;448 110.0 \n", + "1 BrentLab/hackett_2020;hackett_2020;448 334.0 \n", + "2 BrentLab/hackett_2020;hackett_2020;448 3882.0 \n", + "3 BrentLab/hackett_2020;hackett_2020;448 437.0 \n", + "4 BrentLab/hackett_2020;hackett_2020;448 374.0 \n", + "\n", + " perturbation_rank_threshold binding_set_size perturbation_set_size \\\n", + "0 346.0 113.0 346.0 \n", + "1 1.0 334.0 5524.0 \n", + "2 1.0 3883.0 5591.0 \n", + "3 1.0 442.0 5591.0 \n", + "4 1.0 376.0 5591.0 \n", + "\n", + " dto_fdr dto_empirical_pvalue pr_ranking_column \\\n", + "0 0.236207 0.001 log2fc \n", + "1 0.000000 1.000 pvalue \n", + "2 0.000000 1.000 pvalue \n", + "3 0.000000 1.000 pvalue \n", + "4 0.000000 1.000 pvalue \n", + "\n", + " binding_repo_dataset \\\n", + "0 callingcards-annotated_features \n", + "1 harbison_2004-harbison_2004 \n", + "2 mahendrawada_2025-chec_mahendrawada_m2025_af_c... \n", + "3 callingcards-annotated_features \n", + "4 callingcards-annotated_features \n", + "\n", + " perturbation_repo_dataset binding_id_id \\\n", + "0 hackett_2020-hackett_2020 803 \n", + "1 hackett_2020-hackett_2020 88 \n", + "2 hackett_2020-hackett_2020 59 \n", + "3 hackett_2020-hackett_2020 126 \n", + "4 hackett_2020-hackett_2020 156 \n", + "\n", + " binding_id_source perturbation_id_id \\\n", + "0 BrentLab/callingcards;annotated_features 448 \n", + "1 harbison 448 \n", + "2 BrentLab/mahendrawada_2025;chec_mahendrawada_m... 448 \n", + "3 BrentLab/callingcards;annotated_features 448 \n", + "4 BrentLab/callingcards;annotated_features 448 \n", + "\n", + " perturbation_id_source \n", + "0 hackett \n", + "1 hackett \n", + "2 hackett \n", + "3 hackett \n", + "4 hackett \n" + ] + } + ], + "source": [ + "# We can remove those regulators from our query using a parameterized query\n", + "hackett_harbison_dto = vdb.query(\"\"\"\n", + "SELECT h.sample_id, h.regulator_symbol, h.time, h.mechanism,\n", + " dto.*\n", + "FROM hackett_meta h\n", + "LEFT JOIN (\n", + " SELECT *\n", + " FROM dto_expanded\n", + ") AS dto\n", + "ON CAST(h.sample_id AS VARCHAR) = dto.perturbation_id_id\n", + "WHERE h.regulator_symbol NOT IN $replicated_hacket_regulators\n", + " AND h.mechanism = 'ZEV'\n", + " AND h.restriction = 'P'\n", + " AND h.time = 15\n", + "ORDER BY h.regulator_symbol, h.time, h.mechanism\n", + "\"\"\",\n", + " replicated_hacket_regulators=replicated_hackett_regulators\n", + ")\n", + "print(hackett_harbison_dto.head())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cell-26", + "metadata": {}, + "outputs": [], + "source": [ + "# Clean up temp file\n", + "temp_config.unlink(missing_ok=True)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "tfbpapi-py3.11", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.9" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/virtual_db.md b/docs/virtual_db.md new file mode 100644 index 0000000..ded59f4 --- /dev/null +++ b/docs/virtual_db.md @@ -0,0 +1,85 @@ +# VirtualDB + +VirtualDB provides a SQL query interface across heterogeneous HuggingFace +datasets using an in-memory DuckDB database. Each dataset defines experimental +conditions in its own way, with properties stored at different hierarchy levels +(repository, dataset, or field) and using different naming conventions. +VirtualDB uses an external YAML configuration to map these varying structures +to a common schema, normalize factor level names (e.g., "D-glucose", +"dextrose", "glu" all become "glucose"), and enable cross-dataset queries with +standardized field names and values. + +For primary datasets, VirtualDB creates: + +- **`_meta`** -- one row per sample with derived metadata columns +- **``** -- full measurement-level data joined to the metadata view + +For comparative analysis datasets, VirtualDB creates: + +- **`_expanded`** -- the raw data with composite ID fields parsed + into `_source` (aliased to configured `db_name`) and + `_id` (sample_id) columns + +See the [configuration guide](virtual_db_configuration.md) for setup details +and the [tutorial](tutorials/virtual_db_tutorial.ipynb) for usage examples. + +## Advanced Usage + +The underlying DuckDB connection is available as `vdb._conn`. You can use +`_conn` to execute any SQL on the database, eg creating more views, or +creating a table in memory. + +Custom **views** created this way appear in `tables()`, `describe()`, and +`get_fields()` automatically because those methods query DuckDB's +`information_schema`. Custom **tables** do not appear in `tables()` (which +only lists views), but are fully queryable via `vdb.query()`. + +Example -- create a materialized analysis table:: + + # Create a persistent in-memory table from a complex query. + # This example selects one "best" Hackett-2020 sample per regulator + # using a priority system: ZEV+P > GEV+P > GEV+M. + vdb._conn.execute(""" + CREATE OR REPLACE TABLE hackett_analysis_set AS + WITH regulator_tiers AS ( + SELECT + regulator_locus_tag, + CASE + WHEN BOOL_OR(mechanism = 'ZEV' AND restriction = 'P') THEN 1 + WHEN BOOL_OR(mechanism = 'GEV' AND restriction = 'P') THEN 2 + ELSE 3 + END AS tier + FROM hackett_meta + WHERE regulator_locus_tag NOT IN ('Z3EV', 'GEV') + GROUP BY regulator_locus_tag + ), + tier_filter AS ( + SELECT + h.sample_id, h.regulator_locus_tag, h.regulator_symbol, + h.mechanism, h.restriction, h.date, h.strain, t.tier + FROM hackett_meta h + JOIN regulator_tiers t USING (regulator_locus_tag) + WHERE + (t.tier = 1 AND h.mechanism = 'ZEV' AND h.restriction = 'P') + OR (t.tier = 2 AND h.mechanism = 'GEV' AND h.restriction = 'P') + OR (t.tier = 3 AND h.mechanism = 'GEV' AND h.restriction = 'M') + ) + SELECT DISTINCT + sample_id, regulator_locus_tag, regulator_symbol, + mechanism, restriction, date, strain + FROM tier_filter + WHERE regulator_symbol NOT IN ('GCN4', 'RDS2', 'SWI1', 'MAC1') + ORDER BY regulator_locus_tag, sample_id + """) + + df = vdb.query("SELECT * FROM hackett_analysis_set") + +Tables and views created this way are in-memory only and do not persist across +VirtualDB instances. They exist for the lifetime of the DuckDB connection. + +## API Reference + +::: tfbpapi.virtual_db.VirtualDB + options: + show_root_heading: true + show_source: true diff --git a/docs/virtual_db_configuration.md b/docs/virtual_db_configuration.md new file mode 100644 index 0000000..fd1e5f4 --- /dev/null +++ b/docs/virtual_db_configuration.md @@ -0,0 +1,408 @@ +# VirtualDB Configuration Guide + +VirtualDB requires a YAML configuration file that defines which datasets to +include, how to map their fields to common names, and how to normalize factor +levels. + +## Basic Example + +```yaml +repositories: + # Each repository defines a "table" in the virtual database + BrentLab/harbison_2004: + # REQUIRED: Specify which column is the sample identifier. The `field` + # value is the actual column name in the parquet data. At the repo level, + # it applies to all datasets in this repository. If not specified at + # either level, the default column name "sample_id" is assumed. + sample_id: + field: sample_id + # Repository-wide properties (apply to all datasets in this repository) + # Paths are explicit from the datacard root + nitrogen_source: + path: experimental_conditions.media.nitrogen_source.name + + dataset: + # Each dataset gets its own view with standardized fields + harbison_2004: + # note: this is optional. If not specified, then the config_name is used. + # This is useful if the config_name isn't suited to a table name, or if it + # were to conflict with another dataset in the configuration + db_name: harbison + # Dataset-specific properties (constant for all samples) + # Explicit path from datacard/config root + phosphate_source: + path: experimental_conditions.media.phosphate_source.compound + + # Field-level properties (vary per sample) + # Path is relative to field's definitions dict + carbon_source: + field: condition + path: media.carbon_source.compound + dtype: string # Optional: specify data type + + # Field without path (column alias with normalization) + environmental_condition: + field: condition + + BrentLab/kemmeren_2014: + dataset: + kemmeren_2014: + # optional -- see the note for `db_name` in harbison above + db_name: kemmeren + # REQUIRED: If `sample_id` isn't defined at the repo level, it must be + # defined at the dataset level. The `field` value is the actual column + # name in the parquet data (does not need to be literally "sample_id"). + sample_id: + field: sample_id + # Same logical fields, different physical paths + # Explicit path from datacard/config root + carbon_source: + path: experimental_conditions.media.carbon_source.compound + dtype: string + temperature_celsius: + path: experimental_conditions.temperature_celsius + dtype: numeric # Enables numeric filtering with comparison operators + + # Comparative dataset example + BrentLab/yeast_comparative_analysis: + dataset: + dto: + # Use field mappings to change a field's displayed name. If not specifically + # listed, then the field is included as it exists in the source data + dto_fdr: + field: dto_fdr + dto_empirical_pvalue: + field: empirical_pvalue + + # links specify which primary datasets are referenced by composite ID fields + links: + binding_id: + - [BrentLab/harbison_2004, harbison_2004] + perturbation_id: + - [BrentLab/kemmeren_2014, kemmeren_2014] + +# ===== Normalization Rules ===== +# Map varying terminologies to standardized values +factor_aliases: + carbon_source: + glucose: [D-glucose, glu, dextrose] + galactose: [D-galactose, gal] + +# Handle missing values with defaults +missing_value_labels: + carbon_source: "unspecified" + +# ===== Documentation ===== +description: + carbon_source: The carbon source provided to the cells during growth +``` + +### Property Hierarchy + +Properties are extracted at three hierarchy levels: + +1. **Repository-wide**: Common to all datasets in a repository + - Paths relative to datacard/config root (explicit) + - Example: `path: experimental_conditions.media.nitrogen_source.name` + +2. **Dataset-specific**: Specific to one dataset configuration + - Paths relative to datacard/config root (explicit) + - Example: `path: experimental_conditions.media.phosphate_source.compound` + +3. **Field-level**: Vary per sample, defined in field definitions + - `field` specifies which field to extract from + - `path` relative to that field's definitions dict + - Example: `field: condition, path: media.carbon_source.compound` + +**Special case**: Field without path creates a column alias +- `field: condition` (no path) renames `condition` column, enables normalization + +### Path Resolution + +Paths use dot notation to navigate nested structures: + +**Repository/Dataset-level** (explicit paths from datacard root): +- `path: experimental_conditions.temperature_celsius` - access experimental conditions +- `path: experimental_conditions.media.carbon_source.compound` - nested condition data +- `path: description` - access fields outside experimental_conditions + +**Field-level** (paths relative to field definitions): +- `field: condition, path: media.carbon_source.compound` looks in field + `condition`'s definitions and navigates to `media.carbon_source.compound` + +### Data Type Specifications + +Field mappings support an optional `dtype` parameter to ensure proper type handling +during metadata extraction and query filtering. + +**Supported dtypes**: +- `string` - Text data (default if not specified) +- `numeric` - Numeric values (integers or floating-point numbers) +- `bool` - Boolean values (true/false) +- `factor` - Categorical data backed by a DuckDB ENUM type (see below) + +**When to use dtype**: + +1. **Numeric filtering**: Required for fields used with comparison operators + (`<`, `>`, `<=`, `>=`, `between`) +2. **Type consistency**: When source data might be extracted with incorrect type +3. **Categorical columns**: Use `factor` when a field has a fixed, known set of + levels and you want DuckDB to enforce membership and enable efficient storage + +### factor dtype (DuckDB ENUM) + +When `dtype: factor` is set on a field-only mapping, VirtualDB registers a DuckDB +ENUM type from the field's `class_label` definition in the DataCard and casts the +column to that type in the `_meta` view. + +**Requirements**: + +- `dtype: factor` may only be used with field-only mappings (`field:` specified, + no `path:` or `expression:`). +- The DataCard must declare the field with `dtype: {class_label: {names: [...]}}`. + If the field is missing, has a non-`class_label` dtype, or the `names` list is + absent or empty, VirtualDB raises a `ValueError` at view-registration time. + +**Column naming when the output name matches the source field**: + +When the mapping key equals the source field name (the common case, e.g. +`time: {field: time, dtype: factor}`), the raw column is preserved in the view +under a `_orig` alias so that the original values remain accessible: + +- `time` -- ENUM-typed column with levels from the DataCard +- `time_orig` -- original raw column (e.g., DOUBLE or VARCHAR) + +If `time_orig` already exists in the parquet, VirtualDB finds the next available +name: `time_orig_1`, `time_orig_2`, etc. + +**Example DataCard feature definition** (in the HuggingFace dataset card YAML): + +```yaml +- name: time + dtype: + class_label: + names: + - 0 + - 5 + - 10 + - 15 + - 20 + - 45 + - 90 + description: Time point in minutes after induction +``` + +**Example VirtualDB config**: + +```yaml +repositories: + BrentLab/hackett_2020: + dataset: + hackett_2020: + db_name: hackett + sample_id: + field: sample_id + time: + field: time + dtype: factor +``` + +After view registration, `hackett_meta` will contain: +- `time` -- ENUM column, queryable as `WHERE time = '45'` +- `time_orig` -- original numeric column + +## Tags + +Tags are arbitrary string key/value pairs for annotating datasets. They follow +the same hierarchy as property mappings: repo-level tags apply to all datasets +in the repository, dataset-level tags apply only to that dataset, and +dataset-level tags override repo-level tags with the same key. + +```yaml +repositories: + BrentLab/harbison_2004: + # Repo-level tags apply to all datasets in this repository + tags: + assay: binding + organism: yeast + dataset: + harbison_2004: + sample_id: + field: sample_id + # Dataset-level tags override repo-level tags with the same key + tags: + assay: chip-chip + + BrentLab/kemmeren_2014: + tags: + assay: perturbation + organism: yeast + dataset: + kemmeren_2014: + sample_id: + field: sample_id +``` + +Access merged tags via `vdb.get_tags(db_name)`, identifying datasets by +their name as it appears in `vdb.tables()`: + +```python +from tfbpapi.virtual_db import VirtualDB + +vdb = VirtualDB("datasets.yaml") + +# Returns {"assay": "chip-chip", "organism": "yeast"} +# (dataset-level assay overrides repo-level) +vdb.get_tags("harbison") + +# Returns {"assay": "perturbation", "organism": "yeast"} +vdb.get_tags("kemmeren") +``` + +The underlying `MetadataConfig` (available as `vdb.config`) exposes the same +data via `(repo_id, config_name)` pairs for programmatic or developer use: + +```python +# Equivalent to vdb.get_tags("harbison") above +vdb.config.get_tags("BrentLab/harbison_2004", "harbison_2004") +``` + +## Missing Value Labels + +`missing_value_labels` is a top-level mapping from property name to a default +string value. When a property is listed here, every dataset's `_meta` view will +include that column -- even datasets that have no explicit mapping for it. For +those datasets, the column is emitted as the constant fallback value. + +Datasets that *do* have an explicit mapping for the property are unaffected; they +resolve the value normally (from field definitions, a path, or an expression). + +```yaml +missing_value_labels: + carbon_source: "unspecified" + temperature_celsius: "unspecified" +``` + +**Behavior by dataset**: + +| Dataset | `carbon_source` mapping | `carbon_source` in `_meta` | +|---------|------------------------|---------------------------| +| harbison | `field: condition, path: media.carbon_source.compound` | resolved from DataCard definitions | +| degron | (none) | `'unspecified'` (fallback) | + +Without `missing_value_labels`, datasets that lack the mapping simply do not +include the column in their `_meta` view, making cross-dataset queries on that +column error or require `COALESCE`. + +## Comparative Datasets + +Comparative datasets differ from other dataset types in that they represent +relationships between samples across datasets rather than individual samples. +Each row relates 2+ samples from other datasets. + +### Structure + +Comparative datasets use `source_sample` fields instead of a single sample +identifier column: +- Multiple fields with `role: source_sample` +- Each contains composite identifier: `"repo_id;config_name;sample_id_value"` +- Example: `binding_id = "BrentLab/harbison_2004;harbison_2004;42"` + +### Fields + +All fields in the comparative dataset are included. But they may be re-named +(aliased) by specifically mapping them in the configuration. + +```yaml +dto: + # this would make the displayed field name 'dto_pvalue' + instead of 'empirical_pvalue' + dto_pvalue: + field: empirical_pvalue +``` + +### Link Structure + +the `links` section specifies how the composite IDs map to primary datasets. The first +sub-element under `links` is the name of the field in the comparative dataset that +contains the composite IDs. The value is a list of `[repo_id, config_name]` +pairs indicating which primary datasets are referenced by that field. Those primary +datasets must also be defined in the overall VirtualDB configuration. + +```yaml +# Within the comparative dataset config +dto: + links: + binding_id: + - [BrentLab/harbison_2004, harbison_2004] # [repo_id, config_name] + - [BrentLab/callingcards, annotated_features] + perturbation_id: + - [BrentLab/kemmeren_2014, kemmeren_2014] +``` + +See the [huggingface datacard documentation](huggingface_datacard.md#5-comparative) +for more detailed explanation of comparative datasets and composite IDs. + +## Internal Structure + +VirtualDB uses an in-memory DuckDB database to construct a layered hierarchy +of SQL views over locally cached Parquet files. Views are created on initialization and are not persisted to disk. + +### View Hierarchy + +For each configured dataset, VirtualDB registers a series of views that +build on each other. Using `harbison` as an example primary dataset and +`dto` as a comparative dataset: + +**1. Metadata view** + +One row per unique sample identifier (the column configured via +`sample_id: {field: }`). Derived columns from the +configuration (e.g., `carbon_source`, `temperature_celsius`) are resolved +here using datacard definitions, factor aliases, and missing value labels. +This is the primary view for querying sample-level metadata. + +**2. Raw data view** + +The full parquet data joined to the metadata view so that every row +carries both the raw measurement columns and the derived metadata +columns. **Developer note**: There is an internal view called ___parquet that +is just the raw parquet data without any metadata joins or derived columns. +This is used as the base for joining to the metadata view, but is not exposed directly +to users. + +**3. Expanded view (comparative only)** -- `dto_expanded` + +For comparative datasets, each composite ID field (e.g. `binding_id` +with format `"repo_id;config_name;sample_id"`) is parsed into two +additional columns: + +- `_source` -- the `repo_id;config_name` prefix, aliased + to the configured `db_name` when the pair is in the VirtualDB config. + For example, `BrentLab/harbison_2004;harbison_2004` becomes `harbison`. +- `_id` -- the sample_id component. + +This makes it straightforward to join back to primary dataset views +or filter by source dataset without parsing composite IDs in SQL. + +### View Diagram + +``` +__harbison_parquet (raw parquet, not directly exposed) + | + +-> harbison_meta (deduplicated, one row per sample identifier, + | with derived columns from config) + | + +-> harbison (full parquet joined to harbison_meta) + +__dto_parquet (raw parquet, not directly exposed) + | + +-> dto_expanded (parquet + parsed columns: + binding_id_source, binding_id_id, + perturbation_id_source, perturbation_id_id) +``` + +## Usage + +For usage examples and tutorials, +see the [VirtualDB Tutorial](tutorials/virtual_db_tutorial.ipynb). \ No newline at end of file diff --git a/mkdocs.yml b/mkdocs.yml index a28f581..42d3bf6 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -1,45 +1,141 @@ site_name: tfbpapi -site_description: "A collection of objects and functions to work with calling cards sequencing tools" +site_description: "Python API for querying and analyzing genomic datasets from HuggingFace Hub" site_author: "ben mueller , chase mateusiak , michael brent " -site_url: "https://brentlab.github.io/tfbpapi/" +site_url: "https://brentlab.github.io/tfbpapi" repo_url: "https://github.com/brentlab/tfbpapi" -repo_name: "tfbpapi" -edit_uri: "edit/master/docs/" +repo_name: "brentlab/tfbpapi" +edit_uri: "edit/main/docs/" watch: ['tfbpapi', 'docs'] theme: name: material + palette: + # Palette toggle for light mode + - media: "(prefers-color-scheme: light)" + scheme: default + primary: indigo + accent: indigo + toggle: + icon: material/brightness-7 + name: Switch to dark mode + # Palette toggle for dark mode + - media: "(prefers-color-scheme: dark)" + scheme: slate + primary: indigo + accent: indigo + toggle: + icon: material/brightness-4 + name: Switch to light mode + features: + - navigation.tabs + - navigation.sections + - navigation.expand + - navigation.path + - navigation.top + - search.highlight + - search.share + - search.suggest + - content.code.copy + - content.code.select + - content.code.annotate + - content.action.edit + - content.action.view + icon: + repo: fontawesome/brands/github + edit: material/pencil + view: material/eye plugins: -- search -- autorefs -- section-index -- mkdocs-jupyter: + - search: + separator: '[\s\-,:!=\[\]()"`/]+|\.(?!\d)|&[lg]t;|(?!\b)(?=[A-Z][a-z])' + - autorefs + - section-index + - mkdocs-jupyter: remove_tag_config: - remove_input_tags: - - hide - remove_output_tags: - - hide -- mkdocstrings: - handlers: - python: - paths: [tfbpapi] # search packages in the src folder - merge_init_into_class: True - options: - docstring_style: 'sphinx' + remove_input_tags: + - hide + remove_output_tags: + - hide + execute: false + allow_errors: false + - mkdocstrings: + handlers: + python: + paths: [.] + inventories: + - https://docs.python.org/3/objects.inv + - https://numpy.org/doc/stable/objects.inv + - https://pandas.pydata.org/docs/objects.inv + options: + docstring_style: sphinx + show_source: true + show_root_heading: true + show_root_toc_entry: true + show_symbol_type_heading: true + show_symbol_type_toc: true + signature_crossrefs: true markdown_extensions: + - abbr + - admonition + - attr_list + - def_list + - footnotes + - md_in_html - smarty + - tables - toc: - permalink: True + permalink: true + title: On this page - sane_lists - pymdownx.arithmatex: generic: true + - pymdownx.betterem: + smart_enable: all + - pymdownx.caret + - pymdownx.details + - pymdownx.emoji: + emoji_generator: !!python/name:material.extensions.emoji.to_svg + emoji_index: !!python/name:material.extensions.emoji.twemoji + - pymdownx.highlight: + anchor_linenums: true + line_spans: __span + pygments_lang_class: true + - pymdownx.inlinehilite + - pymdownx.keys + - pymdownx.magiclink: + normalize_issue_symbols: true + repo_url_shorthand: true + user: brentlab + repo: tfbpapi + - pymdownx.mark + - pymdownx.smartsymbols + - pymdownx.snippets: + auto_append: + - includes/mkdocs.md - pymdownx.superfences: custom_fences: - name: mermaid class: mermaid - format: "!!python/name:pymdownx.superfences.fence_code_format" + format: !!python/name:pymdownx.superfences.fence_code_format + - pymdownx.tabbed: + alternate_style: true + combine_header_slug: true + slugify: !!python/object/apply:pymdownx.slugs.slugify + kwds: + case: lower + - pymdownx.tasklist: + custom_checkbox: true + - pymdownx.tilde + +extra: + social: + - icon: fontawesome/brands/github + link: https://github.com/brentlab/tfbpapi + name: GitHub Repository + version: + provider: mike + default: latest extra_javascript: - javascripts/mathjax.js @@ -47,36 +143,28 @@ extra_javascript: - https://cdn.jsdelivr.net/npm/mermaid/dist/mermaid.min.js - js/init-mermaid.js +extra_css: + - stylesheets/extra.css + nav: -- Home: index.md -- Tutorials: - - Database Interface: tutorials/database_interface.ipynb - - LassoCV: tutorials/lassoCV.ipynb - - Interactor Modeling Workflow: tutorials/interactor_modeling_workflow.ipynb -- API: - - Models: - - Overview: ml_models/index.md - - SigmoidModel: ml_models/SigmoidModel.md - - Lasso Modeling: ml_models/lasso_modeling.md - - Database Interface: - - Records Only Classes: - - interface/BindingManualQCAPI.md - - interface/DataSourceAPI.md - - interface/DtoAPI.md - - interface/ExpressionManualQCAPI.md - - interface/FileFormatAPI.md - - interface/GenomicFeatureAPI.md - - interface/RegulatorAPI.md - - Records and Files Classes: - - BindingAPI: interface/BindingAPI.md - - BindingConcatenatedAPI: interface/BindingConcatenatedAPI.md - - CallingCardsBackgroundAPI: interface/CallingCardsBackgroundAPI.md - - ExpressionAPI: interface/ExpressionAPI.md - - PromoterSetAPI: interface/PromoterSetAPI.md - - PromoterSetSigAPI: interface/PromoterSetSigAPI.md - - Developer Classes: - - interface/AbstractAPI.md - - interface/AbstractRecordsAndFilesAPI.md - - interface/AbstractRecordsOnlyAPI.md - - interface/Cache.md - - interface/ParamsDict.md + - Home: index.md + - Tutorials: + - "Getting Started": + - "DataCard: Exploring Datasets": tutorials/datacard_tutorial.ipynb + - "Cache Management": tutorials/cache_manager_tutorial.ipynb + - "Querying Data": + - "VirtualDB: Unified Cross-Dataset Queries": tutorials/virtual_db_tutorial.ipynb + - API Reference: + - Core: + - VirtualDB: virtual_db.md + - DataCard: datacard.md + - HfCacheManager: hf_cache_manager.md + - Models and Configuration: + - Pydantic Models: models.md + - Fetchers: fetchers.md + - Error Handling: + - Custom Exceptions: errors.md + - HuggingFace Configuration: + - HuggingFace Dataset Card Format: huggingface_datacard.md + - BrentLab Collection: brentlab_yeastresources_collection.md + - VirtualDB Configuration: virtual_db_configuration.md diff --git a/pyproject.toml b/pyproject.toml index 969a661..e3710f2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -8,20 +8,26 @@ readme = "README.md" [tool.poetry.dependencies] python = "^3.11" -pandas = "^2.2.3" requests = "^2.32.3" -aiohttp = "^3.11.18" -cachetools = "^5.5.2" -scikit-learn = "^1.6.1" -requests-toolbelt = "^1.0.0" -responses = "^0.25.7" -aioresponses = "^0.7.8" +pandas = "^2.3.1" +huggingface-hub = "^0.34.4" +duckdb = "^1.3.2" +pydantic = "^2.11.9" [tool.poetry.group.dev.dependencies] pytest = "^8.3.5" -pytest-snapshot = "^0.9.0" pytest-asyncio = "^0.26.0" +types-requests = "^2.32.4.20250809" +mkdocs = "^1.6.1" +mkdocs-material = "^9.6.19" +mkdocs-autorefs = "^1.4.3" +mkdocs-section-index = "^0.3.10" +mkdocs-jupyter = "^0.25.1" +mkdocstrings = {extras = ["python"], version = "^0.30.0"} +matplotlib = "^3.10.6" +seaborn = "^0.13.2" +types-pyyaml = "^6.0.12.20250915" [tool.pytest.ini_options] diff --git a/tfbpapi/AbstractAPI.py b/tfbpapi/AbstractAPI.py deleted file mode 100644 index 19c4eb6..0000000 --- a/tfbpapi/AbstractAPI.py +++ /dev/null @@ -1,230 +0,0 @@ -import logging -import os -from abc import ABC, abstractmethod -from collections.abc import Coroutine -from typing import Any - -import pandas as pd -import requests # type: ignore - -from tfbpapi.Cache import Cache -from tfbpapi.ParamsDict import ParamsDict - - -class AbstractAPI(ABC): - """ - Abstract base class for creating API clients that require token authentication. - - This class provides a template for connecting to a cache for caching API responses, - validating parameters against a list of valid keys, and provides an interface for - CRUD operations. - - """ - - def __init__( - self, - url: str = "", - token: str = "", - **kwargs, - ): - """ - Initialize the API client. - - :param url: The API endpoint URL. Defaults to the `BASE_URL` - environment variable. - :param token: The authentication token. Defaults to the `TOKEN` - environment variable. - :param valid_param_keys: A list of valid parameter keys for the API. - :param params: A ParamsDict object containing parameters for the API request. - :param cache: a Cache object for caching API responses. - :param kwargs: Additional keyword arguments that may be passed on to the - ParamsDict and Cache constructors. - - """ - self.logger = logging.getLogger(self.__class__.__name__) - self._token = token or os.getenv("TOKEN", "") - self.url = url or os.getenv("BASE_URL", "") - self.params = ParamsDict( - params=kwargs.pop("params", {}), - valid_keys=kwargs.pop("valid_keys", []), - ) - self.cache = Cache( - maxsize=kwargs.pop("maxsize", 100), ttl=kwargs.pop("ttl", 300) - ) - - @property - def header(self) -> dict[str, str]: - """The HTTP authorization header.""" - return { - "Authorization": f"token {self.token}", - "Content-Type": "application/json", - } - - @property - def url(self) -> str: - """The URL for the API.""" - return self._url # type: ignore - - @url.setter - def url(self, value: str) -> None: - if not value: - self._url = None - elif hasattr(self, "token") and self.token: - # validate the URL with the new token - self._is_valid_url(value) - self._url = value - else: - self.logger.warning("No token provided: URL un-validated") - self._url = value - - @property - def token(self) -> str: - """The authentication token for the API.""" - return self._token - - @token.setter - def token(self, value: str) -> None: - self._token = value - # validate the URL with the new token - if hasattr(self, "url") and self.url: - self.logger.info("Validating URL with new token") - self._is_valid_url(self.url) - - @property - def cache(self) -> Cache: - """The cache object for caching API responses.""" - return self._cache - - @cache.setter - def cache(self, value: Cache) -> None: - self._cache = value - - @property - def params(self) -> ParamsDict: - """The ParamsDict object containing parameters for the API request.""" - return self._params - - @params.setter - def params(self, value: ParamsDict) -> None: - self._params = value - - def push_params(self, params: dict[str, Any]) -> None: - """Adds or updates parameters in the ParamsDict.""" - try: - self.params.update(params) - except KeyError as e: - self.logger.error(f"Error updating parameters: {e}") - - def pop_params(self, keys: list[str] | None = None) -> None: - """Removes parameters from the ParamsDict.""" - if keys is None: - self.params.clear() - return - if keys is not None and not isinstance(keys, list): - keys = [keys] - for key in keys: - del self.params[key] - - @abstractmethod - def create(self, data: dict[str, Any], **kwargs) -> Any: - """Placeholder for the create method.""" - raise NotImplementedError( - f"`create()` is not implemented for {self.__class__.__name__}" - ) - - @abstractmethod - def read(self, **kwargs) -> Any: - """Placeholder for the read method.""" - raise NotImplementedError( - f"`read()` is not implemented for {self.__class__.__name__}" - ) - - @abstractmethod - def update(self, df: pd.DataFrame, **kwargs) -> Any: - """Placeholder for the update method.""" - raise NotImplementedError( - f"`update()` is not implemented for {self.__class__.__name__}" - ) - - @abstractmethod - def delete(self, id: str, **kwargs) -> Any: - """Placeholder for the delete method.""" - raise NotImplementedError( - f"`delete()` is not implemented for {self.__class__.__name__}" - ) - - @abstractmethod - def submit(self, post_dict: dict[str, Any], **kwargs) -> Any: - """Placeholder for the submit method.""" - raise NotImplementedError( - f"`submit()` is not implemented for {self.__class__.__name__}" - ) - - @abstractmethod - def retrieve( - self, group_task_id: str, timeout: int, polling_interval: int, **kwargs - ) -> Coroutine[Any, Any, Any]: - """Placeholder for the retrieve method.""" - raise NotImplementedError( - f"`retrieve()` is not implemented for {self.__class__.__name__}" - ) - - def _is_valid_url(self, url: str) -> None: - """ - Confirms that the URL is valid and the header authorization is appropriate. - - :param url: The URL to validate. - :type url: str - :raises ValueError: If the URL is invalid or the token is not set. - - """ - try: - # note that with allow_redirect=True the result can be a 300 status code - # which is not an error, and then another request to the redirected URL - response = requests.head(url, headers=self.header, allow_redirects=True) - if response.status_code != 200: - raise ValueError("Invalid URL or token provided. Check both.") - except requests.RequestException as e: - raise AttributeError(f"Error validating URL: {e}") from e - except AttributeError as e: - self.logger.error(f"Error validating URL: {e}") - - def _cache_get(self, key: str, default: Any = None) -> Any: - """ - Get a value from the cache if configured. - - :param key: The key to retrieve from the cache. - :type key: str - :param default: The default value to return if the key is not found. - :type default: any, optional - :return: The value from the cache or the default value. - :rtype: any - - """ - return self.cache.get(key, default=default) - - def _cache_set(self, key: str, value: Any) -> None: - """ - Set a value in the cache if configured. - - :param key: The key to set in the cache. - :type key: str - :param value: The value to set in the cache. - :type value: any - - """ - self.cache.set(key, value) - - def _cache_list(self) -> list[str]: - """List keys in the cache if configured.""" - return self.cache.list() - - def _cache_delete(self, key: str) -> None: - """ - Delete a key from the cache if configured. - - :param key: The key to delete from the cache. - :type key: str - - """ - self.cache.delete(key) diff --git a/tfbpapi/AbstractRecordsAndFilesAPI.py b/tfbpapi/AbstractRecordsAndFilesAPI.py deleted file mode 100644 index 87f99ad..0000000 --- a/tfbpapi/AbstractRecordsAndFilesAPI.py +++ /dev/null @@ -1,314 +0,0 @@ -import csv -import gzip -import os -import tarfile -import tempfile -from collections.abc import Callable -from io import BytesIO -from typing import Any - -import aiohttp -import pandas as pd - -from tfbpapi.AbstractAPI import AbstractAPI - - -class AbstractRecordsAndFilesAPI(AbstractAPI): - """ - Abstract class to interact with both the records and the data stored in the `file` - field. - - The return for this class must be records, against the `/export` - endpoint when `retrieve_files` is False. When `retrieve_files` is True, the cache - should be checked first. If the file doesn't exist there, it should be retrieved - from the database against the `/record_table_and_files` endpoint. The file should - be a tarball with the metadata.csv and the file associated with the record, - where the file is named according to the `id` field in metadata.csv. Data files - should be `.csv.gz`. - - """ - - def __init__(self, **kwargs): - """ - Initialize the AbstractRecordsAndFilesAPI object. This will serve as an - interface to an endpoint that can serve both records and files, and cache the - file/retrieve from the cache if it exists. - - :param kwargs: parameters to pass to AbstractAPI. - - """ - self.export_url_suffix = kwargs.pop("export_url_suffix", "export") - self.export_files_url_suffix = kwargs.pop( - "export_files_url_suffix", "record_table_and_files" - ) - super().__init__(**kwargs) - - @property - def export_url_suffix(self) -> str: - """The URL suffix for exporting records.""" - return self._export_url_suffix - - @export_url_suffix.setter - def export_url_suffix(self, value: str) -> None: - self._export_url_suffix = value - - @property - def export_files_url_suffix(self) -> str: - """The URL suffix for exporting files.""" - return self._export_files_url_suffix - - @export_files_url_suffix.setter - def export_files_url_suffix(self, value: str) -> None: - self._export_files_url_suffix = value - - def _detect_delimiter(self, file_path: str, sample_size: int = 1024) -> str: - """ - Detect the delimiter of a CSV file. - - :param file_path: The path to the CSV file. - :type file_path: str - :param sample_size: The number of bytes to read from the file to detect the - delimiter. Defaults to 1024. - :type sample_size: int - :return: The delimiter of the CSV file. - :rtype: str - :raises FileNotFoundError: If the file does not exist. - :raises gzip.BadGzipFile: If the file is not a valid gzip file. - :raises _csv.Error: If the CSV sniffer cannot determine the delimiter. - - """ - try: - # by default, open() uses newline=False, which opens the file - # in universal newline mode and translates all new line characters - # to '\n' - file = ( - gzip.open(file_path, "rt") - if file_path.endswith(".gz") - else open(file_path) - ) - except FileNotFoundError as exc: - raise FileNotFoundError(f"File {file_path} not found.") from exc - - sample = file.read(sample_size) - - # In order to avoid errors in the csv sniffer, attempt to find the - # last newline character in the string - last_newline_index = sample.rfind("\n") - # if a newline character is found, trim the sample to the last newline - if last_newline_index != -1: - # Trim to the last complete line - sample = sample[:last_newline_index] - - sniffer = csv.Sniffer() - dialect = sniffer.sniff(sample) - delimiter = dialect.delimiter - - file.close() - - return delimiter - - async def read( - self, - callback: Callable[ - [pd.DataFrame, dict[str, Any] | None, Any], Any - ] = lambda metadata, data, cache, **kwargs: ( - {"metadata": metadata, "data": data} - ), - retrieve_files: bool = False, - **kwargs, - ) -> Any: - """ - Retrieve data from the endpoint according to the `retrieve_files` parameter. If - `retrieve_files` is False, the records will be returned as a dataframe. If - `retrieve_files` is True, the files associated with the records will be - retrieved either from the local cache or from the database. Note that a user can - select which effect_colname and pvalue_colname is used for a genomicfile (see - database documentation for more details). If one or both of those are present in - the params, and retrieve_file is true, then that column name is added to the - cache_key. Eg if record 1 is being retrieved from mcisaac data with - effect_colname "log2_raio", then the cache_key for that data will be - "1_log2_ratio". The default effect colname, which is set by the database, will - be stored with only the record id as the cache_key. - - :param callback: The function to call with the metadata. Signature must - include `metadata`, `data`, and `cache`. - :type callback: Callable[[pd.DataFrame, dict[str, Any] | None, Any], Any] - :param retrieve_files: Boolean. Whether to retrieve the files associated with - the records. Defaults to False. - :type retrieve_files: bool - :param kwargs: The following kwargs are used by the read() function. Any - others are passed onto the callback function - - timeout: The timeout for the GET request. Defaults to 120. - - :return: The result of the callback function. - :rtype: Any - - :raises ValueError: If the callback function does not have the correct - signature. - :raises aiohttp.ClientError: If there is an error in the GET request. - :raises pd.errors.ParserError: If there is an error reading the request - - """ - if not callable(callback) or {"metadata", "data", "cache"} - set( - callback.__code__.co_varnames - ): - raise ValueError( - "The callback must be a callable function with `metadata`, `data`, ", - "and `cache` as parameters.", - ) - - export_url = f"{self.url.rstrip('/')}/{self.export_url_suffix}" - self.logger.debug("read() export_url: %s", export_url) - - timeout = aiohttp.ClientTimeout(kwargs.pop("timeout", 120)) - async with aiohttp.ClientSession(timeout=timeout) as session: - try: - async with session.get( - export_url, headers=self.header, params=self.params - ) as response: - response.raise_for_status() - content = await response.content.read() - with gzip.GzipFile(fileobj=BytesIO(content)) as f: - records_df = pd.read_csv(f) - - if not retrieve_files: - return callback(records_df, None, self.cache, **kwargs) - else: - data_list = await self._retrieve_files(session, records_df) - return callback( - records_df, - data_list, - self.cache, - **kwargs, - ) - - except aiohttp.ClientError as e: - self.logger.error(f"Error in GET request: {e}") - raise - except pd.errors.ParserError as e: - self.logger.error(f"Error reading request content: {e}") - raise - - async def _retrieve_files( - self, session: aiohttp.ClientSession, records_df: pd.DataFrame - ) -> dict[str, pd.DataFrame]: - """ - Retrieve files associated with the records either from the local cache or from - the database. - - :param session: The aiohttp ClientSession. - :type session: aiohttp.ClientSession - :param records_df: The DataFrame containing the records. - :type records_df: pd.DataFrame - :return: A dictionary where the keys are record IDs and the values are - DataFrames of the associated files. - :rtype: dict[str, pd.DataFrame] - - """ - data_list = {} - for record_id in records_df["id"]: - data_list[str(record_id)] = await self._retrieve_file(session, record_id) - return data_list - - async def _retrieve_file( - self, session: aiohttp.ClientSession, record_id: int - ) -> pd.DataFrame: - """ - Retrieve a file associated with a record either from the local cache or from the - database. - - :param session: The aiohttp ClientSession. - :type session: aiohttp.ClientSession - :param record_id: The ID of the record. - :type record_id: int - :return: A DataFrame containing the file's data. - :rtype: pd.DataFrame - :raises FileNotFoundError: If the file is not found in the tar archive. - :raises ValueError: If the delimiter is not supported. - - """ - export_files_url = f"{self.url.rstrip('/')}/{self.export_files_url_suffix}" - self.logger.debug("_retrieve_file() export_url: %s", export_files_url) - - # set key for local cache - cache_key = str(record_id) - if "effect_colname" in self.params: - cache_key += f"_{self.params['effect_colname']}" - if "pvalue_colname" in self.params: - cache_key += f"_{self.params['pvalue_colname']}" - cached_data = self._cache_get(cache_key) - if cached_data is not None: - self.logger.info(f"cache_key {cache_key} retrieved from cache.") - return pd.read_json(BytesIO(cached_data.encode())) - else: - self.logger.debug(f"cache_key {cache_key} not found in cache.") - - try: - header = self.header.copy() - header["Content-Type"] = "application/gzip" - retrieve_files_params = self.params.copy() - retrieve_files_params.update({"id": record_id}) - async with session.get( - export_files_url, - headers=header, - params=retrieve_files_params, - timeout=120, - ) as response: - response.raise_for_status() - tar_data = await response.read() - - # Create a temporary file for the tarball - tar_file = tempfile.NamedTemporaryFile(delete=False, suffix=".tar.gz") - try: - tar_file.write(tar_data) - tar_file.flush() - tar_file.seek(0) - - # Create a temporary directory for extraction - with tempfile.TemporaryDirectory() as extract_dir: - # Open the tar file and log its contents - with tarfile.open(fileobj=tar_file, mode="r:gz") as tar: - tar_members = tar.getmembers() - self.logger.debug( - f"Tar file contains: " - f"{[member.name for member in tar_members]}", - ) - - # Find the specific file to extract - csv_filename = f"{record_id}.csv.gz" - member = next( - (m for m in tar_members if m.name == csv_filename), None - ) - if member is None: - raise FileNotFoundError( - f"{csv_filename} not found in tar archive" - ) - - # Extract only the specific member - tar.extract(member, path=extract_dir) - - # Read the extracted CSV file - csv_path = os.path.join(extract_dir, csv_filename) - self.logger.debug(f"Extracted file: {csv_path}") - - delimiter = self._detect_delimiter(csv_path) - - # raise an error if the delimiter is not a "," or a "\t" - if delimiter not in [",", "\t"]: - raise ValueError( - f"Delimiter {delimiter} is not supported. " - "Supported delimiters are ',' and '\\t'." - ) - - df = pd.read_csv(csv_path, delimiter=delimiter) - - # Store the data in the cache - self.logger.debug(f"Storing {cache_key} in cache.") - self._cache_set(cache_key, df.to_json()) - finally: - os.unlink(tar_file.name) - - return df - except Exception as e: - self.logger.error(f"Error retrieving file for cache_key {cache_key}: {e}") - raise diff --git a/tfbpapi/AbstractRecordsOnlyAPI.py b/tfbpapi/AbstractRecordsOnlyAPI.py deleted file mode 100644 index 1751ec7..0000000 --- a/tfbpapi/AbstractRecordsOnlyAPI.py +++ /dev/null @@ -1,82 +0,0 @@ -import gzip -import logging -from collections.abc import Callable -from io import BytesIO -from typing import Any - -import aiohttp -import pandas as pd - -from tfbpapi.AbstractAPI import AbstractAPI - - -class AbstractRecordsOnlyAPI(AbstractAPI): - """Abstract class for CRUD operations on records-only (no file storage) - endpoints.""" - - def __init__(self, **kwargs): - """ - Initialize the RecordsOnlyAPI object. - - :param kwargs: Additional parameters to pass to AbstractAPI. - - """ - self.logger = logging.getLogger(__name__) - super().__init__(**kwargs) - - async def read( - self, - callback: Callable[ - [pd.DataFrame, dict[str, Any] | None, Any], Any - ] = lambda metadata, data, cache, **kwargs: { - "metadata": metadata, - "data": data, - }, - export_url_suffix="export", - **kwargs, - ) -> Any: - """ - Retrieve data from the endpoint. The data will be returned as a dataframe. The - callback function must take metadata, data, and cache as parameters. - - :param callback: The function to call with the data. Signature must - include `metadata`, `data`, and `cache` as parameters. - :param export_url_suffix: The URL suffix for the export endpoint. This will - return a response object with a csv file. - :param kwargs: This can be used to pass "params" to the request to use in place - of `self.params`. If those are passed, they will be popped off and then - the remaining kwargs will be passed to the callback function - - """ - if not callable(callback) or {"metadata", "data", "cache"} - set( - callback.__code__.co_varnames - ): - raise ValueError( - "The callback must be a callable function with `metadata`,", - "`data`, and `cache` as parameters.", - ) - - export_url = f"{self.url.rstrip('/')}/{export_url_suffix}" - self.logger.debug("read() export_url: %s", export_url) - - async with aiohttp.ClientSession() as session: - try: - # note that the url and the export suffix are joined such that - # the url is stripped of any trailing slashes and the export suffix is - # added without a leading slash - async with session.get( - export_url, - headers=self.header, - params=kwargs.pop("params", self.params), - ) as response: - response.raise_for_status() - content = await response.content.read() - with gzip.GzipFile(fileobj=BytesIO(content)) as f: - records_df = pd.read_csv(f) - return callback(records_df, None, self.cache, **kwargs) - except aiohttp.ClientError as e: - self.logger.error(f"Error in GET request: {e}") - raise - except pd.errors.ParserError as e: - self.logger.error(f"Error reading request content: {e}") - raise diff --git a/tfbpapi/BindingAPI.py b/tfbpapi/BindingAPI.py deleted file mode 100644 index 8d58b51..0000000 --- a/tfbpapi/BindingAPI.py +++ /dev/null @@ -1,61 +0,0 @@ -import os -from typing import Any - -import pandas as pd - -from tfbpapi.AbstractRecordsAndFilesAPI import ( - AbstractRecordsAndFilesAPI, -) - - -class BindingAPI(AbstractRecordsAndFilesAPI): - """Class to interact with the BindingAPI endpoint.""" - - def __init__(self, **kwargs) -> None: - """ - Initialize the BindingAPI object. - - :param kwargs: parameters to pass through AbstractRecordsAndFilesAPI to - AbstractAPI. - - """ - valid_param_keys = kwargs.pop( - "valid_param_keys", - [ - "id", - "regulator", - "regulator_locus_tag", - "regulator_symbol", - "batch", - "replicate", - "source", - "source_orig_id", - "strain", - "condition", - "lab", - "assay", - "workflow", - "data_usable", - ], - ) - - url = kwargs.pop("url", os.getenv("BINDING_URL", None)) - - super().__init__(url=url, valid_keys=valid_param_keys, **kwargs) - - def create(self, data: dict[str, Any], **kwargs) -> Any: - raise NotImplementedError("The BindingAPI does not support create.") - - def update(self, df: pd.DataFrame, **kwargs) -> Any: - raise NotImplementedError("The BindingAPI does not support update.") - - def delete(self, id: str, **kwargs) -> Any: - raise NotImplementedError("The BindingAPI does not support delete.") - - def submit(self, post_dict: dict[str, Any], **kwargs) -> Any: - raise NotImplementedError("The BindingAPI does not support submit.") - - def retrieve( - self, group_task_id: str, timeout: int, polling_interval: int, **kwargs - ) -> Any: - raise NotImplementedError("The BindingAPI does not support retrieve.") diff --git a/tfbpapi/BindingConcatenatedAPI.py b/tfbpapi/BindingConcatenatedAPI.py deleted file mode 100644 index 1ad6aff..0000000 --- a/tfbpapi/BindingConcatenatedAPI.py +++ /dev/null @@ -1,62 +0,0 @@ -import os -from typing import Any - -import pandas as pd - -from tfbpapi.AbstractRecordsAndFilesAPI import ( - AbstractRecordsAndFilesAPI, -) - - -class BindingConcatenatedAPI(AbstractRecordsAndFilesAPI): - """Class to interact with the BindingConcatenatedAPI endpoint.""" - - def __init__(self, **kwargs) -> None: - """ - Initialize the BindingConcatenatedAPI object. - - :param kwargs: parameters to pass through AbstractRecordsAndFilesAPI to - AbstractAPI. - - """ - valid_param_keys = kwargs.pop( - "valid_param_keys", - [ - "id", - "regulator", - "regulator_locus_tag", - "regulator_symbol", - "batch", - "replicate", - "source", - "strain", - "condition", - "lab", - "assay", - "workflow", - "data_usable", - ], - ) - - url = kwargs.pop("url", os.getenv("BINDINGCONCATENATED_URL", None)) - - super().__init__(url=url, valid_keys=valid_param_keys, **kwargs) - - def create(self, data: dict[str, Any], **kwargs) -> Any: - raise NotImplementedError("The BindingConcatenatedAPI does not support create.") - - def update(self, df: pd.DataFrame, **kwargs) -> Any: - raise NotImplementedError("The BindingConcatenatedAPI does not support update.") - - def delete(self, id: str, **kwargs) -> Any: - raise NotImplementedError("The BindingConcatenatedAPI does not support delete.") - - def submit(self, post_dict: dict[str, Any], **kwargs) -> Any: - raise NotImplementedError("The BindingConcatenatedAPI does not support submit.") - - def retrieve( - self, group_task_id: str, timeout: int, polling_interval: int, **kwargs - ) -> Any: - raise NotImplementedError( - "The BindingConcatenatedAPI does not support retrieve." - ) diff --git a/tfbpapi/BindingManualQCAPI.py b/tfbpapi/BindingManualQCAPI.py deleted file mode 100644 index df4169b..0000000 --- a/tfbpapi/BindingManualQCAPI.py +++ /dev/null @@ -1,106 +0,0 @@ -import os -from typing import Any - -import pandas as pd -import requests # type: ignore - -from tfbpapi.AbstractRecordsOnlyAPI import AbstractRecordsOnlyAPI - - -class BindingManualQCAPI(AbstractRecordsOnlyAPI): - """A class to interact with the BindingManualQCAPI endpoint.""" - - def __init__(self, **kwargs): - """ - Initialize the BindingManualQCAPI object. - - :param kwargs: parameters to pass to AbstractAPI via AbstractRecordsOnlyAPI. - - """ - valid_param_keys = kwargs.pop( - "valid_param_keys", - [ - "id", - "binding", - "best_datatype", - "data_usable", - "passing_replicate", - "rank_recall", - "regulator", - "regulator_locus_tag", - "regulator_symbol", - "batch", - "source", - ], - ) - - url = kwargs.pop("url", os.getenv("BINDINGMANUALQC_URL", None)) - if not url: - raise AttributeError( - "url must be provided or the environmental variable ", - "`BINDINGMANUALQC_URL` must be set", - ) - - self.bulk_update_url_suffix = kwargs.pop( - "bulk_update_url_suffix", "bulk-update" - ) - - super().__init__(url=url, valid_param_keys=valid_param_keys, **kwargs) - - @property - def bulk_update_url_suffix(self) -> str: - """The URL suffix for updating multiple records in the same request.""" - return self._bulk_update_url_suffix - - @bulk_update_url_suffix.setter - def bulk_update_url_suffix(self, value: str) -> None: - self._bulk_update_url_suffix = value - - def update(self, df: pd.DataFrame, **kwargs: Any) -> requests.Response: - """ - Update the records in the database. - - :param df: The DataFrame containing the records to update. - :type df: pd.DataFrame - :param kwargs: Additional fields to include in the payload. - :type kwargs: Any - :return: The response from the POST request. - :rtype: requests.Response - :raises requests.RequestException: If the request fails. - - """ - bulk_update_url = ( - f"{self.url.rstrip('/')}/{self.bulk_update_url_suffix.rstrip('/')}/" - ) - - self.logger.debug("bulk_update_url: %s", bulk_update_url) - - # Include additional fields in the payload if provided - payload = {"data": df.to_dict(orient="records")} - payload.update(kwargs) - - try: - response = requests.post( - bulk_update_url, - headers=self.header, - json=payload, - ) - response.raise_for_status() - return response - except requests.RequestException as e: - self.logger.error(f"Error in POST request: {e}") - raise - - def create(self, data: dict[str, Any], **kwargs) -> Any: - raise NotImplementedError("The BindingManualQCAPI does not support create.") - - def delete(self, id: str, **kwargs) -> Any: - raise NotImplementedError("The BindingManualQCAPI does not support delete.") - - def submit(self, post_dict: dict[str, Any], **kwargs) -> Any: - raise NotImplementedError("The BindingManualQCAPI does not support submit.") - - def retrieve( - self, group_task_id: str, timeout: int, polling_interval: int, **kwargs - ) -> Any: - raise NotImplementedError("The BindingManualQCAPI does not support retrieve.") diff --git a/tfbpapi/Cache.py b/tfbpapi/Cache.py deleted file mode 100644 index 366604d..0000000 --- a/tfbpapi/Cache.py +++ /dev/null @@ -1,29 +0,0 @@ -import logging -from typing import Any - -from cachetools import TTLCache # type: ignore - - -class Cache: - """A caching class that uses cachetools for TTL caching with an LRU eviction - policy.""" - - def __init__(self, maxsize: int = 100, ttl: int = 300): - self.ttl_cache = TTLCache(maxsize=maxsize, ttl=ttl) - self.logger = logging.getLogger(__name__) - - def get(self, key: str, default: Any = None) -> Any: - """Get a value from the cache.""" - return self.ttl_cache.get(key, default) - - def set(self, key: str, value: Any) -> None: - """Set a value in the cache.""" - self.ttl_cache[key] = value - - def list(self) -> list[str]: - """List all keys in the cache.""" - return list(self.ttl_cache.keys()) - - def delete(self, key: str) -> None: - """Delete a key from the cache.""" - self.ttl_cache.pop(key, None) diff --git a/tfbpapi/CallingCardsBackgroundAPI.py b/tfbpapi/CallingCardsBackgroundAPI.py deleted file mode 100644 index f5b7668..0000000 --- a/tfbpapi/CallingCardsBackgroundAPI.py +++ /dev/null @@ -1,56 +0,0 @@ -import os -from typing import Any - -import pandas as pd - -from tfbpapi.AbstractRecordsAndFilesAPI import ( - AbstractRecordsAndFilesAPI, -) - - -class CallingCardsBackgroundAPI(AbstractRecordsAndFilesAPI): - """Class to interact with the CallingCardsBackgroundAPI endpoint.""" - - def __init__(self, **kwargs) -> None: - """ - Initialize the CallingCardsBackgroundAPI object. - - :param kwargs: parameters to pass through AbstractRecordsAndFilesAPI to - AbstractAPI. - - """ - valid_param_keys = kwargs.pop( - "valid_param_keys", - ["id", "name"], - ) - - url = kwargs.pop("url", os.getenv("CALLINGCARDSBACKGROUND_URL", None)) - - super().__init__(url=url, valid_keys=valid_param_keys, **kwargs) - - def create(self, data: dict[str, Any], **kwargs) -> Any: - raise NotImplementedError( - "The CallingCardsBackgroundAPI does not support create." - ) - - def update(self, df: pd.DataFrame, **kwargs) -> Any: - raise NotImplementedError( - "The CallingCardsBackgroundAPI does not support update." - ) - - def delete(self, id: str, **kwargs) -> Any: - raise NotImplementedError( - "The CallingCardsBackgroundAPI does not support delete." - ) - - def submit(self, post_dict: dict[str, Any], **kwargs) -> Any: - raise NotImplementedError( - "The CallingCardsBackgroundAPI does not support submit." - ) - - def retrieve( - self, group_task_id: str, timeout: int, polling_interval: int, **kwargs - ) -> Any: - raise NotImplementedError( - "The CallingCardsBackgroundAPI does not support retrieve." - ) diff --git a/tfbpapi/DataSourceAPI.py b/tfbpapi/DataSourceAPI.py deleted file mode 100644 index 0d00785..0000000 --- a/tfbpapi/DataSourceAPI.py +++ /dev/null @@ -1,48 +0,0 @@ -import os -from typing import Any - -import pandas as pd - -from tfbpapi.AbstractRecordsOnlyAPI import AbstractRecordsOnlyAPI - - -class DataSourceAPI(AbstractRecordsOnlyAPI): - """A class to interact with the DataSourceAPI endpoint.""" - - def __init__(self, **kwargs): - """ - Initialize the DataSourceAPI object. - - :param kwargs: parameters to pass to AbstractAPI via AbstractRecordsOnlyAPI. - - """ - valid_param_keys = kwargs.pop( - "valid_param_keys", - ["id", "fileformat_id", "fileformat", "lab", "assay", "workflow"], - ) - - url = kwargs.pop("url", os.getenv("DATASOURCE_URL", None)) - if not url: - raise AttributeError( - "url must be provided or the environmental variable ", - "`DATASOURCE_URL` must be set", - ) - - super().__init__(url=url, valid_keys=valid_param_keys, **kwargs) - - def create(self, data: dict[str, Any], **kwargs) -> Any: - raise NotImplementedError("The DataSourceAPI does not support create.") - - def update(self, df: pd.DataFrame, **kwargs) -> Any: - raise NotImplementedError("The DataSourceAPI does not support update.") - - def delete(self, id: str, **kwargs) -> Any: - raise NotImplementedError("The DataSourceAPI does not support delete.") - - def submit(self, post_dict: dict[str, Any], **kwargs) -> Any: - raise NotImplementedError("The DataSourceAPI does not support submit.") - - def retrieve( - self, group_task_id: str, timeout: int, polling_interval: int, **kwargs - ) -> Any: - raise NotImplementedError("The DataSourceAPI does not support retrieve.") diff --git a/tfbpapi/DtoAPI.py b/tfbpapi/DtoAPI.py deleted file mode 100644 index bc8d404..0000000 --- a/tfbpapi/DtoAPI.py +++ /dev/null @@ -1,295 +0,0 @@ -import asyncio -import json -import os -import time -from typing import Any - -import aiohttp -import pandas as pd -import requests # type: ignore - -from tfbpapi.AbstractRecordsOnlyAPI import AbstractRecordsOnlyAPI - - -class DtoAPI(AbstractRecordsOnlyAPI): - """ - A class to interact with the DTO API. - - Retrieves dto data from the database. - - """ - - def __init__(self, **kwargs) -> None: - """ - Initialize the DTO object. This will serve as an interface to the DTO endpoint - of both the database and the application cache. - - :param url: The URL of the DTO API - :param kwargs: Additional parameters to pass to AbstractAPI. - - """ - - self.bulk_update_url_suffix = kwargs.pop( - "bulk_update_url_suffix", "bulk-update" - ) - - super().__init__( - url=kwargs.pop("url", os.getenv("DTO_URL", "")), - **kwargs, - ) - - async def read(self, *args, **kwargs) -> Any: - """ - Override the read() method to use a custom callback that parses metadata. - - :param callback: The function to call with the metadata. Defaults to parsing - metadata. - :type callback: Callable[[pd.DataFrame, dict[str, Any] | None, Any], Any] - :return: The result of the callback function. - :rtype: Any - - """ - - # Define the default callback - def dto_callback(metadata, data, cache, **kwargs): - return {"metadata": self.parse_metadata(metadata), "data": data} - - # Explicitly set the callback argument to dto_callback - kwargs["callback"] = dto_callback - - # Call the superclass method with updated kwargs - return await super().read(*args, **kwargs) - - async def submit( - self, - post_dict: dict[str, Any], - **kwargs, - ) -> Any: - """ - Submit a DTO task to the DTO API. - - :param post_dict: The dictionary to submit to the DTO API. The typing needs to - be adjusted -- it can take a list of dictionaries to submit a batch. - :return: The group_task_id of the submitted task. - - """ - # make a post request with the post_dict to dto_url - dto_url = f"{self.url.rstrip('/')}/submit/" - self.logger.debug("dto_url: %s", dto_url) - - async with aiohttp.ClientSession() as session: - async with session.post( - dto_url, headers=self.header, json=post_dict - ) as response: - try: - response.raise_for_status() - except aiohttp.ClientResponseError as e: - self.logger.error( - "Failed to submit DTO task: Status %s, Reason %s", - e.status, - e.message, - ) - raise - result = await response.json() - try: - return result["group_task_id"] - except KeyError: - self.logger.error( - "Expected 'group_task_id' in response: %s", json.dumps(result) - ) - raise - - async def retrieve( - self, - group_task_id: str, - timeout: int = 300, - polling_interval: int = 2, - **kwargs, - ) -> dict[str, pd.DataFrame]: - """ - Periodically check the task status and retrieve the result when the task - completes. - - :param group_task_id: The task ID to retrieve results for. - :param timeout: The maximum time to wait for the task to complete (in seconds). - :param polling_interval: The time to wait between status checks (in seconds). - :return: Records from the DTO API of the successfully completed task. - - """ - # Start time for timeout check - start_time = time.time() - - # Task status URL - status_url = f"{self.url.rstrip('/')}/status/" - - while True: - async with aiohttp.ClientSession() as session: - # Send a GET request to check the task status - async with session.get( - status_url, - headers=self.header, - params={"group_task_id": group_task_id}, - ) as response: - response.raise_for_status() # Raise an error for bad status codes - status_response = await response.json() - - # Check if the task is complete - if status_response.get("status") == "SUCCESS": - - if error_tasks := status_response.get("error_tasks"): - self.logger.error( - f"Tasks {group_task_id} failed: {error_tasks}" - ) - if success_tasks := status_response.get("success_pks"): - params = {"id": ",".join(str(pk) for pk in success_tasks)} - return await self.read(params=params) - elif status_response.get("status") == "FAILURE": - raise Exception( - f"Task {group_task_id} failed: {status_response}" - ) - - # Check if we have reached the timeout - elapsed_time = time.time() - start_time - if elapsed_time > timeout: - raise TimeoutError( - f"Task {group_task_id} did not " - "complete within {timeout} seconds." - ) - - # Wait for the specified polling interval before checking again - await asyncio.sleep(polling_interval) - - def create(self, data: dict[str, Any], **kwargs) -> requests.Response: - raise NotImplementedError("The DTO does not support create.") - - def update(self, df: pd.DataFrame, **kwargs: Any) -> requests.Response: - """ - Update the records in the database. - - :param df: The DataFrame containing the records to update. - :type df: pd.DataFrame - :param kwargs: Additional fields to include in the payload. - :type kwargs: Any - :return: The response from the POST request. - :rtype: requests.Response - :raises requests.RequestException: If the request fails. - - """ - bulk_update_url = ( - f"{self.url.rstrip('/')}/{self.bulk_update_url_suffix.rstrip('/')}/" - ) - - self.logger.debug("bulk_update_url: %s", bulk_update_url) - - # Include additional fields in the payload if provided - payload = {"data": df.to_dict(orient="records")} - payload.update(kwargs) - - try: - response = requests.post( - bulk_update_url, - headers=self.header, - json=payload, - ) - response.raise_for_status() - return response - except requests.RequestException as e: - self.logger.error(f"Error in POST request: {e}") - raise - - def delete(self, id: str, **kwargs) -> Any: - """ - Delete a DTO record from the database. - - :param id: The ID of the DTO record to delete. - :return: A dictionary with a status message indicating success or failure. - - """ - # Include the Authorization header with the token - headers = kwargs.get("headers", {}) - headers["Authorization"] = f"Token {self.token}" - - # Make the DELETE request with the updated headers - response = requests.delete(f"{self.url}/{id}/", headers=headers, **kwargs) - - if response.status_code == 204: - return {"status": "success", "message": "DTO deleted successfully."} - - # Raise an error if the response indicates failure - response.raise_for_status() - - def parse_metadata(self, metadata: pd.DataFrame) -> pd.DataFrame: - """ - Parse the metadata from the DTO API. - - :param metadata: The metadata DataFrame to parse. - :return: The parsed metadata DataFrame. - :raises KeyError: If the metadata DataFrame is missing required columns. - - """ - if metadata.empty: - self.logger.warning("Metadata is empty") - return metadata - - output_columns = [ - "id", - "promotersetsig", - "expression", - "regulator_symbol", - "binding_source", - "expression_source", - "passing_fdr", - "passing_pvalue", - ] - - # required columns are "result" and output_columns - missing_req_columns = [ - col for col in ["result"] + output_columns if col not in metadata.columns - ] - if missing_req_columns: - raise KeyError( - "Metadata is missing required columns: " - "{', '.join(missing_req_columns)}" - ) - - dto_results_list = [] - - # Check and rename keys, logging a warning if a key is missing - keys_to_rename = { - "rank1": "binding_rank_threshold", - "rank2": "perturbation_rank_threshold", - "set1_len": "binding_set_size", - "set2_len": "perturbation_set_size", - } - - for _, row in metadata.iterrows(): - dto_results = json.loads(row.result.replace("'", '"')) - - for old_key, new_key in keys_to_rename.items(): - if old_key in dto_results: - dto_results[new_key] = dto_results.pop(old_key) - else: - self.logger.warning( - f"Key '{old_key}' missing in row with id '{row.id}'." - ) - - dto_results["id"] = row.id - dto_results["promotersetsig"] = row.promotersetsig - dto_results["expression"] = row.expression - dto_results["regulator_symbol"] = row.regulator_symbol - dto_results["binding_source"] = row.binding_source - dto_results["expression_source"] = row.expression_source - dto_results["passing_fdr"] = row.passing_fdr - dto_results["passing_pvalue"] = row.passing_pvalue - - dto_results_list.append(dto_results) - - # Create DataFrame - result_df = pd.DataFrame(dto_results_list) - - # Reorder columns: output_columns first, followed by others - reordered_columns = output_columns + [ - col for col in result_df.columns if col not in output_columns - ] - - return result_df.loc[:, reordered_columns] diff --git a/tfbpapi/ExpressionAPI.py b/tfbpapi/ExpressionAPI.py deleted file mode 100644 index c61e1f7..0000000 --- a/tfbpapi/ExpressionAPI.py +++ /dev/null @@ -1,66 +0,0 @@ -import os -from typing import Any - -import pandas as pd - -from tfbpapi.AbstractRecordsAndFilesAPI import ( - AbstractRecordsAndFilesAPI, -) - - -class ExpressionAPI(AbstractRecordsAndFilesAPI): - """Class to interact with the ExpressionAPI endpoint.""" - - def __init__(self, **kwargs) -> None: - """ - Initialize the ExpressionAPI object. - - :param kwargs: parameters to pass through AbstractRecordsAndFilesAPI to - AbstractAPI. - - """ - valid_param_keys = kwargs.pop( - "valid_param_keys", - [ - "id", - "regulator", - "regulator_locus_tag", - "regulator_symbol", - "batch", - "control", - "mechanism", - "restriction", - "time", - "strain", - "source", - "source_name", - "source_time", - "lab", - "assay", - "workflow", - "effect_colname", - "pvalue_colname", - "preferred_replicate", - ], - ) - - url = kwargs.pop("url", os.getenv("EXPRESSION_URL", None)) - - super().__init__(url=url, valid_keys=valid_param_keys, **kwargs) - - def create(self, data: dict[str, Any], **kwargs) -> Any: - raise NotImplementedError("The ExpressionAPI does not support create.") - - def update(self, df: pd.DataFrame, **kwargs) -> Any: - raise NotImplementedError("The ExpressionAPI does not support update.") - - def delete(self, id: str, **kwargs) -> Any: - raise NotImplementedError("The ExpressionAPI does not support delete.") - - def submit(self, post_dict: dict[str, Any], **kwargs) -> Any: - raise NotImplementedError("The ExpressionAPI does not support submit.") - - def retrieve( - self, group_task_id: str, timeout: int, polling_interval: int, **kwargs - ) -> Any: - raise NotImplementedError("The ExpressionAPI does not support retrieve.") diff --git a/tfbpapi/ExpressionManualQCAPI.py b/tfbpapi/ExpressionManualQCAPI.py deleted file mode 100644 index 80023e6..0000000 --- a/tfbpapi/ExpressionManualQCAPI.py +++ /dev/null @@ -1,103 +0,0 @@ -import os -from typing import Any - -import pandas as pd -import requests # type: ignore - -from tfbpapi.AbstractRecordsOnlyAPI import AbstractRecordsOnlyAPI - - -class ExpressionManualQCAPI(AbstractRecordsOnlyAPI): - """A class to interact with the ExpressionManualQCAPI endpoint.""" - - def __init__(self, **kwargs): - """ - Initialize the ExpressionManualQCAPI object. - - :param kwargs: parameters to pass to AbstractAPI via AbstractRecordsOnlyAPI. - - """ - valid_param_keys = kwargs.pop( - "valid_param_keys", - [ - "id", - "expression", - "strain_verified", - "regulator_locus_tag", - "regulator_symbol", - "batch", - "replicate", - "control", - "mechanism", - "restriction", - "time", - "source", - "lab", - "assay", - "workflow", - ], - ) - - url = kwargs.pop("url", os.getenv("EXPRESSIONMANUALQC_URL", None)) - if not url: - raise AttributeError( - "url must be provided or the environmental variable ", - "`EXPRESSIONMANUALQC_URL` must be set", - ) - - self.bulk_update_url_suffix = kwargs.pop( - "bulk_update_url_suffix", "bulk-update" - ) - - super().__init__(url=url, valid_keys=valid_param_keys, **kwargs) - - def create(self, data: dict[str, Any], **kwargs) -> Any: - raise NotImplementedError("The ExpressionManualQCAPI does not support create.") - - def update(self, df: pd.DataFrame, **kwargs: Any) -> requests.Response: - """ - Update the records in the database. - - :param df: The DataFrame containing the records to update. - :type df: pd.DataFrame - :param kwargs: Additional fields to include in the payload. - :type kwargs: Any - :return: The response from the POST request. - :rtype: requests.Response - :raises requests.RequestException: If the request fails. - - """ - bulk_update_url = ( - f"{self.url.rstrip('/')}/{self.bulk_update_url_suffix.rstrip('/')}/" - ) - - self.logger.debug("bulk_update_url: %s", bulk_update_url) - - # Include additional fields in the payload if provided - payload = {"data": df.to_dict(orient="records")} - payload.update(kwargs) - - try: - response = requests.post( - bulk_update_url, - headers=self.header, - json=payload, - ) - response.raise_for_status() - return response - except requests.RequestException as e: - self.logger.error(f"Error in POST request: {e}") - raise - - def delete(self, id: str, **kwargs) -> Any: - raise NotImplementedError("The ExpressionManualQCAPI does not support delete.") - - def submit(self, post_dict: dict[str, Any], **kwargs) -> Any: - raise NotImplementedError("The ExpressionManualQCAPI does not support submit.") - - def retrieve( - self, group_task_id: str, timeout: int, polling_interval: int, **kwargs - ) -> Any: - raise NotImplementedError( - "The ExpressionManualQCAPI does not support retrieve." - ) diff --git a/tfbpapi/FileFormatAPI.py b/tfbpapi/FileFormatAPI.py deleted file mode 100644 index bccdcc1..0000000 --- a/tfbpapi/FileFormatAPI.py +++ /dev/null @@ -1,57 +0,0 @@ -import os -from typing import Any - -import pandas as pd - -from tfbpapi.AbstractRecordsOnlyAPI import AbstractRecordsOnlyAPI - - -class FileFormatAPI(AbstractRecordsOnlyAPI): - """A class to interact with the FileFormatAPI endpoint.""" - - def __init__(self, **kwargs): - """ - Initialize the FileFormatAPI object. - - :param kwargs: parameters to pass to AbstractAPI via AbstractRecordsOnlyAPI. - - """ - valid_param_keys = kwargs.pop( - "valid_param_keys", - [ - "fileformat", - "fields", - "separator", - "feature_identifier_col", - "effect_col", - "default_effect_threshold", - "pval_col", - "default_pvalue_threshold", - ], - ) - - url = kwargs.pop("url", os.getenv("FILEFORMAT_URL", None)) - if not url: - raise AttributeError( - "url must be provided or the environmental variable ", - "`FILEFORMAT_URL` must be set", - ) - - super().__init__(url=url, valid_keys=valid_param_keys, **kwargs) - - def create(self, data: dict[str, Any], **kwargs) -> Any: - raise NotImplementedError("The FileFormatAPI does not support create.") - - def update(self, df: pd.DataFrame, **kwargs) -> Any: - raise NotImplementedError("The FileFormatAPI does not support update.") - - def delete(self, id: str, **kwargs) -> Any: - raise NotImplementedError("The FileFormatAPI does not support delete.") - - def submit(self, post_dict: dict[str, Any], **kwargs) -> Any: - raise NotImplementedError("The FileFormatAPI does not support submit.") - - def retrieve( - self, group_task_id: str, timeout: int, polling_interval: int, **kwargs - ) -> Any: - raise NotImplementedError("The FileFormatAPI does not support retrieve.") diff --git a/tfbpapi/GenomicFeatureAPI.py b/tfbpapi/GenomicFeatureAPI.py deleted file mode 100644 index 499cb6c..0000000 --- a/tfbpapi/GenomicFeatureAPI.py +++ /dev/null @@ -1,60 +0,0 @@ -import os -from typing import Any - -import pandas as pd - -from tfbpapi.AbstractRecordsOnlyAPI import AbstractRecordsOnlyAPI - - -class GenomicFeatureAPI(AbstractRecordsOnlyAPI): - """A class to interact with the GenomicFeatureAPI endpoint.""" - - def __init__(self, **kwargs): - """ - Initialize the GenomicFeatureAPI object. - - :param kwargs: parameters to pass to AbstractAPI via AbstractRecordsOnlyAPI. - - """ - valid_param_keys = kwargs.pop( - "valid_param_keys", - [ - "id", - "chr", - "start", - "end", - "strand", - "type", - "locus_tag", - "symbol", - "source", - "alias", - "note", - ], - ) - - url = kwargs.pop("url", os.getenv("GENOMICFEATURE_URL", None)) - if not url: - raise AttributeError( - "url must be provided or the environmental variable ", - "`GENOMICFEATURE_URL` must be set", - ) - - super().__init__(url=url, valid_keys=valid_param_keys, **kwargs) - - def create(self, data: dict[str, Any], **kwargs) -> Any: - raise NotImplementedError("The GenomicFeatureAPI does not support create.") - - def update(self, df: pd.DataFrame, **kwargs) -> Any: - raise NotImplementedError("The GenomicFeatureAPI does not support update.") - - def delete(self, id: str, **kwargs) -> Any: - raise NotImplementedError("The GenomicFeatureAPI does not support delete.") - - def submit(self, post_dict: dict[str, Any], **kwargs) -> Any: - raise NotImplementedError("The GenomicFeatureAPI does not support submit.") - - def retrieve( - self, group_task_id: str, timeout: int, polling_interval: int, **kwargs - ) -> Any: - raise NotImplementedError("The GenomicFeatureAPI does not support retrieve.") diff --git a/tfbpapi/ParamsDict.py b/tfbpapi/ParamsDict.py deleted file mode 100644 index 19f7470..0000000 --- a/tfbpapi/ParamsDict.py +++ /dev/null @@ -1,156 +0,0 @@ -from typing import Any, Union - - -class ParamsDict(dict): - """ - A dictionary subclass that ensures all keys are strings and supports multiple key- - value assignments at once, with validation against a list of valid keys. - - This class is designed to be used for passing parameters to HTTP requests and - extends the base dictionary class, ensuring that insertion order is preserved. - - """ - - def __init__(self, params: dict[str, Any] = {}, valid_keys: list[str] = []) -> None: - """ - Initialize the ParamsDict with optional initial parameters and valid keys. - - :param params: A dictionary of initial parameters. All keys must be strings. - :type params: dict, optional - :param valid_keys: A list of valid keys for validation. - :type valid_keys: list of str, optional - :raises ValueError: If `params` is not a dictionary or if any of the keys - are not strings. - - """ - params = params or {} - valid_keys = valid_keys or [] - if not isinstance(params, dict): - raise ValueError("params must be a dictionary") - if len(params) > 0 and not all(isinstance(k, str) for k in params.keys()): - raise ValueError("params must be a dictionary with string keys") - super().__init__(params) - self._valid_keys = valid_keys - - def __setitem__(self, key: str | list[str], value: Any | list[Any]) -> None: - """ - Set a parameter value or multiple parameter values. - - :param key: The parameter key or a list of parameter keys. - :type key: str or list of str - :param value: The parameter value or a list of parameter values. - :type value: any or list of any - :raises ValueError: If the length of `key` and `value` lists do not match. - :raises KeyError: If `key` is not a string or a list of strings. - - """ - if isinstance(key, str): - self._validate_key(key) - super().__setitem__(key, value) - elif isinstance(key, list) and isinstance(value, list): - if len(key) != len(value): - raise ValueError("Length of keys and values must match") - for k, v in zip(key, value): - if not isinstance(k, str): - raise KeyError("All keys must be strings") - self._validate_key(k) - super().__setitem__(k, v) - else: - raise KeyError("Key must be a string or list of strings") - - def __getitem__(self, key: str | list[str]) -> Union[Any, "ParamsDict"]: - """ - Get a parameter value or a new ParamsDict with specified keys. - - :param key: The parameter key or a list of parameter keys. - :type key: str or list of str - :return: The parameter value or a new ParamsDict with the specified keys. - :rtype: any or ParamsDict - :raises KeyError: If `key` is not a string or a list of strings. - - """ - if isinstance(key, str): - return super().__getitem__(key) - elif isinstance(key, list): - return ParamsDict({k: dict.__getitem__(self, k) for k in key if k in self}) - else: - raise KeyError("Key must be a string or list of strings") - - def __delitem__(self, key: str) -> None: - """ - Delete a parameter by key. - - :param key: The parameter key. - :type key: str - :raises KeyError: If `key` is not a string. - - """ - if isinstance(key, str): - super().__delitem__(key) - else: - raise KeyError("Key must be a string") - - def __repr__(self) -> str: - """ - Return a string representation of the ParamsDict. - - :return: A string representation of the ParamsDict. - :rtype: str - - """ - return f"ParamsDict({super().__repr__()})" - - def __str__(self) -> str: - """ - Return a human-readable string representation of the ParamsDict. - - :return: A human-readable string representation of the ParamsDict. - :rtype: str - - """ - return ", ".join(f"{k}: {v}" for k, v in self.items()) - - def update(self, *args, **kwargs) -> None: - """Update the ParamsDict with the key/value pairs from other, overwriting - existing keys.""" - if args: - other = args[0] - if isinstance(other, dict): - [self._validate_key(k) for k in other.keys()] - for key, value in other.items(): - self.__setitem__(key, value) - else: - [self._validate_key(k) for k, _ in other] - for key, value in other: - self.__setitem__(key, value) - [self._validate_key(k) for k in kwargs.keys()] - for key, value in kwargs.items(): - self.__setitem__(key, value) - - def as_dict(self) -> dict: - """ - Convert the ParamsDict to a standard dictionary. - - :return: A standard dictionary with the same items as the ParamsDict. - :rtype: dict - - """ - return dict(self) - - def _validate_key(self, key: str) -> bool: - """Validate that the key is in the list of valid keys.""" - if self._valid_keys and key not in self._valid_keys: - raise KeyError(f"Invalid parameter key provided: {key}") - return True - - @property - def valid_keys(self) -> list[str]: - """Get the list of valid keys.""" - return self._valid_keys - - @valid_keys.setter - def valid_keys(self, keys: list[str]) -> None: - """Set the list of valid keys.""" - if not all(isinstance(k, str) for k in keys): - raise ValueError("valid_keys must be a list of strings") - self._valid_keys = keys diff --git a/tfbpapi/PromoterSetAPI.py b/tfbpapi/PromoterSetAPI.py deleted file mode 100644 index f747497..0000000 --- a/tfbpapi/PromoterSetAPI.py +++ /dev/null @@ -1,46 +0,0 @@ -import os -from typing import Any - -import pandas as pd - -from tfbpapi.AbstractRecordsAndFilesAPI import ( - AbstractRecordsAndFilesAPI, -) - - -class PromoterSetAPI(AbstractRecordsAndFilesAPI): - """Class to interact with the PromoterSetAPI endpoint.""" - - def __init__(self, **kwargs) -> None: - """ - Initialize the PromoterSetAPI object. - - :param kwargs: parameters to pass through AbstractRecordsAndFilesAPI to - AbstractAPI. - - """ - valid_param_keys = kwargs.pop( - "valid_param_keys", - ["id", "name"], - ) - - url = kwargs.pop("url", os.getenv("PROMOTERSET_URL", None)) - - super().__init__(url=url, valid_keys=valid_param_keys, **kwargs) - - def create(self, data: dict[str, Any], **kwargs) -> Any: - raise NotImplementedError("The PromoterSetAPI does not support create.") - - def update(self, df: pd.DataFrame, **kwargs) -> Any: - raise NotImplementedError("The PromoterSetAPI does not support update.") - - def delete(self, id: str, **kwargs) -> Any: - raise NotImplementedError("The PromoterSetAPI does not support delete.") - - def submit(self, post_dict: dict[str, Any], **kwargs) -> Any: - raise NotImplementedError("The PromoterSetAPI does not support submit.") - - def retrieve( - self, group_task_id: str, timeout: int, polling_interval: int, **kwargs - ) -> Any: - raise NotImplementedError("The PromoterSetAPI does not support retrieve.") diff --git a/tfbpapi/PromoterSetSigAPI.py b/tfbpapi/PromoterSetSigAPI.py deleted file mode 100644 index 0110a8c..0000000 --- a/tfbpapi/PromoterSetSigAPI.py +++ /dev/null @@ -1,67 +0,0 @@ -import os -from typing import Any - -import pandas as pd - -from tfbpapi.AbstractRecordsAndFilesAPI import ( - AbstractRecordsAndFilesAPI, -) - - -class PromoterSetSigAPI(AbstractRecordsAndFilesAPI): - """Class to interact with the PromoterSetSigAPI endpoint.""" - - def __init__(self, **kwargs) -> None: - """ - Initialize the PromoterSetSigAPI object. - - :param kwargs: parameters to pass through AbstractRecordsAndFilesAPI to - AbstractAPI. - - """ - valid_param_keys = kwargs.pop( - "valid_param_keys", - [ - "id", - "binding", - "promoter", - "promoter_name", - "background", - "background_name", - "regulator_locus_tag", - "regulator_symbol", - "batch", - "replicate", - "source", - "source_name", - "lab", - "assay", - "workflow", - "data_usable", - "aggregated", - "condition", - "deduplicate", - "preferred_replicate", - ], - ) - - url = kwargs.pop("url", os.getenv("PROMOTERSETSIG_URL", None)) - - super().__init__(url=url, valid_keys=valid_param_keys, **kwargs) - - def create(self, data: dict[str, Any], **kwargs) -> Any: - raise NotImplementedError("The PromoterSetSigAPI does not support create.") - - def update(self, df: pd.DataFrame, **kwargs) -> Any: - raise NotImplementedError("The PromoterSetSigAPI does not support update.") - - def delete(self, id: str, **kwargs) -> Any: - raise NotImplementedError("The PromoterSetSigAPI does not support delete.") - - def submit(self, post_dict: dict[str, Any], **kwargs) -> Any: - raise NotImplementedError("The PromoterSetSigAPI does not support submit.") - - def retrieve( - self, group_task_id: str, timeout: int, polling_interval: int, **kwargs - ) -> Any: - raise NotImplementedError("The PromoterSetSigAPI does not support retrieve.") diff --git a/tfbpapi/RankResponseAPI.py b/tfbpapi/RankResponseAPI.py deleted file mode 100644 index 6ed3330..0000000 --- a/tfbpapi/RankResponseAPI.py +++ /dev/null @@ -1,286 +0,0 @@ -import asyncio -import json -import os -import tarfile -import tempfile -import time -from typing import Any - -import aiohttp -import pandas as pd -from requests import Response, delete, post # type: ignore -from requests_toolbelt import MultipartEncoder - -from tfbpapi.AbstractRecordsAndFilesAPI import ( - AbstractRecordsAndFilesAPI, -) - - -class RankResponseAPI(AbstractRecordsAndFilesAPI): - """ - A class to interact with the Rank Response API. - - Retrieves rank response data from the database. - - """ - - def __init__(self, **kwargs) -> None: - """ - Initialize the RankResponseAPI object. This will serve as an interface to the - RankResponse endpoint of both the database and the application cache. - - :param url: The URL of the Rank Response API - :param kwargs: Additional parameters to pass to AbstractAPI. - - """ - super().__init__( - url=kwargs.pop("url", os.getenv("RANKRESPONSE_URL", "")), - **kwargs, - ) - - async def submit( - self, - post_dict: dict[str, Any], - **kwargs, - ) -> Any: - # make a post request with the post_dict to rankresponse_url - rankresponse_url = f"{self.url.rstrip('/')}/submit/" - self.logger.debug("rankresponse_url: %s", rankresponse_url) - - async with aiohttp.ClientSession() as session: - async with session.post( - rankresponse_url, headers=self.header, json=post_dict - ) as response: - response.raise_for_status() - result = await response.json() - try: - return result["group_task_id"] - except KeyError: - self.logger.error( - "Expected 'group_task_id' in response: %s", json.dumps(result) - ) - raise - - async def retrieve( - self, - group_task_id: str, - timeout: int = 300, - polling_interval: int = 2, - **kwargs, - ) -> dict[str, pd.DataFrame]: - """ - Periodically check the task status and retrieve the result when the task - completes. - - :param group_task_id: The task ID to retrieve results for. - :param timeout: The maximum time to wait for the task to complete (in seconds). - :param polling_interval: The time to wait between status checks (in seconds). - :return: Extracted files from the result tarball. - - """ - # Start time for timeout check - start_time = time.time() - - # Task status URL - status_url = f"{self.url.rstrip('/')}/status/" - - while True: - async with aiohttp.ClientSession() as session: - # Send a GET request to check the task status - async with session.get( - status_url, - headers=self.header, - params={"group_task_id": group_task_id}, - ) as response: - response.raise_for_status() # Raise an error for bad status codes - status_response = await response.json() - - # Check if the task is complete - if status_response.get("status") == "SUCCESS": - # Fetch and return the tarball - return await self._download_result(group_task_id) - elif status_response.get("status") == "FAILURE": - raise Exception( - f"Task {group_task_id} failed: {status_response}" - ) - - # Check if we have reached the timeout - elapsed_time = time.time() - start_time - if elapsed_time > timeout: - raise TimeoutError( - f"Task {group_task_id} did not " - "complete within {timeout} seconds." - ) - - # Wait for the specified polling interval before checking again - await asyncio.sleep(polling_interval) - - async def _download_result(self, group_task_id: str) -> Any: - """ - Download the result tarball after the task is successful. - - :param group_task_id: The group_task_id to download the results for. - :return: Extracted metadata and data from the tarball. - - """ - download_url = f"{self.url.rstrip('/')}/retrieve_task/" - - async with aiohttp.ClientSession() as session: - async with session.get( - download_url, - headers=self.header, - params={"group_task_id": group_task_id}, - ) as response: - response.raise_for_status() # Ensure request was successful - tar_data = await response.read() - - # Save tarball to a temporary file or return raw tar content - with tempfile.NamedTemporaryFile( - delete=False, suffix=".tar.gz" - ) as temp_file: - temp_file.write(tar_data) - temp_file.flush() - temp_file.seek(0) - - # Extract and return the content of the tarball - return self._extract_files(temp_file.name) - - def _extract_files(self, tar_path: str) -> dict[str, pd.DataFrame]: - """ - Extract metadata and associated files from a tarball. - - :param tar_path: The path to the tarball file. - :return: A tuple of metadata DataFrame and a dictionary of DataFrames for each - file. - - """ - with tarfile.open(tar_path, mode="r:gz") as tar: - tar_members = tar.getmembers() - - # Extract metadata.json - metadata_member = next( - (m for m in tar_members if m.name == "metadata.json"), None - ) - if metadata_member is None: - raise FileNotFoundError("metadata.json not found in tar archive") - - extracted_file = tar.extractfile(metadata_member) - if extracted_file is None: - raise FileNotFoundError("Failed to extract metadata.json") - - with extracted_file as f: - metadata_dict = json.load(f) - - metadata_df = pd.DataFrame(metadata_dict.values()) - metadata_df["id"] = metadata_dict.keys() - - # Extract CSV files - data = {} - for rr_id in metadata_df["id"]: - csv_filename = f"{rr_id}.csv.gz" - member = next((m for m in tar_members if m.name == csv_filename), None) - if member is None: - raise FileNotFoundError(f"{csv_filename} not found in tar archive") - - extracted_file = tar.extractfile(member) - if extracted_file is None: - raise FileNotFoundError(f"Failed to extract {csv_filename}") - - with extracted_file as f: - data[rr_id] = pd.read_csv(f, compression="gzip") - return {"metadata": metadata_df, "data": data} - - def create(self, data: dict[str, Any], **kwargs) -> Response: - """ - Create a new RankResponse record by uploading a gzipped CSV file. - - :param data: This should be the fields in the RankREsponse model, eg - "promotersetsig_id", "expression_id" and "parameters". - :param kwargs: Additional parameters to pass to the post. This must include a - DataFrame to upload as a CSV file with the keyword `df`, eg `df=my_df`. - - :return: The result of the post request. - - :raises ValueError: If a DataFrame is not provided in the keyword arguments. - :raises TypeError: If the DataFrame provided is not a pandas DataFrame. - - """ - # ensure that the url ends in a slash - rankresponse_url = f"{self.url.rstrip('/')}/" - df = kwargs.pop("df", None) - - if df is None: - raise ValueError( - "A DataFrame must be provided to create " - "a RankResponse via keyword `df`" - ) - if not isinstance(df, pd.DataFrame): - raise TypeError( - f"Expected a DataFrame for keyword `df`, got {type(df).__name__}" - ) - - # Create a temporary gzipped CSV file from the DataFrame - with tempfile.NamedTemporaryFile(suffix=".csv.gz") as temp_file: - df.to_csv(temp_file.name, compression="gzip", index=False) - - # Prepare the file and metadata for upload - with open(temp_file.name, "rb") as file: - multipart_data = MultipartEncoder( - fields={**data, "file": (temp_file.name, file, "application/gzip")} - ) - headers = {**self.header, "Content-Type": multipart_data.content_type} - - # Send the POST request with custom encoded multipart data - response = post(rankresponse_url, headers=headers, data=multipart_data) - - response.raise_for_status() - return response - - def update(self, df: pd.DataFrame, **kwargs) -> Any: - raise NotImplementedError("The RankResponseAPI does not support update.") - - def delete(self, id: str = "", **kwargs) -> Any: - """ - Delete one or more records from the database. - - :param id: The ID of the record to delete. However, you can also pass in - `ids` as a list of IDs to delete multiple records. This is why `id` is optional. - If neither `id` nor `ids` is provided, a ValueError is raised. - - :return: A dictionary with a status message indicating success or failure. - - :raises ValueError: If neither `id` nor `ids` is provided. - - """ - # Include the Authorization header with the token - headers = kwargs.get("headers", {}) - headers["Authorization"] = f"Token {self.token}" - - ids = kwargs.pop("ids", str(id)) - - # Determine if it's a single ID or multiple - if isinstance(ids, str) and str != "": - # Single ID deletion for backward compatibility - response = delete(f"{self.url}/{ids}/", headers=headers, **kwargs) - elif isinstance(ids, list) and ids: - # Bulk delete with a list of IDs - response = delete( - f"{self.url}/delete/", - headers=headers, - json={"ids": ids}, # Send the list of IDs in the request body - **kwargs, - ) - else: - raise ValueError( - "No ID(s) provided for deletion. Either pass a single ID with " - "`id` or a list of IDs with `ids = [1,2, ...]" - ) - - if response.status_code in [200, 204]: - return { - "status": "success", - "message": "RankResponse(s) deleted successfully.", - } - - # Raise an error if the response indicates failure - response.raise_for_status() diff --git a/tfbpapi/RegulatorAPI.py b/tfbpapi/RegulatorAPI.py deleted file mode 100644 index 675c002..0000000 --- a/tfbpapi/RegulatorAPI.py +++ /dev/null @@ -1,53 +0,0 @@ -import os -from typing import Any - -import pandas as pd - -from tfbpapi.AbstractRecordsOnlyAPI import AbstractRecordsOnlyAPI - - -class RegulatorAPI(AbstractRecordsOnlyAPI): - """A class to interact with the RegulatorAPI endpoint.""" - - def __init__(self, **kwargs): - """ - Initialize the RegulatorAPI object. - - :param kwargs: parameters to pass to AbstractAPI via AbstractRecordsOnlyAPI. - - """ - valid_param_keys = kwargs.pop( - "valid_param_keys", - [ - "id", - "regulator_locus_tag", - "regulator_symbol", - "under_development", - ], - ) - - url = kwargs.pop("url", os.getenv("REGULATOR_URL", None)) - if not url: - raise AttributeError( - "url must be provided or the environmental variable ", - "`REGULATOR_URL` must be set", - ) - - super().__init__(url=url, valid_keys=valid_param_keys, **kwargs) - - def create(self, data: dict[str, Any], **kwargs) -> Any: - raise NotImplementedError("The RegulatorAPI does not support create.") - - def update(self, df: pd.DataFrame, **kwargs) -> Any: - raise NotImplementedError("The RegulatorAPI does not support update.") - - def delete(self, id: str, **kwargs) -> Any: - raise NotImplementedError("The RegulatorAPI does not support delete.") - - def submit(self, post_dict: dict[str, Any], **kwargs) -> Any: - raise NotImplementedError("The RegulatorAPI does not support submit.") - - def retrieve( - self, group_task_id: str, timeout: int, polling_interval: int, **kwargs - ) -> Any: - raise NotImplementedError("The RegulatorAPI does not support retrieve.") diff --git a/tfbpapi/__init__.py b/tfbpapi/__init__.py index 5f54700..f9db664 100644 --- a/tfbpapi/__init__.py +++ b/tfbpapi/__init__.py @@ -1,37 +1,33 @@ -from .BindingAPI import BindingAPI -from .BindingConcatenatedAPI import BindingConcatenatedAPI -from .BindingManualQCAPI import BindingManualQCAPI -from .CallingCardsBackgroundAPI import CallingCardsBackgroundAPI -from .DataSourceAPI import DataSourceAPI -from .DtoAPI import DtoAPI -from .ExpressionAPI import ExpressionAPI -from .ExpressionManualQCAPI import ExpressionManualQCAPI -from .FileFormatAPI import FileFormatAPI -from .GenomicFeatureAPI import GenomicFeatureAPI -from .metric_arrays import metric_arrays -from .PromoterSetAPI import PromoterSetAPI -from .PromoterSetSigAPI import PromoterSetSigAPI -from .rank_transforms import shifted_negative_log_ranks, stable_rank, transform -from .RankResponseAPI import RankResponseAPI -from .RegulatorAPI import RegulatorAPI +from .datacard import DataCard +from .fetchers import HfDataCardFetcher, HfRepoStructureFetcher, HfSizeInfoFetcher +from .hf_cache_manager import HfCacheManager +from .models import ( + DatasetCard, + DatasetConfig, + DatasetType, + ExtractedMetadata, + FeatureInfo, + MetadataConfig, + MetadataRelationship, + PropertyMapping, + RepositoryConfig, +) +from .virtual_db import VirtualDB __all__ = [ - "BindingAPI", - "BindingConcatenatedAPI", - "BindingManualQCAPI", - "CallingCardsBackgroundAPI", - "DataSourceAPI", - "DtoAPI", - "ExpressionAPI", - "ExpressionManualQCAPI", - "FileFormatAPI", - "GenomicFeatureAPI", - "metric_arrays", - "transform", - "PromoterSetAPI", - "PromoterSetSigAPI", - "RankResponseAPI", - "RegulatorAPI", - "stable_rank", - "shifted_negative_log_ranks", + "DataCard", + "HfCacheManager", + "HfDataCardFetcher", + "HfRepoStructureFetcher", + "HfSizeInfoFetcher", + "MetadataConfig", + "PropertyMapping", + "RepositoryConfig", + "VirtualDB", + "DatasetCard", + "DatasetConfig", + "DatasetType", + "ExtractedMetadata", + "FeatureInfo", + "MetadataRelationship", ] diff --git a/tfbpapi/constants.py b/tfbpapi/constants.py new file mode 100644 index 0000000..749678f --- /dev/null +++ b/tfbpapi/constants.py @@ -0,0 +1,11 @@ +import os +from pathlib import Path + +from huggingface_hub.constants import HF_HUB_CACHE + +CACHE_DIR = Path(os.getenv("HF_CACHE_DIR", HF_HUB_CACHE)) + + +def get_hf_token() -> str | None: + """Get HuggingFace token from environment variable.""" + return os.getenv("HF_TOKEN") diff --git a/tfbpapi/datacard.py b/tfbpapi/datacard.py new file mode 100644 index 0000000..52a573c --- /dev/null +++ b/tfbpapi/datacard.py @@ -0,0 +1,721 @@ +""" +DataCard class for parsing and exploring HuggingFace dataset metadata. + +This module provides the DataCard class for parsing HuggingFace dataset cards +into structured Python objects that can be easily explored. The focus is on +enabling users to drill down into the YAML structure to understand: + +- Dataset configurations and their types +- Feature definitions and roles +- Experimental conditions at all hierarchy levels (top/config/field) +- Field-level condition definitions +- Metadata relationships + +Users can then use this information to plan metadata table structures and +data loading strategies. + +""" + +import logging +from dataclasses import dataclass +from typing import Any + +from pydantic import ValidationError + +from tfbpapi.errors import DataCardError, DataCardValidationError, HfDataFetchError +from tfbpapi.fetchers import ( + HfDataCardFetcher, + HfRepoStructureFetcher, + HfSizeInfoFetcher, +) +from tfbpapi.models import ( + DatasetCard, + DatasetConfig, + ExtractedMetadata, + FeatureInfo, + MetadataRelationship, +) + + +@dataclass +class DatasetSchema: + """ + Complete schema summary for a data configuration. + + Derived entirely from the DataCard YAML -- no DuckDB introspection needed. Used by + VirtualDB to determine column partitioning between data and metadata parquets. + + :ivar data_columns: Column names present in the data parquet. + :ivar metadata_columns: Column names that are metadata. + :ivar join_columns: Columns common to both data and metadata parquets (used as JOIN + keys for external metadata). Empty for embedded metadata (same parquet, no JOIN + needed). + :ivar metadata_source: One of ``"embedded"``, ``"external"``, or ``"none"``. + :ivar external_metadata_config: Config name of the external metadata config, or + ``None`` if metadata is embedded or absent. + :ivar is_partitioned: Whether the data parquet is partitioned. + + """ + + data_columns: set[str] + metadata_columns: set[str] + join_columns: set[str] + metadata_source: str + external_metadata_config: str | None + is_partitioned: bool + + +class DataCard: + """ + Parser and explorer for HuggingFace dataset metadata. + + The parsed structure uses Pydantic models with `extra="allow"` to accept + arbitrary fields (like experimental_conditions) without requiring code + changes. + + Key capabilities: + - Parse dataset card YAML into structured objects + - Navigate experimental conditions at 3 levels (top/config/field) + - Explore field definitions and roles + - Extract metadata schema for table design + - Discover metadata relationships + + Example: + >>> card = DataCard("BrentLab/harbison_2004") + >>> # Use context manager for config exploration + >>> with card.config("harbison_2004") as cfg: + ... # Get all experimental conditions + ... conds = cfg.experimental_conditions() + ... # Get condition fields with definitions + ... fields = cfg.condition_fields() + ... # Drill down into specific field + ... for name, info in fields.items(): + ... for value, definition in info['definitions'].items(): + ... print(f"{name}={value}: {definition}") + + Example (legacy API still supported): + >>> card = DataCard("BrentLab/harbison_2004") + >>> conditions = card.get_experimental_conditions("harbison_2004") + >>> defs = card.get_field_definitions("harbison_2004", "condition") + + """ + + def __init__(self, repo_id: str, token: str | None = None): + """ + Initialize DataCard for a repository. + + :param repo_id: HuggingFace repository identifier (e.g., "user/dataset") + :param token: Optional HuggingFace token for authentication + + """ + self.repo_id = repo_id + self.token = token + self.logger = logging.getLogger(self.__class__.__name__) + + # Initialize fetchers + self._card_fetcher = HfDataCardFetcher(token=token) + self._structure_fetcher = HfRepoStructureFetcher(token=token) + self._size_fetcher = HfSizeInfoFetcher(token=token) + + # Cache for parsed card + self._dataset_card: DatasetCard | None = None + self._metadata_cache: dict[str, list[ExtractedMetadata]] = {} + self._metadata_fields_map: dict[str, list[str]] = {} + + @property + def dataset_card(self) -> DatasetCard: + """Get the validated dataset card.""" + if self._dataset_card is None: + self._load_and_validate_card() + # this is here for type checking purposes. _load_and_validate_card() + # will either set the _dataset_card or raise an error + assert self._dataset_card is not None + return self._dataset_card + + def _load_and_validate_card(self) -> None: + """Load and validate the dataset card from HuggingFace.""" + try: + self.logger.debug(f"Loading dataset card for {self.repo_id}") + card_data = self._card_fetcher.fetch(self.repo_id) + + if not card_data: + raise DataCardValidationError( + f"No dataset card found for {self.repo_id}" + ) + + # Validate using Pydantic model + self._dataset_card = DatasetCard(**card_data) + self._build_metadata_fields_map() + self.logger.debug(f"Successfully validated dataset card for {self.repo_id}") + + except ValidationError as e: + # Create a more user-friendly error message + error_details = [] + for error in e.errors(): + field_path = " -> ".join(str(x) for x in error["loc"]) + error_type = error["type"] + error_msg = error["msg"] + input_value = error.get("input", "N/A") + + if "dtype" in field_path and error_type == "string_type": + error_details.append( + f"Field '{field_path}': Expected a simple data type " + "string (like 'string', 'int64', 'float64') " + "but got a complex structure. This might be a categorical " + "field with class labels. " + f"Actual value: {input_value}" + ) + else: + error_details.append( + f"Field '{field_path}': {error_msg} (got: {input_value})" + ) + + detailed_msg = ( + f"Dataset card validation failed for {self.repo_id}:\n" + + "\n".join(f" - {detail}" for detail in error_details) + ) + self.logger.error(detailed_msg) + raise DataCardValidationError(detailed_msg) from e + except HfDataFetchError as e: + raise DataCardError(f"Failed to fetch dataset card: {e}") from e + + @property + def configs(self) -> list[DatasetConfig]: + """Get all dataset configurations.""" + return self.dataset_card.configs + + def get_config(self, config_name: str) -> DatasetConfig | None: + """Get a specific configuration by name.""" + return self.dataset_card.get_config_by_name(config_name) + + def get_features(self, config_name: str) -> list[FeatureInfo]: + """ + Get all feature definitions for a configuration. + + :param config_name: Configuration name + :return: List of FeatureInfo objects + :raises DataCardError: If config not found + + """ + config = self.get_config(config_name) + if not config: + raise DataCardError(f"Configuration '{config_name}' not found") + + return config.dataset_info.features + + def _extract_partition_values( + self, config: DatasetConfig, field_name: str + ) -> set[str]: + """Extract values from partition structure.""" + if ( + not config.dataset_info.partitioning + or not config.dataset_info.partitioning.enabled + ): + return set() + + partition_columns = config.dataset_info.partitioning.partition_by or [] + if field_name not in partition_columns: + return set() + + try: + # Get partition values from repository structure + partition_values = self._structure_fetcher.get_partition_values( + self.repo_id, field_name + ) + return set(partition_values) + except HfDataFetchError: + self.logger.warning(f"Failed to extract partition values for {field_name}") + return set() + + def get_metadata_relationships( + self, refresh_cache: bool = False + ) -> list[MetadataRelationship]: + """ + Get relationships between data configs and their metadata. + + :param refresh_cache: If True, force refresh dataset card from remote + + """ + # Clear cached dataset card if refresh requested + if refresh_cache: + self._dataset_card = None + + relationships = [] + data_configs = self.dataset_card.get_data_configs() + metadata_configs = self.dataset_card.get_metadata_configs() + + for data_config in data_configs: + # Check for explicit applies_to relationships + for meta_config in metadata_configs: + if ( + meta_config.applies_to + and data_config.config_name in meta_config.applies_to + ): + relationships.append( + MetadataRelationship( + data_config=data_config.config_name, + metadata_config=meta_config.config_name, + relationship_type="explicit", + ) + ) + + # Check for embedded metadata (always runs regardless of + # explicit relationships) + if data_config.metadata_fields: + relationships.append( + MetadataRelationship( + data_config=data_config.config_name, + metadata_config=f"{data_config.config_name}_embedded", + relationship_type="embedded", + ) + ) + + return relationships + + def _build_metadata_fields_map(self) -> None: + """ + Build a mapping from data config names to their metadata fields. + + Called during card loading. For each data config, resolves metadata + fields from two sources: + + 1. Embedded: the data config has ``metadata_fields`` listing which + of its own columns are metadata. + 2. External: a separate metadata-type config has ``applies_to`` + including this config name. The metadata fields are the feature + names from that metadata config. + + Embedded takes priority. For external, the first matching metadata + config wins. + + """ + assert self._dataset_card is not None + self._metadata_fields_map = {} + meta_configs = self._dataset_card.get_metadata_configs() + + for data_cfg in self._dataset_card.get_data_configs(): + name = data_cfg.config_name + # Embedded case + if data_cfg.metadata_fields: + self._metadata_fields_map[name] = list(data_cfg.metadata_fields) + continue + # External case: find metadata config with applies_to + for meta_cfg in meta_configs: + if meta_cfg.applies_to and name in meta_cfg.applies_to: + self._metadata_fields_map[name] = [ + f.name for f in meta_cfg.dataset_info.features + ] + break + else: + self.logger.info( + "No metadata fields found for data config '%s' " + "in repo '%s' -- no embedded metadata_fields and " + "no metadata config with applies_to", + name, + self.repo_id, + ) + + def get_metadata_fields(self, config_name: str) -> list[str] | None: + """ + Get metadata field names for a data configuration. + + Returns pre-computed metadata fields resolved during card loading. + Handles both embedded metadata (``metadata_fields`` on the data + config) and external metadata (separate metadata config with + ``applies_to``). + + :param config_name: Name of the data configuration + :return: List of metadata field names, or None if no metadata + + """ + # Ensure card is loaded (triggers _build_metadata_fields_map) + _ = self.dataset_card + return self._metadata_fields_map.get(config_name) + + def get_data_col_names(self, config_name: str) -> set[str]: + """ + Return the column names from the data config's feature list. + + These are the columns present in the data parquet file, derived directly from + the DataCard feature definitions without any DuckDB introspection. + + :param config_name: Name of the data configuration + :return: Set of column names, empty if config not found + + """ + _ = self.dataset_card # ensure loaded + config = self.get_config(config_name) + if not config: + return set() + return {f.name for f in config.dataset_info.features} + + def get_metadata_config_name(self, config_name: str) -> str | None: + """ + Return the config_name of the external metadata config, if any. + + If the data config has embedded ``metadata_fields``, or if no + metadata config with ``applies_to`` references this config, + returns None. + + :param config_name: Name of the data configuration + :return: The metadata config name, or None + + """ + _ = self.dataset_card # ensure loaded + data_cfg = self.get_config(config_name) + if not data_cfg: + return None + # Embedded metadata -- no external config needed + if data_cfg.metadata_fields: + return None + # Find external metadata config with applies_to + for meta_cfg in self.dataset_card.get_metadata_configs(): + if meta_cfg.applies_to and config_name in meta_cfg.applies_to: + return meta_cfg.config_name + return None + + def get_dataset_schema(self, config_name: str) -> DatasetSchema | None: + """ + Return schema summary for a data configuration. + + Determines whether metadata is embedded or external, which + columns belong to data vs metadata parquets, and which columns + are shared between them (join keys for external metadata). + All information is derived from the DataCard YAML -- no DuckDB + introspection is needed. + + :param config_name: Name of the data configuration + :return: DatasetSchema instance, or None if config not found + + Example -- embedded metadata:: + + schema = card.get_dataset_schema("harbison_2004") + # schema.metadata_source == "embedded" + # schema.join_columns == set() (same parquet, no JOIN) + + Example -- external metadata:: + + schema = card.get_dataset_schema("annotated_features") + # schema.metadata_source == "external" + # schema.external_metadata_config == "annotated_features_meta" + # schema.join_columns == {"id"} (common to both parquets) + + """ + _ = self.dataset_card # ensure loaded + config = self.get_config(config_name) + if not config: + return None + + is_partitioned = bool( + config.dataset_info.partitioning + and config.dataset_info.partitioning.enabled + ) + + # Embedded: metadata_fields lists which of the config's own + # columns are metadata; all live in the same parquet + if config.metadata_fields: + all_cols = {f.name for f in config.dataset_info.features} + meta_cols = set(config.metadata_fields) + data_cols = all_cols - meta_cols + return DatasetSchema( + data_columns=data_cols, + metadata_columns=meta_cols, + join_columns=set(), + metadata_source="embedded", + external_metadata_config=None, + is_partitioned=is_partitioned, + ) + + # External: find metadata config with applies_to + for meta_cfg in self.dataset_card.get_metadata_configs(): + if meta_cfg.applies_to and config_name in meta_cfg.applies_to: + data_cols = {f.name for f in config.dataset_info.features} + meta_cols = {f.name for f in meta_cfg.dataset_info.features} + join_cols = data_cols & meta_cols + return DatasetSchema( + data_columns=data_cols, + metadata_columns=meta_cols, + join_columns=join_cols, + metadata_source="external", + external_metadata_config=meta_cfg.config_name, + is_partitioned=is_partitioned, + ) + + # No metadata relationship -- treat all columns as data + all_cols = {f.name for f in config.dataset_info.features} + return DatasetSchema( + data_columns=all_cols, + metadata_columns=set(), + join_columns=set(), + metadata_source="none", + external_metadata_config=None, + is_partitioned=is_partitioned, + ) + + def get_repository_info(self) -> dict[str, Any]: + """Get general repository information.""" + card = self.dataset_card + + try: + structure = self._structure_fetcher.fetch(self.repo_id) + total_files = structure.get("total_files", 0) + last_modified = structure.get("last_modified") + except HfDataFetchError: + total_files = None + last_modified = None + + return { + "repo_id": self.repo_id, + "pretty_name": card.pretty_name, + "license": card.license, + "tags": card.tags, + "language": card.language, + "size_categories": card.size_categories, + "num_configs": len(card.configs), + "dataset_types": [config.dataset_type.value for config in card.configs], + "total_files": total_files, + "last_modified": last_modified, + "has_default_config": self.dataset_card.default_config is not None, + } + + def extract_metadata_schema(self, config_name: str) -> dict[str, Any]: + """ + Extract complete metadata schema for planning metadata table structure. + + This is the primary method for understanding what metadata is available and + how to structure it into a metadata table. It consolidates information from + all sources: + + - **Field roles**: Which fields are regulators, targets, conditions, etc. + - **Top-level conditions**: Repo-wide conditions (constant for all samples) + - **Config-level conditions**: Config-specific conditions + (constant for this config) + - **Field-level definitions**: Per-sample condition definitions + + The returned schema provides all the information needed to: + 1. Identify sample identifier fields (regulator_identifier, etc.) + 2. Determine which conditions are constant vs. variable + 3. Access condition definitions for creating flattened columns + 4. Plan metadata table structure + + :param config_name: Configuration name to extract schema for + :return: Dict with comprehensive schema including: + - regulator_fields: List of regulator identifier field names + - target_fields: List of target identifier field names + - condition_fields: List of experimental_condition field names + - condition_definitions: Dict mapping field -> value -> definition + - top_level_conditions: Dict of repo-wide conditions + - config_level_conditions: Dict of config-specific conditions + :raises DataCardError: If configuration not found + + Example: + >>> schema = card.extract_metadata_schema('harbison_2004') + >>> # Identify identifier fields + >>> print(f"Regulator fields: {schema['regulator_fields']}") + >>> # Check for constant conditions + >>> if schema['top_level_conditions']: + ... print("Has repo-wide constant conditions") + >>> # Get field-level definitions for metadata table + >>> for field in schema['condition_fields']: + ... defs = schema['condition_definitions'][field] + ... print(f"{field} has {len(defs)} levels") + + """ + config = self.get_config(config_name) + if not config: + raise DataCardError(f"Configuration '{config_name}' not found") + + schema: dict[str, Any] = { + "regulator_fields": [], + "target_fields": [], + "condition_fields": [], + "condition_definitions": {}, + "metadata_fields": None, + "top_level_conditions": None, + "config_level_conditions": None, + } + + for feature in config.dataset_info.features: + if feature.role == "regulator_identifier": + schema["regulator_fields"].append(feature.name) + elif feature.role == "target_identifier": + schema["target_fields"].append(feature.name) + elif feature.role == "experimental_condition": + schema["condition_fields"].append(feature.name) + if feature.definitions: + schema["condition_definitions"][feature.name] = feature.definitions + + # Include features from external metadata config + meta_fields = self.get_metadata_fields(config_name) + schema["metadata_fields"] = meta_fields + if meta_fields is not None and not config.metadata_fields: + for meta_cfg in self.dataset_card.get_metadata_configs(): + if meta_cfg.applies_to and config_name in meta_cfg.applies_to: + for feature in meta_cfg.dataset_info.features: + if feature.role == "regulator_identifier": + schema["regulator_fields"].append(feature.name) + elif feature.role == "target_identifier": + schema["target_fields"].append(feature.name) + elif feature.role == "experimental_condition": + schema["condition_fields"].append(feature.name) + if feature.definitions: + schema["condition_definitions"][ + feature.name + ] = feature.definitions + break + + # Add top-level conditions (applies to all configs/samples) + if self.dataset_card.model_extra: + top_level = self.dataset_card.model_extra.get("experimental_conditions") + if top_level: + schema["top_level_conditions"] = top_level + + # Add config-level conditions (applies to this config's samples) + if config.model_extra: + config_level = config.model_extra.get("experimental_conditions") + if config_level: + schema["config_level_conditions"] = config_level + + return schema + + def get_experimental_conditions( + self, config_name: str | None = None + ) -> dict[str, Any]: + """ + Get experimental conditions with proper hierarchy handling. + + This method enables drilling down into the experimental conditions hierarchy: + - Top-level (repo-wide): Common to all configs/samples + - Config-level: Specific to a config, common to its samples + - Field-level: Per-sample variation (use get_field_definitions instead) + + Returns experimental conditions at the appropriate level: + - If config_name is None: returns top-level (repo-wide) conditions only + - If config_name is provided: returns merged (top + config) conditions + + All conditions are returned as flexible dicts that preserve the original + YAML structure. Navigate nested dicts to access specific values. + + :param config_name: Optional config name. If provided, merges top + and config levels + :return: Dict of experimental conditions (empty dict if none defined) + + Example: + >>> # Get top-level conditions + >>> top = card.get_experimental_conditions() + >>> temp = top.get('temperature_celsius', 30) + >>> + >>> # Get merged conditions for a config + >>> merged = card.get_experimental_conditions('config_name') + >>> media = merged.get('media', {}) + >>> media_name = media.get('name', 'unspecified') + + """ + # Get top-level conditions (stored in model_extra) + top_level = ( + self.dataset_card.model_extra.get("experimental_conditions", {}) + if self.dataset_card.model_extra + else {} + ) + + # If no config specified, return top-level only + if config_name is None: + return top_level.copy() if isinstance(top_level, dict) else {} + + # Get config-level conditions + config = self.get_config(config_name) + if not config: + raise DataCardError(f"Configuration '{config_name}' not found") + + config_level = ( + config.model_extra.get("experimental_conditions", {}) + if config.model_extra + else {} + ) + + # Merge: config-level overrides top-level + merged = {} + if isinstance(top_level, dict): + merged.update(top_level) + if isinstance(config_level, dict): + merged.update(config_level) + + return merged + + def get_field_definitions( + self, config_name: str, field_name: str + ) -> dict[str, Any]: + """ + Get definitions for a specific field (field-level conditions). + + This is the third level of the experimental conditions hierarchy - conditions + that vary per sample. Returns a dict mapping each possible field value to its + detailed specification. + + For fields with role=experimental_condition, the definitions typically include + nested structures like media composition, temperature, treatments, etc. that + define what each categorical value means experimentally. + + :param config_name: Configuration name + :param field_name: Field name (typically has role=experimental_condition) + :return: Dict mapping field values to their definition dicts + (empty if no definitions) + :raises DataCardError: If config or field not found + + Example: + >>> # Get condition definitions + >>> defs = card.get_field_definitions('harbison_2004', 'condition') + >>> # defs = {'YPD': {...}, 'HEAT': {...}, ...} + >>> + >>> # Drill down into a specific condition + >>> ypd = defs['YPD'] + >>> env_conds = ypd.get('environmental_conditions', {}) + >>> media = env_conds.get('media', {}) + >>> media_name = media.get('name') + + """ + config = self.get_config(config_name) + if not config: + raise DataCardError(f"Configuration '{config_name}' not found") + + # Find the feature + feature = None + for f in config.dataset_info.features: + if f.name == field_name: + feature = f + break + + if not feature: + raise DataCardError( + f"Field '{field_name}' not found in config '{config_name}'" + ) + + # Return definitions if present, otherwise empty dict + return feature.definitions if feature.definitions else {} + + def summary(self) -> str: + """Get a human-readable summary of the dataset.""" + card = self.dataset_card + info = self.get_repository_info() + + lines = [ + f"Dataset: {card.pretty_name or self.repo_id}", + f"Repository: {self.repo_id}", + f"License: {card.license or 'Not specified'}", + f"Configurations: {len(card.configs)}", + f"Dataset Types: {', '.join(info['dataset_types'])}", + ] + + if card.tags: + lines.append(f"Tags: {', '.join(card.tags)}") + + # Add config summaries + lines.append("\nConfigurations:") + for config in card.configs: + default_mark = " (default)" if config.default else "" + lines.append( + f" - {config.config_name}: {config.dataset_type.value}{default_mark}" + ) + lines.append(f" {config.description}") + + return "\n".join(lines) diff --git a/tfbpapi/errors.py b/tfbpapi/errors.py new file mode 100644 index 0000000..2735964 --- /dev/null +++ b/tfbpapi/errors.py @@ -0,0 +1,37 @@ +"""Custom exception classes for dataset management.""" + + +class HfDataFetchError(Exception): + """Raised when HuggingFace API requests fail.""" + + def __init__( + self, + message: str, + repo_id: str | None = None, + status_code: int | None = None, + endpoint: str | None = None, + ): + super().__init__(message) + self.repo_id = repo_id + self.status_code = status_code + self.endpoint = endpoint + + +class DataCardError(Exception): + """Base exception for DataCard operations.""" + + pass + + +class DataCardValidationError(DataCardError): + """Exception raised when dataset card validation fails.""" + + def __init__( + self, + message: str, + repo_id: str | None = None, + validation_errors: list | None = None, + ): + super().__init__(message) + self.repo_id = repo_id + self.validation_errors = validation_errors or [] diff --git a/tfbpapi/fetchers.py b/tfbpapi/fetchers.py new file mode 100644 index 0000000..c8d978f --- /dev/null +++ b/tfbpapi/fetchers.py @@ -0,0 +1,244 @@ +"""Data fetchers for HuggingFace Hub integration.""" + +import logging +import re +from typing import Any + +import requests +from huggingface_hub import DatasetCard, repo_info +from requests import HTTPError + +from tfbpapi.constants import get_hf_token +from tfbpapi.errors import HfDataFetchError + + +class HfDataCardFetcher: + """Handles fetching dataset cards from HuggingFace Hub.""" + + def __init__(self, token: str | None = None): + """ + Initialize the fetcher. + + :param token: HuggingFace token for authentication + + """ + self.logger = logging.getLogger(self.__class__.__name__) + self.token = token or get_hf_token() + + def fetch(self, repo_id: str, repo_type: str = "dataset") -> dict[str, Any]: + """ + Fetch and return dataset card data. + + :param repo_id: Repository identifier (e.g., "user/dataset") + :param repo_type: Type of repository ("dataset", "model", "space") + :return: Dataset card data as dictionary + :raises HfDataFetchError: If fetching fails + + """ + try: + self.logger.debug(f"Fetching dataset card for {repo_id}") + card = DatasetCard.load(repo_id, repo_type=repo_type, token=self.token) + + if not card.data: + self.logger.warning(f"Dataset card for {repo_id} has no data section") + return {} + + return card.data.to_dict() + + except Exception as e: + error_msg = f"Failed to fetch dataset card for {repo_id}: {e}" + self.logger.error(error_msg) + raise HfDataFetchError(error_msg) from e + + +class HfSizeInfoFetcher: + """Handles fetching size information from HuggingFace Dataset Server API.""" + + def __init__(self, token: str | None = None): + """ + Initialize the fetcher. + + :param token: HuggingFace token for authentication + + """ + self.logger = logging.getLogger(self.__class__.__name__) + self.token = token or get_hf_token() + self.base_url = "https://datasets-server.huggingface.co" + + def _build_headers(self) -> dict[str, str]: + """Build request headers with authentication if available.""" + headers = {"User-Agent": "TFBP-API/1.0"} + if self.token: + headers["Authorization"] = f"Bearer {self.token}" + return headers + + def fetch(self, repo_id: str) -> dict[str, Any]: + """ + Fetch dataset size information. + + :param repo_id: Repository identifier (e.g., "user/dataset") + :return: Size information as dictionary + :raises HfDataFetchError: If fetching fails + + """ + url = f"{self.base_url}/size" + params = {"dataset": repo_id} + headers = self._build_headers() + + try: + self.logger.debug(f"Fetching size info for {repo_id}") + response = requests.get(url, params=params, headers=headers, timeout=30) + response.raise_for_status() + + data = response.json() + self.logger.debug(f"Size info fetched successfully for {repo_id}") + return data + + except HTTPError as e: + if e.response.status_code == 404: + error_msg = f"Dataset {repo_id} not found" + elif e.response.status_code == 403: + error_msg = ( + f"Access denied to dataset {repo_id} (check token permissions)" + ) + else: + error_msg = f"HTTP error fetching size for {repo_id}: {e}" + + self.logger.error(error_msg) + raise HfDataFetchError(error_msg) from e + + except requests.RequestException as e: + error_msg = f"Request failed fetching size for {repo_id}: {e}" + self.logger.error(error_msg) + raise HfDataFetchError(error_msg) from e + + except ValueError as e: + error_msg = f"Invalid JSON response fetching size for {repo_id}: {e}" + self.logger.error(error_msg) + raise HfDataFetchError(error_msg) from e + + +class HfRepoStructureFetcher: + """Handles fetching repository structure from HuggingFace Hub.""" + + def __init__(self, token: str | None = None): + """ + Initialize the fetcher. + + :param token: HuggingFace token for authentication + + """ + self.logger = logging.getLogger(self.__class__.__name__) + self.token = token or get_hf_token() + self._cached_structure: dict[str, dict[str, Any]] = {} + + def fetch(self, repo_id: str, force_refresh: bool = False) -> dict[str, Any]: + """ + Fetch repository structure information. + + :param repo_id: Repository identifier (e.g., "user/dataset") + :param force_refresh: If True, bypass cache and fetch fresh data + :return: Repository structure information + :raises HfDataFetchError: If fetching fails + + """ + # Check cache first unless force refresh is requested + if not force_refresh and repo_id in self._cached_structure: + self.logger.debug(f"Using cached repo structure for {repo_id}") + return self._cached_structure[repo_id] + + try: + self.logger.debug(f"Fetching repo structure for {repo_id}") + info = repo_info(repo_id=repo_id, repo_type="dataset", token=self.token) + + # Extract file structure + files = [] + partitions: dict[str, set] = {} + + for sibling in info.siblings or []: + file_info = { + "path": sibling.rfilename, + "size": sibling.size, + "is_lfs": sibling.lfs is not None, + } + files.append(file_info) + + # Extract partition information from file paths + self._extract_partition_info(sibling.rfilename, partitions) + + result = { + "repo_id": repo_id, + "files": files, + "partitions": partitions, + "total_files": len(files), + "last_modified": ( + info.last_modified.isoformat() if info.last_modified else None + ), + } + + # Cache the result + self._cached_structure[repo_id] = result + return result + + except Exception as e: + error_msg = f"Failed to fetch repo structure for {repo_id}: {e}" + self.logger.error(error_msg) + raise HfDataFetchError(error_msg) from e + + def _extract_partition_info( + self, file_path: str, partitions: dict[str, set[str]] + ) -> None: + """ + Extract partition information from file paths. + + :param file_path: Path to analyze for partitions + :param partitions: Dictionary to update with partition info + + """ + # Look for partition patterns like "column=value" in path + partition_pattern = r"([^/=]+)=([^/]+)" + matches = re.findall(partition_pattern, file_path) + + for column, value in matches: + if column not in partitions: + partitions[column] = set() + partitions[column].add(value) + + def get_partition_values( + self, repo_id: str, partition_column: str, force_refresh: bool = False + ) -> list[str]: + """ + Get all values for a specific partition column. + + :param repo_id: Repository identifier + :param partition_column: Name of the partition column + :param force_refresh: If True, bypass cache and fetch fresh data + :return: List of unique partition values + :raises HfDataFetchError: If fetching fails + + """ + structure = self.fetch(repo_id, force_refresh=force_refresh) + partition_values = structure.get("partitions", {}).get(partition_column, set()) + return sorted(list(partition_values)) + + def get_dataset_files( + self, repo_id: str, path_pattern: str | None = None, force_refresh: bool = False + ) -> list[dict[str, Any]]: + """ + Get dataset files, optionally filtered by path pattern. + + :param repo_id: Repository identifier + :param path_pattern: Optional regex pattern to filter files + :param force_refresh: If True, bypass cache and fetch fresh data + :return: List of matching files + :raises HfDataFetchError: If fetching fails + + """ + structure = self.fetch(repo_id, force_refresh=force_refresh) + files = structure["files"] + + if path_pattern: + pattern = re.compile(path_pattern) + files = [f for f in files if pattern.search(f["path"])] + + return files diff --git a/tfbpapi/hf_cache_manager.py b/tfbpapi/hf_cache_manager.py new file mode 100644 index 0000000..26ca708 --- /dev/null +++ b/tfbpapi/hf_cache_manager.py @@ -0,0 +1,631 @@ +import logging +from datetime import datetime, timedelta +from pathlib import Path +from typing import Any, Literal + +import duckdb +from huggingface_hub import scan_cache_dir, try_to_load_from_cache +from huggingface_hub.utils import DeleteCacheStrategy + +from tfbpapi.datacard import DataCard + + +class HfCacheManager(DataCard): + """Enhanced cache management for Hugging Face Hub with metadata-focused + retrieval.""" + + def __init__( + self, + repo_id: str, + duckdb_conn: duckdb.DuckDBPyConnection, + token: str | None = None, + logger: logging.Logger | None = None, + ): + super().__init__(repo_id, token) + self.duckdb_conn = duckdb_conn + self.logger = logger or logging.getLogger(__name__) + + def _get_metadata_for_config( + self, config, force_refresh: bool = False + ) -> dict[str, Any]: + """ + Get metadata for a specific configuration using 3-case strategy. + + :param config: Configuration object to process + :param force_refresh: If True, skip cache checks and download fresh from remote + + """ + config_result = { + "config_name": config.config_name, + "strategy": None, + "table_name": None, + "success": False, + "message": "", + } + + table_name = f"metadata_{config.config_name}" + + try: + # Skip cache checks if force_refresh is True + if not force_refresh: + # Case 1: Check if metadata already exists in DuckDB + if self._check_metadata_exists_in_duckdb(table_name): + config_result.update( + { + "strategy": "duckdb_exists", + "table_name": table_name, + "success": True, + "message": f"Metadata table {table_name} " + "already exists in DuckDB", + } + ) + return config_result + + # Case 2: Check if HF data is in cache, create DuckDB representation + if self._load_metadata_from_cache(config, table_name): + config_result.update( + { + "strategy": "cache_loaded", + "table_name": table_name, + "success": True, + "message": "Loaded metadata from cache " + f"into table {table_name}", + } + ) + return config_result + + # Case 3: Download from HF (explicit vs embedded) + if self._download_and_load_metadata(config, table_name): + config_result.update( + { + "strategy": "downloaded", + "table_name": table_name, + "success": True, + "message": "Downloaded and loaded metadata " + f"into table {table_name}", + } + ) + return config_result + + config_result["message"] = ( + f"Failed to retrieve metadata for {config.config_name}" + ) + + except Exception as e: + config_result["message"] = f"Error processing {config.config_name}: {e}" + self.logger.error(f"Error in metadata config {config.config_name}: {e}") + + return config_result + + def _check_metadata_exists_in_duckdb(self, table_name: str) -> bool: + """Case 1: Check if metadata table already exists in DuckDB database.""" + try: + # Query information schema to check if table exists + result = self.duckdb_conn.execute( + "SELECT table_name FROM information_schema.tables WHERE table_name = ?", + [table_name], + ).fetchone() + + exists = result is not None + if exists: + self.logger.debug(f"Table {table_name} already exists in DuckDB") + return exists + + except Exception as e: + self.logger.debug(f"Error checking DuckDB table existence: {e}") + return False + + def _load_metadata_from_cache(self, config, table_name: str) -> bool: + """Case 2: HF data in cache, create DuckDB representation.""" + try: + # Check if metadata files are cached locally + cached_files = [] + for data_file in config.data_files: + cached_path = try_to_load_from_cache( + repo_id=self.repo_id, + filename=data_file.path, + repo_type="dataset", + ) + + if isinstance(cached_path, str) and Path(cached_path).exists(): + cached_files.append(cached_path) + + if not cached_files: + self.logger.debug(f"No cached files found for {config.config_name}") + return False + + # Load cached parquet files into DuckDB + self._create_duckdb_table_from_files( + cached_files, table_name, config.config_name + ) + self.logger.info( + f"Loaded {len(cached_files)} cached files into {table_name}" + ) + return True + + except Exception as e: + self.logger.debug(f"Error loading from cache for {config.config_name}: {e}") + return False + + def _download_and_load_metadata(self, config, table_name: str) -> bool: + """Case 3: Download from HF (explicit vs embedded).""" + try: + from huggingface_hub import snapshot_download + + # Download specific files for this metadata config + file_patterns = [data_file.path for data_file in config.data_files] + + downloaded_path = snapshot_download( + repo_id=self.repo_id, + repo_type="dataset", + allow_patterns=file_patterns, + token=self.token, + ) + + # Find downloaded parquet files + downloaded_files = [] + for pattern in file_patterns: + file_path = Path(downloaded_path) / pattern + if file_path.exists() and file_path.suffix == ".parquet": + downloaded_files.append(str(file_path)) + else: + # Handle wildcard patterns, including nested wildcards + if "*" in pattern: + # Use glob on the full pattern relative to downloaded_path + base_path = Path(downloaded_path) + matching_files = list(base_path.glob(pattern)) + downloaded_files.extend( + [str(f) for f in matching_files if f.suffix == ".parquet"] + ) + else: + # Handle non-wildcard patterns that might be directories + parent_dir = Path(downloaded_path) / Path(pattern).parent + if parent_dir.exists(): + downloaded_files.extend( + [str(f) for f in parent_dir.glob("*.parquet")] + ) + + if not downloaded_files: + self.logger.warning( + f"No parquet files found after download for {config.config_name}" + ) + return False + + # Load downloaded files into DuckDB + self._create_duckdb_table_from_files( + downloaded_files, table_name, config.config_name + ) + self.logger.info( + f"Downloaded and loaded {len(downloaded_files)} files into {table_name}" + ) + return True + + except Exception as e: + self.logger.error( + f"Error downloading metadata for {config.config_name}: {e}" + ) + return False + + def _create_duckdb_table_from_files( + self, file_paths: list[str], table_name: str, config_name: str + ) -> None: + """Create DuckDB table/view from parquet files.""" + if len(file_paths) == 1: + # Single file + create_sql = f""" + CREATE OR REPLACE VIEW {table_name} AS + SELECT * FROM read_parquet('{file_paths[0]}') + """ + else: + # Multiple files + files_str = "', '".join(file_paths) + create_sql = f""" + CREATE OR REPLACE VIEW {table_name} AS + SELECT * FROM read_parquet(['{files_str}']) + """ + + self.duckdb_conn.execute(create_sql) + self.logger.debug( + f"Created DuckDB view {table_name} from {len(file_paths)} files" + ) + + # Validate source_sample fields if they exist + self._validate_source_sample_fields(table_name, config_name) + + def _validate_source_sample_fields(self, table_name: str, config_name: str) -> None: + """ + Validate source_sample fields have correct format. + + Composite sample identifiers must be in the format: + "repo_id;config_name;sample_id" (exactly 3 semicolon-separated parts) + + """ + config = self.get_config(config_name) + + # Find all source_sample fields + source_sample_fields = [ + f.name + for f in config.dataset_info.features # type: ignore + if f.role == "source_sample" + ] + + if not source_sample_fields: + return # No validation needed + + # For each field, validate format + for field_name in source_sample_fields: + query = f""" + SELECT {field_name}, + LENGTH({field_name}) - LENGTH(REPLACE({field_name}, ';', '')) + AS semicolon_count + FROM {table_name} + WHERE semicolon_count != 2 + LIMIT 1 + """ + result = self.duckdb_conn.execute(query).fetchone() + + if result: + raise ValueError( + f"Invalid format in field '{field_name}' " + f"with role='source_sample'. " + f"Expected 'repo_id;config_name;sample_id' " + f"(3 semicolon-separated parts), " + f"but found: '{result[0]}'" + ) + + def _extract_embedded_metadata_field( + self, data_table_name: str, field_name: str, metadata_table_name: str + ) -> bool: + """Extract a specific metadata field from a data table.""" + try: + # Create a metadata view with unique values from the specified field + extract_sql = f""" + CREATE OR REPLACE VIEW {metadata_table_name} AS + SELECT DISTINCT {field_name} as value, COUNT(*) as count + FROM {data_table_name} + WHERE {field_name} IS NOT NULL + GROUP BY {field_name} + ORDER BY count DESC + """ + + self.duckdb_conn.execute(extract_sql) + + # Verify the table was created and has data + count_result = self.duckdb_conn.execute( + f"SELECT COUNT(*) FROM {metadata_table_name}" + ).fetchone() + + if count_result and count_result[0] > 0: + self.logger.info( + f"Extracted {count_result[0]} unique values for {field_name} " + f"into {metadata_table_name}" + ) + return True + else: + self.logger.warning(f"No data found for field {field_name}") + return False + + except Exception as e: + self.logger.error(f"Error extracting field {field_name}: {e}") + return False + + def clean_cache_by_age( + self, + max_age_days: int = 30, + dry_run: bool = True, + ) -> DeleteCacheStrategy: + """ + Clean cache entries older than specified age. + + :param max_age_days: Remove revisions older than this many days + :param dry_run: If True, show what would be deleted without executing + size_threshold: Only delete if total cache size exceeds this (e.g., "10GB") + + :return: DeleteCacheStrategy object that can be executed + + """ + cache_info = scan_cache_dir() + cutoff_date = datetime.now() - timedelta(days=max_age_days) + + old_revisions = [] + for repo in cache_info.repos: + for revision in repo.revisions: + # Check if revision is older than cutoff + revision_date = datetime.fromtimestamp(revision.last_modified) + if revision_date < cutoff_date: + old_revisions.append(revision.commit_hash) + self.logger.debug( + f"Marking for deletion: {revision.commit_hash} " + f"(last modified: {revision.last_modified})" + ) + + if not old_revisions: + self.logger.info("No old revisions found to delete") + # return None + + delete_strategy = cache_info.delete_revisions(*old_revisions) + + self.logger.info( + f"Found {len(old_revisions)} old revisions. " + f"Will free {delete_strategy.expected_freed_size_str}" + ) + + if not dry_run: + delete_strategy.execute() + self.logger.info( + f"Cache cleanup completed. Freed " + f"{delete_strategy.expected_freed_size_str}" + ) + else: + self.logger.info("Dry run completed. Use dry_run=False to execute deletion") + + return delete_strategy + + def clean_cache_by_size( + self, + target_size: str, + strategy: Literal[ + "oldest_first", "largest_first", "least_used" + ] = "oldest_first", + dry_run: bool = True, + ) -> DeleteCacheStrategy: + """ + Clean cache to reach target size by removing revisions. + + :param target_size: Target cache size (e.g., "5GB", "500MB") + :param strategy: Deletion strategy - "oldest_first", "largest_first", + "least_used" + :param dry_run: If True, show what would be deleted without executing + + :return: DeleteCacheStrategy object that can be executed + + """ + cache_info = scan_cache_dir() + current_size = cache_info.size_on_disk + target_bytes = self._parse_size_string(target_size) + + if current_size <= target_bytes: + self.logger.info( + f"Cache size ({cache_info.size_on_disk_str}) already below " + f"target ({target_size})" + ) + + bytes_to_free = current_size - target_bytes + + # Get all revisions sorted by strategy + all_revisions = [] + for repo in cache_info.repos: + for revision in repo.revisions: + all_revisions.append(revision) + + # Sort revisions based on strategy + if strategy == "oldest_first": + all_revisions.sort(key=lambda r: r.last_modified) + elif strategy == "largest_first": + all_revisions.sort(key=lambda r: r.size_on_disk, reverse=True) + elif strategy == "least_used": + # Use last_modified as proxy for usage + all_revisions.sort(key=lambda r: r.last_modified) + else: + raise ValueError(f"Unknown strategy: {strategy}") + + # Select revisions to delete + revisions_to_delete = [] + freed_bytes = 0 + + for revision in all_revisions: + if freed_bytes >= bytes_to_free: + break + revisions_to_delete.append(revision.commit_hash) + freed_bytes += revision.size_on_disk + + if not revisions_to_delete: + self.logger.warning("No revisions selected for deletion") + + delete_strategy = cache_info.delete_revisions(*revisions_to_delete) + + self.logger.info( + f"Selected {len(revisions_to_delete)} revisions for deletion. " + f"Will free {delete_strategy.expected_freed_size_str}" + ) + + if not dry_run: + delete_strategy.execute() + self.logger.info( + f"Cache cleanup completed. Freed " + f"{delete_strategy.expected_freed_size_str}" + ) + else: + self.logger.info("Dry run completed. Use dry_run=False to execute deletion") + + return delete_strategy + + def clean_unused_revisions( + self, keep_latest: int = 2, dry_run: bool = True + ) -> DeleteCacheStrategy: + """ + Clean unused revisions, keeping only the latest N revisions per repo. + + :param keep_latest: Number of latest revisions to keep per repo + :param dry_run: If True, show what would be deleted without executing + :return: DeleteCacheStrategy object that can be executed + + """ + cache_info = scan_cache_dir() + revisions_to_delete = [] + + for repo in cache_info.repos: + # Sort revisions by last modified (newest first) + sorted_revisions = sorted( + repo.revisions, key=lambda r: r.last_modified, reverse=True + ) + + # Keep the latest N, mark the rest for deletion + if len(sorted_revisions) > keep_latest: + old_revisions = sorted_revisions[keep_latest:] + for revision in old_revisions: + revisions_to_delete.append(revision.commit_hash) + self.logger.debug( + f"Marking old revision for deletion: {repo.repo_id} - " + f"{revision.commit_hash}" + ) + + delete_strategy = cache_info.delete_revisions(*revisions_to_delete) + + self.logger.info( + f"Found {len(revisions_to_delete)} unused revisions. " + f"Will free {delete_strategy.expected_freed_size_str}" + ) + + if not dry_run: + delete_strategy.execute() + self.logger.info( + f"Cache cleanup completed. Freed " + f"{delete_strategy.expected_freed_size_str}" + ) + else: + self.logger.info("Dry run completed. Use dry_run=False to execute deletion") + + return delete_strategy + + def auto_clean_cache( + self, + max_age_days: int = 30, + max_total_size: str = "10GB", + keep_latest_per_repo: int = 2, + dry_run: bool = True, + ) -> list[DeleteCacheStrategy]: + """ + Automated cache cleaning with multiple strategies. + + :param max_age_days: Remove revisions older than this + :param max_total_size: Target maximum cache size + :param keep_latest_per_repo: Keep this many latest revisions per repo + :param dry_run: If True, show what would be deleted without executing + :return: List of DeleteCacheStrategy objects that were executed + + """ + strategies_executed = [] + + self.logger.info("Starting automated cache cleanup...") + + # Step 1: Remove very old revisions + strategy = self.clean_cache_by_age(max_age_days=max_age_days, dry_run=dry_run) + if strategy: + strategies_executed.append(strategy) + + # Step 2: Remove unused revisions (keep only latest per repo) + strategy = self.clean_unused_revisions( + keep_latest=keep_latest_per_repo, dry_run=dry_run + ) + if strategy: + strategies_executed.append(strategy) + + # Step 3: If still over size limit, remove more aggressively + cache_info = scan_cache_dir() + if cache_info.size_on_disk > self._parse_size_string(max_total_size): + strategy = self.clean_cache_by_size( + target_size=max_total_size, strategy="oldest_first", dry_run=dry_run + ) + if strategy: + strategies_executed.append(strategy) + + total_freed = sum(s.expected_freed_size for s in strategies_executed) + self.logger.info( + f"Automated cleanup complete. Total freed: " + f"{self._format_bytes(total_freed)}" + ) + + return strategies_executed + + def _parse_size_string(self, size_str: str) -> int: + """Parse size string like '10GB' to bytes.""" + size_str = size_str.upper().strip() + + # Check longer units first to avoid partial matches + multipliers = {"TB": 1024**4, "GB": 1024**3, "MB": 1024**2, "KB": 1024, "B": 1} + + for unit, multiplier in multipliers.items(): + if size_str.endswith(unit): + number = float(size_str[: -len(unit)]) + return int(number * multiplier) + + # If no unit specified, assume bytes + return int(size_str) + + def _format_bytes(self, bytes_size: int) -> str: + """Format bytes into human readable string.""" + if bytes_size == 0: + return "0B" + + # iterate over common units, dividing by 1024 each time, to find an + # appropriate unit. Default to TB if the size is very large + size = float(bytes_size) + for unit in ["B", "KB", "MB", "GB", "TB"]: + if size < 1024.0: + return f"{size:.1f}{unit}" + size /= 1024.0 + return f"{size:.1f}TB" + + def query(self, sql: str, config_name: str, refresh_cache: bool = False) -> Any: + """ + Execute SQL query against a specific dataset configuration. + + Loads the specified configuration and executes the SQL query. + Automatically replaces the config name in the SQL with the actual + table name for user convenience. + + :param sql: SQL query to execute + :param config_name: Configuration name to query (table will be loaded + if needed) + :param refresh_cache: If True, force refresh from remote instead of + using cache + :return: DataFrame with query results + :raises ValueError: If config_name not found or query fails + + Example: + mgr = HfCacheManager("BrentLab/harbison_2004", duckdb.connect()) + df = mgr.query( + "SELECT DISTINCT sample_id FROM harbison_2004", + "harbison_2004" + ) + + """ + # Validate config exists + if config_name not in [c.config_name for c in self.configs]: + available_configs = [c.config_name for c in self.configs] + raise ValueError( + f"Config '{config_name}' not found. " + f"Available configs: {available_configs}" + ) + + # Load the configuration data + config = self.get_config(config_name) + if not config: + raise ValueError(f"Could not retrieve config '{config_name}'") + + config_result = self._get_metadata_for_config( + config, force_refresh=refresh_cache + ) + if not config_result.get("success", False): + raise ValueError( + f"Failed to load data for config '{config_name}': " + f"{config_result.get('message', 'Unknown error')}" + ) + + table_name = config_result.get("table_name") + if not table_name: + raise ValueError(f"No table available for config '{config_name}'") + + # Replace config name with actual table name in SQL for user convenience + modified_sql = sql.replace(config_name, table_name) + + # Execute query + try: + result = self.duckdb_conn.execute(modified_sql).fetchdf() + self.logger.debug(f"Query executed successfully on {config_name}") + return result + except Exception as e: + self.logger.error(f"Query execution failed: {e}") + self.logger.error(f"SQL: {modified_sql}") + raise ValueError(f"Query execution failed: {e}") from e diff --git a/tfbpapi/metric_arrays.py b/tfbpapi/metric_arrays.py deleted file mode 100644 index 2bfaf14..0000000 --- a/tfbpapi/metric_arrays.py +++ /dev/null @@ -1,162 +0,0 @@ -import logging -from collections.abc import Callable - -import pandas as pd - -logger = logging.getLogger(__name__) - - -def metric_arrays( - res_dict: dict[str, pd.DataFrame | dict[str, pd.DataFrame]], - metrics_dict: dict[str, Callable], - rownames: str = "target_symbol", - colnames: str = "regulator_symbol", - row_dedup_func: Callable | None = None, - drop_incomplete_rows: bool = True, -) -> dict[str, pd.DataFrame]: - """ - Extract specified metrics from an AbstractRecordsAndFilesAPI instance's - read(retrieve_files=True) results object. - - :param res_dict: The output of an AbstractRecordsAndFiles instance. - :param metrics_dict: A dictionary where the keys are metrics and the values are - functions to apply to rows in the event that there are multiple rows with - the same rownames. Set to None to raise error if duplicate rownames are found. - :param rownames: Column name to use for row labels. - :param colnames: Column name to use for column labels. - :param drop_incomplete_rows: When True, drops rows and columns with all NaN values. - - :return: A dictionary where the metric is the key and the value is a DataFrame. - The column values are metric values, and the column names correspond - to `colnames` in the metadata DataFrame. - - :raises AttributeError: If the values in `colnames` or `rownames` are not unique - :raises KeyError: If the res_dict does not have keys 'metadata' and 'data' - :raises KeyError: If the data dictionary does not have the same keys as the 'id' - column - :raises ValueError: If the metadata does not have an 'id' column - :raises ValueError: If either the metadata or the data dictionary values are not - DataFrames - :raises ValueError: If the `colnames` is not in the res_dict metadata - :raises ValueError: If the `rownames` is not in the res_dict data - :raises ValueError: If the metrics are not in the data dictionary - - """ - - # Check required keys - if not all(k in res_dict for k in ["metadata", "data"]): - raise KeyError("res_dict must have keys 'metadata' and 'data'") - - metadata: pd.DataFrame = res_dict["metadata"] - - # Verify 'id' in metadata - if "id" not in metadata.columns: - raise ValueError("metadata must have an 'id' column") - - # Check for missing keys in 'data' - missing_keys = [k for k in metadata["id"] if str(k) not in res_dict["data"]] - if missing_keys: - raise KeyError( - f"Data dictionary must have the same keys as the 'id' " - f"column. Missing keys: {missing_keys}" - ) - - # Ensure all data dictionary values are DataFrames - if not all(isinstance(v, pd.DataFrame) for v in res_dict["data"].values()): - raise ValueError("All values in the data dictionary must be DataFrames") - - # Verify rownames in data and colnames in metadata - if colnames not in metadata.columns: - raise ValueError(f"colnames '{colnames}' not in metadata") - data_with_missing_rownames = [ - id for id, df in res_dict["data"].items() if rownames not in df.columns - ] - if data_with_missing_rownames: - raise ValueError( - f"rownames '{rownames}' not in data for ids: {data_with_missing_rownames}" - ) - - # Factorize unique row and column labels - row_labels = pd.Index( - {item for df in res_dict["data"].values() for item in df[rownames].unique()} - ) - - # Initialize output dictionary with NaN DataFrames for each metric - output_dict = { - m: pd.DataFrame(index=pd.Index(row_labels, name=rownames)) - for m in metrics_dict.keys() - } - - # Populate DataFrames with metric values - info_msgs = set() - for _, row in metadata.iterrows(): - try: - data = res_dict["data"][row["id"]] - except KeyError: - info_msgs.add("casting `id` to str to extract data from res_dict['data']") - data = res_dict["data"][str(row["id"])] - - for metric, row_dedup_func in metrics_dict.items(): - # Filter data to include only the rownames and metric columns - if metric not in data.columns: - raise ValueError( - f"Metric '{metric}' not found in data for id '{row['id']}'" - ) - - metric_data = data[[rownames, metric]] - - # Handle deduplication if row_dedup_func is provided - if row_dedup_func is not None: - metric_data = ( - metric_data.groupby(rownames)[metric] - .apply(row_dedup_func) - .reset_index() - ) - else: - # Ensure no duplicates exist if no deduplication function is provided - if metric_data[rownames].duplicated().any(): - raise ValueError( - f"Duplicate entries found for metric '{metric}' " - f"in id '{row['id']}' without dedup_func" - ) - - # test if row[colnames] is already in output_dict[metric]. If it is, add a - # replicate suffix and try again, Continue doing this until the column name - # is unique - colname = row[colnames] - suffix = 2 - while colname in output_dict[metric].columns: - colname = f"{row[colnames]}_rep{suffix}" - suffix += 1 - if suffix > 2: - info_msgs.add( - f"Column name '{row[colnames]}' already exists in " - f"output DataFrame for metric '{metric}'. " - f"Renaming to '{colname}'" - ) - # Join metric data with output DataFrame for the metric - output_dict[metric] = output_dict[metric].join( - metric_data.set_index(rownames).rename(columns={metric: colname}), - how="left", - ) - logger.info("; ".join(info_msgs)) - - # Drop incomplete rows and columns if drop_incomplete_rows is True - if drop_incomplete_rows: - for metric, df in output_dict.items(): - # Drop rows and columns where all values are NaN - initial_shape = df.shape - output_dict[metric] = df.dropna(axis=0) - final_shape = output_dict[metric].shape - - dropped_rows = initial_shape[0] - final_shape[0] - dropped_columns = initial_shape[1] - final_shape[1] - - if dropped_rows > 0 or dropped_columns > 0: - logger.warning( - f"{dropped_rows} rows and {dropped_columns} " - f"columns with incomplete " - f"records were dropped for metric '{metric}'." - ) - - return output_dict diff --git a/tfbpapi/models.py b/tfbpapi/models.py new file mode 100644 index 0000000..ae0d918 --- /dev/null +++ b/tfbpapi/models.py @@ -0,0 +1,990 @@ +""" +Pydantic models for dataset card validation and metadata configuration. + +These models provide minimal structure for parsing HuggingFace dataset cards while +remaining flexible enough to accommodate diverse experimental systems. Most fields use +extra="allow" to accept domain-specific additions without requiring code changes. + +Also includes models for VirtualDB metadata normalization configuration. + +""" + +import logging +from enum import Enum +from functools import cached_property +from pathlib import Path +from typing import Any, TypeAlias + +import yaml # type: ignore[import-untyped] +from pydantic import ( + BaseModel, + ConfigDict, + Field, + computed_field, + field_serializer, + field_validator, + model_validator, +) + +# Type aliases for improved readability +FactorAliases: TypeAlias = dict[str, dict[str, list[str | int | float | bool]]] + + +logger = logging.getLogger(__name__) + + +class DatasetType(str, Enum): + """Supported dataset types.""" + + GENOMIC_FEATURES = "genomic_features" + ANNOTATED_FEATURES = "annotated_features" + GENOME_MAP = "genome_map" + METADATA = "metadata" + COMPARATIVE = "comparative" + + +class FeatureInfo(BaseModel): + """ + Information about a dataset feature/column. + + Minimal required fields with flexible dtype handling. + + """ + + name: str = Field(..., description="Column name in the data") + dtype: str | dict[str, Any] = Field( + ..., + description="Data type (string, int64, float64, etc.) or class_label dict", + ) + description: str = Field(..., description="Description of the field") + role: str | None = Field( + default=None, + description="Optional semantic role. 'experimental_condition' " + "has special behavior.", + ) + definitions: dict[str, Any] | None = Field( + default=None, + description="For experimental_condition fields: definitions per value", + ) + + +class PartitioningInfo(BaseModel): + """Partitioning configuration for datasets.""" + + enabled: bool = Field(default=False, description="Whether partitioning is enabled") + partition_by: list[str] | None = Field( + default=None, description="Partition column names" + ) + path_template: str | None = Field( + default=None, description="Path template for partitioned files" + ) + + +class DatasetInfo(BaseModel): + """Dataset structure information.""" + + features: list[FeatureInfo] = Field(..., description="Feature definitions") + partitioning: PartitioningInfo | None = Field( + default=None, description="Partitioning configuration" + ) + + +class DataFileInfo(BaseModel): + """Information about data files.""" + + split: str = Field(default="train", description="Dataset split name") + path: str = Field(..., description="Path to data file(s)") + + +class DatasetConfig(BaseModel): + """ + Configuration for a dataset within a repository. + + Uses extra="allow" to accept arbitrary experimental_conditions and other fields. + + """ + + config_name: str = Field(..., description="Unique configuration identifier") + description: str = Field(..., description="Human-readable description") + dataset_type: DatasetType = Field(..., description="Type of dataset") + default: bool = Field( + default=False, description="Whether this is the default config" + ) + applies_to: list[str] | None = Field( + default=None, description="Configs this metadata applies to" + ) + metadata_fields: list[str] | None = Field( + default=None, description="Fields for embedded metadata extraction" + ) + data_files: list[DataFileInfo] = Field(..., description="Data file information") + dataset_info: DatasetInfo = Field(..., description="Dataset structure information") + + model_config = ConfigDict(extra="allow") + + @field_validator("applies_to", mode="after") + @classmethod + def applies_to_only_for_metadata( + cls, v: list[str] | None, info + ) -> list[str] | None: + """ + Validate that applies_to is only used for metadata or comparative configs. + + :param v: The applies_to field value + :param info: Validation info containing other field values + :return: The validated applies_to value + :raises ValueError: If applies_to is used with invalid dataset type + + """ + if v is not None: + dataset_type = info.data.get("dataset_type") + if dataset_type not in (DatasetType.METADATA, DatasetType.COMPARATIVE): + raise ValueError( + "applies_to field is only valid " + "for metadata and comparative dataset types" + ) + return v + + @field_validator("metadata_fields", mode="after") + @classmethod + def metadata_fields_not_empty(cls, v: list[str] | None) -> list[str] | None: + """ + Validate metadata_fields is not an empty list. + + :param v: The metadata_fields value + :return: The validated metadata_fields value + :raises ValueError: If metadata_fields is an empty list + + """ + if v is not None and len(v) == 0: + raise ValueError("metadata_fields cannot be empty list, use None instead") + return v + + +class DatasetCard(BaseModel): + """ + Complete dataset card model. + + Uses extra="allow" to accept arbitrary top-level metadata and + experimental_conditions. + + """ + + configs: list[DatasetConfig] = Field(..., description="Dataset configurations") + + model_config = ConfigDict(extra="allow") + + @field_validator("configs", mode="after") + @classmethod + def validate_configs(cls, v: list[DatasetConfig]) -> list[DatasetConfig]: + """ + Validate configs list. + + Ensures at least one config exists, all config names are unique, and at most one + config is marked as default. + + :param v: The list of DatasetConfig objects + :return: The validated list of configs + :raises ValueError: If validation fails + + """ + # Check non-empty + if not v: + raise ValueError("At least one dataset configuration is required") + + # Check unique names + names = [config.config_name for config in v] + if len(names) != len(set(names)): + raise ValueError("Configuration names must be unique") + + # Check at most one default + defaults = sum(1 for config in v if config.default) + if defaults > 1: + raise ValueError("At most one configuration can be marked as default") + + return v + + # Computed properties for better discoverability + @computed_field # type: ignore[prop-decorator] + @cached_property + def default_config(self) -> DatasetConfig | None: + """ + Get the default configuration if one exists. + + :return: The default DatasetConfig or None if no default is set + + """ + for config in self.configs: + if config.default: + return config + return None + + @computed_field # type: ignore[prop-decorator] + @cached_property + def config_names(self) -> list[str]: + """ + Get all configuration names. + + :return: List of all config_name values + + """ + return [config.config_name for config in self.configs] + + # Utility methods (not serialized) + def get_config_by_name(self, name: str) -> DatasetConfig | None: + """ + Get a configuration by name. + + :param name: The configuration name to search for + :return: The matching DatasetConfig or None if not found + + """ + for config in self.configs: + if config.config_name == name: + return config + return None + + def get_configs_by_type(self, dataset_type: DatasetType) -> list[DatasetConfig]: + """ + Get all configurations of a specific type. + + :param dataset_type: The DatasetType to filter by + :return: List of matching DatasetConfig objects + + """ + return [ + config for config in self.configs if config.dataset_type == dataset_type + ] + + def get_data_configs(self) -> list[DatasetConfig]: + """ + Get all non-metadata configurations. + + :return: List of DatasetConfig objects excluding metadata types + + """ + return [ + config + for config in self.configs + if config.dataset_type != DatasetType.METADATA + ] + + def get_metadata_configs(self) -> list[DatasetConfig]: + """ + Get all metadata configurations. + + :return: List of DatasetConfig objects with metadata type + + """ + return [ + config + for config in self.configs + if config.dataset_type == DatasetType.METADATA + ] + + +class ExtractedMetadata(BaseModel): + """Metadata extracted from datasets.""" + + config_name: str = Field(..., description="Source configuration name") + field_name: str = Field( + ..., description="Field name the metadata was extracted from" + ) + values: set[str] = Field(..., description="Unique values found") + extraction_method: str = Field(..., description="How the metadata was extracted") + + @field_serializer("values", mode="plain") + def serialize_values(self, value: set[str]) -> list[str]: + """ + Serialize set as sorted list for JSON compatibility. + + :param value: Set of string values + :return: Sorted list of strings + + """ + return sorted(value) + + +class MetadataRelationship(BaseModel): + """Relationship between a data config and its metadata.""" + + data_config: str = Field(..., description="Data configuration name") + metadata_config: str = Field(..., description="Metadata configuration name") + relationship_type: str = Field( + ..., description="Type of relationship (explicit, embedded)" + ) + + +# ============================================================================ +# VirtualDB Metadata Configuration Models +# ============================================================================ + + +class PropertyMapping(BaseModel): + """ + Mapping specification for a single property. + + :ivar field: Optional field name for field-level properties. + When specified, looks in this field's definitions. + When omitted, uses repo/config-level resolution. + :ivar path: Optional dot-notation path to the property value. + For repo/config-level: relative to datacard/config root + (e.g., "experimental_conditions.media.carbon_source" or "description") + For field-level: relative to the field's definitions dict + (e.g., "temperature_celsius" resolves within each sample's definition) + When omitted with field specified, creates a column alias. + :ivar expression: Optional SQL expression for derived/computed fields. + When specified, creates a computed column. + Cannot be used with field or path. + :ivar dtype: Optional data type specification for type conversion. + Supported values: 'string', 'numeric', 'bool'. + When specified, extracted values are converted to this type. + + Examples:: + + # Repo/config-level property (explicit path from datacard root) + PropertyMapping(path="experimental_conditions.media.carbon_source.compound") + + # Repo/config-level property outside experimental_conditions + PropertyMapping(path="description") + + # Field-level property with path (relative to field definitions) + PropertyMapping(field="condition", path="temperature_celsius") + + # Field-level column alias (no path) + PropertyMapping(field="condition") + + # Derived field with expression + PropertyMapping(expression="dto_fdr < 0.05") + + """ + + field: str | None = Field(None, description="Field name for field-level properties") + path: str | None = Field(None, description="Dot-notation path to property") + expression: str | None = Field( + None, description="SQL expression for derived fields" + ) + dtype: str | None = Field( + None, + description=( + "Data type for conversion: 'string', 'numeric', 'bool', or 'factor'. " + "When 'factor', the field must reference a DataCard field with a " + "class_label dtype specifying the allowed levels. VirtualDB will " + "register a DuckDB ENUM type and cast the column to it." + ), + ) + + @field_validator("path", "field", "expression", mode="before") + @classmethod + def strip_whitespace(cls, v: str | None) -> str | None: + """ + Strip whitespace and validate non-empty strings. + + :param v: String value to validate + :return: Stripped string or None + :raises ValueError: If string is empty or only whitespace + + """ + if v is None: + return None + v = v.strip() + if not v: + raise ValueError("Value cannot be empty or whitespace") + return v + + @model_validator(mode="after") + def validate_field_types(self) -> "PropertyMapping": + """ + Ensure at least one field type is specified and mutually exclusive. + + Also validates dtype='factor' requires a field (not expression or path-only). + + :return: The validated PropertyMapping instance + :raises ValueError: If validation constraints are violated + + """ + if self.expression is not None: + if self.field is not None or self.path is not None: + raise ValueError( + "expression cannot be used with field or path - " + "derived fields are computed, not extracted" + ) + elif self.field is None and self.path is None: + raise ValueError( + "At least one of 'field', 'path', or 'expression' must be specified" + ) + if self.dtype == "factor": + if self.expression is not None or self.field is None: + raise ValueError( + "dtype='factor' requires 'field' to be specified and " + "cannot be used with 'expression' or as a path-only mapping" + ) + return self + + +class DatasetVirtualDBConfig(BaseModel): + """ + VirtualDB configuration for a specific dataset within a repository. + + Additional property mappings can be provided as extra fields and will be + automatically parsed as PropertyMapping objects. + + :ivar sample_id: Mapping for the sample identifier field (required for + primary datasets) + :ivar links: For comparative datasets, map link_field -> list of + [repo_id, config_name] pairs specifying which primary datasets + are linked through each link field. + + Example - Primary dataset:: + + annotated_features: + sample_id: + field: sample_id + regulator_locus_tag: + field: regulator_locus_tag + + Example - Comparative dataset:: + + dto: + # Field mappings - use this to rename fields + dto_fdr: + field: dto_fdr + dto_pvalue: + field: empirical_pvalue # renames empirical_pvalue to dto_pvalue + # Links to primary datasets + links: + binding_id: + - [BrentLab/harbison_2004, harbison_2004] + - [BrentLab/callingcards, annotated_features] + perturbation_id: + - [BrentLab/kemmeren_2014, kemmeren_2014] + + """ + + sample_id: PropertyMapping | None = Field( + None, description="Mapping for sample identifier field" + ) + db_name: str | None = Field( + None, + description=( + "Short name for this dataset in the SQL interface. " + "Falls back to the config_name (YAML dict key) if not " + "specified. Must be a valid SQL identifier." + ), + ) + links: dict[str, list[list[str]]] = Field( + default_factory=dict, + description="For comparative datasets: map link_field -> " + "[repo_id, config_name] pairs", + ) + tags: dict[str, str] = Field( + default_factory=dict, + description="Arbitrary key/value annotations for this dataset", + ) + + model_config = ConfigDict(extra="allow") + + @field_validator("links", mode="after") + @classmethod + def validate_links( + cls, v: dict[str, list[list[str]]] + ) -> dict[str, list[list[str]]]: + """ + Validate that each link is a [repo_id, config_name] pair. + + :param v: Links dictionary + :return: Validated links + :raises ValueError: If any link is not a valid pair + + """ + for link_field, datasets in v.items(): + for i, dataset_pair in enumerate(datasets): + if not isinstance(dataset_pair, list) or len(dataset_pair) != 2: + raise ValueError( + f"Link {i} for link_field '{link_field}' must be " + f"[repo_id, config_name], got: {dataset_pair}" + ) + return v + + @field_validator("db_name", mode="after") + @classmethod + def validate_db_name(cls, v: str | None) -> str | None: + """ + Validate db_name is a valid SQL identifier and not reserved. + + :param v: db_name value + :return: Validated db_name + :raises ValueError: If db_name is invalid + + """ + if v is None: + return None + import re + + if not re.match(r"^[a-zA-Z_][a-zA-Z0-9_]*$", v): + raise ValueError( + f"db_name '{v}' is not a valid SQL identifier. " + "Use only letters, digits, and underscores, " + "starting with a letter or underscore." + ) + reserved = {"samples"} + if v.lower() in reserved: + raise ValueError(f"db_name '{v}' is reserved for internal use.") + return v + + @model_validator(mode="before") + @classmethod + def parse_property_mappings(cls, data: Any) -> dict[str, Any]: + """ + Parse extra fields as PropertyMapping objects. + + :param data: Raw input data + :return: Processed data with PropertyMapping objects + :raises ValueError: If PropertyMapping validation fails + + """ + if not isinstance(data, dict): + return data + + result = {} + for key, value in data.items(): + # Known typed fields - let Pydantic handle them + if key in ("sample_id", "links", "db_name", "tags"): + result[key] = value + # Dict values should be PropertyMappings + elif isinstance(value, dict): + try: + result[key] = PropertyMapping.model_validate(value) + except Exception as e: + raise ValueError( + f"Invalid PropertyMapping for field '{key}': {e}" + ) from e + # Already parsed PropertyMapping or other type + else: + result[key] = value + + return result + + @property + def property_mappings(self) -> dict[str, PropertyMapping]: + """ + Get all property mappings from extra fields. + + :return: Dictionary of property names to PropertyMapping objects + + """ + if not self.model_extra: + return {} + + return { + key: value + for key, value in self.model_extra.items() + if isinstance(value, PropertyMapping) + } + + +class RepositoryConfig(BaseModel): + """ + Configuration for a single repository. + + For example: BrentLab/harbison_2004 + + :ivar properties: Repo-wide property mappings that apply to all datasets + :ivar dataset: Dataset-specific configurations including sample_id, + comparative_analyses, and property mappings + + Example:: + + BrentLab/harbison_2004: + temperature_celsius: + path: temperature_celsius + dataset: + harbison_2004: + sample_id: + field: sample_id + carbon_source: + field: condition + path: media.carbon_source + + """ + + properties: dict[str, PropertyMapping] = Field( + default_factory=dict, description="Repo-wide property mappings" + ) + dataset: dict[str, DatasetVirtualDBConfig] | None = Field( + None, description="Dataset-specific configurations" + ) + tags: dict[str, str] = Field( + default_factory=dict, + description="Arbitrary key/value annotations for all datasets in this repo", + ) + + @model_validator(mode="before") + @classmethod + def parse_structure(cls, data: Any) -> dict[str, Any]: + """ + Parse raw dict structure into typed objects. + + :param data: Raw input data + :return: Processed data with typed objects + :raises ValueError: If validation fails + + """ + if not isinstance(data, dict): + return data + + # Parse dataset section + parsed_datasets: dict[str, DatasetVirtualDBConfig] | None = None + dataset_section = data.get("dataset") + + if dataset_section: + if not isinstance(dataset_section, dict): + raise ValueError("'dataset' key must contain a dict") + + parsed_datasets = {} + for dataset_name, config_dict in dataset_section.items(): + if not isinstance(config_dict, dict): + raise ValueError(f"Dataset '{dataset_name}' must contain a dict") + + try: + parsed_datasets[dataset_name] = ( + DatasetVirtualDBConfig.model_validate(config_dict) + ) + except Exception as e: + raise ValueError( + f"Invalid configuration for dataset '{dataset_name}': {e}" + ) from e + + # Parse repo-wide properties (all keys except 'dataset' and 'tags') + parsed_properties = {} + for key, value in data.items(): + if key in ("dataset", "tags"): + continue + + try: + parsed_properties[key] = PropertyMapping.model_validate(value) + except Exception as e: + raise ValueError(f"Invalid repo-wide property '{key}': {e}") from e + + return { + "properties": parsed_properties, + "dataset": parsed_datasets, + "tags": data.get("tags") or {}, + } + + +class MetadataConfig(BaseModel): + """ + Configuration for building standardized metadata tables. + + Specifies optional alias mappings for normalizing factor levels across + heterogeneous datasets, plus property path mappings for each repository. + + :ivar factor_aliases: Optional mappings of standardized names to actual values. + Example: {"carbon_source": {"glucose": ["D-glucose", "dextrose"]}} + :ivar missing_value_labels: Labels for missing values by property name + :ivar description: Human-readable descriptions for each property + :ivar repositories: Dict mapping repository IDs to their configurations + + Example:: + + repositories: + BrentLab/harbison_2004: + dataset: + harbison_2004: + carbon_source: + field: condition + path: media.carbon_source + + BrentLab/kemmeren_2014: + temperature: + path: temperature_celsius + dataset: + kemmeren_2014: + carbon_source: + path: media.carbon_source + + # Comparative dataset with aliases and links + BrentLab/yeast_comparative_analysis: + dataset: + dto: + dto_fdr: + field: dto_fdr + aliases: + dto_pvalue: dto_empirical_pvalue + links: + binding_id: + - [BrentLab/harbison_2004, harbison_2004] + + factor_aliases: + carbon_source: + glucose: ["D-glucose", "dextrose"] + galactose: ["D-galactose", "Galactose"] + + missing_value_labels: + carbon_source: "unspecified" + + description: + carbon_source: "Carbon source in growth media" + + """ + + factor_aliases: FactorAliases = Field( + default_factory=dict, + description="Optional alias mappings for normalizing factor levels", + ) + missing_value_labels: dict[str, str] = Field( + default_factory=dict, + description="Labels for missing values by property name", + ) + description: dict[str, str] = Field( + default_factory=dict, + description="Human-readable descriptions for each property", + ) + repositories: dict[str, RepositoryConfig] = Field( + ..., description="Repository configurations keyed by repo ID" + ) + + @field_validator("missing_value_labels", "description", mode="before") + @classmethod + def filter_none_values(cls, v: dict[str, str] | None) -> dict[str, str]: + """ + Filter out None values that may come from empty YAML values. + + :param v: Dictionary that may contain None values + :return: Dictionary with None values filtered out + + """ + if not v: + return {} + # Pydantic will validate it's a dict, we just filter None values + return {k: val for k, val in v.items() if val is not None} + + @field_validator("factor_aliases", mode="after") + @classmethod + def validate_factor_aliases(cls, v: FactorAliases) -> FactorAliases: + """ + Validate factor alias structure and value types. + + :param v: Factor aliases dictionary + :return: Validated factor aliases + :raises ValueError: If any alias has an empty value list + + """ + for prop_name, aliases in v.items(): + for alias_name, actual_values in aliases.items(): + if not actual_values: + raise ValueError( + f"Alias '{alias_name}' for '{prop_name}' cannot " + f"have empty value list" + ) + return v + + @model_validator(mode="after") + def validate_repositories_have_datasets(self) -> "MetadataConfig": + """ + Validate that every repository defines at least one dataset. + + :return: The validated MetadataConfig instance + :raises ValueError: If any repository has no datasets defined + + """ + for repo_id, repo_config in self.repositories.items(): + if not repo_config.dataset: + raise ValueError( + f"Repository '{repo_id}' must define at least one " + "dataset under the 'dataset' key." + ) + return self + + @model_validator(mode="after") + def validate_unique_db_names(self) -> "MetadataConfig": + """ + Validate that all resolved db_names are unique across datasets. + + Each dataset resolves to db_name or config_name. These must be unique to avoid + SQL view name collisions. + + :return: The validated MetadataConfig instance + :raises ValueError: If duplicate db_names are found + + """ + seen: dict[str, str] = {} + for repo_id, repo_config in self.repositories.items(): + if not repo_config.dataset: + continue + for config_name, dataset_config in repo_config.dataset.items(): + resolved = dataset_config.db_name or config_name + key = resolved.lower() + if key in seen: + raise ValueError( + f"Duplicate db_name '{resolved}': used by " + f"'{seen[key]}' and " + f"'{repo_id}/{config_name}'" + ) + seen[key] = f"{repo_id}/{config_name}" + return self + + @model_validator(mode="before") + @classmethod + def parse_config(cls, data: Any) -> dict[str, Any]: + """ + Parse and validate all top-level sections of the VirtualDB configuration. + + Handles the four top-level sections: ``repositories`` (required), + ``factor_aliases``, ``missing_value_labels``, and ``description`` + (all optional). Logs an INFO message for each optional section that + is absent from the configuration. + + :param data: Raw configuration data + :return: Processed configuration dict ready for Pydantic field validation + :raises ValueError: If ``repositories`` is missing or empty, or if + any repository config is invalid + + """ + if not isinstance(data, dict): + return data + + repositories_data = data.get("repositories", {}) + + if not repositories_data: + raise ValueError( + "Configuration must have a 'repositories' key " + "with at least one repository" + ) + + for optional_key in ("factor_aliases", "missing_value_labels", "description"): + if not data.get(optional_key): + logger.info( + "No '%s' section found in VirtualDB configuration.", + optional_key, + ) + + # Parse each repository config + repositories = {} + for repo_id, repo_config in repositories_data.items(): + try: + repositories[repo_id] = RepositoryConfig.model_validate(repo_config) + except Exception as e: + raise ValueError( + f"Invalid configuration for repository '{repo_id}': {e}" + ) from e + + return { + "factor_aliases": data.get("factor_aliases", {}), + "missing_value_labels": data.get("missing_value_labels", {}), + "description": data.get("description", {}), + "repositories": repositories, + } + + @classmethod + def from_yaml(cls, path: Path | str) -> "MetadataConfig": + """ + Load and validate configuration from YAML file. + + :param path: Path to YAML configuration file + :return: Validated MetadataConfig instance + :raises ValidationError: If configuration is invalid + :raises FileNotFoundError: If file doesn't exist + :raises ValueError: If YAML file does not contain a dictionary + + """ + with open(Path(path)) as f: + data = yaml.safe_load(f) + + if not isinstance(data, dict): + raise ValueError( + f"Configuration file must contain a YAML dictionary, " + f"got {type(data).__name__} instead" + ) + + return cls.model_validate(data) + + def get_repository_config(self, repo_id: str) -> RepositoryConfig | None: + """ + Get configuration for a specific repository. + + :param repo_id: Repository ID (e.g., "BrentLab/harbison_2004") + :return: RepositoryConfig instance or None if not found + + """ + return self.repositories.get(repo_id) + + def get_property_mappings( + self, repo_id: str, config_name: str + ) -> dict[str, PropertyMapping]: + """ + Get merged property mappings for a repo/dataset combination. + + Merges repo-wide and dataset-specific mappings, with dataset-specific taking + precedence. + + :param repo_id: Repository ID + :param config_name: Dataset/config name + :return: Dict mapping property names to PropertyMapping objects + + """ + repo_config = self.get_repository_config(repo_id) + if not repo_config: + return {} + + # Start with repo-wide properties + mappings: dict[str, PropertyMapping] = dict(repo_config.properties) + + # Override with dataset-specific properties + if repo_config.dataset and config_name in repo_config.dataset: + dataset_config = repo_config.dataset[config_name] + mappings.update(dataset_config.property_mappings) + + return mappings + + def get_tags(self, repo_id: str, config_name: str) -> dict[str, str]: + """ + Get merged tags for a repo/dataset combination. + + Merges repo-level and dataset-level tags, with dataset-level tags taking + precedence for the same key. + + :param repo_id: Repository ID + :param config_name: Dataset/config name + :return: Dict of merged tags + + """ + repo_config = self.get_repository_config(repo_id) + if not repo_config: + return {} + + merged: dict[str, str] = dict(repo_config.tags) + + if repo_config.dataset and config_name in repo_config.dataset: + merged.update(repo_config.dataset[config_name].tags) + + return merged + + def get_sample_id_field(self, repo_id: str, config_name: str) -> str: + """ + Resolve the actual column name for the sample identifier. + + Checks dataset-level ``sample_id`` first, then repo-level, + falling back to ``"sample_id"`` if neither is configured. + + :param repo_id: Repository ID + :param config_name: Dataset/config name + :return: Column name for the sample identifier + + """ + repo_cfg = self.get_repository_config(repo_id) + if not repo_cfg: + return "sample_id" + + # Dataset-level takes precedence + if repo_cfg.dataset and config_name in repo_cfg.dataset: + ds_cfg = repo_cfg.dataset[config_name] + if ds_cfg.sample_id is not None and ds_cfg.sample_id.field: + return ds_cfg.sample_id.field + + # Repo-level fallback + repo_sample_id = repo_cfg.properties.get("sample_id") + if repo_sample_id is not None and repo_sample_id.field is not None: + return repo_sample_id.field + + return "sample_id" diff --git a/tfbpapi/rank_transforms.py b/tfbpapi/rank_transforms.py deleted file mode 100644 index 9e4c672..0000000 --- a/tfbpapi/rank_transforms.py +++ /dev/null @@ -1,154 +0,0 @@ -import numpy as np -from scipy.stats import rankdata - - -def shifted_negative_log_ranks(ranks: np.ndarray) -> np.ndarray: - """ - Transforms ranks to negative log10 values and shifts such that the lowest value is - 0. - - :param ranks: A vector of ranks - :return np.ndarray: A vector of negative log10 transformed ranks shifted such that - the lowest value is 0 - :raises ValueError: If the ranks are not numeric. - - """ - if not np.issubdtype(ranks.dtype, np.number): - raise ValueError("`ranks` must be a numeric") - max_rank = np.max(ranks) - log_max_rank = np.log10(max_rank) - return -1 * np.log10(ranks) + log_max_rank - - -def stable_rank( - pvalue_vector: np.ndarray, enrichment_vector: np.ndarray, method="average" -) -> np.ndarray: - """ - Ranks data by primary_column, breaking ties based on secondary_column. The expected - primary and secondary columns are 'pvalue' and 'enrichment', respectively. Then the - ranks are transformed to negative log10 values and shifted such that the lowest - value is 0 and the highest value is log10(min_rank). - - :param pvalue_vector: A vector of pvalues - :param enrichment_vector: A vector of enrichment values corresponding to the pvalues - :param method: The method to use for final ranking. Default is "average". - See `rankdata` - - :return np.ndarray: A vector of negative log10 transformed ranks shifted such that - the lowest value is 0 and the highest value is log10(min_rank) - :raises ValueError: If the primary or secondary column is not numeric. - - """ - - # Check if primary and secondary columns are numeric - if not np.issubdtype(pvalue_vector.dtype, np.number): - raise ValueError("`primary_vector` must be a numeric") - if not np.issubdtype(enrichment_vector.dtype, np.number): - raise ValueError("`secondary_vector` must be a numeric") - - # Step 1: Rank by primary_column - # note that this will now always be an integer, unlike average which could return - # decimal values making adding the secondary rank more difficult - primary_rank = rankdata(pvalue_vector, method="min") - - # Step 2: Identify ties in primary_rank - unique_ranks = np.unique(primary_rank) - - # Step 3: Adjust ranks within ties using secondary ranking - adjusted_primary_rank = primary_rank.astype( - float - ) # Convert to float for adjustments - - for unique_rank in unique_ranks: - # Get indices where primary_rank == unique_rank - tie_indices = np.where(primary_rank == unique_rank)[0] - - if len(tie_indices) > 1: # Only adjust if there are ties - # Rank within the tie group by secondary_column - # (descending if higher is better) - tie_secondary_values = enrichment_vector[tie_indices] - secondary_rank_within_ties = rankdata( - -tie_secondary_values, method="average" - ) - - # Calculate dynamic scale factor to ensure adjustments are < 1. Since the - # primary_rank is an integer, adding a number less than 1 will not affect - # rank relative to the other groups. - max_secondary_rank = np.max(secondary_rank_within_ties) - scale_factor = ( - 0.9 / max_secondary_rank - ) # Keep scale factor slightly below 1/max rank - - # multiple the secondary_rank_within_ties values by 0.1 and add this value - # to the adjusted_primary_rank_values. This will rank the tied primary - # values by the secondary values, but not affect the overall primary rank - # outside of the tie group - # think about this scale factor - adjusted_primary_rank[tie_indices] += ( - secondary_rank_within_ties * scale_factor - ) - - # Step 4: Final rank based on the adjusted primary ranks - final_ranks = rankdata(adjusted_primary_rank, method=method) - - return final_ranks - - -def rank_by_pvalue(pvalue_vector: np.ndarray, method="average") -> np.ndarray: - """ - This expects a vector of pvalues, returns a vector of ranks where the lowest pvalue - has the lowest rank. - - :param pvalue_vector: A vector of pvalues - :param enrichment_vector: A vector of enrichment values corresponding to the pvalues - :param method: The method to use for ranking. Default is "average". See `rankdata` - :return np.ndarray: A vector of negative log10 transformed ranks shifted such that - the lowest value is 0 and the highest value is log10(min_rank) - :raises ValueError: If the primary or secondary column is not numeric. - - """ - - # Check if primary and secondary columns are numeric - if not np.issubdtype(pvalue_vector.dtype, np.number): - raise ValueError("`primary_vector` must be a numeric") - - # Step 1: Rank by primary_column - # note that this will now always be an integer, unlike average which could return - # decimal values making adding the secondary rank more difficult - return rankdata(pvalue_vector, method=method) - - -def transform( - pvalue_vector: np.ndarray, - enrichment_vector: np.ndarray, - use_enrichment: bool = True, - negative_log_shift: bool = True, - **kwargs, -) -> np.ndarray: - """ - This calls the rank() function and then transforms the ranks to negative log10 - values and shifts to the right such that the lowest value (largest rank, least - important) is 0. - - :param pvalue_vector: A vector of pvalues - :param enrichment_vector: A vector of enrichment values corresponding to the pvalues - :param use_enrichment: Set to True to use the enrichment vector to break ties. - Default is True. If False, pvalues will be ranked directly with method="average' - :param negative_log_shift: Set to True to shift the ranks to the right such that the - lowest value (largest rank, least important) is 0. Default is True. - :param kwargs: Additional keyword arguments to pass to the rank() function (e.g. - method="min") - :return np.ndarray: A vector of negative log10 transformed ranks shifted such that - the lowest value is 0 and the highest value is log10(min_rank) - :raises ValueError: If the primary or secondary column is not numeric. - - """ - if use_enrichment: - ranks = stable_rank(pvalue_vector, enrichment_vector, **kwargs) - else: - ranks = rank_by_pvalue(pvalue_vector, **kwargs) - - if negative_log_shift: - return shifted_negative_log_ranks(ranks) - else: - return ranks diff --git a/tfbpapi/tests/conftest.py b/tfbpapi/tests/conftest.py new file mode 100644 index 0000000..9fdd767 --- /dev/null +++ b/tfbpapi/tests/conftest.py @@ -0,0 +1,1466 @@ +import pickle +from pathlib import Path +from unittest.mock import patch + +import pytest + + +@pytest.fixture +def mock_cache_info(): + """Load real cache data from pickle file.""" + cache_file = Path(__file__).parent / "data" / "cache_info.pkl" + + if not cache_file.exists(): + pytest.skip( + "test_cache_data.pkl not found. Run cache data generation script first." + ) + + with open(cache_file, "rb") as f: + return pickle.load(f) + + +@pytest.fixture +def mock_scan_cache_dir(mock_cache_info): + """Mock scan_cache_dir to return our pickled cache data.""" + with patch("huggingface_hub.scan_cache_dir", return_value=mock_cache_info): + yield mock_cache_info + + +# ============================================================================ +# Datainfo Fixtures (merged from tests/datainfo/conftest.py) +# ============================================================================ + + +@pytest.fixture +def sample_dataset_card_data(): + """Sample dataset card data for testing.""" + return { + "license": "mit", + "language": ["en"], + "tags": ["biology", "genomics", "yeast"], + "pretty_name": "Test Genomics Dataset", + "size_categories": ["100K log2(1.7) & " + "pval < 0.05). Note that " + "there is a slight " + "difference when " + "calculating from the data " + "provided here, I believe " + "due to a difference in " + "the way the targets are " + "parsed and filtered (some " + "ORFs that have since been " + "removed from the " + "annotations are removed). " + "I didn't investigate this " + "closely, though.", + "role": "experimental_condition", + }, + { + "name": "profile_first_published", + "dtype": "string", + "description": "citation or reference " + "indicating where this " + "expression profile was " + "first published", + "role": "experimental_condition", + }, + { + "name": "chase_notes", + "dtype": "string", + "description": "notes added during data " + "curation and parsing", + }, + ] + }, + } + ], + } diff --git a/tfbpapi/tests/conftests.py b/tfbpapi/tests/conftests.py deleted file mode 100644 index e69de29..0000000 diff --git a/tfbpapi/tests/example_datacards.py b/tfbpapi/tests/example_datacards.py new file mode 100644 index 0000000..36b023f --- /dev/null +++ b/tfbpapi/tests/example_datacards.py @@ -0,0 +1,510 @@ +# flake8: noqa +""" +Three diverse datacard examples for testing datacard parsing and database construction. + +These examples capture different patterns of experimental condition specification: +1. Top-level conditions with field-level variations (minimal media) +2. Complex field-level definitions with multiple environmental conditions +3. Partitioned dataset with separate metadata configs using applies_to + +""" + +EXAMPLE_1_SIMPLE_TOPLEVEL = """--- +license: mit +language: + - en +tags: + - genomics + - yeast + - transcription +pretty_name: "Example Dataset 1 - TF Perturbation" +size_categories: + - 100K- + Systematic gene identifier of the ChIP-targeted transcription factor + role: regulator_identifier + - name: regulator_symbol + dtype: string + description: Standard gene symbol of the ChIP-targeted transcription factor + role: regulator_identifier + - name: target_locus_tag + dtype: string + description: Systematic gene identifier of the target gene + role: target_identifier + - name: target_symbol + dtype: string + description: Standard gene symbol of the target gene + role: target_identifier + - name: condition + dtype: + class_label: + names: ["YPD", "galactose", "heat_shock", "oxidative_stress", + "amino_acid_starvation"] + description: Environmental or stress condition of the experiment + role: experimental_condition + definitions: + YPD: + description: Rich media baseline condition + environmental_conditions: + temperature_celsius: 30 + cultivation_method: liquid_culture + growth_phase_at_harvest: + od600: 0.6 + stage: mid_log_phase + media: + name: YPD + carbon_source: + - compound: D-glucose + concentration_percent: 2 + nitrogen_source: + - compound: yeast_extract + concentration_percent: 1 + - compound: peptone + concentration_percent: 2 + galactose: + description: Alternative carbon source condition + environmental_conditions: + temperature_celsius: 30 + cultivation_method: liquid_culture + growth_phase_at_harvest: + od600: 0.6 + stage: mid_log_phase + media: + name: YPD + carbon_source: + - compound: D-galactose + concentration_percent: 2 + nitrogen_source: + - compound: yeast_extract + concentration_percent: 1 + - compound: peptone + concentration_percent: 2 + heat_shock: + description: Temperature stress condition + environmental_conditions: + temperature_celsius: 37 + cultivation_method: liquid_culture + growth_phase_at_harvest: + od600: 0.6 + stage: mid_log_phase + media: + name: YPD + carbon_source: + - compound: D-glucose + concentration_percent: 2 + nitrogen_source: + - compound: yeast_extract + concentration_percent: 1 + - compound: peptone + concentration_percent: 2 + heat_treatment: + duration_minutes: 15 + oxidative_stress: + description: Hydrogen peroxide stress condition + environmental_conditions: + temperature_celsius: 30 + cultivation_method: liquid_culture + growth_phase_at_harvest: + od600: 0.6 + stage: mid_log_phase + media: + name: YPD + carbon_source: + - compound: D-glucose + concentration_percent: 2 + nitrogen_source: + - compound: yeast_extract + concentration_percent: 1 + - compound: peptone + concentration_percent: 2 + chemical_treatment: + compound: hydrogen_peroxide + concentration_percent: 0.004 + duration_minutes: 20 + amino_acid_starvation: + description: Amino acid starvation via chemical inhibition + environmental_conditions: + temperature_celsius: 30 + cultivation_method: liquid_culture + growth_phase_at_harvest: + od600: 0.5 + stage: mid_log_phase + media: + name: synthetic_complete + carbon_source: + - compound: D-glucose + concentration_percent: 2 + nitrogen_source: + - compound: yeast_nitrogen_base + # 6.71 g/L + concentration_percent: 0.671 + specifications: + - without_amino_acids + - without_ammonium_sulfate + - compound: ammonium_sulfate + # 5 g/L + concentration_percent: 0.5 + - compound: amino_acid_dropout_mix + # 2 g/L + concentration_percent: 0.2 + chemical_treatment: + compound: 3-amino-1,2,4-triazole + concentration_percent: 0.01 + duration_hours: 1 + - name: binding_score + dtype: float64 + description: ChIP-seq binding enrichment score + role: quantitative_measure + - name: peak_pvalue + dtype: float64 + description: Statistical significance of binding peak + role: quantitative_measure + - name: peak_qvalue + dtype: float64 + description: FDR-adjusted p-value for binding peak + role: quantitative_measure +--- +""" + + +EXAMPLE_3_PARTITIONED_WITH_METADATA = """--- +license: mit +language: + - en +tags: + - genomics + - yeast + - binding + - genome-wide + - chec-seq +pretty_name: "Example Dataset 3 - Genome Coverage Compendium" +size_categories: + - 10M- + unique identifier for a specific sample. The sample ID identifies a unique + (regulator_locus_tag, time, mechanism, restriction, date, strain) tuple. + - name: db_id + dtype: integer + description: >- + an old unique identifer, for use internally only. Deprecated and will be removed eventually. + Do not use in analysis. db_id = 0, for GEV and Z3EV, means that those samples are not + included in the original DB. + - name: regulator_locus_tag + dtype: string + description: >- + induced transcriptional regulator systematic ID. + See hf/BrentLab/yeast_genome_resources + role: regulator_identifier + - name: regulator_symbol + dtype: string + description: >- + induced transcriptional regulator common name. If no common name exists, + then the `regulator_locus_tag` is used. + role: regulator_identifier + - name: target_locus_tag + dtype: string + description: >- + The systematic ID of the feature to which the effect/pvalue is assigned. + See hf/BrentLab/yeast_genome_resources + role: target_identifier + - name: target_symbol + dtype: string + description: >- + The common name of the feature to which the effect/pvalue is assigned. + If there is no common name, the `target_locus_tag` is used. + role: target_identifier + - name: time + dtype: float + description: time point (minutes) + role: experimental_condition + - name: mechanism + dtype: + class_label: + names: ["GEV", "ZEV"] + description: Synthetic TF induction system (GEV or ZEV) + role: experimental_condition + definitions: + GEV: + perturbation_method: + type: inducible_overexpression + system: GEV + inducer: beta-estradiol + description: "Galactose-inducible estrogen receptor-VP16 fusion system" + ZEV: + perturbation_method: + type: inducible_overexpression + system: ZEV + inducer: beta-estradiol + description: "Z3 (synthetic zinc finger)-estrogen receptor-VP16 fusion system" + - name: restriction + dtype: + class_label: + names: ["M", "N", "P"] + description: >- + nutrient limitation, one of P (phosphate limitation (20 mg/l).), + N (Nitrogen‐limited cultures were maintained at 40 mg/l ammonium sulfate) or + M (Not defined in the paper or on the Calico website) + role: experimental_condition + definitions: + P: + media: + nitrogen_source: + - compound: ammonium_sulfate + # Saldanha et al 2004: 5 g/l + concentration_percent: 0.5 + phosphate_source: + - compound: potassium_phosphate_monobasic + # Hackett et al 2020: 20 mg/l + concentration_percent: 0.002 + N: + media: + nitrogen_source: + - compound: ammonium_sulfate + # Hackett et al 2020: 40 mg/l + concentration_percent: 0.004 + M: + description: "Not defined in the paper or on the Calico website" + - name: date + dtype: string + description: date performed + role: experimental_condition + - name: strain + dtype: string + description: strain name + role: experimental_condition + - name: green_median + dtype: float + description: median of green (reference) channel fluorescence + role: quantitative_measure + - name: red_median + dtype: float + description: median of red (experimental) channel fluorescence + role: quantitative_measure + - name: log2_ratio + dtype: float + description: log2(red / green) subtracting value at time zero + role: quantitative_measure + - name: log2_cleaned_ratio + dtype: float + description: Non-specific stress response and prominent outliers removed + role: quantitative_measure + - name: log2_noise_model + dtype: float + description: estimated noise standard deviation + role: quantitative_measure + - name: log2_cleaned_ratio_zth2d + dtype: float + description: >- + cleaned timecourses hard-thresholded based on + multiple observations (or last observation) passing the noise model + role: quantitative_measure + - name: log2_selected_timecourses + dtype: float + description: >- + cleaned timecourses hard-thresholded based on single observations + passing noise model and impulse evaluation of biological feasibility + role: quantitative_measure + - name: log2_shrunken_timecourses + dtype: float + description: >- + selected timecourses with observation-level shrinkage based on + local FDR (false discovery rate). Most users of the data will want + to use this column. + role: quantitative_measure +--- + +# harbison_2004 +--- +license: mit +language: + - en +tags: + - genomics + - yeast + - transcription + - binding +pretty_name: "Harbison, 2004 ChIP-chip" +size_categories: + - 1M- + Environmental condition of the experiment. Nearly all of the 204 regulators + have a YPD condition, and some have others in addition. + role: experimental_condition + definitions: + YPD: + description: Rich media baseline condition + # Harbison et al 2004: grown at 30°C (from HEAT condition context) + temperature_celsius: 30 + cultivation_method: unspecified + growth_phase_at_harvest: + # Harbison et al 2004: OD600 ~0.8 + od600: 0.8 + media: + # Harbison et al 2004: 1% yeast extract / 2% peptone / 2% glucose + name: YPD + carbon_source: + - compound: D-glucose + concentration_percent: 2 + nitrogen_source: + - compound: yeast_extract + concentration_percent: 1 + - compound: peptone + concentration_percent: 2 + SM: + description: Amino acid starvation stress condition + temperature_celsius: 30 + cultivation_method: unspecified + growth_phase_at_harvest: + # Harbison et al 2004: OD600 ~0.6 + od600: 0.6 + media: + # Harbison et al 2004: synthetic complete medium + name: synthetic_complete + carbon_source: unspecified + nitrogen_source: unspecified + chemical_treatment: + compound: sulfometuron_methyl + # Harbison et al 2004: 0.2 mg/ml + concentration_percent: 0.02 + duration_hours: 2 + RAPA: + description: Nutrient deprivation via TOR inhibition + temperature_celsius: 30 + cultivation_method: unspecified + growth_phase_at_harvest: + # Harbison et al 2004: OD600 ~0.8 + od600: 0.8 + media: + name: YPD + carbon_source: + - compound: D-glucose + concentration_percent: 2 + nitrogen_source: + - compound: yeast_extract + concentration_percent: 1 + - compound: peptone + concentration_percent: 2 + chemical_treatment: + compound: rapamycin + # Harbison et al 2004: 100 nM + concentration_percent: 9.142e-6 + duration_minutes: 20 + H2O2Hi: + description: High oxidative stress condition + temperature_celsius: 30 + cultivation_method: unspecified + growth_phase_at_harvest: + # Harbison et al 2004: OD600 ~0.5 + od600: 0.5 + media: + name: YPD + carbon_source: + - compound: D-glucose + concentration_percent: 2 + nitrogen_source: + - compound: yeast_extract + concentration_percent: 1 + - compound: peptone + concentration_percent: 2 + chemical_treatment: + compound: hydrogen_peroxide + # Harbison et al 2004: 4 mM + concentration_percent: 0.0136 + duration_minutes: 30 + H2O2Lo: + description: Moderate oxidative stress condition + temperature_celsius: 30 + cultivation_method: unspecified + growth_phase_at_harvest: + # Harbison et al 2004: OD600 ~0.5 + od600: 0.5 + media: + name: YPD + carbon_source: + - compound: D-glucose + concentration_percent: 2 + nitrogen_source: + - compound: yeast_extract + concentration_percent: 1 + - compound: peptone + concentration_percent: 2 + chemical_treatment: + compound: hydrogen_peroxide + # Harbison et al 2004: 0.4 mM + concentration_percent: 0.00136 + duration_minutes: 20 + Acid: + description: Acidic pH stress condition + temperature_celsius: 30 + cultivation_method: unspecified + growth_phase_at_harvest: + # Harbison et al 2004: OD600 ~0.5 + od600: 0.5 + media: + name: YPD + carbon_source: + - compound: D-glucose + concentration_percent: 2 + nitrogen_source: + - compound: yeast_extract + concentration_percent: 1 + - compound: peptone + concentration_percent: 2 + chemical_treatment: + compound: succinic_acid + # Harbison et al 2004: 0.05 M to reach pH 4.0 + concentration_percent: 0.59 + target_pH: 4.0 + duration_minutes: 30 + Alpha: + description: Mating pheromone induction condition + temperature_celsius: 30 + cultivation_method: unspecified + growth_phase_at_harvest: + # Harbison et al 2004: OD600 ~0.8 + od600: 0.8 + media: + name: YPD + carbon_source: + - compound: D-glucose + concentration_percent: 2 + nitrogen_source: + - compound: yeast_extract + concentration_percent: 1 + - compound: peptone + concentration_percent: 2 + chemical_treatment: + compound: alpha_factor_pheromone + # Harbison et al 2004: 5 mg/ml + concentration_percent: 0.5 + duration_minutes: 30 + BUT14: + description: Long-term filamentation induction with butanol + temperature_celsius: 30 + cultivation_method: unspecified + growth_phase_at_harvest: + # Harbison et al 2004: OD600 ~0.8 + od600: 0.8 + media: + # Harbison et al 2004: YPD containing 1% butanol + name: YPD + carbon_source: + - compound: D-glucose + concentration_percent: 2 + nitrogen_source: + - compound: yeast_extract + concentration_percent: 1 + - compound: peptone + concentration_percent: 2 + additives: + - compound: butanol + concentration_percent: 1 + incubation_duration_hours: 14 + BUT90: + description: Short-term filamentation induction with butanol + temperature_celsius: 30 + cultivation_method: unspecified + growth_phase_at_harvest: + # Harbison et al 2004: OD600 ~0.8 + od600: 0.8 + media: + # Harbison et al 2004: YPD containing 1% butanol + name: YPD + carbon_source: + - compound: D-glucose + concentration_percent: 2 + nitrogen_source: + - compound: yeast_extract + concentration_percent: 1 + - compound: peptone + concentration_percent: 2 + additives: + - compound: butanol + concentration_percent: 1 + incubation_duration_minutes: 90 + "Thi-": + description: Vitamin B1 deprivation stress condition + temperature_celsius: 30 + cultivation_method: unspecified + growth_phase_at_harvest: + # Harbison et al 2004: OD600 ~0.8 + od600: 0.8 + media: + # Harbison et al 2004: synthetic complete medium lacking thiamin + name: synthetic_complete_minus_thiamine + carbon_source: unspecified + nitrogen_source: unspecified + GAL: + description: Galactose-based growth medium condition + temperature_celsius: 30 + cultivation_method: unspecified + growth_phase_at_harvest: + # Harbison et al 2004: OD600 ~0.8 + od600: 0.8 + media: + # Harbison et al 2004: YEP medium supplemented with galactose (2%) + name: yeast_extract_peptone + carbon_source: + - compound: D-galactose + concentration_percent: 2 + nitrogen_source: + - compound: yeast_extract + concentration_percent: unspecified + - compound: peptone + concentration_percent: unspecified + HEAT: + description: Heat shock stress condition + # Harbison et al 2004: grown at 30°C, shifted to 37°C for 45 min + initial_temperature_celsius: 30 + temperature_shift_celsius: 37 + temperature_shift_duration_minutes: 45 + cultivation_method: unspecified + growth_phase_at_harvest: + # Harbison et al 2004: OD600 ~0.5 + od600: 0.5 + media: + # Harbison et al 2004: YPD + name: YPD + carbon_source: + - compound: D-glucose + concentration_percent: 2 + nitrogen_source: + - compound: yeast_extract + concentration_percent: 1 + - compound: peptone + concentration_percent: 2 + "Pi-": + description: Phosphate deprivation stress condition + temperature_celsius: 30 + cultivation_method: unspecified + growth_phase_at_harvest: + # Harbison et al 2004: OD600 ~0.8 + od600: 0.8 + media: + # Harbison et al 2004: synthetic complete medium lacking phosphate + name: synthetic_complete_minus_phosphate + carbon_source: unspecified + nitrogen_source: unspecified + RAFF: + description: Raffinose-based growth medium condition + temperature_celsius: 30 + cultivation_method: unspecified + growth_phase_at_harvest: + # Harbison et al 2004: OD600 ~0.8 + od600: 0.8 + media: + # Harbison et al 2004: YEP medium supplemented with raffinose (2%) + name: yeast_extract_peptone + carbon_source: + - compound: D-raffinose + concentration_percent: 2 + nitrogen_source: + - compound: yeast_extract + concentration_percent: unspecified + - compound: peptone + concentration_percent: unspecified + - name: regulator_locus_tag + dtype: string + description: Systematic gene name (ORF identifier) of the ChIPd transcription factor + role: regulator_identifier + - name: regulator_symbol + dtype: string + description: Standard gene symbol of the ChIPd transcription factor + role: regulator_identifier + - name: target_locus_tag + dtype: string + description: Systematic gene name (ORF identifier) of the target gene measured + role: target_identifier + - name: target_symbol + dtype: string + description: Standard gene symbol of the target gene measured + role: target_identifier + - name: effect + dtype: float64 + description: The chip channel ratio (effect size) + role: quantitative_measure + - name: pvalue + dtype: float64 + description: pvalue of the chip channel ratio (effect) + role: quantitative_measure +--- + +# hu_2007_reimand_2010 +--- +license: mit +language: + - en +tags: + - genomics + - yeast + - transcription + - perturbation + - response + - knockout + - TFKO +pretty_name: Hu 2007/Reimand 2010 TFKO +size_categories: + - 1M- + an old unique identifer, for use internally only. Deprecated and will be removed eventually. + Do not use in analysis. + - name: regulator_locus_tag + dtype: string + description: induced transcriptional regulator systematic ID. See hf/BrentLab/yeast_genome_resources + role: regulator_identifier + - name: regulator_symbol + dtype: string + description: induced transcriptional regulator common name. If no common name exists, then the `regulator_locus_tag` is used. + role: regulator_identifier + - name: target_locus_tag + dtype: string + description: The systematic ID of the feature to which the effect/pvalue is assigned. See hf/BrentLab/yeast_genome_resources + role: target_identifier + - name: target_symbol + dtype: string + description: The common name of the feature to which the effect/pvalue is assigned. If there is no common name, the `target_locus_tag` is used. + role: target_identifier + - name: effect + dtype: float + description: >- + log fold change of mutant vs wt. From the remaind methods: Differential expression + was calculated using a moderated eBayes t-test as implemented in the Limma + Bioconductor package + role: quantitative_measure + - name: pval + dtype: float + description: P-values were FDR-adjusted across the whole microarray dataset to correct for multiple testing + role: quantitative_measure + - name: average_od_of_replicates + dtype: float + description: average OD of the replicates at harvest + - name: heat_shock + dtype: bool + description: >- + `True` if the regulator strain was subjected to heat shock treatment. + Applied to 22 transcription factors implicated in heat shock response. + `False` otherwise + role: experimental_condition + definitions: + true: + # Hu et al 2007: "15-min heat shock at 39°C" + temperature_celsius: 39 + duration_minutes: 15 + strain_background: + genotype: BY4741 + mating_type: MATa + markers: + - his3Δ1 + - leu2Δ0 + - met15Δ0 + - ura3Δ0 + source: Open_Biosystems + description: Knockout strains for nonessential transcription factors + false: + description: Standard growth conditions at 30°C + strain_background: + genotype: BY4741 + mating_type: MATa + markers: + - his3Δ1 + - leu2Δ0 + - met15Δ0 + - ura3Δ0 + source: Open_Biosystems + description: Knockout strains for nonessential transcription factors + - name: tetracycline_treatment + dtype: bool + description: >- + `True` if the regulator strain was treated with doxycycline to repress + TetO7-promoter regulated essential transcription factors. Applied to 6 + essential transcription factors. `False` for untreated control condition. + role: experimental_condition + definitions: + true: + drug_treatment: + compound: doxycycline + # Hu et al 2007: 10 mg/ml + concentration_percent: 1 + duration_hours_min: 14 + duration_hours_max: 16 + strain_background: + genotype: BY4741_derivative + mating_type: MATa + markers: + - URA3::CMV-tTA + - his3Δ1 + - leu2Δ0 + - met15Δ0 + source: Open_Biosystems + description: Essential transcription factors with TetO7-promoter regulation + false: + description: No doxycycline treatment; TetO7 promoter active + strain_background: + genotype: BY4741_derivative + mating_type: MATa + markers: + - URA3::CMV-tTA + - his3Δ1 + - leu2Δ0 + - met15Δ0 + source: Open_Biosystems + description: Essential transcription factors with TetO7-promoter regulation +--- + +# hughes_2006 +--- +license: mit +language: +- en +tags: +- biology +- genomics +- yeast +- transcription-factors +- gene-expression +- perturbation-screen +- overexpression +- knockout +- microarray +- functional-genomics +pretty_name: "Hughes 2006 Yeast Transcription Factor Perturbation Dataset" +size_categories: +- 100K- + unique identifier for a specific sample. The sample ID identifies + a unique regulator_locus_tag and can be used to join to the + other datasets in this repo, including the metadata + - name: regulator_locus_tag + dtype: string + role: identifier + description: >- + Systematic gene name (ORF identifier) of the + transcription factor + - name: regulator_symbol + dtype: string + description: Standard gene symbol of the transcription factor + - name: found_domain + dtype: string + description: >- + Identified DNA-binding domain(s) or protein family classification + - name: sgd_description + dtype: string + description: >- + Functional description from Saccharomyces Genome Database (SGD) + - name: essential + dtype: bool + description: >- + Boolean indicating whether the gene is essential for viability + - name: oe_passed_qc + dtype: bool + description: >- + Boolean indicating whether overexpression experiments passed + quality control + - name: del_passed_qc + dtype: bool + description: >- + Boolean indicating whether deletion experiments passed + quality control + +- config_name: overexpression + description: Overexpression perturbation normalized log2 fold changes + dataset_type: annotated_features + data_files: + - split: train + path: overexpression.parquet + # temperature and growth phase are unspecified. nitrogen_source is + # also unspecified + media: + # Hughes et al 2006: "selective medium supplemented with 2% raffinose" + name: selective_medium + carbon_source: + - compound: D-raffinose + # Hughes et al 2006: 2% raffinose + concentration_percent: 2 + induction: + # Hughes et al 2006: "induction with 2% galactose for 3 h" + inducer: + compound: D-galactose + concentration_percent: 2 + duration_hours: 3 + dataset_info: + features: + - name: sample_id + dtype: integer + description: >- + unique identifier for a specific sample. The sample ID identifies + a unique regulator_locus_tag and can be used to join to the + other datasets in this repo, including the metadata + - name: regulator_locus_tag + dtype: string + description: >- + Systematic gene name (ORF identifier) of the + perturbed transcription factor + role: regulator_identifier + - name: regulator_symbol + dtype: string + description: Standard gene symbol of the perturbed transcription factor + - name: target_locus_tag + dtype: string + description: >- + Systematic gene name (ORF identifier) of the + target gene measured + role: target_identifier + - name: target_symbol + dtype: string + description: Standard gene symbol of the target gene measured + role: target_identifier + - name: dye_plus + dtype: float64 + role: quantitative_measure + description: >- + Normalized log2 fold change for positive (+) dye orientation. + Positive values indicate upregulation in response to overexpression. + - name: dye_minus + dtype: float64 + role: quantitative_measure + description: >- + Normalized log2 fold change for negative (-) dye orientation. + Positive values indicate upregulation in response to overexpression. + - name: mean_norm_log2fc + dtype: float64 + role: quantitative_measure + description: >- + Average log2 fold change across dye orientations, + providing a dye-independent estimate of gene expression + change upon transcription factor overexpression. + +- config_name: knockout + description: Deletion/knockout perturbation normalized log2 fold changes + dataset_type: annotated_features + data_files: + - split: train + path: knockout.parquet + experimental_conditions: + temperature_celsius: unspecified + cultivation_method: unspecified + media: + # Hughes et al 2006: "synthetic medium supplemented with 2% dextrose" + name: synthetic_medium + carbon_source: + - compound: D-glucose + # Hughes et al 2006: 2% dextrose + concentration_percent: 2 + nitrogen_source: unspecified + dataset_info: + features: + - name: sample_id + dtype: integer + description: >- + unique identifier for a specific sample. The sample ID identifies + a unique regulator_locus_tag and can be used to join to the + other datasets in this repo, including the metadata + - name: regulator_locus_tag + dtype: string + description: >- + Systematic gene name (ORF identifier) of the perturbed + transcription factor + role: regulator_identifier + - name: regulator_symbol + dtype: string + description: Standard gene symbol of the perturbed transcription factor + role: regulator_identifier + - name: target_locus_tag + dtype: string + description: >- + Systematic gene name (ORF identifier) of the + target gene measured + role: target_identifier + - name: target_symbol + dtype: string + description: Standard gene symbol of the target gene measured + role: target_identifier + - name: dye_plus + dtype: float64 + description: >- + Normalized log2 fold change for positive (+) dye orientation. + Positive values indicate upregulation in response to deletion. + role: quantitative_measure + - name: dye_minus + dtype: float64 + description: >- + Normalized log2 fold change for negative (-) dye orientation. + Positive values indicate upregulation in response to deletion. + role: quantitative_measure + - name: mean_norm_log2fc + dtype: float64 + description: >- + Average log2 fold change across dye orientations, providing a + dye-independent estimate of gene expression change upon + transcription factor deletion. + role: quantitative_measure +--- + +# kemmeren_2014 +--- +license: mit +language: +- en +tags: +- genomics +- yeast +- transcription +- perturbation +- response +- knockout +- TFKO +pretty_name: "Kemmeren, 2014 Overexpression" +size_categories: +- 1M- + Transcriptional regulator overexpression perturbation data with + differential expression measurements + dataset_type: annotated_features + default: true + metadata_fields: ["regulator_locus_tag", "regulator_symbol"] + data_files: + - split: train + path: kemmeren_2014.parquet + dataset_info: + features: + - name: sample_id + dtype: integer + description: >- + unique identifier for a specific sample. + The sample ID identifies a unique regulator. + - name: db_id + dtype: integer + description: >- + an old unique identifer, for use internally only. Deprecated and will be removed eventually. + Do not use in analysis. db_id = 0 for loci that were originally parsed incorrectly. + - name: regulator_locus_tag + dtype: string + description: >- + induced transcriptional regulator systematic ID. + See hf/BrentLab/yeast_genome_resources + role: regulator_identifier + - name: regulator_symbol + dtype: string + description: >- + induced transcriptional regulator common name. + If no common name exists, then the `regulator_locus_tag` is used. + role: regulator_identifier + - name: reporterId + dtype: string + description: probe ID as reported from the original data + - name: target_locus_tag + dtype: string + description: >- + The systematic ID of the feature to which the effect/pvalue is assigned. + See hf/BrentLab/yeast_genome_resources + role: target_identifier + - name: target_symbol + dtype: string + description: >- + The common name of the feature to which the effect/pvalue is assigned. + If there is no common name, the `target_locus_tag` is used. + role: target_identifier + - name: M + dtype: float64 + description: log₂ fold change (mutant vs wildtype) + role: quantitative_measure + - name: Madj + dtype: float64 + description: >- + M value with the cell cycle signal removed + (see paper cited in the introduction above) + role: quantitative_measure + - name: A + dtype: float64 + description: >- + average log2 intensity of the two channels, a proxy for expression level + (This is a guess based on microarray convention -- not specified on holstege site) + role: quantitative_measure + - name: pval + dtype: float64 + description: significance of the modeled effect (M), from limma + role: quantitative_measure + - name: variable_in_wt + dtype: string + description: >- + True if the given locus is variable in the WT condition. + Recommended to remove these from analysis. False otherwise. + See Holstege website for more information + role: experimental_condition + - name: multiple_probes + dtype: string + description: >- + True if there is more than one probe associated with + the same genomic locus. False otherwise + role: experimental_condition + - name: kemmeren_regulator + dtype: string + description: >- + True if the regulator is one of the regulators studied in the + original Kemmeren et al. (2014) global regulator study. False otherwise + role: experimental_condition + - name: regulator_desc + dtype: string + description: >- + functional description of the induced regulator + from the original paper supplement + role: experimental_condition + - name: functional_category + dtype: string + description: functional classification of the regulator from the original paper supplement + role: experimental_condition + - name: slides + dtype: string + description: identifier(s) for the microarray slide(s) used in this experiment + role: experimental_condition + - name: mating_type + dtype: string + description: mating type of the strain background used in the experiment + role: experimental_condition + - name: source_of_deletion_mutants + dtype: string + description: origin of the strain + role: experimental_condition + - name: primary_hybsets + dtype: string + description: identifier for the primary hybridization set to which this sample belongs + role: experimental_condition + - name: responsive_non_responsive + dtype: string + description: >- + classification of the regulator as responsive or not to the + deletion from the original paper supplement + role: experimental_condition + - name: nr_sign_changes + dtype: integer + description: >- + number of significant changes in expression detected for the regulator locus tag (abs(M) > log2(1.7) & pval < 0.05). + Note that there is a slight difference when calculating from the data provided here, I believe due to a difference in + the way the targets are parsed and filtered (some ORFs that have since been removed from the annotations are removed). + I didn't investigate this closely, though. + role: experimental_condition + - name: profile_first_published + dtype: string + description: citation or reference indicating where this expression profile was first published + role: experimental_condition + - name: chase_notes + dtype: string + description: notes added during data curation and parsing +--- + +# mahendrawada_2025 +--- +license: mit +language: +- en +tags: +- biology +- genomics +- yeast +- transcription-factors +- gene-expression +- binding +- chec +- perturbation +- rnaseq +- nascent rnaseq +pretty_name: "Mahendrawada 2025 ChEC-seq and Nascent RNA-seq data" +size_categories: +- 100K- + unique identifier for a specific sample, which uniquely identifies one of the 178 TFs. + Across datasets in this repo, the a given sample_id identifies the same regulator. + - name: regulator_locus_tag + dtype: string + description: Systematic gene name (ORF identifier) of the transcription factor + - name: regulator_symbol + dtype: string + description: Standard gene symbol of the transcription factor + - name: target_locus_tag + dtype: string + description: Systematic gene name (ORF identifier) of the target gene + - name: target_symbol + dtype: string + description: Standard gene symbol of the target gene + - name: peak_score + dtype: float64 + description: ChEC signal around peak center (sum of ChEC signal from -150 to +150 bp from peak summit) normalized to Drosophila spike-in control + - name: processing_method + dtype: string + description: Method used for peak calling and quantification (original authors) + +- config_name: reprocessed_chec_seq + description: ChEC-seq transcription factor binding data reprocessed with updated peak calling methodology + dataset_type: annotated_features + data_files: + - split: train + path: chec_reprocessed_mahendrawada_2025.parquet + dataset_info: + features: + - name: sample_id + dtype: integer + description: >- + unique identifier for a specific sample, which uniquely identifies one of the 178 TFs. + Across datasets in this repo, the a given sample_id identifies the same regulator. + - name: regulator_locus_tag + dtype: string + description: Systematic gene name (ORF identifier) of the transcription factor + - name: regulator_symbol + dtype: string + description: Standard gene symbol of the transcription factor + - name: target_locus_tag + dtype: string + description: Systematic gene name (ORF identifier) of the target gene + - name: target_symbol + dtype: string + description: Standard gene symbol of the target gene + - name: enrichment + dtype: float64 + description: ratio of experimental insertions to background insertions + - name: poisson_pval + dtype: float64 + description: enrichment poisson pvalue + +- config_name: reprocessed_diffcontrol_5prime + description: Comparing two different sets of control replicates, m2025 from the Mahendrawada 2025 paper, and h2021 from a previous paper from the Hahn lab + dataset_type: annotated_features + metadata_fields: + - control_source + - condition + - regulator_locus_tag + experimental_conditions: + # Mahendrawada et al 2025: "30 °C culture" + temperature_celsius: 30 + cultivation_method: unspecified + growth_phase_at_harvest: + # Mahendrawada et al 2025: "A600 of ~1.0" + od600: 1.0 + media: + # Mahendrawada et al 2025: "synthetic complete (SC) media" + name: synthetic_complete + carbon_source: unspecified + nitrogen_source: + - compound: yeast_nitrogen_base + # Mahendrawada et al 2025: 1.7 g/L (without ammonium sulfate or amino acids (BD Difco)) + concentration_percent: 0.17 + specifications: + - without_ammonium_sulfate + - without_amino_acids + - compound: ammonium_sulfate + # Mahendrawada et al 2025: 5 g/L + concentration_percent: 0.5 + - compound: amino_acid_dropout_mix + # Mahendrawada et al 2025: 0.6 g/L + concentration_percent: 0.06 + - compound: adenine_sulfate + # Mahendrawada et al 2025: 40 μg/ml = 0.04 g/L + concentration_percent: 0.004 + - compound: uracil + # Mahendrawada et al 2025: 2 μg/ml = 0.002 g/L + concentration_percent: 0.0002 + data_files: + - split: train + path: reprocess_diffcontrol_5prime.parquet + dataset_info: + features: + - name: control_source + dtype: string + description: Source identifier for the control dataset (m2025 or h2021) + - name: condition + dtype: string + description: Experimental condition. 'standard' is YPD. + - name: regulator_locus_tag + dtype: string + description: Systematic gene name (ORF identifier) of the transcription factor + - name: target_locus_tag + dtype: string + description: Systematic gene name (ORF identifier) of the target gene + - name: chr + dtype: string + description: Chromosome name of the promoter/target region + - name: start + dtype: int64 + description: Start coordinate of the promoter region + - name: end + dtype: int64 + description: End coordinate of the promoter region + - name: strand + dtype: string + description: Strand orientation (+ or -) of the promoter/target + - name: input_vs_target_log2_fold_change + dtype: float64 + description: Log2 fold change of TF-tagged sample vs control (from DESeq2) + - name: input_vs_target_p_value + dtype: float64 + description: P-value for differential enrichment (from DESeq2) + - name: input_vs_target_adj_p_value + dtype: float64 + description: Adjusted p-value (FDR-corrected) for differential enrichment (from DESeq2) + +- config_name: rna_seq + description: Nascent RNA-seq differential expression data following transcription factor depletion using 4TU metabolic labeling + dataset_type: annotated_features + metadata_fields: + - regulator_locus_tag + - regulator_symbol + data_files: + - split: train + path: rnaseq_mahendrawada_2025.parquet + dataset_info: + features: + - name: sample_id + dtype: integer + description: >- + unique identifier for a specific sample, which uniquely identifies one of the 178 TFs. + Across datasets in this repo, the a given sample_id identifies the same regulator. + - name: db_id + dtype: integer + description: >- + an old unique identifer, for use internally only. Deprecated and will be removed eventually. + Do not use in analysis. + - name: regulator_locus_tag + dtype: string + description: Systematic gene name (ORF identifier) of the depleted transcription factor + - name: regulator_symbol + dtype: string + description: Standard gene symbol of the depleted transcription factor + - name: target_locus_tag + dtype: string + description: Systematic gene name (ORF identifier) of the differentially expressed target gene + - name: target_symbol + dtype: string + description: Standard gene symbol of the differentially expressed target gene + - name: log2fc + dtype: float64 + description: Log2 fold change (IAA/DMSO) for significantly affected genes (DESeq2, padj <0.1, FC >= 1.3) +--- + +# rossi_2021 +--- +license: mit +tags: +- transcription-factor +- binding +- chipexo +- genomics +- biology +language: +- en +pretty_name: Rossi ChIP-exo 2021 +experimental_conditions: + temperature_celsius: 25 + cultivation_method: unspecified + growth_phase_at_harvest: + phase: mid_log + od600: 0.8 + media: + name: yeast_peptone_dextrose + carbon_source: + - compound: D-glucose + concentration_percent: unspecified + nitrogen_source: + - compound: yeast_extract + concentration_percent: unspecified + - compound: peptone + concentration_percent: unspecified + + # Heat shock applied only to SAGA strains + # note that im not sure which strains this + # applies to -- it is a TODO to better + # document this + heat_shock: + induced: true + temperature_celsius: 37 + duration_minutes: 6 + pre_induction_temperature_celsius: 25 + method: equal_volume_medium_transfer +configs: +- config_name: metadata + description: Metadata describing the tagged regulator in each experiment + dataset_type: metadata + data_files: + - split: train + path: rossi_2021_metadata.parquet + dataset_info: + features: + - name: regulator_locus_tag + dtype: string + description: Systematic gene name (ORF identifier) of the transcription factor + - name: regulator_symbol + dtype: string + description: Standard gene symbol of the transcription factor + - name: run_accession + dtype: string + description: GEO run accession identifier for the sample + - name: yeastepigenome_id + dtype: string + description: Sample identifier used by yeastepigenome.org +- config_name: genome_map + description: "ChIP-exo 5' tag coverage data partitioned by sample accession" + dataset_type: genome_map + data_files: + - split: train + path: genome_map/*/*.parquet + dataset_info: + features: + - name: chr + dtype: string + description: Chromosome name (e.g., chrI, chrII, etc.) + - name: pos + dtype: int32 + description: "Genomic position of the 5' tag" + - name: pileup + dtype: int32 + description: "Depth of coverage (number of 5' tags) at this genomic position" +- config_name: rossi_annotated_features + description: ChIP-exo regulator-target binding features with peak statistics + dataset_type: annotated_features + default: true + metadata_fields: + - regulator_locus_tag + - regulator_symbol + - target_locus_tag + data_files: + - split: train + path: yeastepigenome_annotatedfeatures.parquet + dataset_info: + features: + - name: sample_id + dtype: int32 + description: >- + Unique identifier for each ChIP-exo experimental sample. + - name: pss_id + dtype: float64 + description: >- + Current brentlab promotersetsig table id. This will eventually be removed. + - name: binding_id + dtype: float64 + description: >- + Current brentlab binding table id. This will eventually be removed. + - name: yeastepigenome_id + dtype: float64 + description: >- + Unique identifier in the yeastepigenome database. + - name: regulator_locus_tag + dtype: string + description: >- + Systematic ORF name of the regulator. + role: regulator_identifier + - name: regulator_symbol + dtype: string + description: >- + Common gene name of the regulator. + role: regulator_identifier + - name: target_locus_tag + dtype: string + description: >- + The systematic ID of the feature to which the effect/pvalue is + assigned. See hf/BrentLab/yeast_genome_resources + role: target_identifier + - name: target_symbol + dtype: string + description: >- + The common name of the feature to which the effect/pvalue is + assigned. If there is no common name, the `target_locus_tag` is + used. + role: target_identifier + - name: n_sig_peaks + dtype: float64 + description: >- + Number of peaks in the promoter region of the the target gene + role: quantitative_measure + - name: max_fc + dtype: float64 + description: >- + If there are multiple peaks in the promoter region, then the maximum is + reported. Otherwise, it is the fold change of the single peak in the + promoter. + role: quantitative_measure + - name: min_pval + dtype: float64 + description: >- + The most significant p-value among peaks for this interaction. + role: quantitative_measure +- config_name: reprocess_annotatedfeatures + description: >- + Annotated features reprocessed with updated peak + calling methodology + dataset_type: annotated_features + data_files: + - split: train + path: reprocess_annotatedfeatures.parquet + dataset_info: + features: + - name: regulator_locus_tag + dtype: string + description: Systematic gene name (ORF identifier) of the transcription factor + - name: regulator_symbol + dtype: string + description: Standard gene symbol of the transcription factor + - name: target_locus_tag + dtype: string + description: Systematic gene name (ORF identifier) of the target gene + - name: target_symbol + dtype: string + description: Standard gene symbol of the target gene + - name: baseMean + dtype: float64 + description: Average of normalized count values, dividing by size factors, taken over all samples + - name: log2FoldChange + dtype: float64 + description: Log2 fold change between comparison and control groups + - name: lfcSE + dtype: float64 + description: Standard error estimate for the log2 fold change estimate + - name: stat + dtype: float64 + description: Value of the test statistic for the gene + - name: pvalue + dtype: float64 + description: P-value of the test for the gene + - name: padj + dtype: float64 + description: Adjusted p-value for multiple testing for the gene +- config_name: reprocess_annotatedfeatures_tagcounts + description: Another version of the reprocessed data, quantified similarly to Calling Cards + dataset_type: annotated_features + data_files: + - split: train + path: reprocess_annotatedfeatures_tagcounts.parquet + dataset_info: + features: + - name: regulator_locus_tag + dtype: string + description: Systematic gene name (ORF identifier) of the transcription factor + role: regulator_identifier + - name: target_locus_tag + dtype: string + description: Systematic gene name (ORF identifier) of the target gene + role: target_identifier + - name: rank + dtype: int64 + description: Rank (ties method min rank) of the peak based on pvalue with ties broken by enrichment. Largest rank is most significant. + - name: control_count + dtype: int64 + description: Number of tags in the control condition + - name: experimental_count + dtype: int64 + description: Number of tags in the experimental condition + - name: mu + dtype: float64 + description: Expected count under the null hypothesis (control_count + 1) * (experimental_total_tags / control_total_tags) + - name: enrichment + dtype: float64 + description: Enrichment ratio of experimental over control. (experimental_counts / experimental_total) / (control_counts + pseudocount) / control_total + role: quantitative_measure + - name: log2_enrichment + dtype: float64 + description: Log2-transformed enrichment ratio + role: quantitative_measure + - name: neg_log10_pvalue + dtype: float64 + description: Negative log10 of the p-value for binding significance + role: quantitative_measure + - name: neg_log10_qvalue + dtype: float64 + description: Negative log10 of the FDR-adjusted q-value + role: quantitative_measure +--- + +# yeast_genome_resources +--- +license: mit +pretty_name: BrentLab Yeast Genome Resources +language: + - en +dataset_info: + features: + - name: start + dtype: int32 + description: Start coordinate (1-based, **inclusive**) + - name: end + dtype: int32 + description: End coordinate (1-based, **inclusive**) + - name: strand + dtype: string + levels: + - + + - "-" + description: Strand of feature + - name: type + dtype: string + levels: + - gene + - ncRNA_gene + - tRNA_gene + - snoRNA_gene + - transposable_element_gene + - pseudogene + - telomerase_RNA_gene + - snRNA_gene + - rRNA_gene + - blocked_reading_frame + description: classification of feature + - name: locus_tag + dtype: string + description: Systematic ID of feature + - name: symbol + dtype: string + description: Common name of feature + - name: alias + dtype: string + description: Alternative names of feature, typically alternative symbols + - name: source + dtype: string + description: Annotation file version/origin of the feature + - name: note + dtype: string + description: Additional feature information, typically the description from the + SGD gff/gtf + partitioning: + keys: + - name: chr + dtype: string + levels: + - chrI + - chrII + - chrVII + - chrV + - chrIII + - chrIV + - chrVIII + - chrVI + - chrX + - chrIX + - chrXI + - chrXIV + - chrXII + - chrXIII + - chrXV + - chrXVI + - chrM +configs: + - config_name: features + default: true + data_files: + - split: train + path: + - features/*/part-0.parquet +--- diff --git a/tfbpapi/tests/snapshots/__init__.py b/tfbpapi/tests/snapshots/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/tfbpapi/tests/snapshots/promotersetsig_records_and_files.tar.gz b/tfbpapi/tests/snapshots/promotersetsig_records_and_files.tar.gz deleted file mode 100644 index bde8021..0000000 Binary files a/tfbpapi/tests/snapshots/promotersetsig_records_and_files.tar.gz and /dev/null differ diff --git a/tfbpapi/tests/snapshots/snap_test_AbstractAPI.py b/tfbpapi/tests/snapshots/snap_test_AbstractAPI.py deleted file mode 100644 index 8444992..0000000 --- a/tfbpapi/tests/snapshots/snap_test_AbstractAPI.py +++ /dev/null @@ -1,20 +0,0 @@ -# -*- coding: utf-8 -*- -# snapshottest: v1 - https://goo.gl/zC4yUc -from __future__ import unicode_literals - -from snapshottest import Snapshot - - -snapshots = Snapshot() - -snapshots['test_cache_operations cache_get_after_delete'] = 'None' - -snapshots['test_cache_operations cache_get_after_set'] = 'test_value' - -snapshots['test_cache_operations cache_list'] = "['test_key']" - -snapshots['test_pop_params pop_params_after_all_removed'] = '{}' - -snapshots['test_pop_params pop_params_after_one_removed'] = '{"param2": "value2"}' - -snapshots['test_push_params push_params'] = '{"param1": "value1", "param2": "value2"}' diff --git a/tfbpapi/tests/snapshots/snap_test_AbstractRecordsAndFilesAPI.py b/tfbpapi/tests/snapshots/snap_test_AbstractRecordsAndFilesAPI.py deleted file mode 100644 index 807cb7d..0000000 --- a/tfbpapi/tests/snapshots/snap_test_AbstractRecordsAndFilesAPI.py +++ /dev/null @@ -1,15 +0,0 @@ -# snapshottest: v1 - https://goo.gl/zC4yUc - -from snapshottest import Snapshot - -snapshots = Snapshot() - -snapshots[ - "test_save_response_records_and_files 1" -] = """id,uploader_id,upload_date,modifier_id,modified_date,binding_id,promoter_id,background_id,fileformat_id,file -10690,1,2024-03-26,1,2024-03-26 14:28:43.825628+00:00,4079,4,6,5,promotersetsig/10690.csv.gz -10694,1,2024-03-26,1,2024-03-26 14:28:44.739775+00:00,4083,4,6,5,promotersetsig/10694.csv.gz -10754,1,2024-03-26,1,2024-03-26 14:29:01.837335+00:00,4143,4,6,5,promotersetsig/10754.csv.gz -10929,1,2024-03-26,1,2024-03-26 14:29:45.379790+00:00,4318,4,6,5,promotersetsig/10929.csv.gz -10939,1,2024-03-26,1,2024-03-26 14:29:47.853980+00:00,4327,4,6,5,promotersetsig/10939.csv.gz -""" diff --git a/tfbpapi/tests/snapshots/test_AbstractAPI/test_cache_operations/cache_get_after_delete b/tfbpapi/tests/snapshots/test_AbstractAPI/test_cache_operations/cache_get_after_delete deleted file mode 100644 index 4af1832..0000000 --- a/tfbpapi/tests/snapshots/test_AbstractAPI/test_cache_operations/cache_get_after_delete +++ /dev/null @@ -1 +0,0 @@ -None \ No newline at end of file diff --git a/tfbpapi/tests/snapshots/test_AbstractAPI/test_cache_operations/cache_get_after_set b/tfbpapi/tests/snapshots/test_AbstractAPI/test_cache_operations/cache_get_after_set deleted file mode 100644 index fff1c65..0000000 --- a/tfbpapi/tests/snapshots/test_AbstractAPI/test_cache_operations/cache_get_after_set +++ /dev/null @@ -1 +0,0 @@ -test_value \ No newline at end of file diff --git a/tfbpapi/tests/snapshots/test_AbstractAPI/test_cache_operations/cache_list b/tfbpapi/tests/snapshots/test_AbstractAPI/test_cache_operations/cache_list deleted file mode 100644 index 1950491..0000000 --- a/tfbpapi/tests/snapshots/test_AbstractAPI/test_cache_operations/cache_list +++ /dev/null @@ -1 +0,0 @@ -['test_key'] \ No newline at end of file diff --git a/tfbpapi/tests/snapshots/test_AbstractAPI/test_pop_params/pop_params_after_all_removed b/tfbpapi/tests/snapshots/test_AbstractAPI/test_pop_params/pop_params_after_all_removed deleted file mode 100644 index 9e26dfe..0000000 --- a/tfbpapi/tests/snapshots/test_AbstractAPI/test_pop_params/pop_params_after_all_removed +++ /dev/null @@ -1 +0,0 @@ -{} \ No newline at end of file diff --git a/tfbpapi/tests/snapshots/test_AbstractAPI/test_pop_params/pop_params_after_one_removed b/tfbpapi/tests/snapshots/test_AbstractAPI/test_pop_params/pop_params_after_one_removed deleted file mode 100644 index cab5c0c..0000000 --- a/tfbpapi/tests/snapshots/test_AbstractAPI/test_pop_params/pop_params_after_one_removed +++ /dev/null @@ -1 +0,0 @@ -{"param2": "value2"} \ No newline at end of file diff --git a/tfbpapi/tests/snapshots/test_AbstractAPI/test_push_params/push_params b/tfbpapi/tests/snapshots/test_AbstractAPI/test_push_params/push_params deleted file mode 100644 index 21d59b6..0000000 --- a/tfbpapi/tests/snapshots/test_AbstractAPI/test_push_params/push_params +++ /dev/null @@ -1 +0,0 @@ -{"param1": "value1", "param2": "value2"} \ No newline at end of file diff --git a/tfbpapi/tests/test_AbstractAPI.py b/tfbpapi/tests/test_AbstractAPI.py deleted file mode 100644 index 84a643d..0000000 --- a/tfbpapi/tests/test_AbstractAPI.py +++ /dev/null @@ -1,94 +0,0 @@ -import json -from typing import Any - -import pytest -import responses - -from tfbpapi.AbstractAPI import AbstractAPI -from tfbpapi.ParamsDict import ParamsDict - - -class ConcreteAPI(AbstractAPI): - """Concrete implementation of AbstractAPI for testing purposes.""" - - def create(self, data: dict[str, Any], **kwargs) -> Any: - pass # Implement for testing if necessary - - def read(self, **kwargs) -> dict[str, Any]: - return {"id": id} # Mock implementation for testing - - def update(self, df: Any, **kwargs) -> Any: - pass # Implement for testing if necessary - - def delete(self, id: str, **kwargs) -> Any: - pass # Implement for testing if necessary - - def submit(self, post_dict: dict, **kwargs) -> Any: - pass # Implement for testing if necessary - - def retrieve( - self, group_task_id: str, timeout: int, polling_interval: int, **kwargs - ) -> Any: - pass # Implement for testing if necessary - - -@pytest.fixture -@responses.activate -def api_client(): - valid_url = "https://valid.url" - responses.add(responses.HEAD, valid_url, status=200) - return ConcreteAPI(url=valid_url, token="token") - - -def test_initialize(snapshot, api_client): - assert api_client.url == "https://valid.url" - assert api_client.token == "token" - assert isinstance(api_client.params, ParamsDict) - - -def test_push_params(snapshot, api_client): - params = {"param1": "value1", "param2": "value2"} - api_client.push_params(params) - # Serialize the dictionary to a JSON string for comparison - params_as_json = json.dumps(api_client.params.as_dict(), sort_keys=True) - snapshot.assert_match(params_as_json, "push_params") - - -def test_pop_params(snapshot, api_client): - params = {"param1": "value1", "param2": "value2"} - api_client.push_params(params) - api_client.pop_params(["param1"]) - params_as_json1 = json.dumps(api_client.params.as_dict(), sort_keys=True) - snapshot.assert_match(params_as_json1, "pop_params_after_one_removed") - api_client.pop_params() - params_as_json2 = json.dumps(api_client.params.as_dict(), sort_keys=True) - snapshot.assert_match(params_as_json2, "pop_params_after_all_removed") - - -@responses.activate -def test_is_valid_url(api_client): - invalid_url = "https://invalid.url" - - responses.add(responses.HEAD, invalid_url, status=404) - - with pytest.raises(ValueError): - api_client.url = invalid_url - - -def test_cache_operations(snapshot, api_client): - key = "test_key" - value = "test_value" - - api_client._cache_set(key, value) - snapshot.assert_match(str(api_client._cache_get(key)), "cache_get_after_set") - - keys = api_client._cache_list() - snapshot.assert_match(str(keys), "cache_list") - - api_client._cache_delete(key) - snapshot.assert_match(str(api_client._cache_get(key)), "cache_get_after_delete") - snapshot.assert_match(str(api_client._cache_get(key)), "cache_get_after_delete") - - -if __name__ == "__main__": - pytest.main() diff --git a/tfbpapi/tests/test_AbstractRecordsAndFilesAPI.py b/tfbpapi/tests/test_AbstractRecordsAndFilesAPI.py deleted file mode 100644 index 1c64a39..0000000 --- a/tfbpapi/tests/test_AbstractRecordsAndFilesAPI.py +++ /dev/null @@ -1,284 +0,0 @@ -import gzip -from io import BytesIO -from tempfile import NamedTemporaryFile -from typing import Any - -import pandas as pd -import pytest -import responses -from aioresponses import aioresponses - -from tfbpapi.AbstractRecordsAndFilesAPI import ( - AbstractRecordsAndFilesAPI, -) - -# The following test is commented out because it requires a running server -- this is -# how I retrieved the data for the tests below. The data is saved in the snapshot -# directory -# -# @pytest.mark.asyncio -# async def test_save_response_records_and_files(snapshot): -# async with aiohttp.ClientSession() as session: -# url = "http://127.0.0.1:8001/api/promotersetsig/export" -# async with session.get( -# url, -# headers={ -# "Authorization": f"token {os.getenv('TOKEN')}", -# "Content-Type": "application/json", -# }, -# params={ -# "regulator_symbol": "HAP5", -# "workflow": "nf_core_callingcards_dev", -# "data_usable": "pass", -# }, -# ) as response: -# response.raise_for_status() -# response_text = await response.text() -# snapshot.assert_match(response_text) -# assert response.status == 200 - - -# @pytest.mark.asyncio -# async def test_save_response_records_and_files(): -# async with aiohttp.ClientSession() as session: -# url = "http://127.0.0.1:8001/api/promotersetsig/record_table_and_files" -# async with session.get( -# url, -# headers={ -# "Authorization": f"token {os.getenv('TOKEN')}", -# "Content-Type": "application/gzip", -# }, -# params={ -# "regulator_symbol": "HAP5", -# "workflow": "nf_core_callingcards_dev", -# "data_usable": "pass", -# }, -# ) as response: -# response.raise_for_status() -# response_content = await response.read() -# with open("saved_response.tar.gz", "wb") as f: -# f.write(response_content) -# assert response.status == 200 - - -def promotersetsig_csv_gzip() -> bytes: - # Define the data as a dictionary - data = { - "id": [10690, 10694, 10754, 10929, 10939], - "uploader_id": [1, 1, 1, 1, 1], - "upload_date": ["2024-03-26"] * 5, - "modifier_id": [1, 1, 1, 1, 1], - "modified_date": [ - "2024-03-26 14:28:43.825628+00:00", - "2024-03-26 14:28:44.739775+00:00", - "2024-03-26 14:29:01.837335+00:00", - "2024-03-26 14:29:45.379790+00:00", - "2024-03-26 14:29:47.853980+00:00", - ], - "binding_id": [4079, 4083, 4143, 4318, 4327], - "promoter_id": [4, 4, 4, 4, 4], - "background_id": [6, 6, 6, 6, 6], - "fileformat_id": [5, 5, 5, 5, 5], - "file": [ - "promotersetsig/10690.csv.gz", - "promotersetsig/10694.csv.gz", - "promotersetsig/10754.csv.gz", - "promotersetsig/10929.csv.gz", - "promotersetsig/10939.csv.gz", - ], - } - - # Create a DataFrame - df = pd.DataFrame(data) - - # Convert the DataFrame to CSV and compress it using gzip - csv_buffer = BytesIO() - with gzip.GzipFile(fileobj=csv_buffer, mode="w") as gz: - df.to_csv(gz, index=False) - - # Get the gzipped data as bytes - return csv_buffer.getvalue() - - -class ConcreteRecordsAndFilesAPI(AbstractRecordsAndFilesAPI): - """Concrete implementation of AbstractRecordsAndFilesAPI for testing purposes.""" - - def create(self, data: dict[str, Any], **kwargs) -> Any: - pass - - def update(self, df: Any, **kwargs) -> Any: - pass - - def delete(self, id: str, **kwargs) -> Any: - pass - - def submit(self, post_dict: dict, **kwargs) -> Any: - pass # Implement for testing if necessary - - def retrieve( - self, group_task_id: str, timeout: int, polling_interval: int, **kwargs - ) -> Any: - pass # Implement for testing if necessary - - -@pytest.fixture -@responses.activate -def api_client(): - valid_url = "http://127.0.0.1:8001/api/promotersetsig" - responses.add(responses.HEAD, valid_url, status=200) - return ConcreteRecordsAndFilesAPI(url=valid_url, token="my_token") - - -@pytest.mark.asyncio -async def test_read_without_files(snapshot, api_client): - with aioresponses() as m: - # Mock the HTTP response with the saved snapshot response - m.get( - "http://127.0.0.1:8001/api/promotersetsig/export", - status=200, - body=promotersetsig_csv_gzip(), - headers={"Content-Type": "application/gzip"}, - ) - - result = await api_client.read() - assert isinstance(result.get("metadata"), pd.DataFrame) - assert result.get("metadata").shape == ( - 5, - 10, - ) - - -# chatGPT and I went through many iterations of trying to mock two endpoints at once. -# no success. the retrieve_files is untested outside of the tutorial notebook as a -# result -# -# @pytest.mark.asyncio -# async def test_read_with_responses(snapshot, api_client): -# with responses.RequestsMock() as rsps: -# # Mock the /export endpoint -# rsps.add( -# responses.GET, -# "http://127.0.0.1:8001/api/promotersetsig/export", -# body=promotersetsig_csv_gzip(), -# status=200, -# content_type="text/csv", -# ) - -# # Path to the tar.gz file -# tar_gz_file_path = os.path.join( -# os.path.dirname(__file__), -# "snapshots", -# "promotersetsig_records_and_files.tar.gz", -# ) - -# # Read the content of the tar.gz file -# with open(tar_gz_file_path, "rb") as tar_gz_file: -# tar_gz_content = tar_gz_file.read() - -# # Mock the /record_table_and_files endpoint -# rsps.add( -# responses.GET, -# "http://127.0.0.1:8001/api/promotersetsig/record_table_and_files", -# body=tar_gz_content, -# status=200, -# content_type="application/gzip", -# ) - -# # Helper function to create a mock ClientResponse -# async def create_mock_response(url, method, body, content_type, status): -# return MockClientResponse( -# method, URL(url), status, {"Content-Type": content_type}, body -# ) - -# # Patch aiohttp.ClientSession.get to use our mocked responses -# async def mock_get(self, url, **kwargs): -# if "export" in url: -# return await create_mock_response( -# url, -# "GET", -# promotersetsig_csv_gzip().encode(), -# "text/csv", -# 200, -# ) -# elif "record_table_and_files" in url: -# return await create_mock_response( -# url, -# "GET", -# tar_gz_content, -# "application/gzip", -# 200, -# ) -# else: -# raise ValueError("Unexpected URL") - -# with patch("aiohttp.ClientSession.get", new=mock_get): -# # Test the read method without retrieving files -# result = await api_client.read() -# assert isinstance(result.get("metadata"), pd.DataFrame) -# assert result.get("metadata").shape == (5, 10) - -# # Test the read method with retrieving files -# result = await api_client.read(retrieve_files=True) -# assert isinstance(result.get("metadata"), pd.DataFrame) -# assert result.get("metadata").shape == (5, 10) -# assert isinstance(result.get("data"), dict) -# assert len(result.get("data")) == 5 -# assert all(isinstance(v, pd.DataFrame) \ -# for v in result.get("data").values()) - -# test the _detect_delimiter method #### - - -def test_detect_delimiter_errors(api_client): - # test that a FileNotFound error is raised if the file does not exist - with pytest.raises(FileNotFoundError): - api_client._detect_delimiter("non_existent_file.csv") - - with NamedTemporaryFile(mode="w", suffix=".csv.gz") as tmpfile: - tmpfile.write("col1,col2,col3\nval1,val2,val3") - tmpfile.flush() - tmpfile_path = tmpfile.name - - with pytest.raises(gzip.BadGzipFile): - api_client._detect_delimiter(tmpfile_path) - - -def test_comma_delimiter(api_client): - with NamedTemporaryFile(mode="w", suffix=".csv") as tmpfile: - tmpfile.write("col1,col2,col3\nval1,val2,val3") - tmpfile.flush() - tmpfile_path = tmpfile.name - - delimiter = api_client._detect_delimiter(tmpfile_path) - assert delimiter == "," - - -def test_tab_delimiter(api_client): - with NamedTemporaryFile(mode="w", suffix=".csv") as tmpfile: - tmpfile.write("col1\tcol2\tcol3\nval1\tval2\tval3") - tmpfile.flush() - tmpfile_path = tmpfile.name - - delimiter = api_client._detect_delimiter(tmpfile_path) - assert delimiter == "\t" - - -def test_space_delimiter(api_client): - with NamedTemporaryFile(mode="w", suffix=".csv") as tmpfile: - tmpfile.write("col1 col2 col3\nval1 val2 val3") - tmpfile.flush() - tmpfile_path = tmpfile.name - - delimiter = api_client._detect_delimiter(tmpfile_path) - assert delimiter == " " - - -def test_gzipped_file(api_client): - with NamedTemporaryFile(suffix=".csv.gz") as tmpfile: - with gzip.open(tmpfile.name, "wt") as gzfile: - gzfile.write("col1,col2,col3\nval1,val2,val3") - gzfile.flush() - tmpfile_path = tmpfile.name - - delimiter = api_client._detect_delimiter(tmpfile_path) - assert delimiter == "," diff --git a/tfbpapi/tests/test_AbstractRecordsOnlyAPI.py b/tfbpapi/tests/test_AbstractRecordsOnlyAPI.py deleted file mode 100644 index 1def39a..0000000 --- a/tfbpapi/tests/test_AbstractRecordsOnlyAPI.py +++ /dev/null @@ -1,71 +0,0 @@ -import gzip -from typing import Any - -import pandas as pd -import pytest -import responses -from aioresponses import aioresponses - -from tfbpapi.AbstractRecordsOnlyAPI import AbstractRecordsOnlyAPI - - -class ConcreteAPI(AbstractRecordsOnlyAPI): - """Concrete implementation of AbstractRecordsOnlyAPI for testing purposes.""" - - def create(self, data: dict[str, Any], **kwargs) -> Any: - pass # Implement for testing if necessary - - def update(self, df: Any, **kwargs) -> Any: - pass # Implement for testing if necessary - - def delete(self, id: str, **kwargs) -> Any: - pass # Implement for testing if necessary - - def submit(self, post_dict: dict, **kwargs) -> Any: - pass # Implement for testing if necessary - - def retrieve( - self, group_task_id: str, timeout: int, polling_interval: int, **kwargs - ) -> Any: - pass # Implement for testing if necessary - - -@pytest.fixture -@responses.activate -def api_client(): - valid_url = "https://example.com/api/endpoint" - responses.add(responses.HEAD, valid_url, status=200) - return ConcreteAPI(url=valid_url, token="my_token") - - -@pytest.mark.asyncio -async def test_read(snapshot, api_client): - with aioresponses() as m: - # Mocking the response - mocked_csv = ( - "id,uploader_id,upload_date,modifier_id,modified_date,binding_id,promoter_id,background_id,fileformat_id,file\n" # noqa: E501 - "10690,1,2024-03-26,1,2024-03-26 14:28:43.825628+00:00,4079,4,6,5,promotersetsig/10690.csv.gz\n" # noqa: E501 - "10694,1,2024-03-26,1,2024-03-26 14:28:44.739775+00:00,4083,4,6,5,promotersetsig/10694.csv.gz\n" # noqa: E501 - "10754,1,2024-03-26,1,2024-03-26 14:29:01.837335+00:00,4143,4,6,5,promotersetsig/10754.csv.gz\n" # noqa: E501 - "10929,1,2024-03-26,1,2024-03-26 14:29:45.379790+00:00,4318,4,6,5,promotersetsig/10929.csv.gz\n" # noqa: E501 - "10939,1,2024-03-26,1,2024-03-26 14:29:47.853980+00:00,4327,4,6,5,promotersetsig/10939.csv.gz" # noqa: E501 - ) - - # Convert to bytes and gzip the content - gzipped_csv = gzip.compress(mocked_csv.encode("utf-8")) - - m.get( - "https://example.com/api/endpoint/export", - status=200, - body=gzipped_csv, - headers={"Content-Type": "application/gzip"}, - ) - - result = await api_client.read() - assert isinstance(result, dict) - assert isinstance(result.get("metadata"), pd.DataFrame) - assert result.get("metadata").shape == (5, 10) # type: ignore - - -if __name__ == "__main__": - pytest.main() diff --git a/tfbpapi/tests/test_Cache.py b/tfbpapi/tests/test_Cache.py deleted file mode 100644 index a84eb37..0000000 --- a/tfbpapi/tests/test_Cache.py +++ /dev/null @@ -1,66 +0,0 @@ -import time - -import pytest - -from tfbpapi.Cache import Cache - - -def test_cache_set_and_get(): - cache = Cache() - cache.set("key1", "value1") - assert cache.get("key1") == "value1" - assert cache.get("key2", "default_value") == "default_value" - - -def test_cache_list(): - cache = Cache() - cache.set("key1", "value1") - cache.set("key2", "value2") - keys = cache.list() - assert "key1" in keys - assert "key2" in keys - - -def test_cache_delete(): - cache = Cache() - cache.set("key1", "value1") - cache.set("key2", "value2") - cache.delete("key1") - assert cache.get("key1") is None - assert cache.get("key2") == "value2" - - -def test_cache_ttl(): - cache = Cache(ttl=1) # TTL set to 1 second - cache.set("key1", "value1") - time.sleep(1.5) # Wait for TTL to expire - assert cache.get("key1") is None # Should be None after TTL expiry - - -def test_cache_lru(): - cache = Cache(maxsize=2) - cache.set("key1", "value1") - cache.set("key2", "value2") - cache.set("key3", "value3") # This should evict "key1" if LRU works - assert cache.get("key1") is None - assert cache.get("key2") == "value2" - assert cache.get("key3") == "value3" - - -def test_separate_cache_instances(): - cache1 = Cache() - cache2 = Cache() - - cache1.set("key1", "value1") - cache2.set("key2", "value2") - - # Ensure they don't share state - assert cache1.get("key1") == "value1" - assert cache1.get("key2") is None - - assert cache2.get("key2") == "value2" - assert cache2.get("key1") is None - - -if __name__ == "__main__": - pytest.main() diff --git a/tfbpapi/tests/test_ParamsDict.py b/tfbpapi/tests/test_ParamsDict.py deleted file mode 100644 index ee5a246..0000000 --- a/tfbpapi/tests/test_ParamsDict.py +++ /dev/null @@ -1,96 +0,0 @@ -import pytest -import requests # type: ignore -import responses - -from tfbpapi.ParamsDict import ParamsDict - - -def test_initialization(): - params = ParamsDict({"b": 2, "a": 1}, valid_keys=["a", "b"]) - assert params == {"a": 1, "b": 2} - - -def test_getitem(): - params = ParamsDict({"a": 1, "b": 2}, valid_keys=["a", "b"]) - assert params["a"] == 1 - assert params[["a", "b"]] == ParamsDict({"a": 1, "b": 2}) - with pytest.raises(KeyError): - _ = params["123"] # Changed from 123 to '123' - - -def test_setitem(): - params = ParamsDict({"a": 1}, valid_keys=["a", "b", "c", "d"]) - params.update({"b": 2}) - assert params == {"a": 1, "b": 2} - - params[["c", "d"]] = [3, 4] - assert params == {"a": 1, "b": 2, "c": 3, "d": 4} - - with pytest.raises(ValueError): - params[["e", "f"]] = [5] - - with pytest.raises(KeyError): - params[123] = 5 # type: ignore - - with pytest.raises(KeyError): - params.update({"d": 4, "e": 5}) - - -def test_delitem(): - params = ParamsDict({"a": 1, "b": 2}, valid_keys=["a", "b"]) - del params["a"] - assert params == {"b": 2} - with pytest.raises(KeyError): - del params["123"] # Changed from 123 to '123' - - -def test_repr(): - params = ParamsDict({"a": 1, "b": 2}, valid_keys=["a", "b"]) - assert repr(params) == "ParamsDict({'a': 1, 'b': 2})" - - -def test_str(): - params = ParamsDict({"a": 1, "b": 2}, valid_keys=["a", "b"]) - assert str(params) == "a: 1, b: 2" - - -def test_len(): - params = ParamsDict({"a": 1, "b": 2}, valid_keys=["a", "b", "c"]) - assert len(params) == 2 - params["c"] = 3 - assert len(params) == 3 - - -def test_keys_values_items(): - params = ParamsDict({"a": 1, "b": 2}, valid_keys=["a", "b"]) - assert set(params.keys()) == {"a", "b"} - assert set(params.values()) == {1, 2} - assert set(params.items()) == {("a", 1), ("b", 2)} - - -def test_clear(): - params = ParamsDict({"a": 1, "b": 2}, valid_keys=["a", "b"]) - params.clear() - assert len(params) == 0 - - -def test_as_dict(): - params = ParamsDict({"a": 1, "b": 2}, valid_keys=["a", "b"]) - assert params.as_dict() == {"a": 1, "b": 2} - - -@responses.activate -def test_requests_integration(): - params = ParamsDict({"a": 1, "b": 2}, valid_keys=["a", "b"]) - - url = "https://httpbin.org/get" - responses.add(responses.GET, url, json={"args": {"a": "1", "b": "2"}}, status=200) - - response = requests.get(url, params=params) - assert response.status_code == 200 - response_json = response.json() - assert response_json["args"] == {"a": "1", "b": "2"} - - -if __name__ == "__main__": - pytest.main() diff --git a/tfbpapi/tests/test_datacard.py b/tfbpapi/tests/test_datacard.py new file mode 100644 index 0000000..b9228d1 --- /dev/null +++ b/tfbpapi/tests/test_datacard.py @@ -0,0 +1,910 @@ +"""Tests for the DataCard class.""" + +from unittest.mock import Mock, patch + +import pytest + +from tfbpapi import DataCard +from tfbpapi.datacard import DatasetSchema +from tfbpapi.errors import DataCardError, DataCardValidationError, HfDataFetchError +from tfbpapi.models import DatasetType + + +def _external_metadata_card_data(): + """Card data with external metadata (no embedded metadata_fields).""" + return { + "configs": [ + { + "config_name": "coverage_data", + "description": "Coverage measurements", + "dataset_type": "genome_map", + "default": True, + "data_files": [{"split": "train", "path": "coverage.parquet"}], + "dataset_info": { + "features": [ + { + "name": "sample_id", + "dtype": "integer", + "description": "Sample ID", + }, + { + "name": "chr", + "dtype": "string", + "description": "Chromosome", + "role": "genomic_coordinate", + }, + { + "name": "coverage", + "dtype": "float32", + "description": "Coverage value", + "role": "quantitative_measure", + }, + ] + }, + }, + { + "config_name": "sample_metadata", + "description": "Sample metadata", + "dataset_type": "metadata", + "applies_to": ["coverage_data"], + "data_files": [{"split": "train", "path": "metadata.parquet"}], + "dataset_info": { + "features": [ + { + "name": "sample_id", + "dtype": "integer", + "description": "Sample ID", + }, + { + "name": "batch", + "dtype": "string", + "description": "Batch ID", + }, + { + "name": "regulator_locus_tag", + "dtype": "string", + "description": "TF locus tag", + "role": "regulator_identifier", + }, + { + "name": "regulator_symbol", + "dtype": "string", + "description": "TF symbol", + "role": "regulator_identifier", + }, + ] + }, + }, + ], + } + + +class TestDataCard: + """Test suite for DataCard class.""" + + @patch("tfbpapi.datacard.HfDataCardFetcher") + @patch("tfbpapi.datacard.HfRepoStructureFetcher") + @patch("tfbpapi.datacard.HfSizeInfoFetcher") + def test_init( + self, + mock_size_fetcher, + mock_structure_fetcher, + mock_card_fetcher, + test_repo_id, + test_token, + ): + """Test DataCard initialization.""" + datacard = DataCard(test_repo_id, token=test_token) + + assert datacard.repo_id == test_repo_id + assert datacard.token == test_token + assert datacard._dataset_card is None + assert datacard._metadata_cache == {} + assert datacard._metadata_fields_map == {} + + # Check that fetchers were initialized + mock_card_fetcher.assert_called_once_with(token=test_token) + mock_structure_fetcher.assert_called_once_with(token=test_token) + mock_size_fetcher.assert_called_once_with(token=test_token) + + @patch("tfbpapi.datacard.HfDataCardFetcher") + @patch("tfbpapi.datacard.HfRepoStructureFetcher") + @patch("tfbpapi.datacard.HfSizeInfoFetcher") + def test_init_without_token( + self, mock_size_fetcher, mock_structure_fetcher, mock_card_fetcher, test_repo_id + ): + """Test DataCard initialization without token.""" + datacard = DataCard(test_repo_id) + + assert datacard.repo_id == test_repo_id + assert datacard.token is None + + # Check that fetchers were initialized without token + mock_card_fetcher.assert_called_once_with(token=None) + mock_structure_fetcher.assert_called_once_with(token=None) + mock_size_fetcher.assert_called_once_with(token=None) + + @patch("tfbpapi.datacard.HfDataCardFetcher") + @patch("tfbpapi.datacard.HfRepoStructureFetcher") + @patch("tfbpapi.datacard.HfSizeInfoFetcher") + def test_load_and_validate_card_success( + self, + mock_size_fetcher, + mock_structure_fetcher, + mock_card_fetcher, + test_repo_id, + sample_dataset_card_data, + ): + """Test successful card loading and validation.""" + # Setup mock + mock_fetcher_instance = Mock() + mock_card_fetcher.return_value = mock_fetcher_instance + mock_fetcher_instance.fetch.return_value = sample_dataset_card_data + + datacard = DataCard(test_repo_id) + + # Access dataset_card property to trigger loading + card = datacard.dataset_card + + assert card is not None + assert len(card.configs) == 4 + assert card.pretty_name == "Test Genomics Dataset" + mock_fetcher_instance.fetch.assert_called_once_with(test_repo_id) + + @patch("tfbpapi.datacard.HfDataCardFetcher") + @patch("tfbpapi.datacard.HfRepoStructureFetcher") + @patch("tfbpapi.datacard.HfSizeInfoFetcher") + def test_load_card_no_data( + self, mock_size_fetcher, mock_structure_fetcher, mock_card_fetcher, test_repo_id + ): + """Test handling when no dataset card is found.""" + mock_fetcher_instance = Mock() + mock_card_fetcher.return_value = mock_fetcher_instance + mock_fetcher_instance.fetch.return_value = {} + + datacard = DataCard(test_repo_id) + + with pytest.raises(DataCardValidationError, match="No dataset card found"): + _ = datacard.dataset_card + + @patch("tfbpapi.datacard.HfDataCardFetcher") + @patch("tfbpapi.datacard.HfRepoStructureFetcher") + @patch("tfbpapi.datacard.HfSizeInfoFetcher") + def test_load_card_validation_error( + self, + mock_size_fetcher, + mock_structure_fetcher, + mock_card_fetcher, + test_repo_id, + invalid_dataset_card_data, + ): + """Test handling of validation errors.""" + mock_fetcher_instance = Mock() + mock_card_fetcher.return_value = mock_fetcher_instance + mock_fetcher_instance.fetch.return_value = invalid_dataset_card_data + + datacard = DataCard(test_repo_id) + + with pytest.raises( + DataCardValidationError, match="Dataset card validation failed" + ): + _ = datacard.dataset_card + + @patch("tfbpapi.datacard.HfDataCardFetcher") + @patch("tfbpapi.datacard.HfRepoStructureFetcher") + @patch("tfbpapi.datacard.HfSizeInfoFetcher") + def test_load_card_fetch_error( + self, mock_size_fetcher, mock_structure_fetcher, mock_card_fetcher, test_repo_id + ): + """Test handling of fetch errors.""" + mock_fetcher_instance = Mock() + mock_card_fetcher.return_value = mock_fetcher_instance + mock_fetcher_instance.fetch.side_effect = HfDataFetchError("Fetch failed") + + datacard = DataCard(test_repo_id) + + with pytest.raises(DataCardError, match="Failed to fetch dataset card"): + _ = datacard.dataset_card + + @patch("tfbpapi.datacard.HfDataCardFetcher") + @patch("tfbpapi.datacard.HfRepoStructureFetcher") + @patch("tfbpapi.datacard.HfSizeInfoFetcher") + def test_configs_property( + self, + mock_size_fetcher, + mock_structure_fetcher, + mock_card_fetcher, + test_repo_id, + sample_dataset_card_data, + ): + """Test getting all configurations via property.""" + mock_fetcher_instance = Mock() + mock_card_fetcher.return_value = mock_fetcher_instance + mock_fetcher_instance.fetch.return_value = sample_dataset_card_data + + datacard = DataCard(test_repo_id) + configs = datacard.configs + + assert len(configs) == 4 + config_names = [config.config_name for config in configs] + assert "genomic_features" in config_names + assert "binding_data" in config_names + assert "genome_map_data" in config_names + assert "experiment_metadata" in config_names + + @patch("tfbpapi.datacard.HfDataCardFetcher") + @patch("tfbpapi.datacard.HfRepoStructureFetcher") + @patch("tfbpapi.datacard.HfSizeInfoFetcher") + def test_get_config_by_name( + self, + mock_size_fetcher, + mock_structure_fetcher, + mock_card_fetcher, + test_repo_id, + sample_dataset_card_data, + ): + """Test getting a specific configuration by name.""" + mock_fetcher_instance = Mock() + mock_card_fetcher.return_value = mock_fetcher_instance + mock_fetcher_instance.fetch.return_value = sample_dataset_card_data + + datacard = DataCard(test_repo_id) + + config = datacard.get_config("binding_data") + assert config is not None + assert config.config_name == "binding_data" + assert config.dataset_type == DatasetType.ANNOTATED_FEATURES + + # Test non-existent config + assert datacard.get_config("nonexistent") is None + + @patch("tfbpapi.datacard.HfDataCardFetcher") + @patch("tfbpapi.datacard.HfRepoStructureFetcher") + @patch("tfbpapi.datacard.HfSizeInfoFetcher") + def test_get_metadata_relationships( + self, + mock_size_fetcher, + mock_structure_fetcher, + mock_card_fetcher, + test_repo_id, + sample_dataset_card_data, + ): + """Test getting metadata relationships.""" + mock_fetcher_instance = Mock() + mock_card_fetcher.return_value = mock_fetcher_instance + mock_fetcher_instance.fetch.return_value = sample_dataset_card_data + + datacard = DataCard(test_repo_id) + + relationships = datacard.get_metadata_relationships() + + # Should have explicit relationship between binding_data and experiment_metadata + explicit_rels = [r for r in relationships if r.relationship_type == "explicit"] + assert len(explicit_rels) == 1 + assert explicit_rels[0].data_config == "binding_data" + assert explicit_rels[0].metadata_config == "experiment_metadata" + + # Should have embedded relationship for binding_data (has metadata_fields) + embedded_rels = [r for r in relationships if r.relationship_type == "embedded"] + assert len(embedded_rels) == 1 + assert embedded_rels[0].data_config == "binding_data" + assert embedded_rels[0].metadata_config == "binding_data_embedded" + + @patch("tfbpapi.datacard.HfDataCardFetcher") + @patch("tfbpapi.datacard.HfRepoStructureFetcher") + @patch("tfbpapi.datacard.HfSizeInfoFetcher") + def test_get_repository_info_success( + self, + mock_size_fetcher, + mock_structure_fetcher, + mock_card_fetcher, + test_repo_id, + sample_dataset_card_data, + sample_repo_structure, + ): + """Test getting repository information.""" + mock_card_fetcher_instance = Mock() + mock_structure_fetcher_instance = Mock() + mock_card_fetcher.return_value = mock_card_fetcher_instance + mock_structure_fetcher.return_value = mock_structure_fetcher_instance + + mock_card_fetcher_instance.fetch.return_value = sample_dataset_card_data + mock_structure_fetcher_instance.fetch.return_value = sample_repo_structure + + datacard = DataCard(test_repo_id) + + info = datacard.get_repository_info() + + assert info["repo_id"] == test_repo_id + assert info["pretty_name"] == "Test Genomics Dataset" + assert info["license"] == "mit" + assert info["num_configs"] == 4 + assert "genomic_features" in info["dataset_types"] + assert "annotated_features" in info["dataset_types"] + assert "genome_map" in info["dataset_types"] + assert "metadata" in info["dataset_types"] + assert info["total_files"] == 5 + assert info["last_modified"] == "2023-12-01T10:30:00Z" + assert info["has_default_config"] is True + + @patch("tfbpapi.datacard.HfDataCardFetcher") + @patch("tfbpapi.datacard.HfRepoStructureFetcher") + @patch("tfbpapi.datacard.HfSizeInfoFetcher") + def test_get_repository_info_fetch_error( + self, + mock_size_fetcher, + mock_structure_fetcher, + mock_card_fetcher, + test_repo_id, + sample_dataset_card_data, + ): + """Test getting repository info when structure fetch fails.""" + mock_card_fetcher_instance = Mock() + mock_structure_fetcher_instance = Mock() + mock_card_fetcher.return_value = mock_card_fetcher_instance + mock_structure_fetcher.return_value = mock_structure_fetcher_instance + + mock_card_fetcher_instance.fetch.return_value = sample_dataset_card_data + mock_structure_fetcher_instance.fetch.side_effect = HfDataFetchError( + "Structure fetch failed" + ) + + datacard = DataCard(test_repo_id) + + info = datacard.get_repository_info() + + assert info["repo_id"] == test_repo_id + assert info["total_files"] is None + assert info["last_modified"] is None + + @patch("tfbpapi.datacard.HfDataCardFetcher") + @patch("tfbpapi.datacard.HfRepoStructureFetcher") + @patch("tfbpapi.datacard.HfSizeInfoFetcher") + def test_summary( + self, + mock_size_fetcher, + mock_structure_fetcher, + mock_card_fetcher, + test_repo_id, + sample_dataset_card_data, + sample_repo_structure, + ): + """Test getting a summary of the dataset.""" + mock_card_fetcher_instance = Mock() + mock_structure_fetcher_instance = Mock() + mock_card_fetcher.return_value = mock_card_fetcher_instance + mock_structure_fetcher.return_value = mock_structure_fetcher_instance + + mock_card_fetcher_instance.fetch.return_value = sample_dataset_card_data + mock_structure_fetcher_instance.fetch.return_value = sample_repo_structure + + datacard = DataCard(test_repo_id) + + summary = datacard.summary() + + assert "Dataset: Test Genomics Dataset" in summary + assert f"Repository: {test_repo_id}" in summary + assert "License: mit" in summary + assert "Configurations: 4" in summary + assert "genomic_features" in summary + assert "binding_data" in summary + assert "genome_map_data" in summary + assert "experiment_metadata" in summary + assert "(default)" in summary # genomic_features is marked as default + + @patch("tfbpapi.datacard.HfDataCardFetcher") + @patch("tfbpapi.datacard.HfRepoStructureFetcher") + @patch("tfbpapi.datacard.HfSizeInfoFetcher") + def test_extract_partition_values( + self, + mock_size_fetcher, + mock_structure_fetcher, + mock_card_fetcher, + test_repo_id, + sample_dataset_card_data, + ): + """Test extracting partition values.""" + mock_card_fetcher_instance = Mock() + mock_structure_fetcher_instance = Mock() + mock_card_fetcher.return_value = mock_card_fetcher_instance + mock_structure_fetcher.return_value = mock_structure_fetcher_instance + + mock_card_fetcher_instance.fetch.return_value = sample_dataset_card_data + mock_structure_fetcher_instance.get_partition_values.return_value = [ + "TF1", + "TF2", + "TF3", + ] + + datacard = DataCard(test_repo_id) + + # Get the genome_map_data config which has partitioning enabled + config = datacard.get_config("genome_map_data") + assert config is not None + assert config.dataset_info.partitioning.enabled is True # type: ignore + + values = datacard._extract_partition_values(config, "regulator") + assert values == {"TF1", "TF2", "TF3"} + mock_structure_fetcher_instance.get_partition_values.assert_called_once_with( + test_repo_id, "regulator" + ) + + @patch("tfbpapi.datacard.HfDataCardFetcher") + @patch("tfbpapi.datacard.HfRepoStructureFetcher") + @patch("tfbpapi.datacard.HfSizeInfoFetcher") + def test_extract_partition_values_no_partitioning( + self, + mock_size_fetcher, + mock_structure_fetcher, + mock_card_fetcher, + test_repo_id, + sample_dataset_card_data, + ): + """Test extracting partition values when partitioning is disabled.""" + mock_card_fetcher_instance = Mock() + mock_structure_fetcher_instance = Mock() + mock_card_fetcher.return_value = mock_card_fetcher_instance + mock_structure_fetcher.return_value = mock_structure_fetcher_instance + + mock_card_fetcher_instance.fetch.return_value = sample_dataset_card_data + + datacard = DataCard(test_repo_id) + + # Get a config without partitioning + config = datacard.get_config("genomic_features") + assert config is not None + assert config.dataset_info.partitioning is None + + values = datacard._extract_partition_values(config, "some_field") + assert values == set() + mock_structure_fetcher_instance.get_partition_values.assert_not_called() + + @patch("tfbpapi.datacard.HfDataCardFetcher") + @patch("tfbpapi.datacard.HfRepoStructureFetcher") + @patch("tfbpapi.datacard.HfSizeInfoFetcher") + def test_extract_partition_values_field_not_in_partitions( + self, + mock_size_fetcher, + mock_structure_fetcher, + mock_card_fetcher, + test_repo_id, + sample_dataset_card_data, + ): + """Test extracting partition values when field is not a partition column.""" + mock_card_fetcher_instance = Mock() + mock_structure_fetcher_instance = Mock() + mock_card_fetcher.return_value = mock_card_fetcher_instance + mock_structure_fetcher.return_value = mock_structure_fetcher_instance + + mock_card_fetcher_instance.fetch.return_value = sample_dataset_card_data + + datacard = DataCard(test_repo_id) + + # Get the genome_map_data config which has partitioning enabled + config = datacard.get_config("genome_map_data") + assert config is not None + + # Try to extract values for a field that's not in partition_by + values = datacard._extract_partition_values(config, "not_a_partition_field") + assert values == set() + mock_structure_fetcher_instance.get_partition_values.assert_not_called() + + @patch("tfbpapi.datacard.HfDataCardFetcher") + @patch("tfbpapi.datacard.HfRepoStructureFetcher") + @patch("tfbpapi.datacard.HfSizeInfoFetcher") + def test_extract_partition_values_fetch_error( + self, + mock_size_fetcher, + mock_structure_fetcher, + mock_card_fetcher, + test_repo_id, + sample_dataset_card_data, + ): + """Test extracting partition values when fetch fails.""" + mock_card_fetcher_instance = Mock() + mock_structure_fetcher_instance = Mock() + mock_card_fetcher.return_value = mock_card_fetcher_instance + mock_structure_fetcher.return_value = mock_structure_fetcher_instance + + mock_card_fetcher_instance.fetch.return_value = sample_dataset_card_data + mock_structure_fetcher_instance.get_partition_values.side_effect = ( + HfDataFetchError("Fetch failed") + ) + + datacard = DataCard(test_repo_id) + + config = datacard.get_config("genome_map_data") + values = datacard._extract_partition_values(config, "regulator") # type: ignore + + # Should return empty set on error + assert values == set() + + +class TestGetMetadataFields: + """Tests for DataCard.get_metadata_fields().""" + + @patch("tfbpapi.datacard.HfDataCardFetcher") + @patch("tfbpapi.datacard.HfRepoStructureFetcher") + @patch("tfbpapi.datacard.HfSizeInfoFetcher") + def test_embedded_metadata_fields( + self, + mock_size_fetcher, + mock_structure_fetcher, + mock_card_fetcher, + test_repo_id, + sample_dataset_card_data, + ): + """Embedded metadata_fields on the data config are returned.""" + mock_fetcher_instance = Mock() + mock_card_fetcher.return_value = mock_fetcher_instance + mock_fetcher_instance.fetch.return_value = sample_dataset_card_data + + datacard = DataCard(test_repo_id) + result = datacard.get_metadata_fields("binding_data") + + assert result == ["regulator_symbol", "experimental_condition"] + + @patch("tfbpapi.datacard.HfDataCardFetcher") + @patch("tfbpapi.datacard.HfRepoStructureFetcher") + @patch("tfbpapi.datacard.HfSizeInfoFetcher") + def test_external_metadata_fields( + self, + mock_size_fetcher, + mock_structure_fetcher, + mock_card_fetcher, + test_repo_id, + ): + """External metadata via applies_to returns feature names.""" + mock_fetcher_instance = Mock() + mock_card_fetcher.return_value = mock_fetcher_instance + mock_fetcher_instance.fetch.return_value = _external_metadata_card_data() + + datacard = DataCard(test_repo_id) + result = datacard.get_metadata_fields("coverage_data") + + assert result == [ + "sample_id", + "batch", + "regulator_locus_tag", + "regulator_symbol", + ] + + @patch("tfbpapi.datacard.HfDataCardFetcher") + @patch("tfbpapi.datacard.HfRepoStructureFetcher") + @patch("tfbpapi.datacard.HfSizeInfoFetcher") + def test_no_metadata_returns_none( + self, + mock_size_fetcher, + mock_structure_fetcher, + mock_card_fetcher, + test_repo_id, + sample_dataset_card_data, + ): + """Config with no metadata returns None.""" + mock_fetcher_instance = Mock() + mock_card_fetcher.return_value = mock_fetcher_instance + mock_fetcher_instance.fetch.return_value = sample_dataset_card_data + + datacard = DataCard(test_repo_id) + result = datacard.get_metadata_fields("genomic_features") + + assert result is None + + @patch("tfbpapi.datacard.HfDataCardFetcher") + @patch("tfbpapi.datacard.HfRepoStructureFetcher") + @patch("tfbpapi.datacard.HfSizeInfoFetcher") + def test_unknown_config_returns_none( + self, + mock_size_fetcher, + mock_structure_fetcher, + mock_card_fetcher, + test_repo_id, + sample_dataset_card_data, + ): + """Unknown config name returns None.""" + mock_fetcher_instance = Mock() + mock_card_fetcher.return_value = mock_fetcher_instance + mock_fetcher_instance.fetch.return_value = sample_dataset_card_data + + datacard = DataCard(test_repo_id) + result = datacard.get_metadata_fields("nonexistent") + + assert result is None + + @patch("tfbpapi.datacard.HfDataCardFetcher") + @patch("tfbpapi.datacard.HfRepoStructureFetcher") + @patch("tfbpapi.datacard.HfSizeInfoFetcher") + def test_extract_schema_includes_external_features( + self, + mock_size_fetcher, + mock_structure_fetcher, + mock_card_fetcher, + test_repo_id, + ): + """extract_metadata_schema includes roles from external metadata.""" + mock_fetcher_instance = Mock() + mock_card_fetcher.return_value = mock_fetcher_instance + mock_fetcher_instance.fetch.return_value = _external_metadata_card_data() + + datacard = DataCard(test_repo_id) + schema = datacard.extract_metadata_schema("coverage_data") + + # External metadata features with role=regulator_identifier + assert "regulator_locus_tag" in schema["regulator_fields"] + assert "regulator_symbol" in schema["regulator_fields"] + # metadata_fields key populated + assert schema["metadata_fields"] is not None + assert "sample_id" in schema["metadata_fields"] + + +class TestGetMetadataConfigName: + """Tests for DataCard.get_metadata_config_name().""" + + @patch("tfbpapi.datacard.HfDataCardFetcher") + @patch("tfbpapi.datacard.HfRepoStructureFetcher") + @patch("tfbpapi.datacard.HfSizeInfoFetcher") + def test_external_metadata_returns_config_name( + self, + mock_size_fetcher, + mock_structure_fetcher, + mock_card_fetcher, + test_repo_id, + ): + """Returns metadata config name when applies_to matches.""" + mock_fetcher_instance = Mock() + mock_card_fetcher.return_value = mock_fetcher_instance + mock_fetcher_instance.fetch.return_value = _external_metadata_card_data() + + datacard = DataCard(test_repo_id) + result = datacard.get_metadata_config_name("coverage_data") + + assert result == "sample_metadata" + + @patch("tfbpapi.datacard.HfDataCardFetcher") + @patch("tfbpapi.datacard.HfRepoStructureFetcher") + @patch("tfbpapi.datacard.HfSizeInfoFetcher") + def test_embedded_metadata_returns_none( + self, + mock_size_fetcher, + mock_structure_fetcher, + mock_card_fetcher, + test_repo_id, + sample_dataset_card_data, + ): + """Returns None when metadata is embedded.""" + mock_fetcher_instance = Mock() + mock_card_fetcher.return_value = mock_fetcher_instance + mock_fetcher_instance.fetch.return_value = sample_dataset_card_data + + datacard = DataCard(test_repo_id) + result = datacard.get_metadata_config_name("binding_data") + + assert result is None + + @patch("tfbpapi.datacard.HfDataCardFetcher") + @patch("tfbpapi.datacard.HfRepoStructureFetcher") + @patch("tfbpapi.datacard.HfSizeInfoFetcher") + def test_unknown_config_returns_none( + self, + mock_size_fetcher, + mock_structure_fetcher, + mock_card_fetcher, + test_repo_id, + sample_dataset_card_data, + ): + """Returns None for unknown config name.""" + mock_fetcher_instance = Mock() + mock_card_fetcher.return_value = mock_fetcher_instance + mock_fetcher_instance.fetch.return_value = sample_dataset_card_data + + datacard = DataCard(test_repo_id) + result = datacard.get_metadata_config_name("nonexistent") + + assert result is None + + +class TestGetDataColNames: + """Tests for DataCard.get_data_col_names().""" + + @patch("tfbpapi.datacard.HfDataCardFetcher") + @patch("tfbpapi.datacard.HfRepoStructureFetcher") + @patch("tfbpapi.datacard.HfSizeInfoFetcher") + def test_returns_feature_names( + self, + mock_size_fetcher, + mock_structure_fetcher, + mock_card_fetcher, + test_repo_id, + sample_dataset_card_data, + ): + """Returns column names from the data config's features.""" + mock_fetcher_instance = Mock() + mock_card_fetcher.return_value = mock_fetcher_instance + mock_fetcher_instance.fetch.return_value = sample_dataset_card_data + + datacard = DataCard(test_repo_id) + result = datacard.get_data_col_names("binding_data") + + # binding_data features: regulator_symbol, target_gene, + # experimental_condition, binding_score + assert isinstance(result, set) + assert result == { + "regulator_symbol", + "target_gene", + "experimental_condition", + "binding_score", + } + + @patch("tfbpapi.datacard.HfDataCardFetcher") + @patch("tfbpapi.datacard.HfRepoStructureFetcher") + @patch("tfbpapi.datacard.HfSizeInfoFetcher") + def test_external_metadata_config_returns_data_features( + self, + mock_size_fetcher, + mock_structure_fetcher, + mock_card_fetcher, + test_repo_id, + ): + """For external metadata, returns data config features only.""" + mock_fetcher_instance = Mock() + mock_card_fetcher.return_value = mock_fetcher_instance + mock_fetcher_instance.fetch.return_value = _external_metadata_card_data() + + datacard = DataCard(test_repo_id) + result = datacard.get_data_col_names("coverage_data") + + # coverage_data features: sample_id, chr, coverage + assert result == {"sample_id", "chr", "coverage"} + # Must NOT include metadata-only columns + assert "batch" not in result + assert "regulator_locus_tag" not in result + + @patch("tfbpapi.datacard.HfDataCardFetcher") + @patch("tfbpapi.datacard.HfRepoStructureFetcher") + @patch("tfbpapi.datacard.HfSizeInfoFetcher") + def test_unknown_config_returns_empty_set( + self, + mock_size_fetcher, + mock_structure_fetcher, + mock_card_fetcher, + test_repo_id, + sample_dataset_card_data, + ): + """Returns empty set for unknown config name.""" + mock_fetcher_instance = Mock() + mock_card_fetcher.return_value = mock_fetcher_instance + mock_fetcher_instance.fetch.return_value = sample_dataset_card_data + + datacard = DataCard(test_repo_id) + result = datacard.get_data_col_names("nonexistent") + + assert result == set() + + +class TestGetDatasetSchema: + """Tests for DataCard.get_dataset_schema().""" + + @patch("tfbpapi.datacard.HfDataCardFetcher") + @patch("tfbpapi.datacard.HfRepoStructureFetcher") + @patch("tfbpapi.datacard.HfSizeInfoFetcher") + def test_embedded_metadata_returns_correct_schema( + self, + mock_size_fetcher, + mock_structure_fetcher, + mock_card_fetcher, + test_repo_id, + sample_dataset_card_data, + ): + """Embedded metadata produces correct data/metadata column split.""" + mock_fetcher_instance = Mock() + mock_card_fetcher.return_value = mock_fetcher_instance + mock_fetcher_instance.fetch.return_value = sample_dataset_card_data + + datacard = DataCard(test_repo_id) + # binding_data has metadata_fields: [regulator_symbol, + # experimental_condition] and features: regulator_symbol, + # target_gene, experimental_condition, binding_score + result = datacard.get_dataset_schema("binding_data") + + assert result is not None + assert isinstance(result, DatasetSchema) + assert result.metadata_source == "embedded" + assert result.external_metadata_config is None + assert result.join_columns == set() + assert result.metadata_columns == { + "regulator_symbol", + "experimental_condition", + } + # data_columns = all features minus metadata_columns + assert result.data_columns == { + "target_gene", + "binding_score", + } + + @patch("tfbpapi.datacard.HfDataCardFetcher") + @patch("tfbpapi.datacard.HfRepoStructureFetcher") + @patch("tfbpapi.datacard.HfSizeInfoFetcher") + def test_external_metadata_returns_correct_schema( + self, + mock_size_fetcher, + mock_structure_fetcher, + mock_card_fetcher, + test_repo_id, + ): + """External metadata produces correct split and join columns.""" + mock_fetcher_instance = Mock() + mock_card_fetcher.return_value = mock_fetcher_instance + mock_fetcher_instance.fetch.return_value = _external_metadata_card_data() + + datacard = DataCard(test_repo_id) + # coverage_data features: sample_id, chr, coverage + # sample_metadata features: sample_id, batch, regulator_locus_tag, + # regulator_symbol + # join_columns = intersection = {sample_id} + result = datacard.get_dataset_schema("coverage_data") + + assert result is not None + assert result.metadata_source == "external" + assert result.external_metadata_config == "sample_metadata" + assert result.data_columns == {"sample_id", "chr", "coverage"} + assert result.metadata_columns == { + "sample_id", + "batch", + "regulator_locus_tag", + "regulator_symbol", + } + assert result.join_columns == {"sample_id"} + + @patch("tfbpapi.datacard.HfDataCardFetcher") + @patch("tfbpapi.datacard.HfRepoStructureFetcher") + @patch("tfbpapi.datacard.HfSizeInfoFetcher") + def test_no_metadata_returns_all_cols_as_data( + self, + mock_size_fetcher, + mock_structure_fetcher, + mock_card_fetcher, + test_repo_id, + sample_dataset_card_data, + ): + """Config with no metadata relationship has all cols as data.""" + mock_fetcher_instance = Mock() + mock_card_fetcher.return_value = mock_fetcher_instance + mock_fetcher_instance.fetch.return_value = sample_dataset_card_data + + datacard = DataCard(test_repo_id) + # genomic_features has no metadata_fields and no applies_to + result = datacard.get_dataset_schema("genomic_features") + + assert result is not None + assert result.metadata_source == "none" + assert result.external_metadata_config is None + assert result.metadata_columns == set() + assert result.join_columns == set() + assert result.data_columns == { + "gene_id", + "gene_symbol", + "chromosome", + "start", + "end", + } + + @patch("tfbpapi.datacard.HfDataCardFetcher") + @patch("tfbpapi.datacard.HfRepoStructureFetcher") + @patch("tfbpapi.datacard.HfSizeInfoFetcher") + def test_unknown_config_returns_none( + self, + mock_size_fetcher, + mock_structure_fetcher, + mock_card_fetcher, + test_repo_id, + sample_dataset_card_data, + ): + """Returns None for an unknown config name.""" + mock_fetcher_instance = Mock() + mock_card_fetcher.return_value = mock_fetcher_instance + mock_fetcher_instance.fetch.return_value = sample_dataset_card_data + + datacard = DataCard(test_repo_id) + result = datacard.get_dataset_schema("nonexistent") + + assert result is None diff --git a/tfbpapi/tests/test_datacard_parsing.py b/tfbpapi/tests/test_datacard_parsing.py new file mode 100644 index 0000000..31349e7 --- /dev/null +++ b/tfbpapi/tests/test_datacard_parsing.py @@ -0,0 +1,169 @@ +"""Test script to verify datacard parsing with new environmental_conditions.""" + +import yaml # type: ignore + +from tfbpapi.models import DatasetCard +from tfbpapi.tests.example_datacards import ( + EXAMPLE_1_SIMPLE_TOPLEVEL, + EXAMPLE_2_COMPLEX_FIELD_DEFINITIONS, + EXAMPLE_3_PARTITIONED_WITH_METADATA, +) + + +def test_example_1(): + """Test parsing example 1: simple top-level conditions.""" + print("=" * 80) + print("Testing Example 1: Simple Top-Level Conditions") + print("=" * 80) + + # Extract YAML from markdown + yaml_content = EXAMPLE_1_SIMPLE_TOPLEVEL.split("---")[1] + data = yaml.safe_load(yaml_content) + + try: + card = DatasetCard(**data) + print("✓ Successfully parsed Example 1") + print(f" - Configs: {len(card.configs)}") + print( + " - Top-level experimental_conditions: " + f"{card.experimental_conditions is not None}" + ) + + if card.experimental_conditions: + env_cond = card.experimental_conditions.environmental_conditions + if env_cond: + print(f" - Temperature: {env_cond.temperature_celsius}°C") + print(f" - Cultivation: {env_cond.cultivation_method}") + if env_cond.media: + print(f" - Media: {env_cond.media.name}") + print(f" - Carbon sources: {len(env_cond.media.carbon_source)}") + print( + f" - Nitrogen sources: {len(env_cond.media.nitrogen_source)}" + ) + + # Check field-level definitions + config = card.configs[0] + for feature in config.dataset_info.features: + if feature.definitions: + print( + f" - Feature '{feature.name}' has " + f"{len(feature.definitions)} definitions" + ) + for def_name in feature.definitions.keys(): + print(f" - {def_name}") + + print() + return True + except Exception as e: + print(f"✗ Failed to parse Example 1: {e}") + import traceback + + traceback.print_exc() + print() + return False + + +def test_example_2(): + """Test parsing example 2: complex field-level definitions.""" + print("=" * 80) + print("Testing Example 2: Complex Field-Level Definitions") + print("=" * 80) + + yaml_content = EXAMPLE_2_COMPLEX_FIELD_DEFINITIONS.split("---")[1] + data = yaml.safe_load(yaml_content) + + try: + card = DatasetCard(**data) + print("✓ Successfully parsed Example 2") + print(f" - Configs: {len(card.configs)}") + print(f" - Strain information: {card.strain_information is not None}") + + # Check field-level definitions + config = card.configs[0] + for feature in config.dataset_info.features: + if feature.definitions: + print( + f" - Feature '{feature.name}' has " + f"{len(feature.definitions)} definitions:" + ) + for def_name, def_value in feature.definitions.items(): + print(f" - {def_name}") + if "environmental_conditions" in def_value: + env = def_value["environmental_conditions"] + if "temperature_celsius" in env: + print(f" Temperature: {env['temperature_celsius']}°C") + if "media" in env: + print(f" Media: {env['media']['name']}") + + print() + return True + except Exception as e: + print(f"✗ Failed to parse Example 2: {e}") + import traceback + + traceback.print_exc() + print() + return False + + +def test_example_3(): + """Test parsing example 3: partitioned with metadata.""" + print("=" * 80) + print("Testing Example 3: Partitioned with Metadata") + print("=" * 80) + + yaml_content = EXAMPLE_3_PARTITIONED_WITH_METADATA.split("---")[1] + data = yaml.safe_load(yaml_content) + + try: + card = DatasetCard(**data) + print("✓ Successfully parsed Example 3") + print(f" - Configs: {len(card.configs)}") + print( + " - Top-level experimental_conditions: " + f"{card.experimental_conditions is not None}" + ) + + if card.experimental_conditions: + env_cond = card.experimental_conditions.environmental_conditions + if env_cond and env_cond.media: + print(f" - Top-level media: {env_cond.media.name}") + + # Check config-level experimental_conditions + for config in card.configs: + if config.experimental_conditions: + print(f" - Config '{config.config_name}' has experimental_conditions") + env_cond = config.experimental_conditions.environmental_conditions + if env_cond and env_cond.media: + print(f" - Media: {env_cond.media.name}") + print(f" - Temperature: {env_cond.temperature_celsius}°C") + + print() + return True + except Exception as e: + print(f"✗ Failed to parse Example 3: {e}") + import traceback + + traceback.print_exc() + print() + return False + + +if __name__ == "__main__": + results = [] + + results.append(test_example_1()) + results.append(test_example_2()) + results.append(test_example_3()) + + print("=" * 80) + print("Summary") + print("=" * 80) + print(f"Passed: {sum(results)}/{len(results)}") + + if all(results): + print("\n✓ All tests passed!") + exit(0) + else: + print("\n✗ Some tests failed") + exit(1) diff --git a/tfbpapi/tests/test_fetchers.py b/tfbpapi/tests/test_fetchers.py new file mode 100644 index 0000000..b197498 --- /dev/null +++ b/tfbpapi/tests/test_fetchers.py @@ -0,0 +1,435 @@ +"""Tests for datainfo fetcher classes.""" + +from unittest.mock import Mock, patch + +import pytest +import requests +from requests import HTTPError + +from tfbpapi.errors import HfDataFetchError +from tfbpapi.fetchers import ( + HfDataCardFetcher, + HfRepoStructureFetcher, + HfSizeInfoFetcher, +) + + +class TestHfDataCardFetcher: + """Test HfDataCardFetcher class.""" + + def test_init_with_token(self, test_token): + """Test initialization with token.""" + fetcher = HfDataCardFetcher(token=test_token) + assert fetcher.token == test_token + + def test_init_without_token(self): + """Test initialization without token.""" + with patch.dict("os.environ", {}, clear=True): + fetcher = HfDataCardFetcher() + assert fetcher.token is None + + def test_init_with_env_token(self, test_token): + """Test initialization with environment token.""" + with patch.dict("os.environ", {"HF_TOKEN": test_token}): + fetcher = HfDataCardFetcher() + assert fetcher.token == test_token + + @patch("tfbpapi.fetchers.DatasetCard") + def test_fetch_success( + self, mock_dataset_card, test_repo_id, sample_dataset_card_data + ): + """Test successful dataset card fetch.""" + # Setup mock + mock_card = Mock() + mock_card.data.to_dict.return_value = sample_dataset_card_data + mock_dataset_card.load.return_value = mock_card + + fetcher = HfDataCardFetcher(token="test_token") + result = fetcher.fetch(test_repo_id) + + assert result == sample_dataset_card_data + mock_dataset_card.load.assert_called_once_with( + test_repo_id, repo_type="dataset", token="test_token" + ) + + @patch("tfbpapi.fetchers.DatasetCard") + def test_fetch_no_data_section(self, mock_dataset_card, test_repo_id): + """Test fetch when dataset card has no data section.""" + # Setup mock with no data + mock_card = Mock() + mock_card.data = None + mock_dataset_card.load.return_value = mock_card + + fetcher = HfDataCardFetcher() + result = fetcher.fetch(test_repo_id) + + assert result == {} + + @patch("tfbpapi.fetchers.DatasetCard") + def test_fetch_exception(self, mock_dataset_card, test_repo_id): + """Test fetch when DatasetCard.load raises exception.""" + mock_dataset_card.load.side_effect = Exception("API Error") + + fetcher = HfDataCardFetcher() + + with pytest.raises(HfDataFetchError, match="Failed to fetch dataset card"): + fetcher.fetch(test_repo_id) + + def test_fetch_different_repo_types(self, sample_dataset_card_data): + """Test fetch with different repository types.""" + with patch("tfbpapi.fetchers.DatasetCard") as mock_dataset_card: + mock_card = Mock() + mock_card.data.to_dict.return_value = sample_dataset_card_data + mock_dataset_card.load.return_value = mock_card + + fetcher = HfDataCardFetcher() + + # Test with model repo + fetcher.fetch("test/repo", repo_type="model") + mock_dataset_card.load.assert_called_with( + "test/repo", repo_type="model", token=None + ) + + # Test with space repo + fetcher.fetch("test/repo", repo_type="space") + mock_dataset_card.load.assert_called_with( + "test/repo", repo_type="space", token=None + ) + + +class TestHfSizeInfoFetcher: + """Test HfSizeInfoFetcher class.""" + + def test_init(self, test_token): + """Test initialization.""" + fetcher = HfSizeInfoFetcher(token=test_token) + assert fetcher.token == test_token + assert fetcher.base_url == "https://datasets-server.huggingface.co" + + def test_build_headers_with_token(self, test_token): + """Test building headers with token.""" + fetcher = HfSizeInfoFetcher(token=test_token) + headers = fetcher._build_headers() + + assert headers["User-Agent"] == "TFBP-API/1.0" + assert headers["Authorization"] == f"Bearer {test_token}" + + def test_build_headers_without_token(self): + """Test building headers without token.""" + fetcher = HfSizeInfoFetcher() + headers = fetcher._build_headers() + + assert headers["User-Agent"] == "TFBP-API/1.0" + assert "Authorization" not in headers + + @patch("tfbpapi.fetchers.requests.get") + def test_fetch_success(self, mock_get, test_repo_id, sample_size_info): + """Test successful size info fetch.""" + # Setup mock response + mock_response = Mock() + mock_response.json.return_value = sample_size_info + mock_get.return_value = mock_response + + fetcher = HfSizeInfoFetcher(token="test_token") + result = fetcher.fetch(test_repo_id) + + assert result == sample_size_info + mock_get.assert_called_once() + + # Check call arguments + call_args = mock_get.call_args + assert call_args[1]["params"]["dataset"] == test_repo_id + assert call_args[1]["headers"]["Authorization"] == "Bearer test_token" + assert call_args[1]["timeout"] == 30 + + @patch("tfbpapi.fetchers.requests.get") + def test_fetch_404_error(self, mock_get, test_repo_id): + """Test fetch with 404 error.""" + # Setup mock 404 response + mock_response = Mock() + mock_response.status_code = 404 + error = HTTPError(response=mock_response) + mock_get.side_effect = error + + fetcher = HfSizeInfoFetcher() + + with pytest.raises(HfDataFetchError, match="Dataset .* not found"): + fetcher.fetch(test_repo_id) + + @patch("tfbpapi.fetchers.requests.get") + def test_fetch_403_error(self, mock_get, test_repo_id): + """Test fetch with 403 error.""" + # Setup mock 403 response + mock_response = Mock() + mock_response.status_code = 403 + error = HTTPError(response=mock_response) + mock_get.side_effect = error + + fetcher = HfSizeInfoFetcher() + + with pytest.raises( + HfDataFetchError, match="Access denied.*check token permissions" + ): + fetcher.fetch(test_repo_id) + + @patch("tfbpapi.fetchers.requests.get") + def test_fetch_other_http_error(self, mock_get, test_repo_id): + """Test fetch with other HTTP error.""" + # Setup mock 500 response + mock_response = Mock() + mock_response.status_code = 500 + error = HTTPError(response=mock_response) + mock_get.side_effect = error + + fetcher = HfSizeInfoFetcher() + + with pytest.raises(HfDataFetchError, match="HTTP error fetching size"): + fetcher.fetch(test_repo_id) + + @patch("tfbpapi.fetchers.requests.get") + def test_fetch_request_exception(self, mock_get, test_repo_id): + """Test fetch with request exception.""" + mock_get.side_effect = requests.RequestException("Network error") + + fetcher = HfSizeInfoFetcher() + + with pytest.raises(HfDataFetchError, match="Request failed fetching size"): + fetcher.fetch(test_repo_id) + + @patch("tfbpapi.fetchers.requests.get") + def test_fetch_json_decode_error(self, mock_get, test_repo_id): + """Test fetch with JSON decode error.""" + # Setup mock response with invalid JSON + mock_response = Mock() + mock_response.json.side_effect = ValueError("Invalid JSON") + mock_get.return_value = mock_response + + fetcher = HfSizeInfoFetcher() + + with pytest.raises(HfDataFetchError, match="Invalid JSON response"): + fetcher.fetch(test_repo_id) + + +class TestHfRepoStructureFetcher: + """Test HfRepoStructureFetcher class.""" + + def test_init(self, test_token): + """Test initialization.""" + fetcher = HfRepoStructureFetcher(token=test_token) + assert fetcher.token == test_token + assert fetcher._cached_structure == {} + + @patch("tfbpapi.fetchers.repo_info") + def test_fetch_success(self, mock_repo_info, test_repo_id, sample_repo_structure): + """Test successful repository structure fetch.""" + # Setup mock repo info + mock_info = Mock() + mock_info.siblings = [ + Mock(rfilename="features.parquet", size=2048000, lfs=Mock()), + Mock(rfilename="binding/part1.parquet", size=1024000, lfs=Mock()), + Mock( + rfilename="tracks/regulator=TF1/experiment=exp1/data.parquet", + size=5120000, + lfs=Mock(), + ), + ] + mock_info.last_modified.isoformat.return_value = "2023-12-01T10:30:00Z" + mock_repo_info.return_value = mock_info + + fetcher = HfRepoStructureFetcher(token="test_token") + result = fetcher.fetch(test_repo_id) + + assert result["repo_id"] == test_repo_id + assert result["total_files"] == 3 + assert len(result["files"]) == 3 + assert result["last_modified"] == "2023-12-01T10:30:00Z" + + # Check that repo_info was called correctly + mock_repo_info.assert_called_once_with( + repo_id=test_repo_id, repo_type="dataset", token="test_token" + ) + + @patch("tfbpapi.fetchers.repo_info") + def test_fetch_with_caching(self, mock_repo_info, test_repo_id): + """Test fetch with caching behavior.""" + # Setup mock + mock_info = Mock() + mock_info.siblings = [] + mock_info.last_modified = None + mock_repo_info.return_value = mock_info + + fetcher = HfRepoStructureFetcher() + + # First fetch + result1 = fetcher.fetch(test_repo_id) + assert mock_repo_info.call_count == 1 + + # Second fetch should use cache + result2 = fetcher.fetch(test_repo_id) + assert mock_repo_info.call_count == 1 # Not called again + assert result1 == result2 + + # Force refresh should call API again + fetcher.fetch(test_repo_id, force_refresh=True) + assert mock_repo_info.call_count == 2 + + @patch("tfbpapi.fetchers.repo_info") + def test_fetch_siblings_none(self, mock_repo_info, test_repo_id): + """Test fetch when siblings is None.""" + # Setup mock with None siblings + mock_info = Mock() + mock_info.siblings = None + mock_info.last_modified = None + mock_repo_info.return_value = mock_info + + fetcher = HfRepoStructureFetcher() + result = fetcher.fetch(test_repo_id) + + assert result["total_files"] == 0 + assert result["files"] == [] + assert result["partitions"] == {} + + @patch("tfbpapi.fetchers.repo_info") + def test_fetch_exception(self, mock_repo_info, test_repo_id): + """Test fetch when repo_info raises exception.""" + mock_repo_info.side_effect = Exception("API Error") + + fetcher = HfRepoStructureFetcher() + + with pytest.raises(HfDataFetchError, match="Failed to fetch repo structure"): + fetcher.fetch(test_repo_id) + + def test_extract_partition_info(self): + """Test extracting partition information from file paths.""" + fetcher = HfRepoStructureFetcher() + partitions = {} # type: ignore + + # Test normal partition pattern + fetcher._extract_partition_info( + "data/regulator=TF1/condition=control/file.parquet", partitions + ) + assert "regulator" in partitions + assert "TF1" in partitions["regulator"] + assert "condition" in partitions + assert "control" in partitions["condition"] + + # Test multiple values for same partition + fetcher._extract_partition_info( + "data/regulator=TF2/condition=treatment/file.parquet", partitions + ) + assert len(partitions["regulator"]) == 2 + assert "TF2" in partitions["regulator"] + assert "treatment" in partitions["condition"] + + # Test file without partitions + fetcher._extract_partition_info("simple_file.parquet", partitions) + # partitions dict should remain unchanged + assert len(partitions) == 2 + + @patch("tfbpapi.fetchers.repo_info") + def test_get_partition_values_success(self, mock_repo_info, test_repo_id): + """Test getting partition values for a specific column.""" + # Setup mock with partitioned files + mock_info = Mock() + mock_info.siblings = [ + Mock(rfilename="data/regulator=TF1/file1.parquet", size=1000, lfs=None), + Mock(rfilename="data/regulator=TF2/file2.parquet", size=1000, lfs=None), + Mock(rfilename="data/regulator=TF3/file3.parquet", size=1000, lfs=None), + ] + mock_info.last_modified = None + mock_repo_info.return_value = mock_info + + fetcher = HfRepoStructureFetcher() + values = fetcher.get_partition_values(test_repo_id, "regulator") + + assert values == ["TF1", "TF2", "TF3"] # Should be sorted + + @patch("tfbpapi.fetchers.repo_info") + def test_get_partition_values_no_partitions(self, mock_repo_info, test_repo_id): + """Test getting partition values when no partitions exist.""" + # Setup mock with no partitioned files + mock_info = Mock() + mock_info.siblings = [ + Mock(rfilename="simple_file.parquet", size=1000, lfs=None), + ] + mock_info.last_modified = None + mock_repo_info.return_value = mock_info + + fetcher = HfRepoStructureFetcher() + values = fetcher.get_partition_values(test_repo_id, "regulator") + + assert values == [] + + @patch("tfbpapi.fetchers.repo_info") + def test_get_dataset_files_all(self, mock_repo_info, test_repo_id): + """Test getting all dataset files.""" + # Setup mock + mock_info = Mock() + mock_info.siblings = [ + Mock(rfilename="file1.parquet", size=1000, lfs=None), + Mock(rfilename="file2.parquet", size=2000, lfs=Mock()), + ] + mock_info.last_modified = None + mock_repo_info.return_value = mock_info + + fetcher = HfRepoStructureFetcher() + files = fetcher.get_dataset_files(test_repo_id) + + assert len(files) == 2 + assert files[0]["path"] == "file1.parquet" + assert files[0]["size"] == 1000 + assert files[0]["is_lfs"] is False + + assert files[1]["path"] == "file2.parquet" + assert files[1]["size"] == 2000 + assert files[1]["is_lfs"] is True + + @patch("tfbpapi.fetchers.repo_info") + def test_get_dataset_files_with_pattern(self, mock_repo_info, test_repo_id): + """Test getting dataset files with path pattern filter.""" + # Setup mock + mock_info = Mock() + mock_info.siblings = [ + Mock(rfilename="data/file1.parquet", size=1000, lfs=None), + Mock(rfilename="metadata/info.json", size=500, lfs=None), + Mock(rfilename="data/file2.parquet", size=2000, lfs=None), + ] + mock_info.last_modified = None + mock_repo_info.return_value = mock_info + + fetcher = HfRepoStructureFetcher() + files = fetcher.get_dataset_files(test_repo_id, path_pattern=r".*\.parquet$") + + assert len(files) == 2 + assert all(f["path"].endswith(".parquet") for f in files) + + def test_get_dataset_files_uses_cache(self): + """Test that get_dataset_files uses fetch caching.""" + fetcher = HfRepoStructureFetcher() + + with patch.object(fetcher, "fetch") as mock_fetch: + mock_fetch.return_value = {"files": []} + + # First call + fetcher.get_dataset_files("test/repo") + mock_fetch.assert_called_with("test/repo", force_refresh=False) + + # Second call with force_refresh + fetcher.get_dataset_files("test/repo", force_refresh=True) + mock_fetch.assert_called_with("test/repo", force_refresh=True) + + def test_get_partition_values_uses_cache(self): + """Test that get_partition_values uses fetch caching.""" + fetcher = HfRepoStructureFetcher() + + with patch.object(fetcher, "fetch") as mock_fetch: + mock_fetch.return_value = {"partitions": {"regulator": {"TF1", "TF2"}}} + + # First call + result = fetcher.get_partition_values("test/repo", "regulator") + mock_fetch.assert_called_with("test/repo", force_refresh=False) + assert result == ["TF1", "TF2"] + + # Second call with force_refresh + fetcher.get_partition_values("test/repo", "regulator", force_refresh=True) + mock_fetch.assert_called_with("test/repo", force_refresh=True) diff --git a/tfbpapi/tests/test_hf_cache_manager.py b/tfbpapi/tests/test_hf_cache_manager.py new file mode 100644 index 0000000..aa395df --- /dev/null +++ b/tfbpapi/tests/test_hf_cache_manager.py @@ -0,0 +1,783 @@ +"""Comprehensive tests for HfCacheManager class.""" + +import logging +from datetime import datetime, timedelta +from unittest.mock import Mock, patch + +import duckdb +import pytest + +from tfbpapi.hf_cache_manager import HfCacheManager +from tfbpapi.models import DatasetType + + +class TestHfCacheManagerInit: + """Test HfCacheManager initialization.""" + + def test_init_basic(self): + """Test basic initialization.""" + conn = duckdb.connect(":memory:") + repo_id = "test/repo" + + with patch( + "tfbpapi.hf_cache_manager.DataCard.__init__", return_value=None + ) as mock_datacard_init: + cache_manager = HfCacheManager(repo_id, conn) + # Manually set the properties that would normally + # be set by DataCard.__init__ + cache_manager.repo_id = repo_id + cache_manager.token = None + + assert cache_manager.repo_id == repo_id + assert cache_manager.duckdb_conn == conn + assert cache_manager.token is None + assert cache_manager.logger is not None + # DataCard should be initialized as parent + mock_datacard_init.assert_called_once_with(repo_id, None) + + def test_init_with_token_and_logger(self): + """Test initialization with token and custom logger.""" + conn = duckdb.connect(":memory:") + repo_id = "test/repo" + token = "test_token" + logger = logging.getLogger("test_logger") + + with patch( + "tfbpapi.hf_cache_manager.DataCard.__init__", return_value=None + ) as mock_datacard_init: + cache_manager = HfCacheManager(repo_id, conn, token=token, logger=logger) + # Manually set the properties that would + # normally be set by DataCard.__init__ + cache_manager.repo_id = repo_id + cache_manager.token = token + + assert cache_manager.repo_id == repo_id + assert cache_manager.duckdb_conn == conn + assert cache_manager.token == token + assert cache_manager.logger == logger + # DataCard should be initialized as parent with token + mock_datacard_init.assert_called_once_with(repo_id, token) + + +class TestHfCacheManagerDatacard: + """Test DataCard integration since HfCacheManager now inherits from DataCard.""" + + def test_datacard_inheritance(self): + """Test that HfCacheManager properly inherits from DataCard.""" + conn = duckdb.connect(":memory:") + repo_id = "test/repo" + token = "test_token" + + with patch( + "tfbpapi.hf_cache_manager.DataCard.__init__", return_value=None + ) as mock_datacard_init: + cache_manager = HfCacheManager(repo_id, conn, token=token) + + # DataCard should be initialized during construction + mock_datacard_init.assert_called_once_with(repo_id, token) + + # Should have DataCard methods available (they exist on the class) + assert hasattr(cache_manager, "get_config") + + +class TestHfCacheManagerDuckDBOperations: + """Test DuckDB operations that are still part of HfCacheManager.""" + + @patch("tfbpapi.hf_cache_manager.DataCard.__init__", return_value=None) + def test_create_duckdb_table_from_files_single_file( + self, mock_datacard_init, tmpdir + ): + """Test creating DuckDB table from single parquet file.""" + # Create a mock parquet file + parquet_file = tmpdir.join("test.parquet") + parquet_file.write("dummy_content") + + # Use a separate cache manager with mock connection for this test + mock_conn = Mock() + test_cache_manager = HfCacheManager("test/repo", mock_conn) + + # Mock the validation method since we're testing table creation + test_cache_manager._validate_source_sample_fields = Mock() # type: ignore + + test_cache_manager._create_duckdb_table_from_files( + [str(parquet_file)], "test_table", "test_config" + ) + + mock_conn.execute.assert_called_once() + sql_call = mock_conn.execute.call_args[0][0] + assert "CREATE OR REPLACE VIEW test_table" in sql_call + assert str(parquet_file) in sql_call + + @patch("tfbpapi.hf_cache_manager.DataCard.__init__", return_value=None) + def test_create_duckdb_table_from_files_multiple_files( + self, mock_datacard_init, tmpdir + ): + """Test creating DuckDB table from multiple parquet files.""" + # Create mock parquet files + file1 = tmpdir.join("test1.parquet") + file1.write("dummy_content1") + file2 = tmpdir.join("test2.parquet") + file2.write("dummy_content2") + + files = [str(file1), str(file2)] + + # Use a separate cache manager with mock connection for this test + mock_conn = Mock() + test_cache_manager = HfCacheManager("test/repo", mock_conn) + + # Mock the validation method since we're testing table creation + test_cache_manager._validate_source_sample_fields = Mock() # type: ignore + + test_cache_manager._create_duckdb_table_from_files( + files, "test_table", "test_config" + ) + + mock_conn.execute.assert_called_once() + sql_call = mock_conn.execute.call_args[0][0] + assert "CREATE OR REPLACE VIEW test_table" in sql_call + assert str(file1) in sql_call + assert str(file2) in sql_call + + +class TestHfCacheManagerCacheManagement: + """Test cache management functionality.""" + + def setup_method(self): + """Set up test fixtures.""" + with patch("tfbpapi.hf_cache_manager.DataCard.__init__", return_value=None): + self.conn = duckdb.connect(":memory:") + self.repo_id = "test/repo" + self.cache_manager = HfCacheManager(self.repo_id, self.conn) + + def test_parse_size_string(self): + """Test size string parsing.""" + assert self.cache_manager._parse_size_string("10KB") == 10 * 1024 + assert self.cache_manager._parse_size_string("5MB") == 5 * 1024**2 + assert self.cache_manager._parse_size_string("2GB") == 2 * 1024**3 + assert self.cache_manager._parse_size_string("1TB") == 1 * 1024**4 + assert self.cache_manager._parse_size_string("500") == 500 + assert self.cache_manager._parse_size_string("10.5GB") == int(10.5 * 1024**3) + + def test_format_bytes(self): + """Test byte formatting.""" + assert self.cache_manager._format_bytes(0) == "0B" + assert self.cache_manager._format_bytes(1023) == "1023.0B" + assert self.cache_manager._format_bytes(1024) == "1.0KB" + assert self.cache_manager._format_bytes(1024**2) == "1.0MB" + assert self.cache_manager._format_bytes(1024**3) == "1.0GB" + assert self.cache_manager._format_bytes(1024**4) == "1.0TB" + + @patch("tfbpapi.hf_cache_manager.scan_cache_dir") + def test_clean_cache_by_age(self, mock_scan_cache_dir): + """Test age-based cache cleaning.""" + # Setup mock cache info + mock_cache_info = Mock() + mock_revision = Mock() + mock_revision.commit_hash = "abc123" + mock_revision.last_modified = (datetime.now() - timedelta(days=35)).timestamp() + + mock_repo = Mock() + mock_repo.revisions = [mock_revision] + + mock_cache_info.repos = [mock_repo] + mock_delete_strategy = Mock() + mock_delete_strategy.expected_freed_size_str = "100MB" + mock_cache_info.delete_revisions.return_value = mock_delete_strategy + + mock_scan_cache_dir.return_value = mock_cache_info + + result = self.cache_manager.clean_cache_by_age(max_age_days=30, dry_run=True) + + assert result == mock_delete_strategy + mock_cache_info.delete_revisions.assert_called_once_with("abc123") + + @patch("tfbpapi.hf_cache_manager.scan_cache_dir") + def test_clean_cache_by_age_no_old_revisions(self, mock_scan_cache_dir): + """Test age-based cleaning when no old revisions exist.""" + mock_cache_info = Mock() + mock_revision = Mock() + mock_revision.commit_hash = "abc123" + mock_revision.last_modified = datetime.now().timestamp() # Recent + + mock_repo = Mock() + mock_repo.revisions = [mock_revision] + + mock_cache_info.repos = [mock_repo] + mock_delete_strategy = Mock() + mock_delete_strategy.expected_freed_size_str = "0B" + mock_cache_info.delete_revisions.return_value = mock_delete_strategy + + mock_scan_cache_dir.return_value = mock_cache_info + + result = self.cache_manager.clean_cache_by_age(max_age_days=30, dry_run=True) + + # Should still return a strategy, but with empty revisions + assert result == mock_delete_strategy + mock_cache_info.delete_revisions.assert_called_once_with() + + @patch("tfbpapi.hf_cache_manager.scan_cache_dir") + def test_clean_cache_by_size(self, mock_scan_cache_dir): + """Test size-based cache cleaning.""" + # Setup mock cache info + mock_cache_info = Mock() + mock_cache_info.size_on_disk = 5 * 1024**3 # 5GB + mock_cache_info.size_on_disk_str = "5.0GB" + + mock_revision = Mock() + mock_revision.commit_hash = "abc123" + mock_revision.last_modified = datetime.now().timestamp() + mock_revision.size_on_disk = 2 * 1024**3 # 2GB + + mock_repo = Mock() + mock_repo.revisions = [mock_revision] + + mock_cache_info.repos = [mock_repo] + mock_delete_strategy = Mock() + mock_delete_strategy.expected_freed_size_str = "2GB" + mock_cache_info.delete_revisions.return_value = mock_delete_strategy + + mock_scan_cache_dir.return_value = mock_cache_info + + result = self.cache_manager.clean_cache_by_size( + target_size="3GB", strategy="oldest_first", dry_run=True + ) + + assert result == mock_delete_strategy + mock_cache_info.delete_revisions.assert_called_once() + + @patch("tfbpapi.hf_cache_manager.scan_cache_dir") + def test_clean_cache_by_size_already_under_target(self, mock_scan_cache_dir): + """Test size-based cleaning when already under target.""" + mock_cache_info = Mock() + mock_cache_info.size_on_disk = 1 * 1024**3 # 1GB + mock_cache_info.size_on_disk_str = "1.0GB" + mock_cache_info.repos = [] + + mock_delete_strategy = Mock() + mock_delete_strategy.expected_freed_size_str = "0B" + mock_cache_info.delete_revisions.return_value = mock_delete_strategy + + mock_scan_cache_dir.return_value = mock_cache_info + + result = self.cache_manager.clean_cache_by_size( + target_size="2GB", strategy="oldest_first", dry_run=True + ) + + assert result == mock_delete_strategy + + @patch("tfbpapi.hf_cache_manager.scan_cache_dir") + def test_clean_unused_revisions(self, mock_scan_cache_dir): + """Test cleaning unused revisions.""" + # Setup mock with multiple revisions + mock_cache_info = Mock() + + mock_revision1 = Mock() + mock_revision1.commit_hash = "abc123" + mock_revision1.last_modified = (datetime.now() - timedelta(days=1)).timestamp() + + mock_revision2 = Mock() + mock_revision2.commit_hash = "def456" + mock_revision2.last_modified = (datetime.now() - timedelta(days=10)).timestamp() + + mock_revision3 = Mock() + mock_revision3.commit_hash = "ghi789" + mock_revision3.last_modified = (datetime.now() - timedelta(days=20)).timestamp() + + mock_repo = Mock() + mock_repo.revisions = [mock_revision1, mock_revision2, mock_revision3] + + mock_cache_info.repos = [mock_repo] + mock_delete_strategy = Mock() + mock_delete_strategy.expected_freed_size_str = "1GB" + mock_cache_info.delete_revisions.return_value = mock_delete_strategy + + mock_scan_cache_dir.return_value = mock_cache_info + + result = self.cache_manager.clean_unused_revisions(keep_latest=2, dry_run=True) + + assert result == mock_delete_strategy + # Should delete oldest revision (ghi789) + mock_cache_info.delete_revisions.assert_called_once_with("ghi789") + + @patch("tfbpapi.hf_cache_manager.scan_cache_dir") + def test_auto_clean_cache(self, mock_scan_cache_dir): + """Test automated cache cleaning.""" + mock_cache_info = Mock() + mock_cache_info.size_on_disk = 10 * 1024**3 # 10GB + mock_cache_info.repos = [] + + mock_delete_strategy = Mock() + mock_delete_strategy.expected_freed_size = 1 * 1024**3 # 1GB + mock_delete_strategy.expected_freed_size_str = "1GB" + + mock_scan_cache_dir.return_value = mock_cache_info + + with patch.object( + self.cache_manager, "clean_cache_by_age", return_value=mock_delete_strategy + ): + with patch.object( + self.cache_manager, + "clean_unused_revisions", + return_value=mock_delete_strategy, + ): + with patch.object( + self.cache_manager, + "clean_cache_by_size", + return_value=mock_delete_strategy, + ): + result = self.cache_manager.auto_clean_cache( + max_age_days=30, + max_total_size="5GB", + keep_latest_per_repo=2, + dry_run=True, + ) + + assert ( + len(result) == 3 + ) # All three cleanup strategies should be executed + assert all(strategy == mock_delete_strategy for strategy in result) + + +class TestHfCacheManagerErrorHandling: + """Test error handling and edge cases.""" + + def setup_method(self): + """Set up test fixtures.""" + with patch("tfbpapi.hf_cache_manager.DataCard.__init__", return_value=None): + self.conn = duckdb.connect(":memory:") + self.repo_id = "test/repo" + self.cache_manager = HfCacheManager(self.repo_id, self.conn) + + def test_parse_size_string_invalid_input(self): + """Test error handling for invalid size strings.""" + with pytest.raises(ValueError): + self.cache_manager._parse_size_string("invalid") + + @patch("tfbpapi.hf_cache_manager.scan_cache_dir") + def test_clean_cache_invalid_strategy(self, mock_scan_cache_dir): + """Test error handling for invalid cleanup strategy.""" + mock_cache_info = Mock() + mock_cache_info.size_on_disk = 5 * 1024**3 + mock_cache_info.repos = [] + mock_scan_cache_dir.return_value = mock_cache_info + + with pytest.raises(ValueError, match="Unknown strategy"): + self.cache_manager.clean_cache_by_size( + target_size="1GB", + strategy="invalid_strategy", # type: ignore[arg-type] + dry_run=True, + ) + + +class TestHfCacheManagerIntegration: + """Integration tests with real DuckDB operations.""" + + def setup_method(self): + """Set up test fixtures.""" + with patch("tfbpapi.hf_cache_manager.DataCard.__init__", return_value=None): + self.conn = duckdb.connect(":memory:") + self.repo_id = "test/repo" + self.cache_manager = HfCacheManager(self.repo_id, self.conn) + + def test_metadata_workflow_integration(self, tmpdir): + """Test complete metadata workflow with real files.""" + # Create temporary parquet file content + metadata_file = tmpdir.join("metadata.parquet") + metadata_file.write("dummy_parquet_content") + + # Test the core table creation functionality + mock_conn = Mock() + test_cache_manager = HfCacheManager("test/repo", mock_conn) + + # Mock the validation method since we're testing table creation + test_cache_manager._validate_source_sample_fields = Mock() # type: ignore + + # Test _create_duckdb_table_from_files directly + test_cache_manager._create_duckdb_table_from_files( + [str(metadata_file)], "metadata_test_metadata", "test_metadata" + ) + + # Verify the SQL was generated correctly + mock_conn.execute.assert_called_once() + sql_call = mock_conn.execute.call_args[0][0] + assert "CREATE OR REPLACE VIEW metadata_test_metadata" in sql_call + assert str(metadata_file) in sql_call + + def test_embedded_metadata_workflow_integration(self): + """Test complete embedded metadata workflow with real DuckDB operations.""" + # Create real test data in DuckDB + self.conn.execute( + """ + CREATE TABLE test_data AS + SELECT + 'gene_' || (row_number() OVER()) as gene_id, + CASE + WHEN (row_number() OVER()) % 3 = 0 THEN 'treatment_A' + WHEN (row_number() OVER()) % 3 = 1 THEN 'treatment_B' + ELSE 'control' + END as experimental_condition, + random() * 1000 as expression_value + FROM range(30) + """ + ) + + # Extract embedded metadata + result = self.cache_manager._extract_embedded_metadata_field( + "test_data", "experimental_condition", "metadata_test_condition" + ) + + assert result is True + + # Verify the metadata table was created correctly + metadata_results = self.conn.execute( + "SELECT value, count FROM metadata_test_condition ORDER BY count DESC" + ).fetchall() + + assert len(metadata_results) == 3 # Three unique conditions + + # Check that the counts make sense (should be 10 each for 30 total rows) + total_count = sum(row[1] for row in metadata_results) + assert total_count == 30 + + # Check that conditions are as expected + conditions = {row[0] for row in metadata_results} + assert conditions == {"treatment_A", "treatment_B", "control"} + + def test_table_existence_checking_integration(self): + """Test table existence checking with real DuckDB operations.""" + # Test non-existent table + assert ( + self.cache_manager._check_metadata_exists_in_duckdb("nonexistent_table") + is False + ) + + # Create a real table + self.conn.execute("CREATE TABLE test_table (id INTEGER, name TEXT)") + + # Test existing table + assert self.cache_manager._check_metadata_exists_in_duckdb("test_table") is True + + # Test with view + self.conn.execute("CREATE VIEW test_view AS SELECT * FROM test_table") + assert self.cache_manager._check_metadata_exists_in_duckdb("test_view") is True + + +# Fixtures for common test data +@pytest.fixture +def sample_metadata_config(): + """Sample metadata configuration for testing.""" + return Mock( + config_name="test_metadata", + description="Test metadata configuration", + data_files=[Mock(path="metadata.parquet")], + applies_to=["data_config"], + ) + + +@pytest.fixture +def sample_data_config(): + """Sample data configuration for testing.""" + return Mock( + config_name="test_data", + metadata_fields=["condition", "replicate"], + dataset_type=DatasetType.ANNOTATED_FEATURES, + ) + + +@pytest.fixture +def mock_cache_revision(): + """Mock cache revision for testing.""" + revision = Mock() + revision.commit_hash = "abc123def456" + revision.last_modified = datetime.now().timestamp() + revision.size_on_disk = 1024 * 1024 * 100 # 100MB + return revision + + +@pytest.fixture +def mock_cache_repo(mock_cache_revision): + """Mock cache repository for testing.""" + repo = Mock() + repo.repo_id = "test/repository" + repo.revisions = [mock_cache_revision] + repo.size_on_disk = 1024 * 1024 * 100 # 100MB + repo.size_on_disk_str = "100.0MB" + return repo + + +@pytest.fixture +def mock_cache_info(mock_cache_repo): + """Mock cache info for testing.""" + cache_info = Mock() + cache_info.cache_dir = "/tmp/cache" + cache_info.repos = [mock_cache_repo] + cache_info.size_on_disk = 1024 * 1024 * 100 # 100MB + cache_info.size_on_disk_str = "100.0MB" + + # Mock delete_revisions method + def mock_delete_revisions(*revision_hashes): + strategy = Mock() + strategy.expected_freed_size = ( + len(revision_hashes) * 1024 * 1024 * 50 + ) # 50MB per revision + strategy.expected_freed_size_str = f"{len(revision_hashes) * 50}.0MB" + strategy.delete_content = list(revision_hashes) + strategy.execute = Mock() + return strategy + + cache_info.delete_revisions = mock_delete_revisions + return cache_info + + +class TestSourceSampleValidation: + """Test validation of source_sample field format.""" + + def setup_method(self): + """Set up test fixtures.""" + self.conn = duckdb.connect(":memory:") + self.repo_id = "test/repo" + + def test_valid_source_sample_format(self, tmpdir): + """Test that valid source_sample format passes validation.""" + # Create parquet file with valid composite identifiers + parquet_file = tmpdir.join("valid_data.parquet") + self.conn.execute( + f""" + COPY ( + SELECT + 'BrentLab/harbison_2004;harbison_2004;CBF1_YPD' + as binding_sample_ref, + 'gene_' || (row_number() OVER()) as target_locus_tag, + random() * 100 as binding_score + FROM range(5) + ) TO '{parquet_file}' (FORMAT PARQUET) + """ + ) + + # Create mock datacard with source_sample field + mock_feature = Mock() + mock_feature.name = "binding_sample_ref" + mock_feature.role = "source_sample" + + mock_dataset_info = Mock() + mock_dataset_info.features = [mock_feature] + + mock_config = Mock() + mock_config.config_name = "test_config" + mock_config.dataset_info = mock_dataset_info + + with patch("tfbpapi.hf_cache_manager.DataCard.__init__", return_value=None): + cache_manager = HfCacheManager(self.repo_id, self.conn) + cache_manager.get_config = Mock(return_value=mock_config) # type: ignore + + # Should not raise any error + cache_manager._create_duckdb_table_from_files( + [str(parquet_file)], "test_table", "test_config" + ) + + def test_invalid_source_sample_two_parts(self, tmpdir): + """Test that source_sample with only 2 parts raises ValueError.""" + # Create parquet file with invalid format (only 2 parts) + parquet_file = tmpdir.join("invalid_data.parquet") + self.conn.execute( + f""" + COPY ( + SELECT + 'BrentLab/harbison_2004;CBF1_YPD' as binding_sample_ref, + 'gene_' || (row_number() OVER()) as target_locus_tag, + random() * 100 as binding_score + FROM range(5) + ) TO '{parquet_file}' (FORMAT PARQUET) + """ + ) + + # Create mock datacard with source_sample field + mock_feature = Mock() + mock_feature.name = "binding_sample_ref" + mock_feature.role = "source_sample" + + mock_dataset_info = Mock() + mock_dataset_info.features = [mock_feature] + + mock_config = Mock() + mock_config.config_name = "test_config" + mock_config.dataset_info = mock_dataset_info + + with patch("tfbpapi.hf_cache_manager.DataCard.__init__", return_value=None): + cache_manager = HfCacheManager(self.repo_id, self.conn) + cache_manager.get_config = Mock(return_value=mock_config) # type: ignore + + # Should raise ValueError with clear message + with pytest.raises(ValueError) as exc_info: + cache_manager._create_duckdb_table_from_files( + [str(parquet_file)], "test_table", "test_config" + ) + + error_msg = str(exc_info.value) + assert "Invalid format in field 'binding_sample_ref'" in error_msg + assert "role='source_sample'" in error_msg + assert "3 semicolon-separated parts" in error_msg + assert "BrentLab/harbison_2004;CBF1_YPD" in error_msg + + def test_invalid_source_sample_one_part(self, tmpdir): + """Test that source_sample with only 1 part raises ValueError.""" + # Create parquet file with invalid format (only 1 part) + parquet_file = tmpdir.join("invalid_data.parquet") + self.conn.execute( + f""" + COPY ( + SELECT + 'CBF1_YPD' as binding_sample_ref, + 'gene_' || (row_number() OVER()) as target_locus_tag, + random() * 100 as binding_score + FROM range(5) + ) TO '{parquet_file}' (FORMAT PARQUET) + """ + ) + + # Create mock datacard with source_sample field + mock_feature = Mock() + mock_feature.name = "binding_sample_ref" + mock_feature.role = "source_sample" + + mock_dataset_info = Mock() + mock_dataset_info.features = [mock_feature] + + mock_config = Mock() + mock_config.config_name = "test_config" + mock_config.dataset_info = mock_dataset_info + + with patch("tfbpapi.hf_cache_manager.DataCard.__init__", return_value=None): + cache_manager = HfCacheManager(self.repo_id, self.conn) + cache_manager.get_config = Mock(return_value=mock_config) # type: ignore + + # Should raise ValueError + with pytest.raises(ValueError) as exc_info: + cache_manager._create_duckdb_table_from_files( + [str(parquet_file)], "test_table", "test_config" + ) + + error_msg = str(exc_info.value) + assert "Invalid format in field 'binding_sample_ref'" in error_msg + assert "CBF1_YPD" in error_msg + + def test_invalid_source_sample_four_parts(self, tmpdir): + """Test that source_sample with 4 parts raises ValueError.""" + # Create parquet file with invalid format (4 parts) + parquet_file = tmpdir.join("invalid_data.parquet") + self.conn.execute( + f""" + COPY ( + SELECT + 'a;b;c;d' as binding_sample_ref, + 'gene_' || (row_number() OVER()) as target_locus_tag, + random() * 100 as binding_score + FROM range(5) + ) TO '{parquet_file}' (FORMAT PARQUET) + """ + ) + + # Create mock datacard with source_sample field + mock_feature = Mock() + mock_feature.name = "binding_sample_ref" + mock_feature.role = "source_sample" + + mock_dataset_info = Mock() + mock_dataset_info.features = [mock_feature] + + mock_config = Mock() + mock_config.config_name = "test_config" + mock_config.dataset_info = mock_dataset_info + + with patch("tfbpapi.hf_cache_manager.DataCard.__init__", return_value=None): + cache_manager = HfCacheManager(self.repo_id, self.conn) + cache_manager.get_config = Mock(return_value=mock_config) # type: ignore + + # Should raise ValueError + with pytest.raises(ValueError) as exc_info: + cache_manager._create_duckdb_table_from_files( + [str(parquet_file)], "test_table", "test_config" + ) + + error_msg = str(exc_info.value) + assert "Invalid format in field 'binding_sample_ref'" in error_msg + assert "a;b;c;d" in error_msg + + def test_no_source_sample_fields(self, tmpdir): + """Test that validation is skipped when no source_sample fields exist.""" + # Create parquet file with normal data + parquet_file = tmpdir.join("normal_data.parquet") + self.conn.execute( + f""" + COPY ( + SELECT + 'gene_' || (row_number() OVER()) as target_locus_tag, + random() * 100 as expression_value + FROM range(5) + ) TO '{parquet_file}' (FORMAT PARQUET) + """ + ) + + # Create mock datacard without source_sample fields + mock_feature = Mock() + mock_feature.name = "target_locus_tag" + mock_feature.role = "target_identifier" + + mock_dataset_info = Mock() + mock_dataset_info.features = [mock_feature] + + mock_config = Mock() + mock_config.config_name = "test_config" + mock_config.dataset_info = mock_dataset_info + + with patch("tfbpapi.hf_cache_manager.DataCard.__init__", return_value=None): + cache_manager = HfCacheManager(self.repo_id, self.conn) + cache_manager.get_config = Mock(return_value=mock_config) # type: ignore + + # Should not raise any error + cache_manager._create_duckdb_table_from_files( + [str(parquet_file)], "test_table", "test_config" + ) + + def test_multiple_source_sample_fields(self, tmpdir): + """Test validation with multiple source_sample fields.""" + # Create parquet file with multiple composite identifier fields + parquet_file = tmpdir.join("multi_ref_data.parquet") + self.conn.execute( + f""" + COPY ( + SELECT + 'BrentLab/harbison_2004;harbison_2004;CBF1_YPD' + as binding_sample_ref, + 'BrentLab/kemmeren_2014;kemmeren_2014;sample_42' + as expression_sample_ref, + 'gene_' || (row_number() OVER()) as target_locus_tag + FROM range(5) + ) TO '{parquet_file}' (FORMAT PARQUET) + """ + ) + + # Create mock datacard with multiple source_sample fields + mock_feature1 = Mock() + mock_feature1.name = "binding_sample_ref" + mock_feature1.role = "source_sample" + + mock_feature2 = Mock() + mock_feature2.name = "expression_sample_ref" + mock_feature2.role = "source_sample" + + mock_dataset_info = Mock() + mock_dataset_info.features = [mock_feature1, mock_feature2] + + mock_config = Mock() + mock_config.config_name = "test_config" + mock_config.dataset_info = mock_dataset_info + + with patch("tfbpapi.hf_cache_manager.DataCard.__init__", return_value=None): + cache_manager = HfCacheManager(self.repo_id, self.conn) + cache_manager.get_config = Mock(return_value=mock_config) # type: ignore + + # Both fields are valid - should not raise + cache_manager._create_duckdb_table_from_files( + [str(parquet_file)], "test_table", "test_config" + ) diff --git a/tfbpapi/tests/test_metadata_config_models.py b/tfbpapi/tests/test_metadata_config_models.py new file mode 100644 index 0000000..9516e3d --- /dev/null +++ b/tfbpapi/tests/test_metadata_config_models.py @@ -0,0 +1,716 @@ +""" +Tests for metadata configuration Pydantic models. + +Tests validation, error messages, and config loading for MetadataBuilder. + +""" + +import pytest +import yaml # type: ignore +from pydantic import ValidationError + +from tfbpapi.models import ( + MetadataConfig, + PropertyMapping, + RepositoryConfig, +) + + +class TestPropertyMapping: + """Tests for PropertyMapping model.""" + + def test_valid_field_level_mapping(self): + """Test valid field-level property mapping.""" + mapping = PropertyMapping(field="condition", path="media.carbon_source") + assert mapping.field == "condition" + assert mapping.path == "media.carbon_source" + + def test_valid_repo_level_mapping(self): + """Test valid repo-level property mapping (no field).""" + mapping = PropertyMapping(path="temperature_celsius") + assert mapping.field is None + assert mapping.path == "temperature_celsius" + + def test_invalid_empty_path(self): + """Test that empty path is rejected.""" + with pytest.raises(ValidationError) as exc_info: + PropertyMapping(path="") + assert "cannot be empty" in str(exc_info.value) + + def test_invalid_whitespace_path(self): + """Test that whitespace-only path is rejected.""" + with pytest.raises(ValidationError) as exc_info: + PropertyMapping(path=" ") + assert "cannot be empty" in str(exc_info.value) + + def test_invalid_empty_field(self): + """Test that empty field string is rejected.""" + with pytest.raises(ValidationError) as exc_info: + PropertyMapping(field="", path="media.carbon_source") + assert "cannot be empty" in str(exc_info.value) + + def test_path_whitespace_stripped(self): + """Test that path whitespace is stripped.""" + mapping = PropertyMapping(path=" media.carbon_source ") + assert mapping.path == "media.carbon_source" + + def test_valid_field_only_mapping(self): + """Test valid field-only mapping (column alias).""" + mapping = PropertyMapping(field="condition") + assert mapping.field == "condition" + assert mapping.path is None + + def test_invalid_neither_field_nor_path(self): + """Test that at least one of field, path, or expression is required.""" + with pytest.raises(ValidationError) as exc_info: + PropertyMapping() + assert ( + "At least one of 'field', 'path', or 'expression' must be specified" + in str(exc_info.value) + ) + + def test_valid_expression_only(self): + """Test valid expression-only mapping (derived field).""" + mapping = PropertyMapping(expression="dto_fdr < 0.05") + assert mapping.expression == "dto_fdr < 0.05" + assert mapping.field is None + assert mapping.path is None + + def test_invalid_expression_with_field(self): + """Test that expression cannot be combined with field.""" + with pytest.raises(ValidationError) as exc_info: + PropertyMapping(expression="dto_fdr < 0.05", field="sample_id") + assert "expression cannot be used with field or path" in str(exc_info.value) + + def test_invalid_expression_with_path(self): + """Test that expression cannot be combined with path.""" + with pytest.raises(ValidationError) as exc_info: + PropertyMapping(expression="dto_fdr < 0.05", path="media.carbon_source") + assert "expression cannot be used with field or path" in str(exc_info.value) + + +class TestDatasetVirtualDBConfig: + """Tests for DatasetVirtualDBConfig model.""" + + def test_valid_config_with_sample_id(self): + """Test valid dataset config with sample_id.""" + from tfbpapi.models import DatasetVirtualDBConfig, PropertyMapping + + config = DatasetVirtualDBConfig(sample_id=PropertyMapping(field="sample_id")) + assert config.sample_id is not None + assert config.sample_id.field == "sample_id" + + def test_valid_config_with_field_mappings_and_links(self): + """Test valid dataset config with field mappings and links for comparative + datasets.""" + from tfbpapi.models import DatasetVirtualDBConfig + + config_dict = { + "sample_id": {"field": "sample_id"}, + "dto_fdr": {"field": "dto_fdr"}, + # Field mapping for aliasing: dto_pvalue displays dto_empirical_pvalue + "dto_pvalue": {"field": "dto_empirical_pvalue"}, + "links": { + "binding_id": [ + ["BrentLab/harbison_2004", "harbison_2004"], + ["BrentLab/callingcards", "annotated_features"], + ] + }, + } + config = DatasetVirtualDBConfig.model_validate(config_dict) + assert config.sample_id is not None + # Check field mapping for aliasing via property_mappings + assert "dto_pvalue" in config.property_mappings + assert config.property_mappings["dto_pvalue"].field == "dto_empirical_pvalue" + assert "binding_id" in config.links + assert len(config.links["binding_id"]) == 2 + assert config.links["binding_id"][0] == [ + "BrentLab/harbison_2004", + "harbison_2004", + ] + + def test_config_with_extra_property_mappings(self): + """Test that extra fields are parsed as PropertyMappings.""" + from tfbpapi.models import DatasetVirtualDBConfig + + config_dict = { + "sample_id": {"field": "sample_id"}, + "regulator_locus_tag": {"field": "regulator_locus_tag"}, + "dto_fdr": {"expression": "dto_fdr < 0.05"}, + } + config = DatasetVirtualDBConfig.model_validate(config_dict) + + # Access extra fields via model_extra + assert "regulator_locus_tag" in config.model_extra + assert "dto_fdr" in config.model_extra + + def test_valid_db_name(self): + """Test valid db_name is accepted.""" + from tfbpapi.models import DatasetVirtualDBConfig + + config = DatasetVirtualDBConfig.model_validate( + {"db_name": "harbison", "sample_id": {"field": "sample_id"}} + ) + assert config.db_name == "harbison" + + def test_db_name_none_by_default(self): + """Test db_name defaults to None.""" + from tfbpapi.models import DatasetVirtualDBConfig + + config = DatasetVirtualDBConfig.model_validate( + {"sample_id": {"field": "sample_id"}} + ) + assert config.db_name is None + + def test_db_name_invalid_sql_identifier(self): + """Test that invalid SQL identifiers are rejected.""" + from tfbpapi.models import DatasetVirtualDBConfig + + with pytest.raises(ValidationError) as exc_info: + DatasetVirtualDBConfig.model_validate( + {"db_name": "123bad", "sample_id": {"field": "sample_id"}} + ) + assert "not a valid SQL identifier" in str(exc_info.value) + + def test_db_name_with_spaces_rejected(self): + """Test that db_name with spaces is rejected.""" + from tfbpapi.models import DatasetVirtualDBConfig + + with pytest.raises(ValidationError) as exc_info: + DatasetVirtualDBConfig.model_validate( + {"db_name": "my table", "sample_id": {"field": "sample_id"}} + ) + assert "not a valid SQL identifier" in str(exc_info.value) + + def test_db_name_reserved_samples(self): + """Test that 'samples' is reserved and rejected.""" + from tfbpapi.models import DatasetVirtualDBConfig + + with pytest.raises(ValidationError) as exc_info: + DatasetVirtualDBConfig.model_validate( + {"db_name": "samples", "sample_id": {"field": "sample_id"}} + ) + assert "reserved" in str(exc_info.value) + + def test_db_name_reserved_case_insensitive(self): + """Test that reserved name check is case-insensitive.""" + from tfbpapi.models import DatasetVirtualDBConfig + + with pytest.raises(ValidationError) as exc_info: + DatasetVirtualDBConfig.model_validate( + {"db_name": "Samples", "sample_id": {"field": "sample_id"}} + ) + assert "reserved" in str(exc_info.value) + + def test_db_name_underscores_allowed(self): + """Test that underscores are allowed in db_name.""" + from tfbpapi.models import DatasetVirtualDBConfig + + config = DatasetVirtualDBConfig.model_validate( + {"db_name": "_my_table_2", "sample_id": {"field": "sample_id"}} + ) + assert config.db_name == "_my_table_2" + + +class TestRepositoryConfig: + """Tests for RepositoryConfig model.""" + + def test_valid_repo_config_with_datasets(self): + """Test valid repository config with dataset section.""" + config_data = { + "temperature_celsius": {"path": "temperature_celsius"}, + "dataset": { + "dataset1": { + "carbon_source": { + "field": "condition", + "path": "media.carbon_source", + } + } + }, + } + config = RepositoryConfig.model_validate(config_data) + assert config.dataset is not None + assert "dataset1" in config.dataset + + def test_valid_repo_config_no_datasets(self): + """Test valid repository config without dataset section.""" + config_data = {"temperature_celsius": {"path": "temperature_celsius"}} + config = RepositoryConfig.model_validate(config_data) + assert config.dataset is None + + def test_invalid_dataset_not_dict(self): + """Test that dataset section must be a dict.""" + config_data = {"dataset": "not a dict"} + with pytest.raises(ValidationError) as exc_info: + RepositoryConfig.model_validate(config_data) + assert "'dataset' key must contain a dict" in str(exc_info.value) + + def test_valid_field_only_property(self): + """Test that field-only properties are valid (column aliases).""" + config_data = { + "dataset": {"dataset1": {"carbon_source": {"field": "condition"}}} + } + config = RepositoryConfig.model_validate(config_data) + assert config.dataset is not None + assert "dataset1" in config.dataset + # Access extra field via model_extra + dataset_config = config.dataset["dataset1"] + assert "carbon_source" in dataset_config.model_extra + assert dataset_config.model_extra["carbon_source"].field == "condition" + assert dataset_config.model_extra["carbon_source"].path is None + + def test_valid_repo_wide_field_only_property(self): + """Test that repo-wide field-only properties are valid.""" + config_data = {"environmental_condition": {"field": "condition"}} + config = RepositoryConfig.model_validate(config_data) + assert "environmental_condition" in config.properties + assert config.properties["environmental_condition"].field == "condition" + assert config.properties["environmental_condition"].path is None + + +class TestMetadataConfig: + """Tests for MetadataConfig model.""" + + def test_valid_config_with_aliases(self, tmp_path): + """Test valid config with factor aliases.""" + config_data = { + "factor_aliases": { + "carbon_source": { + "glucose": ["D-glucose", "dextrose"], + "galactose": ["D-galactose", "Galactose"], + } + }, + "repositories": { + "BrentLab/test": { + "dataset": { + "test": {"carbon_source": {"path": "media.carbon_source"}} + } + } + }, + } + + config_path = tmp_path / "config.yaml" + with open(config_path, "w") as f: + yaml.dump(config_data, f) + + config = MetadataConfig.from_yaml(config_path) + assert "carbon_source" in config.factor_aliases + assert "glucose" in config.factor_aliases["carbon_source"] + assert config.factor_aliases["carbon_source"]["glucose"] == [ + "D-glucose", + "dextrose", + ] + + def test_valid_config_without_aliases(self, tmp_path): + """Test that factor_aliases is optional.""" + config_data = { + "repositories": { + "BrentLab/test": { + "dataset": { + "test": {"carbon_source": {"path": "media.carbon_source"}} + } + } + } + } + + config_path = tmp_path / "config.yaml" + with open(config_path, "w") as f: + yaml.dump(config_data, f) + + config = MetadataConfig.from_yaml(config_path) + assert config.factor_aliases == {} + + def test_valid_config_empty_aliases(self, tmp_path): + """Test that empty factor_aliases dict is allowed.""" + config_data = { + "factor_aliases": {}, + "repositories": { + "BrentLab/test": { + "dataset": { + "test": {"carbon_source": {"path": "media.carbon_source"}} + } + } + }, + } + + config_path = tmp_path / "config.yaml" + with open(config_path, "w") as f: + yaml.dump(config_data, f) + + config = MetadataConfig.from_yaml(config_path) + assert config.factor_aliases == {} + + def test_invalid_alias_not_dict(self): + """Test that property aliases must be a dict.""" + config_data = { + "factor_aliases": { + "carbon_source": ["D-glucose"] # Should be dict, not list + }, + "repositories": { + "BrentLab/test": {"dataset": {"test": {"prop": {"path": "path"}}}} + }, + } + + with pytest.raises(ValidationError) as exc_info: + MetadataConfig.model_validate(config_data) + # Pydantic catches this with type validation before our custom validator + assert "valid dictionary" in str(exc_info.value) or "must be a dict" in str( + exc_info.value + ) + + def test_invalid_alias_value_not_list(self): + """Test that alias values must be lists.""" + config_data = { + "factor_aliases": { + "carbon_source": {"glucose": "D-glucose"} # Should be list, not string + }, + "repositories": { + "BrentLab/test": {"dataset": {"test": {"prop": {"path": "path"}}}} + }, + } + + with pytest.raises(ValidationError) as exc_info: + MetadataConfig.model_validate(config_data) + # Pydantic catches this with type validation before our custom validator + assert "valid list" in str(exc_info.value) or "must map to a list" in str( + exc_info.value + ) + + def test_invalid_alias_empty_list(self): + """Test that alias value lists cannot be empty.""" + config_data = { + "factor_aliases": {"carbon_source": {"glucose": []}}, + "repositories": { + "BrentLab/test": {"dataset": {"test": {"prop": {"path": "path"}}}} + }, + } + + with pytest.raises(ValidationError) as exc_info: + MetadataConfig.model_validate(config_data) + assert "cannot have empty value list" in str(exc_info.value) + + def test_aliases_allow_numeric_values(self): + """Test that aliases can map to numeric values.""" + config_data = { + "factor_aliases": { + "temperature_celsius": { + "thirty": [30, "30"], # Integer and string + "thirty_seven": [37, 37.0], # Integer and float + } + }, + "repositories": { + "BrentLab/test": { + "dataset": { + "test": {"temperature": {"path": "temperature_celsius"}} + } + } + }, + } + + config = MetadataConfig.model_validate(config_data) + assert config.factor_aliases["temperature_celsius"]["thirty"] == [30, "30"] + assert config.factor_aliases["temperature_celsius"]["thirty_seven"] == [ + 37, + 37.0, + ] + + def test_invalid_no_repositories(self): + """Test that at least one repository is required.""" + config_data = {"factor_aliases": {"carbon_source": {"glucose": ["D-glucose"]}}} + with pytest.raises(ValidationError) as exc_info: + MetadataConfig.model_validate(config_data) + assert "at least one repository" in str(exc_info.value) + + def test_get_repository_config(self, tmp_path): + """Test get_repository_config method.""" + config_data = { + "factor_aliases": {"carbon_source": {"glucose": ["D-glucose"]}}, + "repositories": { + "BrentLab/harbison_2004": { + "dataset": { + "harbison_2004": { + "carbon_source": { + "field": "condition", + "path": "media.carbon_source", + } + } + } + } + }, + } + + config_path = tmp_path / "config.yaml" + with open(config_path, "w") as f: + yaml.dump(config_data, f) + + config = MetadataConfig.from_yaml(config_path) + repo_config = config.get_repository_config("BrentLab/harbison_2004") + assert repo_config is not None + assert isinstance(repo_config, RepositoryConfig) + assert repo_config.dataset is not None + assert "harbison_2004" in repo_config.dataset + + # Non-existent repo + assert config.get_repository_config("BrentLab/nonexistent") is None + + def test_get_property_mappings(self, tmp_path): + """Test get_property_mappings method.""" + config_data = { + "factor_aliases": { + "carbon_source": {"glucose": ["D-glucose"]}, + "temperature": {"thirty": [30]}, + }, + "repositories": { + "BrentLab/kemmeren_2014": { + "temperature": {"path": "temperature_celsius"}, # Repo-wide + "dataset": { + "kemmeren_2014": { + "carbon_source": {"path": "media.carbon_source"} + } + }, + } + }, + } + + config_path = tmp_path / "config.yaml" + with open(config_path, "w") as f: + yaml.dump(config_data, f) + + config = MetadataConfig.from_yaml(config_path) + mappings = config.get_property_mappings( + "BrentLab/kemmeren_2014", "kemmeren_2014" + ) + + # Should have both repo-wide and dataset-specific + assert "temperature" in mappings + assert "carbon_source" in mappings + # Mappings are PropertyMapping objects, not dicts + assert isinstance(mappings["temperature"], PropertyMapping) + assert mappings["temperature"].path == "temperature_celsius" + assert mappings["carbon_source"].path == "media.carbon_source" + + def test_dataset_specific_overrides_repo_wide(self, tmp_path): + """Test that dataset-specific mappings override repo-wide.""" + config_data = { + "repositories": { + "BrentLab/test": { + "carbon_source": {"path": "repo.level.path"}, # Repo-wide + "dataset": { + "test_dataset": { + "carbon_source": {"path": "dataset.level.path"} # Override + } + }, + } + }, + } + + config_path = tmp_path / "config.yaml" + with open(config_path, "w") as f: + yaml.dump(config_data, f) + + config = MetadataConfig.from_yaml(config_path) + mappings = config.get_property_mappings("BrentLab/test", "test_dataset") + + # Dataset-specific should win + assert mappings["carbon_source"].path == "dataset.level.path" + + def test_file_not_found(self): + """Test that FileNotFoundError is raised for missing file.""" + with pytest.raises(FileNotFoundError): + MetadataConfig.from_yaml("/nonexistent/path/config.yaml") + + def test_invalid_yaml_structure(self, tmp_path): + """Test that non-dict YAML is rejected.""" + config_path = tmp_path / "config.yaml" + with open(config_path, "w") as f: + f.write("- not\\n- a\\n- dict\\n") + + with pytest.raises(ValueError) as exc_info: + MetadataConfig.from_yaml(config_path) + assert "Configuration file must contain a YAML dictionary" in str( + exc_info.value + ) + + def test_nested_alias_property_names(self, tmp_path): + """Test that alias property names can use dot notation.""" + config_data = { + "factor_aliases": { + "carbon_source": {"glucose": ["D-glucose"]}, + "carbon_source.concentration_percent": {"two_percent": [2]}, + "carbon_source.specifications": {"no_aa": ["without_amino_acids"]}, + }, + "repositories": { + "BrentLab/test": { + "dataset": { + "test": { + "carbon_source": { + "field": "condition", + "path": "media.carbon_source", + } + } + } + } + }, + } + + config_path = tmp_path / "config.yaml" + with open(config_path, "w") as f: + yaml.dump(config_data, f) + + config = MetadataConfig.from_yaml(config_path) + + # All alias properties should be preserved + assert "carbon_source" in config.factor_aliases + assert "carbon_source.concentration_percent" in config.factor_aliases + assert "carbon_source.specifications" in config.factor_aliases + + # Values should be correct + assert config.factor_aliases["carbon_source"]["glucose"] == ["D-glucose"] + assert config.factor_aliases["carbon_source.concentration_percent"][ + "two_percent" + ] == [2] + assert config.factor_aliases["carbon_source.specifications"]["no_aa"] == [ + "without_amino_acids" + ] + + def test_unique_db_names_valid(self): + """Test that unique db_names across datasets pass validation.""" + config_data = { + "repositories": { + "BrentLab/repo1": { + "dataset": { + "dataset_a": { + "db_name": "alpha", + "sample_id": {"field": "sample_id"}, + }, + "dataset_b": { + "db_name": "beta", + "sample_id": {"field": "sample_id"}, + }, + } + } + } + } + config = MetadataConfig.model_validate(config_data) + repo = config.get_repository_config("BrentLab/repo1") + assert repo.dataset["dataset_a"].db_name == "alpha" + assert repo.dataset["dataset_b"].db_name == "beta" + + def test_duplicate_db_names_rejected(self): + """Test that duplicate db_names are rejected.""" + config_data = { + "repositories": { + "BrentLab/repo1": { + "dataset": { + "dataset_a": { + "db_name": "same_name", + "sample_id": {"field": "sample_id"}, + }, + "dataset_b": { + "db_name": "same_name", + "sample_id": {"field": "sample_id"}, + }, + } + } + } + } + with pytest.raises(ValidationError) as exc_info: + MetadataConfig.model_validate(config_data) + assert "Duplicate db_name" in str(exc_info.value) + + def test_duplicate_db_name_case_insensitive(self): + """Test that db_name uniqueness is case-insensitive.""" + config_data = { + "repositories": { + "BrentLab/repo1": { + "dataset": { + "dataset_a": { + "db_name": "Alpha", + "sample_id": {"field": "sample_id"}, + }, + "dataset_b": { + "db_name": "alpha", + "sample_id": {"field": "sample_id"}, + }, + } + } + } + } + with pytest.raises(ValidationError) as exc_info: + MetadataConfig.model_validate(config_data) + assert "Duplicate db_name" in str(exc_info.value) + + def test_db_name_falls_back_to_config_name(self): + """Test that config_name is used when db_name is not set.""" + config_data = { + "repositories": { + "BrentLab/repo1": { + "dataset": { + "harbison_2004": { + "sample_id": {"field": "sample_id"}, + } + } + }, + "BrentLab/repo2": { + "dataset": { + "kemmeren_2014": { + "sample_id": {"field": "sample_id"}, + } + } + }, + } + } + # Should pass -- different config_names used as fallback + config = MetadataConfig.model_validate(config_data) + assert config is not None + + def test_db_name_collides_with_config_name(self): + """Test that db_name colliding with another config_name is rejected.""" + config_data = { + "repositories": { + "BrentLab/repo1": { + "dataset": { + "harbison": { + "sample_id": {"field": "sample_id"}, + } + } + }, + "BrentLab/repo2": { + "dataset": { + "kemmeren": { + "db_name": "harbison", + "sample_id": {"field": "sample_id"}, + } + } + }, + } + } + with pytest.raises(ValidationError) as exc_info: + MetadataConfig.model_validate(config_data) + assert "Duplicate db_name" in str(exc_info.value) + + def test_duplicate_db_names_across_repos(self): + """Test that db_name uniqueness spans across repositories.""" + config_data = { + "repositories": { + "BrentLab/repo1": { + "dataset": { + "ds1": { + "db_name": "shared", + "sample_id": {"field": "sample_id"}, + } + } + }, + "BrentLab/repo2": { + "dataset": { + "ds2": { + "db_name": "shared", + "sample_id": {"field": "sample_id"}, + } + } + }, + } + } + with pytest.raises(ValidationError) as exc_info: + MetadataConfig.model_validate(config_data) + assert "Duplicate db_name" in str(exc_info.value) diff --git a/tfbpapi/tests/test_metric_arrays.py b/tfbpapi/tests/test_metric_arrays.py deleted file mode 100644 index 45a8203..0000000 --- a/tfbpapi/tests/test_metric_arrays.py +++ /dev/null @@ -1,194 +0,0 @@ -import logging - -import numpy as np -import pandas as pd -import pytest - -from tfbpapi.metric_arrays import metric_arrays - - -def test_metric_arrays_expected_result(caplog): - res_dict = { - "metadata": pd.DataFrame( - { - "id": ["A", "B"], - "regulator_symbol": ["tf1", "tf2"], - } - ), - "data": { - "A": pd.DataFrame( - { - "target_symbol": ["gene1", "gene2"], - "metric1": [1.0, 2.0], - } - ), - "B": pd.DataFrame( - { - "target_symbol": ["gene2", "gene1"], - "metric1": [3.0, 4.0], - } - ), - }, - } - metrics_dict = {"metric1": np.mean} - - # Run function - with caplog.at_level(logging.WARNING): - output_dict = metric_arrays(res_dict, metrics_dict) - - # Check expected result for metric1 - # order based on the index of output_dict['metrics1'] since the ordering of - # the rows is random due to the set operation - expected_df = pd.DataFrame( - {"tf1": [1.0, 2.0], "tf2": [4.0, 3.0]}, - index=pd.Index(["gene1", "gene2"], name="target_symbol"), - ).reindex(output_dict["metric1"].index) - - pd.testing.assert_frame_equal(output_dict["metric1"], expected_df) - - # Check no warning since there are no incomplete rows or columns - assert "incomplete" not in caplog.text - - -def test_metric_arrays_missing_data(caplog): - res_dict = { - "metadata": pd.DataFrame( - { - "id": ["A", "B"], - "regulator_symbol": ["tf1", "tf2"], - } - ), - "data": { - "A": pd.DataFrame( - { - "target_symbol": ["gene1", "gene2"], - "metric1": [1.0, 2.0], - } - ), - "B": pd.DataFrame( - { - "target_symbol": ["gene1", "gene3"], - "metric1": [5.0, 3.0], - } - ), - }, - } - metrics_dict = {"metric1": np.mean} - - # Run function with incomplete row dropping - with caplog.at_level(logging.WARNING): - output_dict1 = metric_arrays(res_dict, metrics_dict, drop_incomplete_rows=False) - - # Check result for metric1 with "gene2" dropped due to missing data in B - # sort based on output_dict['metric1'] index since - # the ordering of the rows is random - expected_df1 = pd.DataFrame( - {"tf1": [1.0, 2.0, np.nan], "tf2": [5.0, np.nan, 3.0]}, - index=pd.Index(["gene1", "gene2", "gene3"], name="target_symbol"), - ).reindex(output_dict1["metric1"].index) - - pd.testing.assert_frame_equal(output_dict1["metric1"], expected_df1) - - # Run function with incomplete row dropping - with caplog.at_level(logging.WARNING): - output_dict2 = metric_arrays(res_dict, metrics_dict, drop_incomplete_rows=True) - - # Check result for metric1 with "gene2" dropped due to missing data in B - expected_df2 = pd.DataFrame( - {"tf1": [1.0], "tf2": [5.0]}, - index=pd.Index(["gene1"], name="target_symbol"), - ).reindex(output_dict2["metric1"].index) - - pd.testing.assert_frame_equal(output_dict2["metric1"], expected_df2) - - # Check warning for incomplete rows - assert "2 rows and 0 columns with incomplete records were dropped" in caplog.text - - -def test_metric_arrays_missing_keys(): - res_dict = { - "metadata": pd.DataFrame( - {"id": ["A"], "target_symbol": ["gene1"], "regulator_symbol": ["tf1"]} - ), - # Missing data for id "A" - "data": {}, - } - metrics_dict = {"metric1": np.mean} - - # Expect a KeyError for missing data keys - with pytest.raises(KeyError, match="Data dictionary must have the same keys"): - metric_arrays(res_dict, metrics_dict) - - -def test_metric_arrays_non_dataframe_value(): - res_dict = { - "metadata": pd.DataFrame( - {"id": ["A"], "target_symbol": ["gene1"], "regulator_symbol": ["tf1"]} - ), - "data": {"A": [1, 2, 3]}, # Invalid non-DataFrame entry - } - metrics_dict = {"metric1": np.mean} - - # Expect ValueError when data dictionary values are not DataFrames - with pytest.raises( - ValueError, match="All values in the data dictionary must be DataFrames" - ): - metric_arrays(res_dict, metrics_dict) - - -def test_metric_arrays_duplicate_rows_without_dedup_func(): - res_dict = { - "metadata": pd.DataFrame( - { - "id": ["A"], - "target_symbol": ["gene1"], - "regulator_symbol": ["tf1"], - } - ), - "data": { - "A": pd.DataFrame( - { - "target_symbol": ["gene1", "gene1"], - "metric1": [1.0, 2.0], - } - ), - }, - } - metrics_dict = {"metric1": None} # No deduplication function provided - - # Expect a ValueError due to duplicate rows without deduplication function - # - with pytest.raises( - ValueError, match="Duplicate entries found for metric 'metric1'" - ): - metric_arrays(res_dict, metrics_dict) # type: ignore - - -def test_metric_arrays_deduplication_function(): - res_dict = { - "metadata": pd.DataFrame( - { - "id": ["A"], - "target_symbol": ["gene1"], - "regulator_symbol": ["tf1"], - } - ), - "data": { - "A": pd.DataFrame( - { - "target_symbol": ["gene1", "gene1"], - "metric1": [1.0, 2.0], - } - ), - }, - } - metrics_dict = {"metric1": np.mean} # Deduplication function to average duplicates - - # Run function with deduplication - output_dict = metric_arrays(res_dict, metrics_dict) - - # Check that duplicates were averaged correctly - expected_df = pd.DataFrame( - {"tf1": [1.5]}, pd.Index(["gene1"], name="target_symbol") - ) - pd.testing.assert_frame_equal(output_dict["metric1"], expected_df) diff --git a/tfbpapi/tests/test_models.py b/tfbpapi/tests/test_models.py new file mode 100644 index 0000000..d78b538 --- /dev/null +++ b/tfbpapi/tests/test_models.py @@ -0,0 +1,670 @@ +""" +Tests for datainfo Pydantic models. + +These tests validate the minimal, flexible models that parse HuggingFace dataset cards. + +""" + +import pytest +from pydantic import ValidationError + +from tfbpapi.models import ( + DataFileInfo, + DatasetCard, + DatasetConfig, + DatasetInfo, + DatasetType, + ExtractedMetadata, + FeatureInfo, + MetadataConfig, + MetadataRelationship, + PartitioningInfo, +) + + +class TestDatasetType: + """Tests for DatasetType enum.""" + + def test_dataset_type_values(self): + """Test that all expected dataset types are defined.""" + assert DatasetType.GENOMIC_FEATURES == "genomic_features" + assert DatasetType.ANNOTATED_FEATURES == "annotated_features" + assert DatasetType.GENOME_MAP == "genome_map" + assert DatasetType.METADATA == "metadata" + assert DatasetType.COMPARATIVE == "comparative" + + def test_dataset_type_from_string(self): + """Test creating DatasetType from string.""" + dt = DatasetType("genomic_features") + assert dt == DatasetType.GENOMIC_FEATURES + + def test_invalid_dataset_type(self): + """Test that invalid dataset type raises error.""" + with pytest.raises(ValueError): + DatasetType("invalid_type") + + +class TestFeatureInfo: + """Tests for FeatureInfo model.""" + + def test_minimal_feature_info(self): + """Test creating FeatureInfo with minimal fields.""" + feature = FeatureInfo( + name="gene_id", dtype="string", description="Gene identifier" + ) + assert feature.name == "gene_id" + assert feature.dtype == "string" + assert feature.description == "Gene identifier" + assert feature.role is None + assert feature.definitions is None + + def test_feature_info_with_role(self): + """Test FeatureInfo with role field.""" + feature = FeatureInfo( + name="condition", + dtype="string", + description="Experimental condition", + role="experimental_condition", + ) + assert feature.role == "experimental_condition" + + def test_feature_info_with_definitions(self): + """Test FeatureInfo with definitions for experimental_condition.""" + feature = FeatureInfo( + name="condition", + dtype={"class_label": {"names": ["control", "treated"]}}, + description="Treatment condition", + role="experimental_condition", + definitions={ + "control": {"temperature_celsius": 30}, + "treated": {"temperature_celsius": 37}, + }, + ) + assert feature.definitions is not None + assert "control" in feature.definitions + assert feature.definitions["control"]["temperature_celsius"] == 30 + + def test_feature_info_with_dict_dtype(self): + """Test FeatureInfo with class_label dtype.""" + feature = FeatureInfo( + name="category", + dtype={"class_label": {"names": ["A", "B", "C"]}}, + description="Categorical field", + ) + assert isinstance(feature.dtype, dict) + assert "class_label" in feature.dtype + + +class TestPartitioningInfo: + """Tests for PartitioningInfo model.""" + + def test_default_partitioning_info(self): + """Test PartitioningInfo with defaults.""" + partitioning = PartitioningInfo() + assert partitioning.enabled is False + assert partitioning.partition_by is None + assert partitioning.path_template is None + + def test_enabled_partitioning_info(self): + """Test PartitioningInfo with partitioning enabled.""" + partitioning = PartitioningInfo( + enabled=True, + partition_by=["accession"], + path_template="data/accession={accession}/*.parquet", + ) + assert partitioning.enabled is True + assert partitioning.partition_by == ["accession"] + assert partitioning.path_template == "data/accession={accession}/*.parquet" + + +class TestDataFileInfo: + """Tests for DataFileInfo model.""" + + def test_default_data_file_info(self): + """Test DataFileInfo with default split.""" + data_file = DataFileInfo(path="data.parquet") + assert data_file.split == "train" + assert data_file.path == "data.parquet" + + def test_custom_data_file_info(self): + """Test DataFileInfo with custom split.""" + data_file = DataFileInfo(split="test", path="test_data.parquet") + assert data_file.split == "test" + assert data_file.path == "test_data.parquet" + + +class TestDatasetInfo: + """Tests for DatasetInfo model.""" + + def test_minimal_dataset_info(self): + """Test DatasetInfo with minimal features.""" + dataset_info = DatasetInfo( + features=[ + FeatureInfo( + name="gene_id", dtype="string", description="Gene identifier" + ) + ] + ) + assert len(dataset_info.features) == 1 + assert dataset_info.partitioning is None + + def test_dataset_info_with_partitioning(self): + """Test DatasetInfo with partitioning.""" + dataset_info = DatasetInfo( + features=[ + FeatureInfo(name="chr", dtype="string", description="Chromosome"), + FeatureInfo(name="pos", dtype="int32", description="Position"), + ], + partitioning=PartitioningInfo(enabled=True, partition_by=["chr"]), + ) + assert len(dataset_info.features) == 2 + assert dataset_info.partitioning.enabled is True # type: ignore + + +class TestDatasetConfig: + """Tests for DatasetConfig model.""" + + def test_minimal_dataset_config(self): + """Test DatasetConfig with minimal required fields.""" + config = DatasetConfig( + config_name="test_data", + description="Test dataset", + dataset_type=DatasetType.ANNOTATED_FEATURES, + data_files=[DataFileInfo(path="data.parquet")], + dataset_info=DatasetInfo( + features=[FeatureInfo(name="id", dtype="string", description="ID")] + ), + ) + assert config.config_name == "test_data" + assert config.dataset_type == DatasetType.ANNOTATED_FEATURES + assert config.default is False + assert config.applies_to is None + assert config.metadata_fields is None + + def test_dataset_config_with_applies_to(self): + """Test DatasetConfig with applies_to for metadata.""" + config = DatasetConfig( + config_name="metadata", + description="Metadata", + dataset_type=DatasetType.METADATA, + applies_to=["data_config_1", "data_config_2"], + data_files=[DataFileInfo(path="metadata.parquet")], + dataset_info=DatasetInfo( + features=[ + FeatureInfo( + name="sample_id", dtype="string", description="Sample ID" + ) + ] + ), + ) + assert config.applies_to == ["data_config_1", "data_config_2"] + + def test_dataset_config_applies_to_validation_error(self): + """Test that applies_to raises error for non-metadata configs.""" + with pytest.raises(ValidationError): + DatasetConfig( + config_name="data", + description="Data", + dataset_type=DatasetType.ANNOTATED_FEATURES, + applies_to=["other_config"], + data_files=[DataFileInfo(path="data.parquet")], + dataset_info=DatasetInfo( + features=[FeatureInfo(name="id", dtype="string", description="ID")] + ), + ) + + def test_dataset_config_with_metadata_fields(self): + """Test DatasetConfig with metadata_fields.""" + config = DatasetConfig( + config_name="data", + description="Data", + dataset_type=DatasetType.ANNOTATED_FEATURES, + metadata_fields=["regulator_symbol", "condition"], + data_files=[DataFileInfo(path="data.parquet")], + dataset_info=DatasetInfo( + features=[ + FeatureInfo( + name="regulator_symbol", dtype="string", description="TF symbol" + ), + FeatureInfo( + name="condition", dtype="string", description="Condition" + ), + ] + ), + ) + assert config.metadata_fields == ["regulator_symbol", "condition"] + + def test_dataset_config_empty_metadata_fields_error(self): + """Test that empty metadata_fields raises error.""" + with pytest.raises(ValidationError): + DatasetConfig( + config_name="data", + description="Data", + dataset_type=DatasetType.ANNOTATED_FEATURES, + metadata_fields=[], + data_files=[DataFileInfo(path="data.parquet")], + dataset_info=DatasetInfo( + features=[FeatureInfo(name="id", dtype="string", description="ID")] + ), + ) + + def test_dataset_config_accepts_extra_fields(self): + """Test that DatasetConfig accepts extra fields like experimental_conditions.""" + config_data = { + "config_name": "data", + "description": "Data", + "dataset_type": "annotated_features", + "experimental_conditions": { + "temperature_celsius": 30, + "media": {"name": "YPD"}, + }, + "data_files": [{"path": "data.parquet"}], + "dataset_info": { + "features": [{"name": "id", "dtype": "string", "description": "ID"}] + }, + } + config = DatasetConfig(**config_data) + assert hasattr(config, "model_extra") + assert "experimental_conditions" in config.model_extra + + +class TestDatasetCard: + """Tests for DatasetCard model.""" + + def test_minimal_dataset_card(self): + """Test DatasetCard with minimal structure.""" + card = DatasetCard( + configs=[ + DatasetConfig( + config_name="data", + description="Data", + dataset_type=DatasetType.ANNOTATED_FEATURES, + data_files=[DataFileInfo(path="data.parquet")], + dataset_info=DatasetInfo( + features=[ + FeatureInfo(name="id", dtype="string", description="ID") + ] + ), + ) + ] + ) + assert len(card.configs) == 1 + + def test_dataset_card_accepts_extra_fields(self): + """Test that DatasetCard accepts extra top-level fields.""" + card_data = { + "license": "mit", + "pretty_name": "Test Dataset", + "tags": ["biology", "genomics"], + "experimental_conditions": {"strain_background": "BY4741"}, + "configs": [ + { + "config_name": "data", + "description": "Data", + "dataset_type": "annotated_features", + "data_files": [{"path": "data.parquet"}], + "dataset_info": { + "features": [ + {"name": "id", "dtype": "string", "description": "ID"} + ] + }, + } + ], + } + card = DatasetCard(**card_data) + assert hasattr(card, "model_extra") + assert "license" in card.model_extra + assert "experimental_conditions" in card.model_extra + + def test_empty_configs_error(self): + """Test that empty configs raises error.""" + with pytest.raises(ValidationError): + DatasetCard(configs=[]) + + def test_duplicate_config_names_error(self): + """Test that duplicate config names raises error.""" + with pytest.raises(ValidationError): + DatasetCard( + configs=[ + DatasetConfig( + config_name="data", + description="Data 1", + dataset_type=DatasetType.ANNOTATED_FEATURES, + data_files=[DataFileInfo(path="data1.parquet")], + dataset_info=DatasetInfo( + features=[ + FeatureInfo(name="id", dtype="string", description="ID") + ] + ), + ), + DatasetConfig( + config_name="data", + description="Data 2", + dataset_type=DatasetType.ANNOTATED_FEATURES, + data_files=[DataFileInfo(path="data2.parquet")], + dataset_info=DatasetInfo( + features=[ + FeatureInfo(name="id", dtype="string", description="ID") + ] + ), + ), + ] + ) + + def test_multiple_default_configs_error(self): + """Test that multiple default configs raises error.""" + with pytest.raises(ValidationError): + DatasetCard( + configs=[ + DatasetConfig( + config_name="data1", + description="Data 1", + dataset_type=DatasetType.ANNOTATED_FEATURES, + default=True, + data_files=[DataFileInfo(path="data1.parquet")], + dataset_info=DatasetInfo( + features=[ + FeatureInfo(name="id", dtype="string", description="ID") + ] + ), + ), + DatasetConfig( + config_name="data2", + description="Data 2", + dataset_type=DatasetType.ANNOTATED_FEATURES, + default=True, + data_files=[DataFileInfo(path="data2.parquet")], + dataset_info=DatasetInfo( + features=[ + FeatureInfo(name="id", dtype="string", description="ID") + ] + ), + ), + ] + ) + + def test_get_config_by_name(self): + """Test get_config_by_name method.""" + card = DatasetCard( + configs=[ + DatasetConfig( + config_name="data1", + description="Data 1", + dataset_type=DatasetType.ANNOTATED_FEATURES, + data_files=[DataFileInfo(path="data1.parquet")], + dataset_info=DatasetInfo( + features=[ + FeatureInfo(name="id", dtype="string", description="ID") + ] + ), + ), + DatasetConfig( + config_name="data2", + description="Data 2", + dataset_type=DatasetType.METADATA, + data_files=[DataFileInfo(path="data2.parquet")], + dataset_info=DatasetInfo( + features=[ + FeatureInfo(name="id", dtype="string", description="ID") + ] + ), + ), + ] + ) + config = card.get_config_by_name("data1") + assert config is not None + assert config.config_name == "data1" + assert card.get_config_by_name("nonexistent") is None + + def test_get_configs_by_type(self): + """Test get_configs_by_type method.""" + card = DatasetCard( + configs=[ + DatasetConfig( + config_name="data", + description="Data", + dataset_type=DatasetType.ANNOTATED_FEATURES, + data_files=[DataFileInfo(path="data.parquet")], + dataset_info=DatasetInfo( + features=[ + FeatureInfo(name="id", dtype="string", description="ID") + ] + ), + ), + DatasetConfig( + config_name="metadata", + description="Metadata", + dataset_type=DatasetType.METADATA, + data_files=[DataFileInfo(path="metadata.parquet")], + dataset_info=DatasetInfo( + features=[ + FeatureInfo(name="id", dtype="string", description="ID") + ] + ), + ), + ] + ) + data_configs = card.get_configs_by_type(DatasetType.ANNOTATED_FEATURES) + assert len(data_configs) == 1 + assert data_configs[0].config_name == "data" + + def test_get_default_config(self): + """Test get_default_config method.""" + card = DatasetCard( + configs=[ + DatasetConfig( + config_name="data1", + description="Data 1", + dataset_type=DatasetType.ANNOTATED_FEATURES, + data_files=[DataFileInfo(path="data1.parquet")], + dataset_info=DatasetInfo( + features=[ + FeatureInfo(name="id", dtype="string", description="ID") + ] + ), + ), + DatasetConfig( + config_name="data2", + description="Data 2", + dataset_type=DatasetType.ANNOTATED_FEATURES, + default=True, + data_files=[DataFileInfo(path="data2.parquet")], + dataset_info=DatasetInfo( + features=[ + FeatureInfo(name="id", dtype="string", description="ID") + ] + ), + ), + ] + ) + default = card.default_config + assert default is not None + assert default.config_name == "data2" + + def test_get_data_configs(self): + """Test get_data_configs method.""" + card = DatasetCard( + configs=[ + DatasetConfig( + config_name="data", + description="Data", + dataset_type=DatasetType.ANNOTATED_FEATURES, + data_files=[DataFileInfo(path="data.parquet")], + dataset_info=DatasetInfo( + features=[ + FeatureInfo(name="id", dtype="string", description="ID") + ] + ), + ), + DatasetConfig( + config_name="metadata", + description="Metadata", + dataset_type=DatasetType.METADATA, + data_files=[DataFileInfo(path="metadata.parquet")], + dataset_info=DatasetInfo( + features=[ + FeatureInfo(name="id", dtype="string", description="ID") + ] + ), + ), + ] + ) + data_configs = card.get_data_configs() + assert len(data_configs) == 1 + assert data_configs[0].dataset_type != DatasetType.METADATA + + def test_get_metadata_configs(self): + """Test get_metadata_configs method.""" + card = DatasetCard( + configs=[ + DatasetConfig( + config_name="data", + description="Data", + dataset_type=DatasetType.ANNOTATED_FEATURES, + data_files=[DataFileInfo(path="data.parquet")], + dataset_info=DatasetInfo( + features=[ + FeatureInfo(name="id", dtype="string", description="ID") + ] + ), + ), + DatasetConfig( + config_name="metadata", + description="Metadata", + dataset_type=DatasetType.METADATA, + data_files=[DataFileInfo(path="metadata.parquet")], + dataset_info=DatasetInfo( + features=[ + FeatureInfo(name="id", dtype="string", description="ID") + ] + ), + ), + ] + ) + metadata_configs = card.get_metadata_configs() + assert len(metadata_configs) == 1 + assert metadata_configs[0].dataset_type == DatasetType.METADATA + + +class TestExtractedMetadata: + """Tests for ExtractedMetadata model.""" + + def test_extracted_metadata_creation(self): + """Test creating ExtractedMetadata.""" + metadata = ExtractedMetadata( + config_name="test_config", + field_name="regulator_symbol", + values={"CBF1", "GAL4", "GCN4"}, + extraction_method="distinct", + ) + assert metadata.config_name == "test_config" + assert metadata.field_name == "regulator_symbol" + assert len(metadata.values) == 3 + assert "CBF1" in metadata.values + + +class TestMetadataRelationship: + """Tests for MetadataRelationship model.""" + + def test_metadata_relationship_creation(self): + """Test creating MetadataRelationship.""" + relationship = MetadataRelationship( + data_config="binding_data", + metadata_config="experiment_metadata", + relationship_type="explicit", + ) + assert relationship.data_config == "binding_data" + assert relationship.metadata_config == "experiment_metadata" + assert relationship.relationship_type == "explicit" + + +# ------------------------------------------------------------------ +# Minimal valid YAML snippets reused across MetadataConfig tests +# ------------------------------------------------------------------ + +_MINIMAL_CONFIG = { + "repositories": { + "BrentLab/harbison": { + "dataset": { + "harbison_2004": { + "sample_id": {"field": "sample_id"}, + } + } + } + } +} + + +class TestMetadataConfig: + """Tests for MetadataConfig Pydantic model validation.""" + + def test_valid_minimal_config(self): + """Minimal config with one repo and one dataset parses successfully.""" + config = MetadataConfig.model_validate(_MINIMAL_CONFIG) + assert "BrentLab/harbison" in config.repositories + + def test_missing_repositories_key_raises(self): + """Config missing 'repositories' raises ValueError.""" + with pytest.raises((ValidationError, ValueError)): + MetadataConfig.model_validate({}) + + def test_empty_repositories_raises(self): + """Config with empty 'repositories' dict raises ValueError.""" + with pytest.raises((ValidationError, ValueError)): + MetadataConfig.model_validate({"repositories": {}}) + + def test_repository_with_no_dataset_raises(self): + """Repository with no 'dataset' key raises ValueError.""" + with pytest.raises((ValidationError, ValueError)): + MetadataConfig.model_validate({"repositories": {"BrentLab/harbison": {}}}) + + def test_optional_sections_absent_succeeds(self): + """Parsing succeeds when optional sections are absent.""" + config = MetadataConfig.model_validate(_MINIMAL_CONFIG) + assert config.factor_aliases == {} + assert config.missing_value_labels == {} + + def test_optional_sections_present(self): + """Optional sections are parsed correctly when present.""" + data = { + "repositories": { + "BrentLab/harbison": { + "dataset": { + "harbison_2004": { + "sample_id": {"field": "sample_id"}, + } + } + } + }, + "factor_aliases": {"carbon_source": {"glucose": ["glu", "dextrose"]}}, + "missing_value_labels": {"carbon_source": "unspecified"}, + } + config = MetadataConfig.model_validate(data) + assert "carbon_source" in config.factor_aliases + assert config.missing_value_labels != {} + + def test_duplicate_db_name_raises(self): + """Duplicate db_name across datasets raises ValueError.""" + with pytest.raises((ValidationError, ValueError)): + MetadataConfig.model_validate( + { + "repositories": { + "BrentLab/harbison": { + "dataset": { + "harbison_2004": { + "db_name": "shared_name", + "sample_id": {"field": "sample_id"}, + } + } + }, + "BrentLab/kemmeren": { + "dataset": { + "kemmeren_2014": { + "db_name": "shared_name", + "sample_id": {"field": "sample_id"}, + } + } + }, + } + } + ) diff --git a/tfbpapi/tests/test_rank_transforms.py b/tfbpapi/tests/test_rank_transforms.py deleted file mode 100644 index 31dbeaa..0000000 --- a/tfbpapi/tests/test_rank_transforms.py +++ /dev/null @@ -1,80 +0,0 @@ -import numpy as np -from scipy.stats import rankdata - -from tfbpapi.rank_transforms import ( - shifted_negative_log_ranks, - transform, -) - - -def test_shifted_negative_log_ranks_basic(): - ranks = np.array([1.0, 2.0, 3.0, 4.0, 5.0]) - expected_log_ranks = -1 * np.log10(ranks) + np.log10(np.max(ranks)) - - actual_log_ranks = shifted_negative_log_ranks(ranks) - np.testing.assert_array_almost_equal(actual_log_ranks, expected_log_ranks) - - -def test_shifted_negative_log_ranks_with_ties(): - ranks = np.array([1.0, 2.5, 2.5, 3.0, 4.0]) - expected_log_ranks = -1 * np.log10(ranks) + np.log10(np.max(ranks)) - - actual_log_ranks = shifted_negative_log_ranks(ranks) - np.testing.assert_array_almost_equal(actual_log_ranks, expected_log_ranks) - - -def test_negative_log_transform_basic(): - pvalues = np.array([0.01, 0.05, 0.01, 0.02, 0.05]) - enrichment = np.array([5.0, 3.0, 6.0, 4.0, 4.5]) - - # Expected ranks based on pvalue (primary) with enrichment (secondary) tie-breaking - expected_ranks = np.array([2.0, 5.0, 1.0, 3.0, 4.0]) - expected_log_ranks = -1 * np.log10(expected_ranks) + np.log10( - np.max(expected_ranks) - ) - - actual_log_ranks = transform(pvalues, enrichment) - np.testing.assert_array_almost_equal(actual_log_ranks, expected_log_ranks) - - -def test_all_ties_in_primary_column(): - pvalues = np.array([0.01, 0.01, 0.01, 0.01]) - enrichment = np.array([10.0, 20.0, 15.0, 5.0]) - - # With all pvalues tied, the ranking should depend solely - # on enrichment (higher is better) - expected_secondary_ranks = rankdata(-enrichment, method="average") - expected_log_ranks = -1 * np.log10(expected_secondary_ranks) + np.log10( - np.max(expected_secondary_ranks) - ) - - actual_log_ranks = transform(pvalues, enrichment) - np.testing.assert_array_almost_equal(actual_log_ranks, expected_log_ranks) - - -def test_no_ties_in_primary_column(): - pvalues = np.array([0.01, 0.02, 0.03, 0.04]) - enrichment = np.array([5.0, 10.0, 15.0, 20.0]) - - # With no ties in pvalue, the secondary column should have no effect - expected_ranks = rankdata(pvalues, method="average") - expected_log_ranks = -1 * np.log10(expected_ranks) + np.log10( - np.max(expected_ranks) - ) - - actual_log_ranks = transform(pvalues, enrichment) - np.testing.assert_array_almost_equal(actual_log_ranks, expected_log_ranks) - - -def test_tied_in_both_pvalue_and_enrichment(): - pvalues = np.array([0.01, 0.05, 0.01, 0.02, 0.05]) - enrichment = np.array([5.0, 3.0, 5.0, 4.0, 3.0]) - - # With ties in both primary and secondary columns - expected_ranks = np.array([1.5, 4.5, 1.5, 3.0, 4.5]) - expected_log_ranks = -1 * np.log10(expected_ranks) + np.log10( - np.max(expected_ranks) - ) - - actual_log_ranks = transform(pvalues, enrichment) - np.testing.assert_array_almost_equal(actual_log_ranks, expected_log_ranks) diff --git a/tfbpapi/tests/test_real_datacards.py b/tfbpapi/tests/test_real_datacards.py new file mode 100644 index 0000000..b244a45 --- /dev/null +++ b/tfbpapi/tests/test_real_datacards.py @@ -0,0 +1,704 @@ +""" +Test real datacards from the HuggingFace collection. + +This test suite validates that all real datacards from the BrentLab collection parse +correctly with the updated models.py and specification. + +""" + +import pytest +import yaml # type: ignore + +from tfbpapi.models import DatasetCard + +# Real datacard YAML strings from the collection +BARKAI_COMPENDIUM = """ +license: mit +language: +- en +tags: +- transcription-factor +- binding +- chec-seq +- genomics +- biology +pretty_name: Barkai ChEC-seq Compendium +size_categories: + - 100M 0 + + # Verify config has required fields + config = card.configs[0] + assert config.config_name is not None + assert config.dataset_type is not None + assert config.dataset_info is not None + assert config.dataset_info.features is not None + assert len(config.dataset_info.features) > 0 + + +def test_harbison_2004_condition_definitions(): + """Test that harbison_2004 field-level definitions parse correctly.""" + data = yaml.safe_load(HARBISON_2004) + card = DatasetCard(**data) + + # Find the config + config = card.configs[0] + assert config.config_name == "harbison_2004" + + # Find condition feature + condition_feature = next( + f for f in config.dataset_info.features if f.name == "condition" + ) + + # Should have definitions + assert condition_feature.definitions is not None + assert "YPD" in condition_feature.definitions + assert "Acid" in condition_feature.definitions + assert "BUT14" in condition_feature.definitions + + # YPD definition should have environmental conditions + ypd_def = condition_feature.definitions["YPD"] + assert "environmental_conditions" in ypd_def + + # Acid definition should have target_pH in chemical_treatment + acid_def = condition_feature.definitions["Acid"] + assert "environmental_conditions" in acid_def + assert "chemical_treatment" in acid_def["environmental_conditions"] + assert "target_pH" in acid_def["environmental_conditions"]["chemical_treatment"] + + # BUT14 should have media additives + but14_def = condition_feature.definitions["BUT14"] + assert "environmental_conditions" in but14_def + assert "media" in but14_def["environmental_conditions"] + assert "additives" in but14_def["environmental_conditions"]["media"] + + +def test_hughes_2006_induction(): + """Test that hughes_2006 induction field parses correctly.""" + data = yaml.safe_load(HUGHES_2006) + card = DatasetCard(**data) + + # Check experimental conditions (stored as dict in model_extra) + assert card.configs[0].model_extra is not None + assert "experimental_conditions" in card.configs[0].model_extra + exp_conds = card.configs[0].model_extra["experimental_conditions"] + + # Check induction field + assert "induction" in exp_conds + induction = exp_conds["induction"] + assert "inducer" in induction + assert induction["inducer"]["compound"] == "D-galactose" + assert induction["duration_hours"] == 3 + + +def test_kemmeren_2014_growth_phase(): + """Test that kemmeren_2014 growth phase with od600_tolerance parses correctly.""" + data = yaml.safe_load(KEMMEREN_2014) + card = DatasetCard(**data) + + # Check growth phase (stored as dict in model_extra) + assert card.model_extra is not None + assert "experimental_conditions" in card.model_extra + exp_conds = card.model_extra["experimental_conditions"] + + assert "growth_phase_at_harvest" in exp_conds + growth_phase = exp_conds["growth_phase_at_harvest"] + assert growth_phase["phase"] == "early_mid_log" + assert growth_phase["od600"] == 0.6 + assert growth_phase["od600_tolerance"] == 0.1 + + +def test_hu_2007_strain_background_in_definitions(): + """Test that strain_background in field definitions parses correctly.""" + data = yaml.safe_load(HU_2007) + card = DatasetCard(**data) + + # Find heat_shock feature + config = card.configs[0] + heat_shock_feature = next( + f for f in config.dataset_info.features if f.name == "heat_shock" + ) + + # Check definitions + assert heat_shock_feature.definitions is not None + assert "true" in heat_shock_feature.definitions + + # Check strain_background in definition + true_def = heat_shock_feature.definitions["true"] + assert "strain_background" in true_def + + +def test_field_role_validation(): + """Test that role field accepts any string value.""" + # This should parse successfully with any role string + data = yaml.safe_load(CALLINGCARDS) + card = DatasetCard(**data) + + # Find a feature with a role + config = card.configs[0] + regulator_feature = next( + f for f in config.dataset_info.features if f.name == "regulator_locus_tag" + ) + + # Verify role is a string (not an enum) + assert regulator_feature.role == "regulator_identifier" + assert isinstance(regulator_feature.role, str) + + +def test_concentration_fields(): + """Test that various concentration fields parse correctly.""" + data = yaml.safe_load(KEMMEREN_2014) + card = DatasetCard(**data) + + # Check media compounds (stored as dict in model_extra) + assert card.model_extra is not None + assert "experimental_conditions" in card.model_extra + exp_conds = card.model_extra["experimental_conditions"] + assert "media" in exp_conds + media = exp_conds["media"] + + # Check carbon source + assert "carbon_source" in media + carbon_sources = media["carbon_source"] + assert len(carbon_sources) > 0 + carbon = carbon_sources[0] + assert carbon["concentration_percent"] is not None + + # Check nitrogen source with specifications + assert "nitrogen_source" in media + nitrogen_sources = media["nitrogen_source"] + assert len(nitrogen_sources) > 0 + nitrogen = nitrogen_sources[0] + assert nitrogen["specifications"] is not None + assert "without_amino_acids" in nitrogen["specifications"] + + +def test_extra_fields_do_not_raise_errors(): + """Test that extra fields are accepted (with warnings) but don't raise errors.""" + # All real datacards should parse without ValidationError + # even if they have extra fields + datacards = [ + BARKAI_COMPENDIUM, + CALLINGCARDS, + HARBISON_2004, + HU_2007, + HUGHES_2006, + KEMMEREN_2014, + MAHENDRAWADA_2025, + ROSSI_2021, + ] + + for datacard_yaml in datacards: + data = yaml.safe_load(datacard_yaml) + # This should not raise ValidationError + card = DatasetCard(**data) + assert card is not None + + +def test_empty_nitrogen_source_list(): + """Test that empty nitrogen_source lists are accepted.""" + data = yaml.safe_load(BARKAI_COMPENDIUM) + card = DatasetCard(**data) + + # Check that nitrogen_source is an empty list (stored as dict in model_extra) + assert card.model_extra is not None + assert "experimental_conditions" in card.model_extra + exp_conds = card.model_extra["experimental_conditions"] + assert "media" in exp_conds + media = exp_conds["media"] + assert media["nitrogen_source"] == [] + + +def test_media_additives(): + """Test that media additives parse correctly.""" + data = yaml.safe_load(HARBISON_2004) + card = DatasetCard(**data) + + # Find BUT14 condition definition + config = card.configs[0] + condition_feature = next( + f for f in config.dataset_info.features if f.name == "condition" + ) + but14_def = condition_feature.definitions["BUT14"] # type: ignore + + # Check additives + env_conds_dict = but14_def["environmental_conditions"] + media = env_conds_dict["media"] + assert "additives" in media + additives = media["additives"] + assert len(additives) > 0 + assert additives[0]["compound"] == "butanol" + assert additives[0]["concentration_percent"] == 1 + + +def test_strain_background_formats(): + """Test that strain_background accepts both string and dict formats.""" + # String format + data1 = yaml.safe_load(BARKAI_COMPENDIUM) + card1 = DatasetCard(**data1) + assert card1.model_extra is not None + assert "experimental_conditions" in card1.model_extra + exp_conds1 = card1.model_extra["experimental_conditions"] + assert exp_conds1["strain_background"] == "BY4741" + + # String format in rossi + data2 = yaml.safe_load(ROSSI_2021) + card2 = DatasetCard(**data2) + assert card2.model_extra is not None + assert "experimental_conditions" in card2.model_extra + exp_conds2 = card2.model_extra["experimental_conditions"] + assert exp_conds2["strain_background"] == "W303" + + +if __name__ == "__main__": + pytest.main([__file__, "-v"]) diff --git a/tfbpapi/tests/test_virtual_db.py b/tfbpapi/tests/test_virtual_db.py new file mode 100644 index 0000000..3d32e52 --- /dev/null +++ b/tfbpapi/tests/test_virtual_db.py @@ -0,0 +1,1726 @@ +""" +Tests for the SQL-first VirtualDB interface. + +Uses local Parquet fixtures and monkeypatches ``_resolve_parquet_files`` +and ``_cached_datacard`` so no network access is needed. + +""" + +from pathlib import Path +from unittest.mock import MagicMock + +import duckdb +import pandas as pd +import pytest +import yaml # type: ignore + +from tfbpapi.datacard import DatasetSchema +from tfbpapi.models import DatasetType, FeatureInfo, MetadataConfig +from tfbpapi.virtual_db import VirtualDB + +# ------------------------------------------------------------------ +# Fixtures +# ------------------------------------------------------------------ + + +def _write_parquet(path: Path, df: pd.DataFrame) -> str: + """Write a DataFrame to a Parquet file using DuckDB.""" + conn = duckdb.connect(":memory:") + conn.execute(f"COPY (SELECT * FROM df) TO '{path}' (FORMAT PARQUET)") + conn.close() + return str(path) + + +@pytest.fixture() +def parquet_dir(tmp_path): + """ + Create Parquet files for two primary datasets and one comparative. + + harbison has a ``condition`` column (like the real dataset) rather + than ``carbon_source`` / ``temperature_celsius`` as raw columns. + Those are derived from DataCard field definitions via config + property mappings. + + kemmeren has no ``condition`` column; carbon_source and + temperature_celsius come from config-level (path-only) mappings + that resolve to constants from the DataCard. + + Returns dict mapping (repo_id, config_name) -> [parquet_path]. + + """ + # harbison: 4 samples; samples 1-3 have 2 target measurements each, + # sample 4 has 2 targets but condition "Unknown" has no definition + # for carbon_source (tests missing_value_labels fallback) + harbison_df = pd.DataFrame( + { + "sample_id": [1, 1, 2, 2, 3, 3, 4, 4], + "regulator_locus_tag": [ + "YBR049C", + "YBR049C", + "YDR463W", + "YDR463W", + "YBR049C", + "YBR049C", + "YDR463W", + "YDR463W", + ], + "regulator_symbol": [ + "REB1", + "REB1", + "STP1", + "STP1", + "REB1", + "REB1", + "STP1", + "STP1", + ], + "condition": [ + "YPD", + "YPD", + "Galactose", + "Galactose", + "Acid", + "Acid", + "Unknown", + "Unknown", + ], + "target_locus_tag": [ + "YAL001C", + "YAL002W", + "YAL001C", + "YAL003W", + "YAL002W", + "YAL003W", + "YAL001C", + "YAL002W", + ], + "effect": [1.5, 0.8, 2.1, 0.3, 1.2, 0.9, 0.6, 1.0], + "pvalue": [0.01, 0.4, 0.001, 0.9, 0.05, 0.3, 0.2, 0.7], + } + ) + # kemmeren: 2 samples, each with 2 targets = 4 rows + # No condition column; carbon_source comes from path-only mapping + kemmeren_df = pd.DataFrame( + { + "sample_id": [10, 10, 11, 11], + "regulator_locus_tag": [ + "YBR049C", + "YBR049C", + "YDR463W", + "YDR463W", + ], + "regulator_symbol": [ + "REB1", + "REB1", + "STP1", + "STP1", + ], + "target_locus_tag": [ + "YAL001C", + "YAL002W", + "YAL001C", + "YAL003W", + ], + "effect": [1.1, 0.7, 1.8, 0.5], + "pvalue": [0.02, 0.5, 0.003, 0.7], + } + ) + dto_df = pd.DataFrame( + { + "binding_id": [ + "BrentLab/harbison;harbison_2004;1", + "BrentLab/harbison;harbison_2004;2", + "BrentLab/harbison;harbison_2004;3", + ], + "perturbation_id": [ + "BrentLab/kemmeren;kemmeren_2014;10", + "BrentLab/kemmeren;kemmeren_2014;11", + "BrentLab/kemmeren;kemmeren_2014;10", + ], + "dto_empirical_pvalue": [0.001, 0.05, 0.8], + "dto_fdr": [0.01, 0.1, 0.9], + } + ) + + files = {} + h_path = tmp_path / "harbison.parquet" + files[("BrentLab/harbison", "harbison_2004")] = [ + _write_parquet(h_path, harbison_df) + ] + + k_path = tmp_path / "kemmeren.parquet" + files[("BrentLab/kemmeren", "kemmeren_2014")] = [ + _write_parquet(k_path, kemmeren_df) + ] + + d_path = tmp_path / "dto.parquet" + files[("BrentLab/comp", "dto")] = [_write_parquet(d_path, dto_df)] + + return files + + +@pytest.fixture() +def config_path(tmp_path): + """Create a YAML config file for the test datasets.""" + config = { + "factor_aliases": { + "carbon_source": { + "glucose": ["D-glucose", "dextrose"], + "galactose": ["D-galactose"], + } + }, + "missing_value_labels": {"carbon_source": "unspecified"}, + "repositories": { + "BrentLab/harbison": { + "dataset": { + "harbison_2004": { + "db_name": "harbison", + "sample_id": {"field": "sample_id"}, + "regulator_locus_tag": { + "field": "regulator_locus_tag", + }, + "regulator_symbol": { + "field": "regulator_symbol", + }, + # field+path: derive from condition definitions + "carbon_source": { + "field": "condition", + "path": "media.carbon_source.compound", + }, + "temperature_celsius": { + "field": "condition", + "path": "temperature_celsius", + "dtype": "numeric", + }, + # field-only rename + "environmental_condition": { + "field": "condition", + }, + } + } + }, + "BrentLab/kemmeren": { + # repo-level path-only mappings (constants) + # Paths include experimental_conditions prefix + # to match real datacard model_extra structure + "carbon_source": { + "path": ("experimental_conditions" ".media.carbon_source.compound"), + }, + "temperature_celsius": { + "path": ("experimental_conditions" ".temperature_celsius"), + "dtype": "numeric", + }, + "dataset": { + "kemmeren_2014": { + "db_name": "kemmeren", + "sample_id": {"field": "sample_id"}, + "regulator_locus_tag": { + "field": "regulator_locus_tag", + }, + "regulator_symbol": { + "field": "regulator_symbol", + }, + } + }, + }, + "BrentLab/comp": { + "dataset": { + "dto": { + "dto_pvalue": {"field": "dto_empirical_pvalue"}, + "dto_fdr": {"field": "dto_fdr"}, + "links": { + "binding_id": [ + [ + "BrentLab/harbison", + "harbison_2004", + ], + ], + "perturbation_id": [ + [ + "BrentLab/kemmeren", + "kemmeren_2014", + ], + ], + }, + } + } + }, + }, + } + p = tmp_path / "config.yaml" + with open(p, "w") as f: + yaml.dump(config, f) + return p + + +# metadata_fields per dataset (mirrors what the DataCard would return) +METADATA_FIELDS = { + "harbison_2004": [ + "regulator_locus_tag", + "regulator_symbol", + "condition", + ], + "kemmeren_2014": [ + "regulator_locus_tag", + "regulator_symbol", + ], +} + +# Field definitions from DataCard (condition field for harbison) +HARBISON_CONDITION_DEFS = { + "YPD": { + "temperature_celsius": 30, + "media": { + "carbon_source": [ + {"compound": "D-glucose"}, + ], + }, + }, + "Galactose": { + "temperature_celsius": 30, + "media": { + "carbon_source": [ + {"compound": "D-galactose"}, + ], + }, + }, + "Acid": { + "temperature_celsius": 30, + "media": { + "carbon_source": [ + {"compound": "D-glucose"}, + ], + }, + }, +} + +# Experimental conditions from DataCard (kemmeren -- config-level) +KEMMEREN_EXP_CONDITIONS = { + "temperature_celsius": 30, + "media": { + "carbon_source": [ + {"compound": "D-glucose"}, + ], + }, +} + + +def _make_mock_datacard(repo_id): + """Create a mock DataCard for testing.""" + card = MagicMock() + + if repo_id == "BrentLab/harbison": + config_mock = MagicMock() + config_mock.metadata_fields = METADATA_FIELDS["harbison_2004"] + card.get_config.return_value = config_mock + card.get_field_definitions.return_value = HARBISON_CONDITION_DEFS + card.get_experimental_conditions.return_value = {} + card.get_metadata_fields.return_value = METADATA_FIELDS["harbison_2004"] + card.get_metadata_config_name.return_value = None + # Harbison: embedded metadata, condition is data col used for + # derived properties; metadata_cols are the three metadata fields + harbison_meta_cols = set(METADATA_FIELDS["harbison_2004"]) + harbison_data_cols = { + "sample_id", + "condition", + "target_locus_tag", + "effect", + "pvalue", + } - harbison_meta_cols + card.get_data_col_names.return_value = { + "sample_id", + "regulator_locus_tag", + "regulator_symbol", + "condition", + "target_locus_tag", + "effect", + "pvalue", + } + card.get_dataset_schema.return_value = DatasetSchema( + data_columns=harbison_data_cols + | { + "sample_id", + "condition", + "target_locus_tag", + "effect", + "pvalue", + }, + metadata_columns=harbison_meta_cols, + join_columns=set(), + metadata_source="embedded", + external_metadata_config=None, + is_partitioned=False, + ) + elif repo_id == "BrentLab/kemmeren": + config_mock = MagicMock() + config_mock.metadata_fields = METADATA_FIELDS["kemmeren_2014"] + config_mock.model_extra = {} + card.get_config.return_value = config_mock + card.get_field_definitions.return_value = {} + dataset_card_mock = MagicMock() + dataset_card_mock.model_extra = { + "experimental_conditions": KEMMEREN_EXP_CONDITIONS, + } + card.dataset_card = dataset_card_mock + card.get_metadata_fields.return_value = METADATA_FIELDS["kemmeren_2014"] + card.get_metadata_config_name.return_value = None + kemmeren_meta_cols = set(METADATA_FIELDS["kemmeren_2014"]) + card.get_data_col_names.return_value = { + "sample_id", + "regulator_locus_tag", + "regulator_symbol", + "target_locus_tag", + "effect", + "pvalue", + } + card.get_dataset_schema.return_value = DatasetSchema( + data_columns={ + "sample_id", + "target_locus_tag", + "effect", + "pvalue", + }, + metadata_columns=kemmeren_meta_cols, + join_columns=set(), + metadata_source="embedded", + external_metadata_config=None, + is_partitioned=False, + ) + else: + config_mock = MagicMock() + config_mock.metadata_fields = None + config_mock.dataset_type = DatasetType.COMPARATIVE + card.get_config.return_value = config_mock + card.get_field_definitions.return_value = {} + card.get_experimental_conditions.return_value = {} + card.get_metadata_fields.return_value = None + card.get_metadata_config_name.return_value = None + card.get_data_col_names.return_value = set() + card.get_dataset_schema.return_value = None + + return card + + +@pytest.fixture() +def vdb(config_path, parquet_dir, monkeypatch): + """Return a VirtualDB with _resolve_parquet_files and _cached_datacard monkeypatched + for local testing.""" + import tfbpapi.virtual_db as vdb_module + + def _fake_resolve(self, repo_id, config_name): + return parquet_dir.get((repo_id, config_name), []) + + monkeypatch.setattr(VirtualDB, "_resolve_parquet_files", _fake_resolve) + monkeypatch.setattr( + vdb_module, + "_cached_datacard", + lambda repo_id, token=None: _make_mock_datacard(repo_id), + ) + return VirtualDB(config_path) + + +# ------------------------------------------------------------------ +# Tests: Initialisation and config +# ------------------------------------------------------------------ + + +class TestVirtualDBConfig: + """Tests for VirtualDB configuration loading.""" + + def test_init_loads_config(self, config_path, monkeypatch): + """Test that config loads without error.""" + monkeypatch.setattr(VirtualDB, "_load_datacards", lambda self: None) + monkeypatch.setattr(VirtualDB, "_validate_datacards", lambda self: None) + monkeypatch.setattr(VirtualDB, "_update_cache", lambda self: None) + monkeypatch.setattr(VirtualDB, "_register_all_views", lambda self: None) + v = VirtualDB(config_path) + assert v.config is not None + assert v.token is None + + def test_init_with_token(self, config_path, monkeypatch): + """Test token is stored.""" + monkeypatch.setattr(VirtualDB, "_load_datacards", lambda self: None) + monkeypatch.setattr(VirtualDB, "_validate_datacards", lambda self: None) + monkeypatch.setattr(VirtualDB, "_update_cache", lambda self: None) + monkeypatch.setattr(VirtualDB, "_register_all_views", lambda self: None) + v = VirtualDB(config_path, token="tok123") + assert v.token == "tok123" + + def test_init_missing_file(self): + """Test FileNotFoundError for missing config.""" + with pytest.raises(FileNotFoundError): + VirtualDB("/nonexistent/path.yaml") + + def test_repr(self, vdb): + """Test repr shows repo, dataset, and view counts.""" + r = repr(vdb) + assert "VirtualDB" in r + assert "views)" in r + + def test_db_name_map(self, config_path, monkeypatch): + """Test that _db_name_map resolves db_name correctly.""" + monkeypatch.setattr(VirtualDB, "_load_datacards", lambda self: None) + monkeypatch.setattr(VirtualDB, "_validate_datacards", lambda self: None) + monkeypatch.setattr(VirtualDB, "_update_cache", lambda self: None) + monkeypatch.setattr(VirtualDB, "_register_all_views", lambda self: None) + v = VirtualDB(config_path) + assert "harbison" in v._db_name_map + assert "kemmeren" in v._db_name_map + assert "dto" in v._db_name_map + assert v._db_name_map["harbison"] == ( + "BrentLab/harbison", + "harbison_2004", + ) + + +# ------------------------------------------------------------------ +# Tests: Tags +# ------------------------------------------------------------------ + + +class TestTags: + """Tests for get_tags() hierarchical merging.""" + + def _make_config(self, yaml_str: str) -> MetadataConfig: + import yaml as _yaml + + return MetadataConfig.model_validate(_yaml.safe_load(yaml_str)) + + def test_repo_level_tags_only(self): + """Repo-level tags propagate when dataset has none.""" + config = self._make_config( + """ + repositories: + BrentLab/harbison: + tags: + assay: binding + organism: yeast + dataset: + harbison_2004: + sample_id: + field: sample_id + """ + ) + tags = config.get_tags("BrentLab/harbison", "harbison_2004") + assert tags == {"assay": "binding", "organism": "yeast"} + + def test_dataset_level_tags_only(self): + """Dataset-level tags are returned when repo has none.""" + config = self._make_config( + """ + repositories: + BrentLab/harbison: + dataset: + harbison_2004: + sample_id: + field: sample_id + tags: + assay: chip-chip + """ + ) + tags = config.get_tags("BrentLab/harbison", "harbison_2004") + assert tags == {"assay": "chip-chip"} + + def test_dataset_overrides_repo_tags(self): + """Dataset-level tags override repo-level for the same key.""" + config = self._make_config( + """ + repositories: + BrentLab/harbison: + tags: + assay: binding + organism: yeast + dataset: + harbison_2004: + sample_id: + field: sample_id + tags: + assay: chip-chip + """ + ) + tags = config.get_tags("BrentLab/harbison", "harbison_2004") + assert tags["assay"] == "chip-chip" + assert tags["organism"] == "yeast" + + def test_no_tags(self): + """Returns empty dict when neither level has tags.""" + config = self._make_config( + """ + repositories: + BrentLab/harbison: + dataset: + harbison_2004: + sample_id: + field: sample_id + """ + ) + tags = config.get_tags("BrentLab/harbison", "harbison_2004") + assert tags == {} + + def test_unknown_repo_returns_empty(self): + """Unknown repo_id returns empty dict.""" + config = self._make_config( + """ + repositories: + BrentLab/harbison: + dataset: + harbison_2004: + sample_id: + field: sample_id + """ + ) + assert config.get_tags("BrentLab/nonexistent", "harbison_2004") == {} + + def test_yaml_round_trip(self): + """Tags parsed from YAML produce correct merged result.""" + config = self._make_config( + """ + repositories: + BrentLab/repo_a: + tags: + type: primary + organism: yeast + dataset: + dataset_a: + sample_id: + field: sample_id + tags: + type: binding + version: "2024" + BrentLab/repo_b: + tags: + type: perturbation + dataset: + dataset_b: + sample_id: + field: sample_id + """ + ) + tags_a = config.get_tags("BrentLab/repo_a", "dataset_a") + assert tags_a == {"type": "binding", "organism": "yeast", "version": "2024"} + + tags_b = config.get_tags("BrentLab/repo_b", "dataset_b") + assert tags_b == {"type": "perturbation"} + + def _make_vdb(self, yaml_str: str, tmp_path, monkeypatch) -> VirtualDB: + monkeypatch.setattr(VirtualDB, "_load_datacards", lambda self: None) + monkeypatch.setattr(VirtualDB, "_validate_datacards", lambda self: None) + monkeypatch.setattr(VirtualDB, "_update_cache", lambda self: None) + monkeypatch.setattr(VirtualDB, "_register_all_views", lambda self: None) + p = tmp_path / "config.yaml" + p.write_text(yaml_str) + return VirtualDB(str(p)) + + def test_vdb_get_tags_returns_merged(self, tmp_path, monkeypatch): + """VirtualDB.get_tags() returns merged repo+dataset tags by db_name.""" + vdb = self._make_vdb( + """ + repositories: + BrentLab/harbison: + tags: + assay: binding + organism: yeast + dataset: + harbison_2004: + db_name: harbison + sample_id: + field: sample_id + tags: + assay: chip-chip + """, + tmp_path, + monkeypatch, + ) + tags = vdb.get_tags("harbison") + assert tags == {"assay": "chip-chip", "organism": "yeast"} + + def test_vdb_get_tags_unknown_name_returns_empty(self, tmp_path, monkeypatch): + """VirtualDB.get_tags() returns empty dict for unknown db_name.""" + vdb = self._make_vdb( + """ + repositories: + BrentLab/harbison: + dataset: + harbison_2004: + db_name: harbison + sample_id: + field: sample_id + """, + tmp_path, + monkeypatch, + ) + assert vdb.get_tags("nonexistent") == {} + + def test_vdb_get_tags_no_views_needed(self, tmp_path, monkeypatch): + """VirtualDB.get_tags() returns correct tags from config.""" + vdb = self._make_vdb( + """ + repositories: + BrentLab/harbison: + tags: + assay: binding + dataset: + harbison_2004: + db_name: harbison + sample_id: + field: sample_id + """, + tmp_path, + monkeypatch, + ) + tags = vdb.get_tags("harbison") + assert tags == {"assay": "binding"} + + def test_vdb_get_datasets(self, tmp_path, monkeypatch): + """VirtualDB.get_datasets() returns sorted db_names from config.""" + vdb = self._make_vdb( + """ + repositories: + BrentLab/harbison: + dataset: + harbison_2004: + db_name: harbison + sample_id: + field: sample_id + BrentLab/kemmeren: + dataset: + kemmeren_2014: + db_name: kemmeren + sample_id: + field: sample_id + """, + tmp_path, + monkeypatch, + ) + assert vdb.get_datasets() == ["harbison", "kemmeren"] + + +# ------------------------------------------------------------------ +# Tests: View registration +# ------------------------------------------------------------------ + + +class TestViewRegistration: + """Tests for view creation.""" + + def test_raw_views_created(self, vdb): + """Test that raw per-dataset views exist.""" + views = vdb.tables() + assert "harbison" in views + assert "kemmeren" in views + # Comparative datasets only get _expanded, not a bare view + assert "dto" not in views + assert "dto_expanded" in views + + def test_raw_view_has_all_rows(self, vdb): + """Test raw view returns measurement-level data.""" + df = vdb.query("SELECT COUNT(*) AS n FROM harbison") + # 4 samples x 2 targets each = 8 rows + assert df["n"].iloc[0] == 8 + + def test_raw_view_has_measurement_columns(self, vdb): + """Test raw view includes measurement columns.""" + fields = vdb.get_fields("harbison") + assert "target_locus_tag" in fields + assert "effect" in fields + assert "pvalue" in fields + + def test_raw_view_has_condition_column(self, vdb): + """Test harbison raw view has condition and derived columns.""" + fields = vdb.get_fields("harbison") + assert "condition" in fields + # Derived columns are available via join to _meta + assert "carbon_source" in fields + assert "temperature_celsius" in fields + + def test_meta_views_created(self, vdb): + """Test that _meta views exist for primary datasets.""" + views = vdb.tables() + assert "harbison_meta" in views + assert "kemmeren_meta" in views + # Comparative datasets should NOT have _meta views + assert "dto_meta" not in views + + def test_meta_view_one_row_per_sample(self, vdb): + """Test _meta view has one row per sample_id.""" + df = vdb.query("SELECT COUNT(*) AS n FROM harbison_meta") + # 4 distinct samples + assert df["n"].iloc[0] == 4 + + def test_meta_view_excludes_measurement_columns(self, vdb): + """Test _meta view has only metadata columns.""" + fields = vdb.get_fields("harbison_meta") + assert "sample_id" in fields + assert "regulator_locus_tag" in fields + # Measurement columns should NOT be in _meta + assert "target_locus_tag" not in fields + assert "effect" not in fields + assert "pvalue" not in fields + + def test_meta_view_has_derived_carbon_source(self, vdb): + """Test harbison_meta has carbon_source from field+path.""" + fields = vdb.get_fields("harbison_meta") + assert "carbon_source" in fields + df = vdb.query( + "SELECT sample_id, carbon_source " "FROM harbison_meta ORDER BY sample_id" + ) + values = dict(zip(df["sample_id"], df["carbon_source"])) + # YPD -> D-glucose -> glucose (aliased) + assert values[1] == "glucose" + # Galactose -> D-galactose -> galactose (aliased) + assert values[2] == "galactose" + # Acid -> D-glucose -> glucose (aliased) + assert values[3] == "glucose" + # Unknown -> no definition -> missing_value_labels fallback + assert values[4] == "unspecified" + + def test_meta_view_has_derived_temperature(self, vdb): + """Test harbison_meta has temperature_celsius from field+path.""" + fields = vdb.get_fields("harbison_meta") + assert "temperature_celsius" in fields + df = vdb.query( + "SELECT DISTINCT temperature_celsius " + "FROM harbison_meta " + "WHERE temperature_celsius IS NOT NULL" + ) + # Conditions with definitions have temperature_celsius=30; + # "Unknown" has no definition so gets NULL + assert len(df) == 1 + assert df["temperature_celsius"].iloc[0] == 30.0 + + def test_meta_view_has_field_rename(self, vdb): + """Test harbison_meta has environmental_condition alias.""" + fields = vdb.get_fields("harbison_meta") + assert "environmental_condition" in fields + df = vdb.query( + "SELECT DISTINCT environmental_condition " + "FROM harbison_meta ORDER BY environmental_condition" + ) + values = sorted(df["environmental_condition"].tolist()) + assert values == ["Acid", "Galactose", "Unknown", "YPD"] + + def test_meta_view_path_only_constant(self, vdb): + """Test kemmeren_meta has carbon_source from path-only.""" + fields = vdb.get_fields("kemmeren_meta") + assert "carbon_source" in fields + df = vdb.query("SELECT DISTINCT carbon_source FROM kemmeren_meta") + # Constant resolved from experimental_conditions + # D-glucose -> glucose (aliased) + assert len(df) == 1 + assert df["carbon_source"].iloc[0] == "glucose" + + def test_meta_view_path_only_numeric(self, vdb): + """Test kemmeren_meta has temperature_celsius as numeric.""" + df = vdb.query("SELECT DISTINCT temperature_celsius " "FROM kemmeren_meta") + assert len(df) == 1 + assert df["temperature_celsius"].iloc[0] == 30.0 + + def test_comparative_expanded_view(self, vdb): + """Test that dto_expanded view is created.""" + views = vdb.tables() + assert "dto_expanded" in views + + def test_expanded_view_has_parsed_columns(self, vdb): + """Test that expanded view has _source and _id columns.""" + df = vdb.query("SELECT * FROM dto_expanded LIMIT 1") + assert "binding_id_source" in df.columns + assert "binding_id_id" in df.columns + assert "perturbation_id_source" in df.columns + assert "perturbation_id_id" in df.columns + + def test_expanded_view_source_aliased(self, vdb): + """Test that _source columns use db_name aliases.""" + df = vdb.query("SELECT DISTINCT binding_id_source " "FROM dto_expanded") + assert "harbison" in df["binding_id_source"].tolist() + + def test_expanded_view_perturbation_source_aliased(self, vdb): + """Test that perturbation_id_source uses db_name alias.""" + df = vdb.query("SELECT DISTINCT perturbation_id_source " "FROM dto_expanded") + assert "kemmeren" in df["perturbation_id_source"].tolist() + + def test_expanded_view_id_values(self, vdb): + """Test that _id columns contain the sample_id component.""" + df = vdb.query( + "SELECT DISTINCT binding_id_id " "FROM dto_expanded ORDER BY binding_id_id" + ) + assert set(df["binding_id_id"]) == {"1", "2", "3"} + + +# ------------------------------------------------------------------ +# Tests: Factor aliases in _meta views +# ------------------------------------------------------------------ + + +class TestFactorAliases: + """Tests that factor aliases are applied in _meta views.""" + + def test_alias_applied_in_meta(self, vdb): + """Test that aliases are applied at _meta level too.""" + df = vdb.query( + "SELECT DISTINCT carbon_source " "FROM harbison_meta ORDER BY carbon_source" + ) + values = df["carbon_source"].tolist() + assert "glucose" in values + assert "galactose" in values + assert "D-glucose" not in values + + +# ------------------------------------------------------------------ +# Tests: query() public API +# ------------------------------------------------------------------ + + +class TestQuery: + """Tests for the query() method.""" + + def test_raw_sql(self, vdb): + """Test basic SQL execution.""" + df = vdb.query("SELECT * FROM harbison WHERE sample_id = 1") + # 2 rows: sample 1 has two target measurements + assert len(df) == 2 + assert all(df["sample_id"] == 1) + + def test_parameterized_query(self, vdb): + """Test query with named parameters.""" + df = vdb.query( + "SELECT * FROM harbison WHERE sample_id = $sid", + sid=1, + ) + # 2 rows: sample 1 has two target measurements + assert len(df) == 2 + assert all(df["sample_id"] == 1) + + def test_query_returns_dataframe(self, vdb): + """Test that query always returns a DataFrame.""" + df = vdb.query("SELECT 1 AS x") + assert isinstance(df, pd.DataFrame) + + +# ------------------------------------------------------------------ +# Tests: prepare() and prepared queries +# ------------------------------------------------------------------ + + +class TestPrepare: + """Tests for the prepare() method.""" + + def test_prepare_and_query(self, vdb): + """Test registering and using a prepared query.""" + vdb.prepare( + "by_condition", + "SELECT * FROM harbison " "WHERE condition = $cond", + ) + df = vdb.query("by_condition", cond="YPD") + # 2 rows: sample 1 with YPD has 2 targets + assert len(df) == 2 + assert all(df["condition"] == "YPD") + + def test_prepare_name_collision_with_view(self, vdb): + """Test that prepare rejects names colliding with views.""" + with pytest.raises(ValueError, match="collides with"): + vdb.prepare("harbison", "SELECT 1") + + def test_prepare_overwrite(self, vdb): + """Test that re-preparing the same name overwrites.""" + vdb.prepare("q1", "SELECT 1 AS x") + vdb.prepare("q1", "SELECT 2 AS x") + df = vdb.query("q1") + assert df["x"].iloc[0] == 2 + + +# ------------------------------------------------------------------ +# Tests: tables() and describe() +# ------------------------------------------------------------------ + + +class TestDiscovery: + """Tests for tables(), describe(), get_fields().""" + + def test_tables_sorted(self, vdb): + """Test that tables() returns sorted view names.""" + views = vdb.tables() + assert views == sorted(views) + + def test_describe_single(self, vdb): + """Test describe for a single view.""" + df = vdb.describe("harbison") + assert "column_name" in df.columns + assert "column_type" in df.columns + assert "table" in df.columns + assert all(df["table"] == "harbison") + col_names = df["column_name"].tolist() + assert "sample_id" in col_names + assert "condition" in col_names + + def test_describe_all(self, vdb): + """Test describe for all views.""" + df = vdb.describe() + tables = df["table"].unique().tolist() + assert "harbison" in tables + assert "kemmeren" in tables + + def test_get_fields_single(self, vdb): + """Test get_fields for a specific view.""" + fields = vdb.get_fields("harbison") + assert "sample_id" in fields + assert "condition" in fields + assert fields == sorted(fields) + + def test_get_fields_all(self, vdb): + """Test get_fields across all views.""" + fields = vdb.get_fields() + assert "sample_id" in fields + # comparative fields + assert "dto_empirical_pvalue" in fields + + def test_get_common_fields(self, vdb): + """Test common fields across primary _meta views.""" + common = vdb.get_common_fields() + # Both harbison_meta and kemmeren_meta share these + assert "sample_id" in common + assert "carbon_source" in common + assert "temperature_celsius" in common + assert "regulator_locus_tag" in common + + +# ------------------------------------------------------------------ +# Tests: get_nested_value helper +# ------------------------------------------------------------------ + + +class TestGetNestedValue: + """Tests for the get_nested_value module-level helper.""" + + def test_simple_path(self): + from tfbpapi.virtual_db import get_nested_value + + data = {"media": {"name": "YPD"}} + assert get_nested_value(data, "media.name") == "YPD" + + def test_list_extraction(self): + from tfbpapi.virtual_db import get_nested_value + + data = { + "media": { + "carbon_source": [ + {"compound": "D-glucose"}, + ], + }, + } + result = get_nested_value(data, "media.carbon_source.compound") + assert result == ["D-glucose"] + + def test_missing_key(self): + from tfbpapi.virtual_db import get_nested_value + + assert get_nested_value({"a": 1}, "b") is None + + def test_deep_missing(self): + from tfbpapi.virtual_db import get_nested_value + + assert get_nested_value({"a": {"b": 1}}, "a.c") is None + + def test_non_dict_input(self): + from tfbpapi.virtual_db import get_nested_value + + assert get_nested_value("not a dict", "a.b") is None # type: ignore + + +# ------------------------------------------------------------------ +# Tests: edge cases +# ------------------------------------------------------------------ + + +class TestEdgeCases: + """Edge case and error handling tests.""" + + def test_no_parquet_files(self, tmp_path, monkeypatch): + """Test graceful handling when no parquet files are found.""" + import tfbpapi.virtual_db as vdb_module + + config = { + "repositories": { + "BrentLab/empty": { + "dataset": { + "empty_data": { + "sample_id": {"field": "sample_id"}, + } + } + } + } + } + p = tmp_path / "config.yaml" + with open(p, "w") as f: + yaml.dump(config, f) + + def _fake_resolve(self, repo_id, config_name): + return [] + + monkeypatch.setattr(VirtualDB, "_resolve_parquet_files", _fake_resolve) + monkeypatch.setattr( + vdb_module, + "_cached_datacard", + lambda repo_id, token=None: _make_mock_datacard(repo_id), + ) + + # Should not raise; just have no views + v = VirtualDB(p) + views = v.tables() + assert "empty_data" not in views + + def test_links_with_non_comparative_dataset_type_raises( + self, tmp_path, monkeypatch + ): + """Dataset with 'links' but datacard dataset_type != comparative raises + ValueError.""" + import tfbpapi.virtual_db as vdb_module + + config = { + "repositories": { + "BrentLab/harbison": { + "dataset": { + "harbison_2004": { + "sample_id": {"field": "sample_id"}, + "links": { + "sample_id": [["BrentLab/primary", "primary_data"]] + }, + } + } + } + } + } + p = tmp_path / "config.yaml" + with open(p, "w") as f: + yaml.dump(config, f) + + non_comparative_card = _make_mock_datacard("BrentLab/harbison") + cfg_mock = MagicMock() + cfg_mock.dataset_type = DatasetType.ANNOTATED_FEATURES + non_comparative_card.get_config.return_value = cfg_mock + + monkeypatch.setattr(VirtualDB, "_resolve_parquet_files", lambda *a: []) + monkeypatch.setattr( + vdb_module, + "_cached_datacard", + lambda repo_id, token=None: non_comparative_card, + ) + + with pytest.raises(ValueError, match="comparative"): + VirtualDB(p) + + +# ------------------------------------------------------------------ +# Tests: dynamic sample_id column +# ------------------------------------------------------------------ + + +class TestDynamicSampleId: + """Tests that the sample identifier column is resolved from config.""" + + def test_non_default_sample_id(self, tmp_path, monkeypatch): + """Views work when sample_id maps to a non-default column.""" + import tfbpapi.virtual_db as vdb_module + + # Config uses experiment_id as the sample identifier + config = { + "repositories": { + "TestOrg/custom_id": { + "dataset": { + "custom_data": { + "db_name": "custom", + "sample_id": { + "field": "experiment_id", + }, + "regulator": { + "field": "regulator", + }, + } + } + } + } + } + config_path = tmp_path / "config.yaml" + with open(config_path, "w") as f: + yaml.dump(config, f) + + # Parquet uses experiment_id (not sample_id) + df = pd.DataFrame( + { + "experiment_id": [100, 100, 200, 200], + "regulator": ["TF1", "TF1", "TF2", "TF2"], + "target": ["G1", "G2", "G1", "G2"], + "score": [1.5, 0.8, 2.1, 0.3], + } + ) + parquet_path = tmp_path / "custom.parquet" + files = { + ("TestOrg/custom_id", "custom_data"): [_write_parquet(parquet_path, df)], + } + + # Mock datacard + mock_card = MagicMock() + mock_card.get_metadata_fields.return_value = [ + "regulator", + ] + mock_card.get_field_definitions.return_value = {} + mock_card.get_experimental_conditions.return_value = {} + mock_card.get_dataset_schema.return_value = DatasetSchema( + data_columns={"experiment_id", "target", "score"}, + metadata_columns={"regulator"}, + join_columns=set(), + metadata_source="embedded", + external_metadata_config=None, + is_partitioned=False, + ) + + monkeypatch.setattr( + VirtualDB, + "_resolve_parquet_files", + lambda self, repo_id, cn: files.get((repo_id, cn), []), + ) + monkeypatch.setattr( + vdb_module, + "_cached_datacard", + lambda repo_id, token=None: mock_card, + ) + + v = VirtualDB(config_path) + + # Meta view should rename experiment_id -> sample_id + meta_df = v.query("SELECT * FROM custom_meta") + assert "sample_id" in meta_df.columns + assert "experiment_id" not in meta_df.columns + assert list(meta_df["sample_id"]) == [100, 200] or set( + meta_df["sample_id"] + ) == {100, 200} + assert len(meta_df) == 2 # 2 distinct samples + + # Enriched raw view should also expose sample_id + raw_df = v.query("SELECT * FROM custom") + assert "sample_id" in raw_df.columns + assert "experiment_id" not in raw_df.columns + assert len(raw_df) == 4 # all rows + + def test_non_default_sample_id_with_collision(self, tmp_path, monkeypatch): + """When parquet has both gm_id (sample) and sample_id (other col), gm_id is + renamed to sample_id and sample_id is preserved as sample_id_orig.""" + import tfbpapi.virtual_db as vdb_module + + config = { + "repositories": { + "TestOrg/collision": { + "dataset": { + "collision_data": { + "db_name": "collision", + "sample_id": {"field": "gm_id"}, + "regulator": {"field": "regulator"}, + } + } + } + } + } + config_path = tmp_path / "config.yaml" + with open(config_path, "w") as f: + yaml.dump(config, f) + + # Parquet has gm_id (the real sample id) AND a literal sample_id col + df = pd.DataFrame( + { + "gm_id": [1, 1, 2, 2], + "sample_id": [101, 101, 102, 102], # some other field + "regulator": ["TF1", "TF1", "TF2", "TF2"], + "target": ["G1", "G2", "G1", "G2"], + "score": [1.0, 2.0, 3.0, 4.0], + } + ) + parquet_path = tmp_path / "collision.parquet" + files = { + ("TestOrg/collision", "collision_data"): [_write_parquet(parquet_path, df)], + } + + mock_card = MagicMock() + mock_card.get_metadata_fields.return_value = ["regulator"] + mock_card.get_field_definitions.return_value = {} + mock_card.get_experimental_conditions.return_value = {} + mock_card.get_dataset_schema.return_value = DatasetSchema( + data_columns={"gm_id", "sample_id", "target", "score"}, + metadata_columns={"regulator"}, + join_columns=set(), + metadata_source="embedded", + external_metadata_config=None, + is_partitioned=False, + ) + + monkeypatch.setattr( + VirtualDB, + "_resolve_parquet_files", + lambda self, repo_id, cn: files.get((repo_id, cn), []), + ) + monkeypatch.setattr( + vdb_module, + "_cached_datacard", + lambda repo_id, token=None: mock_card, + ) + + v = VirtualDB(config_path) + + # Meta view: gm_id -> sample_id, original sample_id -> sample_id_orig + meta_df = v.query("SELECT * FROM collision_meta") + assert "sample_id" in meta_df.columns + assert "sample_id_orig" in meta_df.columns + assert "gm_id" not in meta_df.columns + assert set(meta_df["sample_id"]) == {1, 2} + assert set(meta_df["sample_id_orig"]) == {101, 102} + + # Raw view same behavior + raw_df = v.query("SELECT * FROM collision") + assert "sample_id" in raw_df.columns + assert "sample_id_orig" in raw_df.columns + assert "gm_id" not in raw_df.columns + assert len(raw_df) == 4 + + def test_get_sample_id_field_dataset_level(self): + """Dataset-level sample_id takes precedence.""" + config = MetadataConfig.model_validate( + { + "repositories": { + "Org/repo": { + "dataset": { + "ds": { + "sample_id": { + "field": "my_id", + }, + } + } + } + } + } + ) + assert config.get_sample_id_field("Org/repo", "ds") == "my_id" + + def test_get_sample_id_field_repo_level(self): + """Repo-level sample_id used when dataset has none.""" + config = MetadataConfig.model_validate( + { + "repositories": { + "Org/repo": { + "sample_id": {"field": "repo_sid"}, + "dataset": {"ds": {}}, + } + } + } + ) + assert config.get_sample_id_field("Org/repo", "ds") == "repo_sid" + + def test_get_sample_id_field_default(self): + """Falls back to 'sample_id' when not configured.""" + config = MetadataConfig.model_validate( + {"repositories": {"Org/repo": {"dataset": {"ds": {}}}}} + ) + assert config.get_sample_id_field("Org/repo", "ds") == "sample_id" + + def test_get_sample_id_field_dataset_overrides_repo(self): + """Dataset-level overrides repo-level.""" + config = MetadataConfig.model_validate( + { + "repositories": { + "Org/repo": { + "sample_id": {"field": "repo_id_col"}, + "dataset": { + "ds": { + "sample_id": { + "field": "ds_id_col", + }, + } + }, + } + } + } + ) + assert config.get_sample_id_field("Org/repo", "ds") == "ds_id_col" + + +class TestExternalMetadata: + """Tests for datasets with external metadata parquet files.""" + + def test_external_metadata_join(self, tmp_path, monkeypatch): + """Meta view JOINs data and metadata parquet when metadata is in a separate + config.""" + import tfbpapi.virtual_db as vdb_module + + # Data parquet: measurements with sample_id but no + # metadata columns like db_id or batch + data_df = pd.DataFrame( + { + "sample_id": [1, 1, 2, 2], + "target_locus_tag": [ + "YAL001C", + "YAL002W", + "YAL001C", + "YAL002W", + ], + "effect": [1.5, 0.8, 2.1, 0.3], + } + ) + # Metadata parquet: sample-level metadata + meta_df = pd.DataFrame( + { + "sample_id": [1, 2], + "db_id": [101, 102], + "regulator_locus_tag": ["YBR049C", "YDR463W"], + "background_hops": [500, 600], + } + ) + + data_path = _write_parquet(tmp_path / "data.parquet", data_df) + meta_path = _write_parquet(tmp_path / "meta.parquet", meta_df) + + parquet_files = { + ("TestOrg/repo", "chip_data"): [data_path], + ("TestOrg/repo", "sample_metadata"): [meta_path], + } + + config = { + "repositories": { + "TestOrg/repo": { + "sample_id": {"field": "sample_id"}, + "dataset": { + "chip_data": { + "db_name": "chip", + "regulator_locus_tag": { + "field": "regulator_locus_tag", + }, + } + }, + } + } + } + config_file = tmp_path / "config.yaml" + with open(config_file, "w") as f: + yaml.dump(config, f) + + # Mock DataCard: external metadata via applies_to + card = MagicMock() + config_mock = MagicMock() + config_mock.metadata_fields = None # no embedded + card.get_config.return_value = config_mock + card.get_metadata_fields.return_value = [ + "sample_id", + "db_id", + "regulator_locus_tag", + "background_hops", + ] + card.get_metadata_config_name.return_value = "sample_metadata" + # Data parquet columns (from chip_data features) + card.get_data_col_names.return_value = { + "sample_id", + "target_locus_tag", + "effect", + } + card.get_field_definitions.return_value = {} + card.get_experimental_conditions.return_value = {} + # External metadata schema: data cols in data parquet, + # metadata cols in metadata parquet, joined on sample_id + card.get_dataset_schema.return_value = DatasetSchema( + data_columns={"sample_id", "target_locus_tag", "effect"}, + metadata_columns={ + "sample_id", + "db_id", + "regulator_locus_tag", + "background_hops", + }, + join_columns={"sample_id"}, + metadata_source="external", + external_metadata_config="sample_metadata", + is_partitioned=False, + ) + + monkeypatch.setattr( + VirtualDB, + "_resolve_parquet_files", + lambda self, repo_id, cfg: parquet_files.get((repo_id, cfg), []), + ) + monkeypatch.setattr( + vdb_module, + "_cached_datacard", + lambda repo_id, token=None: card, + ) + + v = VirtualDB(config_file) + tables = v.tables() + assert "chip" in tables + assert "chip_meta" in tables + + # Meta view should have columns from both parquets + meta_result = v.query("SELECT * FROM chip_meta ORDER BY sample_id") + meta_cols = set(meta_result.columns) + assert "sample_id" in meta_cols + assert "db_id" in meta_cols + assert "regulator_locus_tag" in meta_cols + assert "background_hops" in meta_cols + + # Verify data is correct (joined properly) + assert len(meta_result) == 2 + row1 = meta_result[meta_result["sample_id"] == 1].iloc[0] + assert row1["db_id"] == 101 + assert row1["regulator_locus_tag"] == "YBR049C" + + # Enriched raw view should also work + raw_result = v.query("SELECT * FROM chip ORDER BY sample_id") + assert "db_id" in raw_result.columns + assert len(raw_result) == 4 # 4 data rows + + +# ------------------------------------------------------------------ +# Tests: dtype='factor' (DuckDB ENUM) +# ------------------------------------------------------------------ + + +class TestFactorDtype: + """Tests for PropertyMapping dtype='factor' and DuckDB ENUM columns.""" + + def _make_vdb_with_factor(self, tmp_path, monkeypatch, feature_dtype): + """ + Helper: build a VirtualDB with one dataset whose 'category' field + has a PropertyMapping with dtype='factor'. ``feature_dtype`` is + passed as the FeatureInfo.dtype for the 'category' field in the + mock DataCard. + """ + import tfbpapi.virtual_db as vdb_module + + df = pd.DataFrame( + { + "sample_id": [1, 1, 2, 2], + "category": ["A", "B", "A", "C"], + "value": [1.0, 2.0, 3.0, 4.0], + } + ) + parquet_path = tmp_path / "data.parquet" + files = {("TestOrg/ds", "cfg"): [_write_parquet(parquet_path, df)]} + + config = { + "repositories": { + "TestOrg/ds": { + "dataset": { + "cfg": { + "db_name": "ds", + "sample_id": {"field": "sample_id"}, + "category": { + "field": "category", + "dtype": "factor", + }, + } + } + } + } + } + config_file = tmp_path / "config.yaml" + with open(config_file, "w") as f: + yaml.dump(config, f) + + card = MagicMock() + card.get_metadata_fields.return_value = ["sample_id", "category"] + card.get_field_definitions.return_value = {} + card.get_experimental_conditions.return_value = {} + card.get_metadata_config_name.return_value = None + card.get_dataset_schema.return_value = DatasetSchema( + data_columns={"sample_id", "category", "value"}, + metadata_columns={"sample_id", "category"}, + join_columns=set(), + metadata_source="embedded", + external_metadata_config=None, + is_partitioned=False, + ) + feature_list = [ + FeatureInfo( + name="category", + dtype=feature_dtype, + description="A categorical field", + ), + FeatureInfo( + name="sample_id", + dtype="int64", + description="Sample identifier", + ), + ] + card.get_features.return_value = feature_list + + monkeypatch.setattr( + VirtualDB, + "_resolve_parquet_files", + lambda self, repo_id, cn: files.get((repo_id, cn), []), + ) + monkeypatch.setattr( + vdb_module, + "_cached_datacard", + lambda repo_id, token=None: card, + ) + return VirtualDB(config_file) + + def test_factor_dtype_creates_enum_column(self, tmp_path, monkeypatch): + """Dtype='factor' casts the column to a DuckDB ENUM in the _meta view.""" + v = self._make_vdb_with_factor( + tmp_path, + monkeypatch, + feature_dtype={"class_label": {"names": ["A", "B", "C"]}}, + ) + df = v.query("SELECT * FROM ds_meta ORDER BY sample_id") + assert "category" in df.columns + # Values should be preserved + assert set(df["category"].dropna()) == {"A", "B", "C"} + + def test_factor_dtype_enum_type_registered(self, tmp_path, monkeypatch): + """The DuckDB ENUM type is registered and can be queried.""" + v = self._make_vdb_with_factor( + tmp_path, + monkeypatch, + feature_dtype={"class_label": {"names": ["A", "B", "C"]}}, + ) + # Trigger view registration + v.tables() + # The ENUM type should be registered in DuckDB + types_df = v._conn.execute( + "SELECT type_name FROM duckdb_types() WHERE logical_type = 'ENUM'" + ).fetchdf() + assert "_enum_category" in types_df["type_name"].tolist() + + def test_factor_dtype_missing_class_label_raises(self, tmp_path, monkeypatch): + """ValueError is raised when the DataCard field has no class_label dtype.""" + with pytest.raises(ValueError, match="class_label"): + v = self._make_vdb_with_factor( + tmp_path, + monkeypatch, + feature_dtype="string", # not a class_label dict + ) + v.tables() # triggers view registration + + def test_factor_dtype_no_names_raises(self, tmp_path, monkeypatch): + """ValueError is raised when class_label has no 'names' key.""" + with pytest.raises(ValueError, match="names"): + v = self._make_vdb_with_factor( + tmp_path, + monkeypatch, + feature_dtype={"class_label": {}}, # no names + ) + v.tables() + + def test_factor_dtype_model_validator_requires_field(self): + """PropertyMapping with dtype='factor' and no field raises ValidationError.""" + from pydantic import ValidationError + + with pytest.raises(ValidationError, match="factor"): + from tfbpapi.models import PropertyMapping + + PropertyMapping.model_validate({"path": "some.path", "dtype": "factor"}) + + def test_factor_dtype_model_validator_rejects_expression(self): + """PropertyMapping with dtype='factor' and expression raises ValidationError.""" + from pydantic import ValidationError + + with pytest.raises(ValidationError): + from tfbpapi.models import PropertyMapping + + PropertyMapping.model_validate({"expression": "col > 0", "dtype": "factor"}) + + def test_factor_dtype_inplace_renames_raw_to_orig(self, tmp_path, monkeypatch): + """ + When dtype='factor' maps a field to the same output name (e.g. + category: {field: category, dtype: factor}), the raw column is + renamed to _orig in the _meta view, and the ENUM-cast column + keeps the original name. + """ + v = self._make_vdb_with_factor( + tmp_path, + monkeypatch, + feature_dtype={"class_label": {"names": ["A", "B", "C"]}}, + ) + df = v.query("SELECT * FROM ds_meta ORDER BY sample_id") + # ENUM-cast column keeps the original name + assert "category" in df.columns + # Raw numeric/string original is preserved under _orig alias + assert "category_orig" in df.columns + # The _orig column should hold the raw values + assert set(df["category_orig"].dropna()) == {"A", "B", "C"} + + def test_factor_dtype_orig_suffix_avoids_collision(self, tmp_path, monkeypatch): + """When _orig already exists in the parquet, the rename uses _orig_1 + instead.""" + import tfbpapi.virtual_db as vdb_module + + df = pd.DataFrame( + { + "sample_id": [1, 2], + "category": ["A", "B"], + "category_orig": ["x", "y"], # pre-existing _orig column + "value": [1.0, 2.0], + } + ) + parquet_path = tmp_path / "data2.parquet" + files = {("TestOrg/ds2", "cfg2"): [_write_parquet(parquet_path, df)]} + + config = { + "repositories": { + "TestOrg/ds2": { + "dataset": { + "cfg2": { + "db_name": "ds2", + "sample_id": {"field": "sample_id"}, + "category": { + "field": "category", + "dtype": "factor", + }, + } + } + } + } + } + config_file = tmp_path / "config2.yaml" + with open(config_file, "w") as f: + yaml.dump(config, f) + + card = MagicMock() + card.get_metadata_fields.return_value = [ + "sample_id", + "category", + "category_orig", + ] + card.get_field_definitions.return_value = {} + card.get_experimental_conditions.return_value = {} + card.get_metadata_config_name.return_value = None + card.get_dataset_schema.return_value = DatasetSchema( + data_columns={"sample_id", "category", "category_orig", "value"}, + metadata_columns={"sample_id", "category", "category_orig"}, + join_columns=set(), + metadata_source="embedded", + external_metadata_config=None, + is_partitioned=False, + ) + card.get_features.return_value = [ + FeatureInfo( + name="category", + dtype={"class_label": {"names": ["A", "B"]}}, + description="categorical", + ), + FeatureInfo( + name="sample_id", + dtype="int64", + description="id", + ), + ] + + monkeypatch.setattr( + VirtualDB, + "_resolve_parquet_files", + lambda self, repo_id, cn: files.get((repo_id, cn), []), + ) + monkeypatch.setattr( + vdb_module, + "_cached_datacard", + lambda repo_id, token=None: card, + ) + v = VirtualDB(config_file) + + result = v.query("SELECT * FROM ds2_meta ORDER BY sample_id") + # Should use _orig_1 because _orig is taken + assert "category_orig_1" in result.columns + assert "category" in result.columns diff --git a/tfbpapi/virtual_db.py b/tfbpapi/virtual_db.py new file mode 100644 index 0000000..ed16dfa --- /dev/null +++ b/tfbpapi/virtual_db.py @@ -0,0 +1,1610 @@ +""" +VirtualDB provides a SQL query interface across heterogeneous datasets. + +A developer creates huggingface repos with datacards. Datacard specifications +specific to tfbpapi can be found at +https://brentlab.github.io/tfbpapi/huggingface_datacard/. Next, a developer can create +a virtualDB configuration file that describes which huggingface repos and datasets to +use, a set of common fields, datasets that contain comparative analytics, and more. +VirtualDB, this code, then uses DuckDB to construct views over Parquet files cached +locally on initialization. For primary datasets, VirtualDB creates metadata +views (one row per sample with derived columns) and full data views (measurement-level +data joined to metadata). For comparative analysis datasets, VirtualDB creates expanded +views that parse composite ID fields into ``_source`` (aliased to the configured +db_name) and ``_id`` (sample identifier) columns. The expectation is that a developer +will use this interface to write SQL queries against the views to provide an API to +downstream users and applications. + +Example Usage:: + + from tfbpapi.virtual_db import VirtualDB + + vdb = VirtualDB("config.yaml", token=token) + + # Discover views + vdb.tables() + vdb.describe("harbison") + + # Raw SQL + df = vdb.query("SELECT * FROM harbison WHERE sample_id = 42") + + # Parameterized SQL + df = vdb.query( + "SELECT * FROM harbison_meta WHERE carbon_source = $cs", + cs="glucose", + ) + + # Prepared queries + vdb.prepare("sig", "SELECT * FROM harbison_meta LIMIT $n") + df = vdb.query("sig", n=10) + +""" + +from __future__ import annotations + +import logging +import re +from functools import lru_cache +from pathlib import Path +from typing import Any + +import duckdb +import pandas as pd +from duckdb import BinderException + +from tfbpapi.datacard import DataCard, DatasetSchema +from tfbpapi.models import DatasetType, MetadataConfig + +logger = logging.getLogger(__name__) + + +class QueryError(Exception): + """Raised when a VirtualDB query fails at execution time.""" + + pass + + +def get_nested_value(data: dict | list, path: str) -> Any: + """ + Navigate nested dict/list using dot notation. + + Handles missing intermediate keys gracefully by returning None. + When an intermediate value is a list of dicts, extracts the + remaining path from each item and returns a list of results. + + :param data: Dictionary or list of dicts to navigate + :param path: Dot-separated path (e.g., "media.carbon_source.compound") + :return: Value at path, list of values, or None if not found + + :raises TypeError: If an unexpected type is encountered during navigation of the + dict/list structure according to the provided path. + + Example -- dict input:: + + >>> get_nested_value({"media": {"name": "YPD"}}, "media.name") + 'YPD' + + Example -- list-of-dicts at an intermediate node:: + + >>> data = { + ... "media": { + ... "carbon_source": [ + ... {"compound": "glucose"}, + ... ] + ... } + ... } + >>> get_nested_value(data, "media.carbon_source.compound") + ['glucose'] + + """ + if not isinstance(data, (dict, list)): + return None + + # If top-level data is a list, extract path from each item + if isinstance(data, list): + results = [] + for item in data: + if isinstance(item, dict): + val = get_nested_value(item, path) + if val is not None: + results.append(val) + return results if results else None + + keys = path.split(".") + current = data + + for i, key in enumerate(keys): + if isinstance(current, dict): + if key not in current: + logger.warning( + "Key '%s' not found at path '%s' (current keys: %s)", + key, + ".".join(keys[: i + 1]), + list(current.keys()), + ) + return None + current = current[key] + elif isinstance(current, list): + # Extract the remaining path from each list item + remaining_path = ".".join(keys[i:]) + results = [] + for item in current: + if isinstance(item, dict): + val = get_nested_value(item, remaining_path) + if val is not None: + results.append(val) + return results if results else None + else: + error_msg = ( + f"Unexpected type '{type(current).__name__}' at " + f"path '{'.'.join(keys[:i])}'; expected dict or " + f"list of dicts" + ) + logger.error(error_msg) + raise TypeError(error_msg) + + return current + + +@lru_cache(maxsize=32) +def _cached_datacard(repo_id: str, token: str | None = None) -> Any: + """ + Return a cached DataCard instance. + + :param repo_id: HuggingFace repository ID + :param token: Optional HuggingFace token + :return: DataCard instance + + """ + return DataCard(repo_id, token=token) + + +class VirtualDB: + """ + A query interface across heterogeneous datasets. + + DuckDB views are lazily registered over Parquet files on first + ``query()`` call. The user writes SQL against named views. + + :ivar config: Validated MetadataConfig + :ivar token: Optional HuggingFace token + + """ + + def __init__( + self, + config_path: Path | str, + token: str | None = None, + duckdb_connection: duckdb.DuckDBPyConnection | None = None, + ): + """ + Initialize VirtualDB with configuration. + + Creates the DuckDB connection and registers all views immediately. + + :param config_path: Path to YAML configuration file + :param token: Optional HuggingFace token for private datasets + :param duckdb_connection: Optional DuckDB connection. If provided, views will be + registered on this connection instead of creating a new in-memory database. + This provides a method of using a persistent database file. If not provided, + an in-memory DuckDB connection is created. + :raises FileNotFoundError: If config file does not exist + :raises ValueError: If configuration is invalid + + """ + self.config = MetadataConfig.from_yaml(config_path) + self.token = token + + self._conn: duckdb.DuckDBPyConnection = ( + duckdb_connection + if duckdb_connection is not None + else duckdb.connect(":memory:") + ) + + # db_name -> (repo_id, config_name) + self._db_name_map = self._build_db_name_map() + + # Prepared queries: name -> sql + self._prepared_queries: dict[str, str] = {} + + self._load_datacards() + self._validate_datacards() + self._update_cache() + self._register_all_views() + + # ------------------------------------------------------------------ + # Public API + # ------------------------------------------------------------------ + + def query(self, sql: str, **params: Any) -> pd.DataFrame: + """ + Execute SQL or a prepared query and return a DataFrame. + + If *sql* matches a registered prepared-query name the stored + SQL template is used instead. Keyword arguments are passed as + named parameters to DuckDB. + + :param sql: Raw SQL string **or** name of a prepared query + :param params: Named parameters (DuckDB ``$name`` syntax) + :return: Query result as a pandas DataFrame + + Examples:: + + # Raw SQL + df = vdb.query("SELECT * FROM harbison LIMIT 5") + + # With parameters + df = vdb.query( + "SELECT * FROM harbison_meta WHERE carbon_source = $cs", + cs="glucose", + ) + + # Prepared query + vdb.prepare("top", "SELECT * FROM harbison_meta LIMIT $n") + df = vdb.query("top", n=10) + + """ + # param `sql` may be a prepared query name, a raw sql statement, or + # a parameterized sql statement that is not prepared. If it exists as a key + # in the _prepared_queries dict, we use the prepared sql. Otherwise, we + # use the sql as passed to query(). + resolved = self._prepared_queries.get(sql, sql) + try: + if params: + return self._conn.execute(resolved, params).fetchdf() + return self._conn.execute(resolved).fetchdf() + except Exception as exc: + import pprint + + params_repr = pprint.pformat(params, indent=2) + raise QueryError( + f"query failed: {exc}\n\n" f"SQL:\n{sql}\n\n" f"params:\n{params_repr}" + ) from exc + + def prepare(self, name: str, sql: str, overwrite: bool = False) -> None: + """ + Register a named parameterized query for later use. + + Parameters use DuckDB ``$name`` syntax. + + :param name: Query name (must not collide with a view name) + :param sql: SQL template with ``$name`` parameters + :param overwrite: If True, overwrite existing prepared query + with same name + :raises ValueError: If *name* collides with an existing view + + Example:: + + vdb.prepare("glucose_regs", ''' + SELECT regulator_symbol, COUNT(*) AS n + FROM harbison_meta + WHERE carbon_source = $cs + GROUP BY regulator_symbol + HAVING n >= $min_n + ''') + df = vdb.query("glucose_regs", cs="glucose", min_n=2) + + """ + + if name in self._list_views() and not overwrite: + error_msg = ( + f"Prepared-query name '{name}' collides with " + f"an existing view. Choose a different name or set " + f"overwrite=True." + ) + logger.error(error_msg) + raise ValueError(error_msg) + self._prepared_queries[name] = sql + + def tables(self) -> list[str]: + """ + Return sorted list of registered view names. + + :return: Sorted list of view names + + """ + + return sorted(self._list_views()) + + def describe(self, table: str | None = None) -> pd.DataFrame: + """ + Describe column names and types for one or all views. + + :param table: View name, or None for all views + :return: DataFrame with columns ``table``, ``column_name``, + ``column_type`` + + """ + + if table is not None: + df = self._conn.execute(f"DESCRIBE {table}").fetchdf() + df.insert(0, "table", table) + return df + + frames = [] + for view in sorted(self._list_views()): + df = self._conn.execute(f"DESCRIBE {view}").fetchdf() + df.insert(0, "table", view) + frames.append(df) + if not frames: + return pd.DataFrame(columns=["table", "column_name", "column_type"]) + return pd.concat(frames, ignore_index=True) + + def get_fields(self, table: str | None = None) -> list[str]: + """ + Return column names for a view or all unique columns. + + :param table: View name, or None for all views + :return: Sorted list of column names + + """ + + if table is not None: + cols = self._conn.execute( + f"SELECT column_name FROM information_schema.columns " + f"WHERE table_name = '{table}'" + ).fetchdf() + return sorted(cols["column_name"].tolist()) + + all_cols: set[str] = set() + for view in self._list_views(): + cols = self._conn.execute( + f"SELECT column_name FROM information_schema.columns " + f"WHERE table_name = '{view}'" + ).fetchdf() + all_cols.update(cols["column_name"].tolist()) + return sorted(all_cols) + + def get_common_fields(self) -> list[str]: + """ + Return columns present in ALL primary ``_meta`` views. + + Primary dataset views are those without ``links`` in their + config (i.e. not comparative datasets). + + :return: Sorted list of common column names + + """ + + meta_views = self._get_primary_meta_view_names() + if not meta_views: + return [] + + sets = [] + for view in meta_views: + cols = self._conn.execute( + f"SELECT column_name FROM information_schema.columns " + f"WHERE table_name = '{view}'" + ).fetchdf() + sets.append(set(cols["column_name"].tolist())) + + common = set.intersection(*sets) + return sorted(common) + + def get_datasets(self) -> list[str]: + """ + Return the sorted list of dataset names known to this VirtualDB. + + Dataset names are the resolved ``db_name`` values from the + configuration (falling back to the config_name when ``db_name`` + is not explicitly set). These are the names accepted by + :meth:`get_tags` and queryable via :meth:`query`. + + Unlike :meth:`tables`, this method reads directly from the + configuration and does not require views to be registered, so + no data is downloaded. + + :return: Sorted list of dataset names + + """ + return sorted(self._db_name_map) + + def get_tags(self, db_name: str) -> dict[str, str]: + """ + Return the merged tags for a dataset. + + Tags are defined in the configuration at the repository and/or + dataset level. Dataset-level tags override repository-level tags + with the same key. See the ``tags`` section of the configuration + guide for details. + + :param db_name: Dataset name as it appears in :meth:`tables` (the + resolved ``db_name`` from the configuration, or the + ``config_name`` if ``db_name`` was not explicitly set). + :return: Dict of merged tags, or empty dict if the dataset has no + tags or the name is not found. + + """ + if db_name not in self._db_name_map: + return {} + repo_id, config_name = self._db_name_map[db_name] + return self.config.get_tags(repo_id, config_name) + + # ------------------------------------------------------------------ + # Initialisation phases + # ------------------------------------------------------------------ + + def _load_datacards(self) -> None: + """ + Fetch (or load from cache) the DataCard for every distinct repo. + + Populates ``self._datacards`` keyed by ``repo_id``. Failures are + logged as warnings and the repo is omitted from the dict so that + subsequent phases can skip it gracefully. + + """ + self._datacards: dict[str, DataCard] = {} + seen_repos: set[str] = set() + for repo_id, _ in self._db_name_map.values(): + if repo_id in seen_repos: + continue + seen_repos.add(repo_id) + try: + self._datacards[repo_id] = _cached_datacard(repo_id, token=self.token) + except Exception as exc: + logger.warning( + "Could not load datacard for repo '%s': %s", + repo_id, + exc, + ) + + def _validate_datacards(self) -> None: + """ + Cross-check the VirtualDB config against the loaded datacards. + + Checks that every dataset with a ``links`` field in the VirtualDB + config has ``dataset_type: comparative`` in its HuggingFace datacard. + Also resolves ``self._dataset_schemas`` and + ``self._external_meta_configs`` (keyed by ``db_name``) for use by + ``_update_cache`` and ``_register_all_views``. + + :raises ValueError: If a dataset with ``links`` does not have + ``dataset_type: comparative`` in its datacard. + + """ + self._dataset_schemas: dict[str, DatasetSchema] = {} + # db_name -> external metadata config_name (for applies_to datasets) + self._external_meta_configs: dict[str, str] = {} + + for db_name, (repo_id, config_name) in self._db_name_map.items(): + repo_cfg = self.config.repositories.get(repo_id) + ds_cfg = ( + repo_cfg.dataset.get(config_name) + if repo_cfg and repo_cfg.dataset + else None + ) + card = self._datacards.get(repo_id) + + # Validate comparative dataset_type agreement. + if ds_cfg and ds_cfg.links: + if card is not None: + dc_config = card.get_config(config_name) + if ( + dc_config is not None + and dc_config.dataset_type != DatasetType.COMPARATIVE + ): + raise ValueError( + f"Dataset '{config_name}' in repo '{repo_id}' has " + f"'links' in the VirtualDB config, indicating a " + f"comparative dataset, but the HuggingFace datacard " + f"declares dataset_type='{dc_config.dataset_type}'. " + f"Update the datacard to use dataset_type: comparative." + ) + continue # comparative datasets need no schema resolution + + # Resolve dataset schema and external metadata config for + # primary datasets. + if card is None: + continue + try: + schema = card.get_dataset_schema(config_name) + except Exception as exc: + logger.warning( + "Could not get dataset schema for %s/%s: %s", + repo_id, + config_name, + exc, + ) + continue + if schema is not None: + self._dataset_schemas[db_name] = schema + if ( + schema is not None + and schema.metadata_source == "external" + and schema.external_metadata_config + ): + self._external_meta_configs[db_name] = schema.external_metadata_config + + def _update_cache(self) -> None: + """ + Download (or locate cached) Parquet files for all dataset configs. + + Populates ``self._parquet_files`` keyed by ``db_name``. For datasets + with external metadata (identified during ``_validate_datacards``), + also downloads those files and stores them under the key + ``"___meta"`` so ``_register_all_views`` can read them + without further network calls. + + """ + self._parquet_files: dict[str, list[str]] = {} + for db_name, (repo_id, config_name) in self._db_name_map.items(): + files = self._resolve_parquet_files(repo_id, config_name) + self._parquet_files[db_name] = files + + for db_name, ext_config_name in self._external_meta_configs.items(): + repo_id, _ = self._db_name_map[db_name] + files = self._resolve_parquet_files(repo_id, ext_config_name) + self._parquet_files[f"__{db_name}_meta"] = files + + def _register_all_views(self) -> None: + """ + Register all DuckDB views in dependency order. + + Expects ``self._parquet_files``, ``self._dataset_schemas``, and + ``self._external_meta_configs`` to have been populated by the earlier + init phases. No network or disk access occurs here. + + """ + # 1. Raw per-dataset views (internal ___parquet + # plus public for primary datasets only) + for db_name, (repo_id, config_name) in self._db_name_map.items(): + comparative = self._is_comparative(repo_id, config_name) + self._register_raw_view( + db_name, + parquet_only=comparative, + ) + + # 2. External metadata parquet views. + # When a data config's metadata lives in a separate HF config + # (applies_to), register its parquet as ___metadata_parquet. + self._external_meta_views: dict[str, str] = {} + for db_name, ext_config_name in self._external_meta_configs.items(): + meta_view = f"__{db_name}_metadata_parquet" + files = self._parquet_files.get(f"__{db_name}_meta", []) + if not files: + logger.warning( + "No parquet files for external metadata config " + "'%s' (db_name '%s') -- skipping external metadata view", + ext_config_name, + db_name, + ) + continue + files_sql = ", ".join(f"'{f}'" for f in files) + try: + self._conn.execute( + f"CREATE OR REPLACE VIEW {meta_view} AS " + f"SELECT * FROM read_parquet([{files_sql}])" + ) + except Exception as exc: + logger.warning( + "Failed to create external metadata view '%s': %s", + meta_view, + exc, + ) + continue + self._external_meta_views[db_name] = meta_view + + # 3. Metadata views for primary datasets (_meta) + for db_name, (repo_id, config_name) in self._db_name_map.items(): + if not self._is_comparative(repo_id, config_name): + self._register_meta_view(db_name, repo_id, config_name) + + # 4. Replace primary raw views with join to _meta so + # derived columns (e.g. carbon_source) are available + for db_name, (repo_id, config_name) in self._db_name_map.items(): + if not self._is_comparative(repo_id, config_name): + self._enrich_raw_view(db_name) + + # 5. Comparative expanded views (pre-parsed composite IDs) + for db_name, (repo_id, config_name) in self._db_name_map.items(): + repo_cfg = self.config.repositories.get(repo_id) + if not repo_cfg or not repo_cfg.dataset: + continue + ds_cfg = repo_cfg.dataset.get(config_name) + if ds_cfg and ds_cfg.links: + self._register_comparative_expanded_view(db_name, ds_cfg) + + # ------------------------------------------------------------------ + # db_name mapping + # ------------------------------------------------------------------ + + def _build_db_name_map(self) -> dict[str, tuple[str, str]]: + """ + Build mapping from resolved db_name to (repo_id, config_name). + + :return: Dict mapping db_name -> (repo_id, config_name) + + """ + mapping: dict[str, tuple[str, str]] = {} + for repo_id, repo_cfg in self.config.repositories.items(): + if not repo_cfg.dataset: + continue + for config_name, ds_cfg in repo_cfg.dataset.items(): + resolved = ds_cfg.db_name or config_name + mapping[resolved] = (repo_id, config_name) + return mapping + + # ------------------------------------------------------------------ + # Parquet file resolution + # ------------------------------------------------------------------ + + def _resolve_parquet_files(self, repo_id: str, config_name: str) -> list[str]: + """ + Download (or locate cached) Parquet files for a dataset config. + + Uses ``huggingface_hub.snapshot_download`` with the file patterns + from the DataCard. + + :param repo_id: HuggingFace repository ID + :param config_name: Dataset configuration name + :return: List of absolute paths to Parquet files + + """ + card = DataCard(repo_id, token=self.token) + config = card.get_config(config_name) + if not config: + logger.warning( + "Config '%s' not found in repo '%s'", + config_name, + repo_id, + ) + return [] + + file_patterns = [df.path for df in config.data_files] + + from huggingface_hub import snapshot_download + + downloaded_path = snapshot_download( + repo_id=repo_id, + repo_type="dataset", + allow_patterns=file_patterns, + token=self.token, + ) + + parquet_files: list[str] = [] + for pattern in file_patterns: + file_path = Path(downloaded_path) / pattern + if file_path.exists() and file_path.suffix == ".parquet": + parquet_files.append(str(file_path)) + elif "*" in pattern: + base = Path(downloaded_path) + parquet_files.extend( + str(f) for f in base.glob(pattern) if f.suffix == ".parquet" + ) + else: + parent_dir = Path(downloaded_path) / Path(pattern).parent + if parent_dir.exists(): + parquet_files.extend(str(f) for f in parent_dir.glob("*.parquet")) + + return parquet_files + + # ------------------------------------------------------------------ + # View registration helpers + # ------------------------------------------------------------------ + + def _register_raw_view( + self, + db_name: str, + *, + parquet_only: bool = False, + ) -> None: + """ + Register a raw DuckDB view over pre-resolved Parquet files. + + Creates an internal ``___parquet`` view that reads + directly from the Parquet files. For primary datasets, also + creates a public ```` view (initially identical) + that may later be replaced by ``_enrich_raw_view``. + + For comparative datasets, only the internal parquet view is + created; the public view is the ``_expanded`` view instead. + + Parquet files must have been resolved by ``_update_cache`` + before this method is called. + + :param db_name: View name + :param parquet_only: If True, only create the internal + ``___parquet`` view (no public ````). + + """ + files = self._parquet_files.get(db_name, []) + if not files: + logger.warning( + "No parquet files for db_name '%s' -- skipping view", + db_name, + ) + return + + files_sql = ", ".join(f"'{f}'" for f in files) + parquet_sql = f"SELECT * FROM read_parquet([{files_sql}])" + self._conn.execute( + f"CREATE OR REPLACE VIEW __{db_name}_parquet AS " f"{parquet_sql}" + ) + if not parquet_only: + sample_col = self._get_sample_id_col(db_name) + if sample_col == "sample_id": + public_select = f"SELECT * FROM __{db_name}_parquet" + else: + raw_cols = self._get_view_columns(f"__{db_name}_parquet") + parts: list[str] = [] + for col in raw_cols: + if col == sample_col: + parts.append(f"{col} AS sample_id") + elif col == "sample_id": + parts.append(f"{col} AS sample_id_orig") + else: + parts.append(col) + cols_sql = ", ".join(parts) + public_select = f"SELECT {cols_sql} FROM __{db_name}_parquet" + self._conn.execute(f"CREATE OR REPLACE VIEW {db_name} AS {public_select}") + + def _register_meta_view(self, db_name: str, repo_id: str, config_name: str) -> None: + """ + Register a ``_meta`` view with one row per sample. + + Includes metadata columns from the DataCard plus any derived columns + from config property mappings (resolved against DataCard definitions + with factor aliases applied). + + For datasets with external metadata (a separate HF config with + ``applies_to``), JOINs the data parquet to the metadata parquet + on the configured sample_id column. The actual columns in the metadata + parquet are determined by DuckDB introspection (``DESCRIBE``) rather + than the DataCard feature list, because DataCard feature lists are + conceptual schemas that may include columns not physically present + in the parquet files. + + :param db_name: Base view name for the primary dataset + :param repo_id: Repository ID + :param config_name: Configuration name + + raises ValueError: If no metadata fields are found. + raises BinderException: If view creation fails, with SQL details. + + """ + parquet_view = f"__{db_name}_parquet" + if not self._view_exists(parquet_view): + return + + sample_col = self._get_sample_id_col(db_name) + + # Pull ext_meta_view early -- needed for both meta_cols and + # FROM clause construction. + schema: DatasetSchema | None = self._dataset_schemas.get(db_name) + ext_meta_view: str | None = self._external_meta_views.get(db_name) + + is_external = ( + ext_meta_view is not None + and schema is not None + and schema.metadata_source == "external" + ) + + if is_external: + # DataCard feature lists are conceptual -- columns listed there + # may not be physically present in the parquet file. Use DuckDB + # introspection to get the actual columns in the metadata parquet. + assert ext_meta_view is not None + actual_meta_cols: set[str] = set(self._get_view_columns(ext_meta_view)) + meta_cols: list[str] = sorted(actual_meta_cols) + elif schema is not None: + actual_meta_cols = schema.metadata_columns + meta_cols = sorted(actual_meta_cols) + else: + meta_cols = self._resolve_metadata_fields(repo_id, config_name) or [] + actual_meta_cols = set(meta_cols) + + if not meta_cols: + raise ValueError( + f"No metadata fields found for {repo_id}/{config_name}. " + f"Cannot create meta view '{db_name}_meta'." + ) + + # FROM clause: JOIN data + metadata parquets when external, + # plain parquet view otherwise. + if is_external: + assert ext_meta_view is not None + # Use the configured sample_id column as the join key. + # The DataCard feature intersection (schema.join_columns) + # is unreliable because a data config's feature list may + # document columns that are physically only in the metadata + # parquet (present conceptually after a join, not in the + # physical data parquet file). + from_clause = ( + f"{parquet_view} d " f"JOIN {ext_meta_view} m " f"USING ({sample_col})" + ) + is_join = True + else: + from_clause = parquet_view + is_join = False + + def qualify(col: str) -> str: + """Return qualified column name for JOIN context.""" + if not is_join: + return col + if col == sample_col: + return col # USING makes join key unqualified + # Use the actual metadata parquet columns (from DuckDB + # introspection) to decide qualification, not the DataCard + # feature list which may be inaccurate. + if col in actual_meta_cols: + return f"m.{col}" + return f"d.{col}" + + # Resolve derived property expressions first. + # When a factor mapping has the same output name as its source + # field (e.g. time -> time), the raw column must be renamed to + # avoid a duplicate column name in the SELECT. The rename uses + # "_orig", or "_orig_1", etc., to avoid collisions with + # other columns that already exist in the parquet. + prop_result = self._resolve_property_columns(repo_id, config_name) + + # Collect all column names that exist in the parquet so we can + # find a unique _orig suffix when needed. + all_parquet_cols: set[str] = set(self._get_view_columns(parquet_view)) + + # Map: raw_col -> alias_in_select for factor-overridden cols + factor_col_renames: dict[str, str] = {} + if prop_result is not None: + _derived_exprs, _prop_raw_cols = prop_result + for expr in _derived_exprs: + # Detect factor CAST expressions of the form: + # CAST(CAST( AS VARCHAR) AS _enum_) AS + # where == (in-place factor override). + # The output column name is the last " AS " token. + parts = expr.rsplit(" AS ", 1) + if len(parts) != 2: + continue + out_col = parts[1].strip() + # Extract the innermost source field from the CAST chain. + # Handles both: + # CAST(CAST( AS VARCHAR) AS _enum_) + # CAST(CAST(CAST( AS BIGINT) AS VARCHAR) AS _enum_) + m = re.match( + r"CAST\(CAST\((?:CAST\()?(\w+)(?:\s+AS\s+BIGINT\))?" + r"\s+AS\s+VARCHAR\)\s+AS\s+_enum_\w+\)", + parts[0], + ) + if m is None: + continue + src_field = m.group(1) + if src_field == out_col and out_col in all_parquet_cols: + # Find a unique _orig name + candidate = f"{out_col}_orig" + n = 1 + while candidate in all_parquet_cols or candidate in ( + v for v in factor_col_renames.values() + ): + candidate = f"{out_col}_orig_{n}" + n += 1 + factor_col_renames[src_field] = candidate + + # Build SELECT: sample_id + metadata cols (deduplicated). + # Raw columns that are factor-overridden are emitted with their + # _orig alias instead of their original name. + # If the configured sample_id column differs from "sample_id", + # rename it so all views expose a consistent "sample_id" column. + # If the parquet also has a literal "sample_id" column, preserve + # it as "sample_id_orig" to avoid losing data. + seen: set[str] = set() + select_parts: list[str] = [] + rename_sample = sample_col != "sample_id" + + def add_col(col: str) -> None: + if col in seen: + return + seen.add(col) + alias = factor_col_renames.get(col) + if alias: + select_parts.append(f"{qualify(col)} AS {alias}") + elif rename_sample and col == sample_col: + select_parts.append(f"{qualify(col)} AS sample_id") + elif rename_sample and col == "sample_id": + select_parts.append(f"{qualify(col)} AS sample_id_orig") + else: + select_parts.append(qualify(col)) + + add_col(sample_col) + # When renaming, check if the parquet source also has a literal + # "sample_id" column; if so, preserve it as "sample_id_orig". + if rename_sample: + source_cols = set(self._get_view_columns(parquet_view)) + if "sample_id" in source_cols: + add_col("sample_id") + for col in meta_cols: + add_col(col) + + # Add derived property expressions from the VirtualDB config + if prop_result is not None: + derived_exprs, prop_raw_cols = prop_result + # Ensure source columns needed by expressions are selected. + # For external metadata datasets, restrict to columns physically + # present in the metadata parquet -- data columns must not bleed + # into the meta view. + allowed_raw_cols = ( + [c for c in prop_raw_cols if c in actual_meta_cols] + if is_external + else prop_raw_cols + ) + for col in allowed_raw_cols: + add_col(col) + # Rewrite CAST expressions to use the _orig alias when the + # source field was renamed to avoid collision. + if factor_col_renames: + rewritten = [] + for expr in derived_exprs: + for orig, alias in factor_col_renames.items(): + # Replace "CAST( AS" with "CAST( AS" + expr = expr.replace(f"CAST({orig} AS", f"CAST({alias} AS") + rewritten.append(expr) + derived_exprs = rewritten + # Qualify source column references inside CASE WHEN expressions + if is_join: + qualified_exprs = [] + for expr in derived_exprs: + for raw_col in prop_raw_cols: + q = qualify(raw_col) + if q != raw_col: + # Replace bare column name in CASE WHEN patterns + expr = expr.replace( + f"CASE {raw_col} ", f"CASE {q} " + ).replace(f" {raw_col} = ", f" {q} = ") + qualified_exprs.append(expr) + derived_exprs = qualified_exprs + select_parts.extend(derived_exprs) + + cols_sql = ", ".join(select_parts) + sql = ( + f"CREATE OR REPLACE VIEW {db_name}_meta AS " + f"SELECT DISTINCT {cols_sql} FROM {from_clause}" + ) + try: + self._conn.execute(sql) + except BinderException as exc: + raise BinderException( + f"Failed to create meta view '{db_name}_meta'.\n" + f" schema: {schema}\n" + f" from_clause: {from_clause}\n" + f" SQL: {sql}\n" + f" error: {exc}" + ) from exc + + def _enrich_raw_view(self, db_name: str) -> None: + """ + Replace a primary raw view with a join to its ``_meta`` view. + + If ``_meta`` has derived columns not present in the + raw parquet view, recreates ```` as a join so derived + columns (e.g. ``carbon_source``) appear alongside measurement + data. + + :param db_name: Base view name for the primary dataset + + """ + meta_name = f"{db_name}_meta" + parquet_name = f"__{db_name}_parquet" + if not self._view_exists(meta_name) or not self._view_exists(parquet_name): + return + + raw_cols_list = self._get_view_columns(parquet_name) + raw_cols = set(raw_cols_list) + meta_cols = set(self._get_view_columns(meta_name)) + + sample_col = self._get_sample_id_col(db_name) + rename_sample = sample_col != "sample_id" + + # Columns to pull from _meta that aren't already in raw parquet, + # accounting for the sample_id rename: when renaming, "sample_id" + # will appear in meta_cols (as the renamed column) but not in + # raw_cols (which has the original name), so we must exclude it + # from extra_cols since the rename in the raw SELECT already + # provides it. + if rename_sample: + # "sample_id" and "sample_id_orig" come from the raw SELECT + # rename, not from meta + extra_cols = meta_cols - raw_cols - {"sample_id", "sample_id_orig"} + else: + extra_cols = meta_cols - raw_cols + + if not extra_cols: + # No derived columns to add -- the view created in + # _register_raw_view (which already handles the rename) + # is sufficient. + return + + if rename_sample: + # Build explicit SELECT to rename the sample column + raw_parts: list[str] = [] + for col in raw_cols_list: + if col == sample_col: + raw_parts.append(f"r.{col} AS sample_id") + elif col == "sample_id": + raw_parts.append(f"r.{col} AS sample_id_orig") + else: + raw_parts.append(f"r.{col}") + raw_select = ", ".join(raw_parts) + else: + raw_select = "r.*" + + if extra_cols: + extra_select = ", ".join(f"m.{c}" for c in sorted(extra_cols)) + full_select = f"{raw_select}, {extra_select}" + else: + full_select = raw_select + + if rename_sample: + join_clause = f"JOIN {meta_name} m ON r.{sample_col} = m.sample_id" + else: + join_clause = f"JOIN {meta_name} m USING ({sample_col})" + + self._conn.execute( + f"CREATE OR REPLACE VIEW {db_name} AS " + f"SELECT {full_select} " + f"FROM {parquet_name} r " + f"{join_clause}" + ) + + def _get_view_columns(self, view: str) -> list[str]: + """ + Return column names for a view. + + Uses ``DESCRIBE`` rather than ``information_schema`` to force + eager schema resolution for ``read_parquet``-backed views, + which DuckDB may evaluate lazily. + + """ + df = self._conn.execute(f"DESCRIBE {view}").fetchdf() + return df["column_name"].tolist() + + def _get_sample_id_col(self, db_name: str) -> str: + """ + Resolve the sample identifier column name for a dataset. + + :param db_name: Resolved database view name + :return: Actual column name for the sample identifier + + """ + repo_id, config_name = self._db_name_map[db_name] + return self.config.get_sample_id_field(repo_id, config_name) + + def _resolve_metadata_fields( + self, repo_id: str, config_name: str + ) -> list[str] | None: + """ + Get metadata field names from the DataCard. + + Delegates to ``DataCard.get_metadata_fields()`` which handles + both embedded metadata_fields and external metadata configs + (via applies_to). + + :param repo_id: Repository ID + :param config_name: Configuration name + :return: List of metadata field names, or None if not found + + """ + try: + card = self._datacards.get(repo_id) or _cached_datacard( + repo_id, token=self.token + ) + return card.get_metadata_fields(config_name) + except Exception: + logger.error( + "Could not resolve metadata_fields for %s/%s", + repo_id, + config_name, + ) + return None + + def _get_class_label_names( + self, card: Any, config_name: str, field: str + ) -> list[str]: + """ + Return the ENUM levels for a field with class_label dtype. + + Looks up the FeatureInfo for ``field`` in the DataCard config and + extracts the ``names`` list from its ``class_label`` dtype dict. + + :param card: DataCard instance + :param config_name: Configuration name + :param field: Field name to look up + :return: List of level strings + :raises ValueError: If the field is not found, has no class_label dtype, + or the class_label dict has no ``names`` key + + """ + try: + features = card.get_features(config_name) + except Exception as exc: + raise ValueError( + f"Could not retrieve features for config '{config_name}': {exc}" + ) from exc + + feature = next((f for f in features if f.name == field), None) + if feature is None: + raise ValueError( + f"Field '{field}' not found in DataCard config '{config_name}'. " + "dtype='factor' requires the field to be declared in the DataCard." + ) + + dtype = feature.dtype + if not isinstance(dtype, dict) or "class_label" not in dtype: + raise ValueError( + f"dtype='factor' is set for field '{field}' in config " + f"'{config_name}', but the DataCard dtype is {dtype!r} rather " + "than a class_label dict. " + "The DataCard must declare dtype: {class_label: {names: [...]}}." + ) + + class_label = dtype["class_label"] + names = class_label.get("names") if isinstance(class_label, dict) else None + if not names: + raise ValueError( + f"class_label for field '{field}' in config '{config_name}' " + "has no 'names' key or the names list is empty. " + "Specify levels as: class_label: {names: [level1, level2, ...]}." + ) + + return [str(n) for n in names] + + def _ensure_enum_type(self, type_name: str, levels: list[str]) -> None: + """ + Create or replace a DuckDB ENUM type with the given levels. + + DuckDB ENUM types must be registered before use in CAST expressions. Drops any + existing type with the same name first to allow re-registration on repeated view + builds. + + :param type_name: SQL identifier for the ENUM type + :param levels: Ordered list of allowed string values + + """ + try: + self._conn.execute(f"DROP TYPE IF EXISTS {type_name}") + except Exception: + pass # type may not exist yet + escaped = ", ".join(f"'{v.replace(chr(39), chr(39)*2)}'" for v in levels) + self._conn.execute(f"CREATE TYPE {type_name} AS ENUM ({escaped})") + + def _resolve_alias(self, col: str, value: str) -> str: + """ + Apply factor alias to a value if one is configured. + + :param col: Column name (e.g., "carbon_source") + :param value: Raw value (e.g., "D-glucose") + :return: Canonical alias (e.g., "glucose") or original value + + """ + aliases = self.config.factor_aliases.get(col) + if not aliases: + return value + lower_val = str(value).lower() + for canonical, actuals in aliases.items(): + if lower_val in [str(a).lower() for a in actuals]: + return canonical + return value + + def _resolve_property_columns( + self, + repo_id: str, + config_name: str, + ) -> tuple[list[str], list[str]] | None: + """ + Build SQL column expressions for derived property columns. + + Resolves config property mappings against the DataCard to + produce SQL expressions that add derived columns to the + ``_meta`` view. + + :param repo_id: Repository ID + :param config_name: Configuration name + :return: Tuple of (sql_expressions, raw_cols_needed) or None + if no property mappings are configured. + ``sql_expressions`` are SQL fragments like + ``"'glucose' AS carbon_source"`` or + ``"CASE WHEN ... END AS carbon_source"``. + ``raw_cols_needed`` are raw parquet column names that must + be present in the inner SELECT. + + """ + mappings = self.config.get_property_mappings(repo_id, config_name) + if not mappings and not self.config.missing_value_labels: + return None + + expressions: list[str] = [] + raw_cols: set[str] = set() + + card = None + if mappings: + try: + card = self._datacards.get(repo_id) or _cached_datacard( + repo_id, token=self.token + ) + except Exception as exc: + logger.warning( + "Could not load DataCard for %s: %s", + repo_id, + exc, + ) + + for key, mapping in mappings.items(): + if card is None: + # Cannot resolve field/path mappings without a DataCard; + # skip this mapping and fall through to missing_value_labels. + continue + if mapping.expression is not None: + # Type D: expression + expressions.append(f"({mapping.expression}) AS {key}") + continue + + if mapping.field is not None and mapping.path is None: + # Type A: field-only (alias or ENUM cast) + raw_cols.add(mapping.field) + if mapping.dtype == "factor": + # Fetch class_label levels from DataCard, register ENUM, + # and emit a CAST expression. Raises ValueError if the + # DataCard does not declare a class_label dtype. + enum_type = f"_enum_{key}" + levels = self._get_class_label_names( + card, config_name, mapping.field + ) + self._ensure_enum_type(enum_type, levels) + # If all levels are integer-valued strings (e.g. '0', + # '90'), the parquet column may be DOUBLE (e.g. 90.0). + # Cast through BIGINT first to strip the decimal before + # converting to VARCHAR so '90.0' becomes '90'. + all_int = all(re.fullmatch(r"-?\d+", lv) for lv in levels) + inner = ( + f"CAST({mapping.field} AS BIGINT)" if all_int else mapping.field + ) + expressions.append( + f"CAST(CAST({inner} AS VARCHAR)" f" AS {enum_type}) AS {key}" + ) + elif key == mapping.field: + # no-op -- column already present as raw col + pass + else: + expressions.append(f"{mapping.field} AS {key}") + continue + + if mapping.field is not None and mapping.path is not None: + # Type B: field + path -- resolve from definitions. + # dtype='factor' is not supported here: levels come from a + # class_label field, not a definitions path. + if mapping.dtype == "factor": + raise ValueError( + f"dtype='factor' is not supported for field+path mappings " + f"(key='{key}'). Use dtype='factor' only with field-only " + "mappings that reference a class_label field in the DataCard." + ) + raw_cols.add(mapping.field) + expr = self._build_field_path_expr( + key, + mapping.field, + mapping.path, + mapping.dtype, + config_name, + card, + ) + if expr is not None: + expressions.append(expr) + continue + + if mapping.field is None and mapping.path is not None: + # Type C: path-only -- constant from config + expr = self._build_path_only_expr( + key, + mapping.path, + mapping.dtype, + config_name, + card, + ) + if expr is not None: + expressions.append(expr) + continue + + # For any key in missing_value_labels that was not covered by an + # explicit mapping for this dataset, emit a constant literal so that + # every _meta view exposes the column (with the fallback value). + for key, label in self.config.missing_value_labels.items(): + if key not in mappings: + escaped = label.replace("'", "''") + expressions.append(f"'{escaped}' AS {key}") + + if not expressions and not raw_cols: + return None + + return expressions, sorted(raw_cols) + + def _build_field_path_expr( + self, + key: str, + field: str, + path: str, + dtype: str | None, + config_name: str, + card: Any, + ) -> str | None: + """ + Build a SQL expression for a field+path property mapping. + + Resolves each definition value via ``get_nested_value``, + applies factor aliases, and returns either a constant or + a CASE WHEN expression. + + :param key: Output column name + :param field: Source field in parquet (e.g., "condition") + :param path: Dot-notation path within definitions + :param dtype: Optional data type ("numeric", "string", "bool") + :param config_name: Configuration name + :param card: DataCard instance + :return: SQL expression string, or None on failure + + """ + try: + defs = card.get_field_definitions(config_name, field) + except Exception as exc: + logger.warning( + "Could not get definitions for field '%s' " "in config '%s': %s", + field, + config_name, + exc, + ) + return None + + if not defs: + return None + + # Resolve each definition value + value_map: dict[str, str] = {} + for def_key, definition in defs.items(): + raw = get_nested_value(definition, path) + if raw is None: + logger.debug( + "Path '%s' resolved to None for " "definition key '%s' (keys: %s)", + path, + def_key, + ( + list(definition.keys()) + if isinstance(definition, dict) + else type(definition).__name__ + ), + ) + continue + # Handle list results (e.g., carbon_source returns + # [{"compound": "D-glucose"}]) + if isinstance(raw, list): + raw = raw[0] if len(raw) == 1 else ", ".join(str(v) for v in raw) + resolved = self._resolve_alias(key, str(raw)) + value_map[str(def_key)] = resolved + + if not value_map: + return None + + # If all values are the same, emit a constant + unique_vals = set(value_map.values()) + if len(unique_vals) == 1: + val = next(iter(unique_vals)) + return self._literal_expr(key, val, dtype) + + # Otherwise, build CASE WHEN + whens = [] + for def_key, resolved in value_map.items(): + escaped_key = def_key.replace("'", "''") + escaped_val = resolved.replace("'", "''") + whens.append(f"WHEN {field} = '{escaped_key}' " f"THEN '{escaped_val}'") + case_sql = " ".join(whens) + missing = self.config.missing_value_labels.get(key) + if missing is not None: + escaped_missing = missing.replace("'", "''") + expr = f"CASE {case_sql} " f"ELSE '{escaped_missing}' END" + else: + expr = f"CASE {case_sql} ELSE NULL END" + if dtype == "numeric": + expr = f"CAST({expr} AS DOUBLE)" + return f"{expr} AS {key}" + + def _build_path_only_expr( + self, + key: str, + path: str, + dtype: str | None, + config_name: str, + card: Any, + ) -> str | None: + """ + Build a constant column expression for a path-only mapping. + + Resolves a single value from the DataCard's raw model_extra, + which preserves the full dict structure (including any + ``experimental_conditions`` wrapper). + + :param key: Output column name + :param path: Dot-notation path (may include + ``experimental_conditions.`` prefix) + :param dtype: Optional data type + :param config_name: Configuration name + :param card: DataCard instance + :return: SQL literal expression, or None on failure + + """ + # Build merged dict from top-level + config-level model_extra. + # This preserves keys like "experimental_conditions" that + # get_experimental_conditions() would strip. + merged: dict[str, Any] = {} + try: + top_extra = card.dataset_card.model_extra + if isinstance(top_extra, dict): + merged.update(top_extra) + config_obj = card.get_config(config_name) + if config_obj and isinstance(config_obj.model_extra, dict): + merged.update(config_obj.model_extra) + except Exception: + logger.debug( + "Could not get model_extra for %s/%s", + card.repo_id if hasattr(card, "repo_id") else "?", + config_name, + ) + return None + + if not merged: + return None + + raw = get_nested_value(merged, path) + if raw is None: + logger.debug( + "Path '%s' resolved to None in model_extra for " + "%s/%s. Available keys: %s", + path, + card.repo_id if hasattr(card, "repo_id") else "?", + config_name, + list(merged.keys()), + ) + return None + + if isinstance(raw, list): + raw = raw[0] if len(raw) == 1 else ", ".join(str(v) for v in raw) + + resolved = self._resolve_alias(key, str(raw)) + return self._literal_expr(key, resolved, dtype) + + @staticmethod + def _literal_expr(key: str, value: str, dtype: str | None) -> str: + """ + Build a SQL literal expression with optional type cast. + + :param key: Column alias + :param value: Literal value + :param dtype: Optional type ("numeric", "string", "bool") + :return: SQL expression + + """ + escaped = value.replace("'", "''") + if dtype == "numeric": + return f"CAST('{escaped}' AS DOUBLE) AS {key}" + return f"'{escaped}' AS {key}" + + def _register_comparative_expanded_view( + self, + db_name: str, + ds_cfg: Any, + ) -> None: + """ + Create ``_expanded`` view with parsed composite ID cols. + + For each link_field in the dataset config, adds two columns: + + - ``_source`` -- the ``repo_id;config_name`` prefix, + aliased to the configured ``db_name`` when available. + - ``_id`` -- the sample identifier component. + + :param db_name: Base view name for the comparative dataset + :param ds_cfg: DatasetVirtualDBConfig with ``links`` + + """ + parquet_view = f"__{db_name}_parquet" + if not self._view_exists(parquet_view): + return + + extra_cols = [] + for link_field, primaries in ds_cfg.links.items(): + # _id column: third component of composite ID + id_col = f"{link_field}_id" + extra_cols.append(f"SPLIT_PART({link_field}, ';', 3) " f"AS {id_col}") + + # _source column: first two components, aliased + # to db_name when the pair is in the config + raw_expr = ( + f"SPLIT_PART({link_field}, ';', 1) || ';' " + f"|| SPLIT_PART({link_field}, ';', 2)" + ) + whens = [] + for pair in primaries: + repo_id, config_name = pair[0], pair[1] + alias = self._get_db_name_for(repo_id, config_name) + if alias: + key = f"{repo_id};{config_name}".replace("'", "''") + whens.append(f"WHEN '{key}' THEN '{alias}'") + if whens: + case_sql = " ".join(whens) + source_expr = f"CASE {raw_expr} {case_sql} " f"ELSE {raw_expr} END" + else: + source_expr = raw_expr + source_col = f"{link_field}_source" + extra_cols.append(f"{source_expr} AS {source_col}") + + if not extra_cols: + return + + cols_sql = ", ".join(extra_cols) + self._conn.execute( + f"CREATE OR REPLACE VIEW {db_name}_expanded AS " + f"SELECT *, {cols_sql} FROM {parquet_view}" + ) + + # ------------------------------------------------------------------ + # Internal helpers + # ------------------------------------------------------------------ + + def _is_comparative(self, repo_id: str, config_name: str) -> bool: + """Return True if the dataset has links (i.e. is comparative).""" + repo_cfg = self.config.repositories.get(repo_id) + if not repo_cfg or not repo_cfg.dataset: + return False + ds_cfg = repo_cfg.dataset.get(config_name) + return bool(ds_cfg and ds_cfg.links) + + def _list_views(self) -> list[str]: + """Return list of public views (excludes internal __ prefixed).""" + df = self._conn.execute( + "SELECT table_name FROM information_schema.tables " + "WHERE table_schema = 'main' AND table_type = 'VIEW'" + ).fetchdf() + return [n for n in df["table_name"].tolist() if not n.startswith("__")] + + def _view_exists(self, name: str) -> bool: + """Check whether a view is registered (including internal).""" + df = self._conn.execute( + "SELECT table_name FROM information_schema.tables " + "WHERE table_schema = 'main' AND table_type = 'VIEW' " + f"AND table_name = '{name}'" + ).fetchdf() + return len(df) > 0 + + def _get_primary_view_names(self) -> list[str]: + """ + Return db_names of primary (non-comparative) raw views. + + A primary dataset is one whose config has no ``links``. + + """ + names = [] + for db_name, (repo_id, config_name) in self._db_name_map.items(): + if not self._is_comparative(repo_id, config_name): + if self._view_exists(db_name): + names.append(db_name) + return sorted(names) + + def _get_primary_meta_view_names(self) -> list[str]: + """Return names of primary ``_meta`` views.""" + return [ + f"{n}_meta" + for n in self._get_primary_view_names() + if self._view_exists(f"{n}_meta") + ] + + def _get_db_name_for(self, repo_id: str, config_name: str) -> str | None: + """Resolve db_name for a (repo_id, config_name) pair.""" + for db_name, (r, c) in self._db_name_map.items(): + if r == repo_id and c == config_name: + return db_name + return None + + def __repr__(self) -> str: + """String representation.""" + n_repos = len(self.config.repositories) + n_datasets = len(self._db_name_map) + n_views = len(self._list_views()) + return ( + f"VirtualDB({n_repos} repos, " + f"{n_datasets} datasets, " + f"{n_views} views)" + )