openproblems-bio · rcannood · May 14, 2026 · May 12, 2026 · May 14, 2026
diff --git a/src/data_processors/cell_type_annotation_tacco/config.vsh.yaml b/src/data_processors/cell_type_annotation_tacco/config.vsh.yaml
@@ -0,0 +1,82 @@
+name: cell_type_annotation_tacco
+namespace: data_processors
+label: "TACCO"
+summary: "Assign cell types to segmented cells via optimal transport against a scRNA-seq reference."
+description: |
+  TACCO (Transfer of Annotations to Cells and their Compartments) takes a
+  processed segmentation prediction (cells with per-cell gene expression) and
+  assigns a cell type to each cell by comparing its expression profile to a
+  scRNA-seq reference via optimal transport. It runs as a post-processing step
+  after any segmentation method and adds a cell_type column to the cell table.
+links:
+  documentation: "https://simonwm.github.io/tacco/"
+  repository: "https://github.com/simonwm/tacco"
+references:
+  doi: "10.1038/s41587-023-01657-3"
+
+
+arguments:
+  - name: --input_processed_prediction
+    __merge__: /src/api/file_processed_prediction.yaml
+    direction: input
+    required: true
+
+  - name: --input_scrnaseq_reference
+    __merge__: /src/api/file_scrnaseq_reference.yaml
+    direction: input
+    required: true
+
+  - name: --output
+    type: file
+    label: "Cell type annotation"
+    summary: "AnnData with predicted cell type labels in obs."
+    description: "An h5ad file containing obs with cell_type and cell_id columns, plus uns metadata (dataset_id, method_id)."
+    direction: output
+    required: true
+    default: output.h5ad
+    example: output.h5ad
+    info:
+      format:
+        type: h5ad
+        obs:
+          - type: string
+            name: cell_type
+            description: Predicted cell type label
+            required: true
+          - type: string
+            name: cell_id
+            description: Cell ID matching the segmentation table
+            required: true
+        uns:
+          - type: string
+            name: dataset_id
+            required: true
+          - type: string
+            name: method_id
+            required: true
+
+test_resources:
+  - type: python_script
+    path: /common/component_tests/run_and_check_output.py
+  - path: /resources_test/task_spatial_segmentation/mouse_brain_combined
+    dest: resources_test/task_spatial_segmentation/mouse_brain_combined
+
+resources:
+  - type: python_script
+    path: script.py
+
+engines:
+  - type: docker
+    image: openproblems/base_python:1
+    setup:
+      - type: python
+        pypi: [tacco]
+    __merge__:
+      - /src/base/setup_spatialdata_partial.yaml
+  - type: native
+
+runners:
+  - type: executable
+  - type: nextflow
+    directives:
+      label: [midtime, midcpu, highmem]
diff --git a/src/data_processors/cell_type_annotation_tacco/script.py b/src/data_processors/cell_type_annotation_tacco/script.py
@@ -0,0 +1,58 @@
+import anndata as ad
+import numpy as np
+import spatialdata as sd
+import tacco
+
+## VIASH START
+par = {
+    'input_processed_prediction': 'resources_test/task_spatial_segmentation/mouse_brain_combined/processed_prediction.zarr',
+    'input_scrnaseq_reference': 'resources_test/task_spatial_segmentation/mouse_brain_combined/scrnaseq_reference.h5ad',
+    'output': 'output.h5ad',
+}
+meta = {
+    'name': 'tacco',
+}
+## VIASH END
+
+print('Reading inputs', flush=True)
+sdata_pred = sd.read_zarr(par['input_processed_prediction'])
+adata_sc = ad.read_h5ad(par['input_scrnaseq_reference'])
+
+table = sdata_pred.tables['table']
+
+if table.n_obs == 0:
+    print('No cells detected in prediction — skipping annotation', flush=True)
+    cell_types = []
+else:
+    # remap Ensembl IDs to gene symbols in-place if needed
+    if 'feature_name' in adata_sc.var.columns:
+        adata_sc.var_names = adata_sc.var['feature_name'].values
+        adata_sc = adata_sc[:, ~adata_sc.var_names.duplicated()].copy()
+
+    if 'counts' not in adata_sc.layers:
+        raise ValueError("scRNA-seq reference is missing the 'counts' layer.")
+
+    common_genes = sorted(set(table.var_names) & set(adata_sc.var_names))
+    if len(common_genes) == 0:
+        raise ValueError('No common genes between prediction cells and scRNA-seq reference.')
+    print(f'Using {len(common_genes)} common genes', flush=True)
+
+    adata_sp_sub = table[:, common_genes].copy()
+    adata_sp_sub.X = adata_sp_sub.layers['counts']
+    adata_sc_sub = adata_sc[:, common_genes].copy()
+    adata_sc_sub.X = adata_sc_sub.layers['counts']
+
+    print('Running TACCO annotation', flush=True)
+    cell_type_annotation = tacco.tl.annotate(
+        adata=adata_sp_sub,
+        reference=adata_sc_sub,
+        annotation_key='cell_type',
+    )
+    best_type_idx = np.argmax(cell_type_annotation.values, axis=1)
+    cell_types = cell_type_annotation.columns[best_type_idx].tolist()
+
+print('Writing output', flush=True)
+output = ad.AnnData(obs={'cell_type': cell_types, 'cell_id': table.obs['cell_id'].values})
+output.uns['dataset_id'] = table.uns['dataset_id']
+output.uns['method_id'] = table.uns['method_id']
+output.write_h5ad(par['output'], compression='gzip')
diff --git a/src/workflows/run_benchmark/config.vsh.yaml b/src/workflows/run_benchmark/config.vsh.yaml
@@ -65,6 +65,7 @@ dependencies:
   - name: control_methods/empty_labels
   - name: control_methods/random_voronoi
   - name: methods/cellpose
+  - name: data_processors/cell_type_annotation_tacco
   - name: metrics/ari
   - name: data_processors/process_prediction
 

diff --git a/src/workflows/run_benchmark/main.nf b/src/workflows/run_benchmark/main.nf
@@ -95,6 +95,19 @@ workflow run_wf {
       }
     )
 
+    // annotate segmented cells with cell types
+    | cell_type_annotation_tacco.run(
+      fromState: [
+        input_processed_prediction: "input_prediction",
+        input_scrnaseq_reference: "input_scrnaseq_reference"
+      ],
+      toState: { id, output, state ->
+        state + [
+          cell_type_annotation: output.output
+        ]
+      }
+    )
+
     // run all metrics
     | runEach(
       components: metrics,