diff --git a/src/data_processors/cell_type_annotation_tacco/config.vsh.yaml b/src/data_processors/cell_type_annotation_tacco/config.vsh.yaml new file mode 100644 index 0000000..695b4a8 --- /dev/null +++ b/src/data_processors/cell_type_annotation_tacco/config.vsh.yaml @@ -0,0 +1,82 @@ +name: cell_type_annotation_tacco +namespace: data_processors +label: "TACCO" +summary: "Assign cell types to segmented cells via optimal transport against a scRNA-seq reference." +description: | + TACCO (Transfer of Annotations to Cells and their Compartments) takes a + processed segmentation prediction (cells with per-cell gene expression) and + assigns a cell type to each cell by comparing its expression profile to a + scRNA-seq reference via optimal transport. It runs as a post-processing step + after any segmentation method and adds a cell_type column to the cell table. +links: + documentation: "https://simonwm.github.io/tacco/" + repository: "https://github.com/simonwm/tacco" +references: + doi: "10.1038/s41587-023-01657-3" + + +arguments: + - name: --input_processed_prediction + __merge__: /src/api/file_processed_prediction.yaml + direction: input + required: true + + - name: --input_scrnaseq_reference + __merge__: /src/api/file_scrnaseq_reference.yaml + direction: input + required: true + + - name: --output + type: file + label: "Cell type annotation" + summary: "AnnData with predicted cell type labels in obs." + description: "An h5ad file containing obs with cell_type and cell_id columns, plus uns metadata (dataset_id, method_id)." + direction: output + required: true + default: output.h5ad + example: output.h5ad + info: + format: + type: h5ad + obs: + - type: string + name: cell_type + description: Predicted cell type label + required: true + - type: string + name: cell_id + description: Cell ID matching the segmentation table + required: true + uns: + - type: string + name: dataset_id + required: true + - type: string + name: method_id + required: true + +test_resources: + - type: python_script + path: /common/component_tests/run_and_check_output.py + - path: /resources_test/task_spatial_segmentation/mouse_brain_combined + dest: resources_test/task_spatial_segmentation/mouse_brain_combined + +resources: + - type: python_script + path: script.py + +engines: + - type: docker + image: openproblems/base_python:1 + setup: + - type: python + pypi: [tacco] + __merge__: + - /src/base/setup_spatialdata_partial.yaml + - type: native + +runners: + - type: executable + - type: nextflow + directives: + label: [midtime, midcpu, highmem] \ No newline at end of file diff --git a/src/data_processors/cell_type_annotation_tacco/script.py b/src/data_processors/cell_type_annotation_tacco/script.py new file mode 100644 index 0000000..b2d8d0b --- /dev/null +++ b/src/data_processors/cell_type_annotation_tacco/script.py @@ -0,0 +1,58 @@ +import anndata as ad +import numpy as np +import spatialdata as sd +import tacco + +## VIASH START +par = { + 'input_processed_prediction': 'resources_test/task_spatial_segmentation/mouse_brain_combined/processed_prediction.zarr', + 'input_scrnaseq_reference': 'resources_test/task_spatial_segmentation/mouse_brain_combined/scrnaseq_reference.h5ad', + 'output': 'output.h5ad', +} +meta = { + 'name': 'tacco', +} +## VIASH END + +print('Reading inputs', flush=True) +sdata_pred = sd.read_zarr(par['input_processed_prediction']) +adata_sc = ad.read_h5ad(par['input_scrnaseq_reference']) + +table = sdata_pred.tables['table'] + +if table.n_obs == 0: + print('No cells detected in prediction — skipping annotation', flush=True) + cell_types = [] +else: + # remap Ensembl IDs to gene symbols in-place if needed + if 'feature_name' in adata_sc.var.columns: + adata_sc.var_names = adata_sc.var['feature_name'].values + adata_sc = adata_sc[:, ~adata_sc.var_names.duplicated()].copy() + + if 'counts' not in adata_sc.layers: + raise ValueError("scRNA-seq reference is missing the 'counts' layer.") + + common_genes = sorted(set(table.var_names) & set(adata_sc.var_names)) + if len(common_genes) == 0: + raise ValueError('No common genes between prediction cells and scRNA-seq reference.') + print(f'Using {len(common_genes)} common genes', flush=True) + + adata_sp_sub = table[:, common_genes].copy() + adata_sp_sub.X = adata_sp_sub.layers['counts'] + adata_sc_sub = adata_sc[:, common_genes].copy() + adata_sc_sub.X = adata_sc_sub.layers['counts'] + + print('Running TACCO annotation', flush=True) + cell_type_annotation = tacco.tl.annotate( + adata=adata_sp_sub, + reference=adata_sc_sub, + annotation_key='cell_type', + ) + best_type_idx = np.argmax(cell_type_annotation.values, axis=1) + cell_types = cell_type_annotation.columns[best_type_idx].tolist() + +print('Writing output', flush=True) +output = ad.AnnData(obs={'cell_type': cell_types, 'cell_id': table.obs['cell_id'].values}) +output.uns['dataset_id'] = table.uns['dataset_id'] +output.uns['method_id'] = table.uns['method_id'] +output.write_h5ad(par['output'], compression='gzip') \ No newline at end of file diff --git a/src/workflows/run_benchmark/config.vsh.yaml b/src/workflows/run_benchmark/config.vsh.yaml index 67bf53c..10c513a 100644 --- a/src/workflows/run_benchmark/config.vsh.yaml +++ b/src/workflows/run_benchmark/config.vsh.yaml @@ -65,6 +65,7 @@ dependencies: - name: control_methods/empty_labels - name: control_methods/random_voronoi - name: methods/cellpose + - name: data_processors/cell_type_annotation_tacco - name: metrics/ari - name: data_processors/process_prediction diff --git a/src/workflows/run_benchmark/main.nf b/src/workflows/run_benchmark/main.nf index a61478f..75043f2 100644 --- a/src/workflows/run_benchmark/main.nf +++ b/src/workflows/run_benchmark/main.nf @@ -95,6 +95,19 @@ workflow run_wf { } ) + // annotate segmented cells with cell types + | cell_type_annotation_tacco.run( + fromState: [ + input_processed_prediction: "input_prediction", + input_scrnaseq_reference: "input_scrnaseq_reference" + ], + toState: { id, output, state -> + state + [ + cell_type_annotation: output.output + ] + } + ) + // run all metrics | runEach( components: metrics,