From 75d6443d5f676a1d15d3685e223b80edcee238c9 Mon Sep 17 00:00:00 2001 From: dariarom94 Date: Thu, 14 May 2026 13:33:25 +0200 Subject: [PATCH 1/6] Add CLAUDE.md with codebase guidance for Claude Code Co-Authored-By: Claude Sonnet 4.6 --- CLAUDE.md | 79 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 79 insertions(+) create mode 100644 CLAUDE.md diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 0000000..e71049f --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1,79 @@ +# CLAUDE.md + +This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository. + +## Framework: Viash + Nextflow + +This is a [Viash](https://viash.io)-based benchmarking task for spatial transcriptomics cell segmentation, part of the [OpenProblems](https://openproblems.bio) initiative. Every executable is a **Viash component** — a self-contained unit pairing a `config.vsh.yaml` (metadata, arguments, Docker setup) with a `script.py` or `script.R`. Nextflow workflows orchestrate multi-component pipelines. + +## Common Commands + +```bash +# Sync test resources from S3 (required before testing) +scripts/sync_resources.sh + +# Build all components (Docker-cached) +viash ns build --parallel --setup cachedbuild + +# Test a single component +viash test src/methods/cellpose/config.vsh.yaml + +# Test all components in parallel +viash ns test --parallel + +# Run benchmark locally (test scale) +scripts/run_benchmark/run_test_local.sh + +# Run full local benchmark +scripts/run_benchmark/run_full_local.sh + +# Create a new method or metric from template +common/scripts/create_component +``` + +## Architecture + +The task follows a fixed pipeline defined by the API specs in `src/api/`: + +``` +Raw spatial data + → [data_processor] process_dataset → unlabelled + solution files + → [method / control_method] → prediction file + → [data_processor] process_prediction → formatted prediction + → [metric] ari → score file +``` + +**Component types** (each defined in `src/api/comp_*.yaml`): +- `control_method` — baseline segmentations (random Voronoi, true labels, empty labels) +- `method` — segmentation algorithms under evaluation (currently: Cellpose) +- `data_processor` — preprocessing and postprocessing steps +- `metric` — evaluation metrics (currently: ARI) +- `workflow` — Nextflow pipelines in `src/workflows/` + +**File format contracts** are defined in `src/api/file_*.yaml`. All component I/O must conform to these schemas. The `common/component_tests/run_and_check_output.py` helper enforces them during tests. + +## Component Structure + +Each component under `src/` follows this pattern: +``` +src/// +├── config.vsh.yaml # arguments, Docker image, test resources, test defaults +└── script.py # or script.R +``` + +Test resources and default parameters for `viash test` are declared inside `config.vsh.yaml` under `info.test_resources` and `info.test_default`. + +## Data Formats + +- **Spatial data**: Zarr-based `SpatialData` objects (`.zarr/`) +- **Single-cell data**: AnnData HDF5 files (`.h5ad`) +- Additional sources of ground truth will include spatial proteomics or annotated histology +- Test datasets live in `resources_test/` (downloaded via `scripts/sync_resources.sh` from S3) + +## Important Notes + +- The `common/` directory is a **git submodule** — clone with `git clone --recursive` or run `git submodule update --init` after cloning. +- `README.md` is **auto-generated** from the YAML API specs; do not edit it directly. +- All components run inside Docker containers by default. Use `--platform docker` / `--engine docker` flags with Viash when needed. +- `_viash.yaml` is the project-level Viash config (project name, organization, package registry, test resource S3 paths). +- Don't commit to main, always create a new branch From 1879d81d66ca2796233d510dca997f306009f54c Mon Sep 17 00:00:00 2001 From: dariarom94 Date: Fri, 15 May 2026 00:07:30 +0200 Subject: [PATCH 2/6] add instructions on summary --- CLAUDE.md | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/CLAUDE.md b/CLAUDE.md index e71049f..eefc529 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -15,6 +15,9 @@ scripts/sync_resources.sh # Build all components (Docker-cached) viash ns build --parallel --setup cachedbuild +# Build all Docker containers (run before local benchmark) +scripts/project/build_all_docker_containers.sh + # Test a single component viash test src/methods/cellpose/config.vsh.yaml @@ -29,6 +32,16 @@ scripts/run_benchmark/run_full_local.sh # Create a new method or metric from template common/scripts/create_component + +# Inspect the Docker image ID and Dockerfile for a built component +target/executable// ---docker_image_id +target/executable// ---dockerfile + +# Run a single built component directly +target/executable// --input --output + +# Run a built component as a Nextflow workflow +nextflow run target/nextflow/ -profile docker --id --input --publish_dir out/ ``` ## Architecture @@ -77,3 +90,4 @@ Test resources and default parameters for `viash test` are declared inside `conf - All components run inside Docker containers by default. Use `--platform docker` / `--engine docker` flags with Viash when needed. - `_viash.yaml` is the project-level Viash config (project name, organization, package registry, test resource S3 paths). - Don't commit to main, always create a new branch +- Fill in the summary for a src/methods//config.vsh.yaml. Write a one sentence summary and a one paragraph summary of how this method works based on documentation and references. \ No newline at end of file From 45e6521b95e1d39206c35e4cd97ffd51737dd1cd Mon Sep 17 00:00:00 2001 From: dariarom94 Date: Sat, 16 May 2026 22:00:07 +0200 Subject: [PATCH 3/6] add test instructions --- CLAUDE.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/CLAUDE.md b/CLAUDE.md index eefc529..de122ca 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -90,4 +90,5 @@ Test resources and default parameters for `viash test` are declared inside `conf - All components run inside Docker containers by default. Use `--platform docker` / `--engine docker` flags with Viash when needed. - `_viash.yaml` is the project-level Viash config (project name, organization, package registry, test resource S3 paths). - Don't commit to main, always create a new branch -- Fill in the summary for a src/methods//config.vsh.yaml. Write a one sentence summary and a one paragraph summary of how this method works based on documentation and references. \ No newline at end of file +- Fill in the summary for a src/methods//config.vsh.yaml. Write a one sentence summary and a one paragraph summary of how this method works based on documentation and references. +- always use viash test to test new components \ No newline at end of file From 1e0843a7ecfdc4421962cd4b2b62f6bbfe2f9b2b Mon Sep 17 00:00:00 2001 From: dariarom94 Date: Sun, 17 May 2026 08:07:08 +0200 Subject: [PATCH 4/6] add method implementation --- src/methods/proseg/config.vsh.yaml | 91 +++++++++++++++ src/methods/proseg/script.py | 164 +++++++++++++++++++++++++++ src/methods/proseg/script_no_sopa.py | 145 +++++++++++++++++++++++ src/methods/stardist/config.vsh.yaml | 54 +++++++++ src/methods/stardist/script.py | 120 ++++++++++++++++++++ 5 files changed, 574 insertions(+) create mode 100644 src/methods/proseg/config.vsh.yaml create mode 100644 src/methods/proseg/script.py create mode 100644 src/methods/proseg/script_no_sopa.py create mode 100644 src/methods/stardist/config.vsh.yaml create mode 100644 src/methods/stardist/script.py diff --git a/src/methods/proseg/config.vsh.yaml b/src/methods/proseg/config.vsh.yaml new file mode 100644 index 0000000..420a97c --- /dev/null +++ b/src/methods/proseg/config.vsh.yaml @@ -0,0 +1,91 @@ +__merge__: /src/api/comp_method.yaml + +name: proseg +label: "Proseg" + +# summary and description are generated by claude sonnet 4.6 based on the paper, github repo and documentation +summary: Infers cell boundaries from transcript spatial distributions using a Cellular Potts model–inspired probabilistic framework. +description: | + Proseg assigns a probability to every configuration of cell labels on a voxel lattice, driven by a + spatial transcript count model that favors transcripts being contained within a single cell, a cell + compactness prior, and optional diffusion terms to account for transcripts that leak outside their + true cell of origin. The model is fit using a Markov chain Monte Carlo sampler that alternates between + sampling cell assignments for each voxel and sampling cell-specific expression parameters. A burn-in + phase at a coarser voxel resolution speeds up initialization before the final-resolution sampling + begins. Because the method is purely transcript-driven it does not require a morphology image, though + an optional nuclear-segmentation prior can be incorporated. Cell boundaries are reported as polygon + shapes and are rasterized to a label image for downstream analysis. + +links: + documentation: "https://github.com/dcjones/proseg" + repository: "https://github.com/dcjones/proseg" +references: + doi: "10.1038/s41592-025-02697-0" + +arguments: + - name: --voxel_layers + type: integer + required: false + description: "Number of layers on the z-axis to model 3D cells." + default: 4 + info: + test_default: 1 + + - name: --samples + type: integer + required: false + description: "Run the sampler for this many iterations." + default: 200 + info: + test_default: 10 + + - name: --burnin_samples + type: integer + required: false + description: "Run the sampler for a preliminary N samples at a lower resolution." + default: 200 + info: + test_default: 10 + + - name: --voxel_size + type: double + required: false + description: "Voxel size in microns on the x/y axis." + default: 1.0 + info: + test_default: 2.0 + + - name: --burnin_voxel_size + type: double + required: false + description: "Larger voxel size for the burn-in phase. Must be an integer multiple of voxel_size." + default: 2.0 + info: + test_default: 4.0 + +resources: + - type: python_script + path: script.py + +engines: + - type: docker + image: openproblems/base_python:1 + __merge__: + - /src/base/setup_spatialdata_partial.yaml + setup: + - type: docker + env: + - PATH="/root/.cargo/bin:${PATH}" + run: + - curl https://sh.rustup.rs -sSf | sh -s -- -y + - echo 'source $HOME/.cargo/env' >> $HOME/.bashrc + - cargo install proseg + - type: python + pypi: [rasterio] + - type: native + +runners: + - type: executable + - type: nextflow + directives: + label: [ hightime, highcpu, midmem ] diff --git a/src/methods/proseg/script.py b/src/methods/proseg/script.py new file mode 100644 index 0000000..6c1d678 --- /dev/null +++ b/src/methods/proseg/script.py @@ -0,0 +1,164 @@ +import os +import shutil +import subprocess +from pathlib import Path +import numpy as np +import pandas as pd +import xarray as xr +import anndata as ad +import spatialdata as sd +from shapely.affinity import affine_transform as shapely_affine_transform +from spatialdata.models import Labels2DModel +from spatialdata.transformations import get_transformation +from rasterio.features import rasterize as rio_rasterize + +## VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': r'resources_test/task_spatial_segmentation/mouse_brain_combined/spatial_unlabelled.zarr', + 'output': r'resources_test/task_spatial_segmentation/mouse_brain_combined/prediction.zarr', + 'voxel_layers': int(r'4'), + 'samples': int(r'200'), + 'burnin_samples': int(r'200'), + 'voxel_size': float(r'1.0'), + 'burnin_voxel_size': float(r'2.0') +} +meta = { + 'name': r'proseg', + 'functionality_name': r'proseg', + 'resources_dir': r'/private/tmp/viash_inject_proseg11113472565944867086', + 'executable': r'/private/tmp/viash_inject_proseg11113472565944867086/proseg', + 'config': r'/private/tmp/viash_inject_proseg11113472565944867086/.config.vsh.yaml', + 'temp_dir': r'/var/folders/fq/ymt0vml175s4yvqxzbmlmpz80000gn/T/', + 'cpus': int(r'123'), + 'memory_b': int(r'123'), + 'memory_kb': int(r'123'), + 'memory_mb': int(r'123'), + 'memory_gb': int(r'123'), + 'memory_tb': int(r'123'), + 'memory_pb': int(r'123'), + 'memory_kib': int(r'123'), + 'memory_mib': int(r'123'), + 'memory_gib': int(r'123'), + 'memory_tib': int(r'123'), + 'memory_pib': int(r'123') +} +dep = { + +} + +## VIASH END + +proseg_dir = Path(meta['temp_dir'] or '/tmp') / 'proseg' +proseg_dir.mkdir(parents=True, exist_ok=True) + +print('Reading input', flush=True) +sdata = sd.read_zarr(par['input']) +image = sdata['morphology_mip']['scale0'].image.compute().to_numpy() +transformation = sdata['morphology_mip']['scale0'].image.transform.copy() +h, w = image.shape[-2:] + +print('Exporting transcripts to CSV', flush=True) +transcripts_df = sdata['transcripts'].compute() +has_z = 'z' in transcripts_df.columns and transcripts_df['z'].notna().any() + +# Proseg requires a prior cell-ID column. Build a simple grid-based prior so +# proseg can refine the segmentation from a meaningful starting state. +x_min, x_max = float(transcripts_df['x'].min()), float(transcripts_df['x'].max()) +y_min, y_max = float(transcripts_df['y'].min()), float(transcripts_df['y'].max()) +grid_size = 20.0 # microns per grid cell — roughly one cell diameter +x_bins = np.arange(x_min, x_max + grid_size, grid_size) +y_bins = np.arange(y_min, y_max + grid_size, grid_size) +x_idx = np.digitize(transcripts_df['x'].to_numpy(), x_bins) - 1 +y_idx = np.digitize(transcripts_df['y'].to_numpy(), y_bins) - 1 +n_x = max(len(x_bins) - 1, 1) +prior_cell_ids = (y_idx * n_x + x_idx) + 1 # 1-indexed; 0 is "unassigned" + +z_vals = transcripts_df['z'] if has_z else np.zeros(len(transcripts_df)) +csv_data = { + 'x': transcripts_df['x'], + 'y': transcripts_df['y'], + 'z': z_vals, + 'gene': transcripts_df['feature_name'], + 'cell_id': prior_cell_ids, +} +transcript_csv = proseg_dir / 'transcripts.csv' +pd.DataFrame(csv_data).to_csv(transcript_csv, index=False) + +n_threads = max((meta.get('cpus') or os.cpu_count() or 1) - 2, 1) +cmd = [ + 'proseg', 'transcripts.csv', + '--x-column', 'x', + '--y-column', 'y', + '--z-column', 'z', + '--gene-column', 'gene', + '--cell-id-column', 'cell_id', + '--cell-id-unassigned', '0', + '--nthreads', str(n_threads), + '--ncomponents', '10', + '--diffusion-probability', '0.2', + '--diffusion-sigma-far', '4', + '--diffusion-sigma-near', '1', + '--nuclear-reassignment-prob', '0.2', + '--cell-compactness', '0.03', + '--voxel-layers', str(par['voxel_layers']), + '--samples', str(par['samples']), + '--burnin-samples', str(par['burnin_samples']), + '--recorded-samples', str(par['samples']), + '--voxel-size', str(par['voxel_size']), + '--burnin-voxel-size', str(par['burnin_voxel_size']), +] +print(f'Running Proseg: {" ".join(cmd)}', flush=True) +subprocess.run(cmd, cwd=str(proseg_dir), check=True) + +print('Reading Proseg Zarr output', flush=True) +proseg_sdata = sd.read_zarr(str(proseg_dir / 'proseg-output.zarr')) +shapes = proseg_sdata.shapes['cell_boundaries'] +print(f'Found {len(shapes)} cell boundaries', flush=True) + +# Proseg boundaries are in global (micron) coordinates. Convert to pixel space +# using the inverse of the morphology image's pixel-to-global transformation. +print('Converting boundaries to pixel space and rasterizing', flush=True) +img_transform = get_transformation(sdata['morphology_mip'], to_coordinate_system='global') +affine_mat = img_transform.to_affine_matrix(input_axes=('x', 'y'), output_axes=('x', 'y')) +inv_affine = np.linalg.inv(affine_mat) + +def global_to_pixel(geom): + a, b, xoff = inv_affine[0, 0], inv_affine[0, 1], inv_affine[0, 2] + d, e, yoff = inv_affine[1, 0], inv_affine[1, 1], inv_affine[1, 2] + return shapely_affine_transform(geom, [a, b, d, e, xoff, yoff]) + +pixel_geoms = shapes['geometry'].apply(global_to_pixel) +shape_geoms = list(zip(pixel_geoms, range(1, len(shapes) + 1))) +labels = rio_rasterize(shape_geoms, out_shape=(h, w), fill=0, dtype=np.int32) + +max_val = labels.max() +if max_val <= np.iinfo(np.uint8).max: + labels = labels.astype(np.uint8) +elif max_val <= np.iinfo(np.uint16).max: + labels = labels.astype(np.uint16) +elif max_val <= np.iinfo(np.uint32).max: + labels = labels.astype(np.uint32) + +print('Creating output data structure', flush=True) +sd_output = sd.SpatialData( + labels={ + 'segmentation': Labels2DModel.parse( + xr.DataArray(labels, name='segmentation', dims=('y', 'x')), + transformations=transformation, + ) + }, + tables={ + 'table': ad.AnnData( + uns={ + 'dataset_id': sdata.tables['table'].uns['dataset_id'], + 'method_id': meta['name'], + } + ) + }, +) + +print('Writing output', flush=True) +if os.path.exists(par['output']): + shutil.rmtree(par['output']) +sd_output.write(par['output']) diff --git a/src/methods/proseg/script_no_sopa.py b/src/methods/proseg/script_no_sopa.py new file mode 100644 index 0000000..b9008fb --- /dev/null +++ b/src/methods/proseg/script_no_sopa.py @@ -0,0 +1,145 @@ +import os +import shutil +from pathlib import Path +# from tifffile import imwrite +import dask +import numpy as np +import pandas as pd +import anndata as ad +import spatialdata as sd + + +## VIASH START +# Note: this section is auto-generated by viash at runtime. To edit it, make changes +# in config.vsh.yaml and then run `viash config inject config.vsh.yaml`. +par = { + 'input_ist': 'resources_test/task_ist_preprocessing/mouse_brain_combined/raw_ist.zarr', + 'input_segmentation': 'resources_test/task_ist_preprocessing/mouse_brain_combined/segmentation.zarr', + 'transcripts_key': 'transcripts', + 'coordinate_system': 'global', + 'output': './temp/proseg_transcripts.zarr', + +} +meta = { + 'name': 'proseg', + 'temp_dir': "/Users/habib/Projects/txsim_project/task_ist_preprocessing/temp" +} +## VIASH END + +TMP_DIR = Path(meta["temp_dir"] or "/tmp") +TMP_DIR.mkdir(parents=True, exist_ok=True) + +TRANSCRIPTS_CSV = TMP_DIR / "transcripts.csv" +SEGMENTATION_NPY = TMP_DIR / "segmentation.npy" +PROSEG_OUTPUT = TMP_DIR / "proseg_output.zarr" + + +# Read input +print('Reading input files', flush=True) +sdata = sd.read_zarr(par['input_ist']) +sdata_segm = sd.read_zarr(par['input_segmentation']) + +# Check if coordinate system is available in input data +transcripts_coord_systems = sd.transformations.get_transformation(sdata[par["transcripts_key"]], get_all=True).keys() +assert par['coordinate_system'] in transcripts_coord_systems, f"Coordinate system '{par['coordinate_system']}' not found in input data." +segmentation_coord_systems = sd.transformations.get_transformation(sdata_segm["segmentation"], get_all=True).keys() +assert par['coordinate_system'] in segmentation_coord_systems, f"Coordinate system '{par['coordinate_system']}' not found in input data." + +# Transform transcript coordinates to the coordinate system + +print('Transforming transcripts coordinates', flush=True) +transcripts = sd.transform(sdata[par['transcripts_key']], to_coordinate_system=par['coordinate_system']) + +# In case of a translation transformation of the segmentation (e.g. crop of the data), we need to adjust the transcript coordinates +trans = sd.transformations.get_transformation(sdata_segm["segmentation"], get_all=True)[par['coordinate_system']].inverse() +transcripts = sd.transform(transcripts, trans, par['coordinate_system']) + +# Write transcripts to csv +print('Writing transcripts to csv', flush=True) +transcripts_df = transcripts[['x', 'y', 'z', 'feature_name']].compute() +transcripts_df['transcript_index'] = transcripts_df.index + +#add segmentation for cell_id +segmentation_image = sdata_segm["segmentation"]["scale0"].image.to_numpy() + +y_coords = transcripts.y.compute().to_numpy(dtype=np.int64) +x_coords = transcripts.x.compute().to_numpy(dtype=np.int64) +transcripts_df['cell_id'] = segmentation_image[y_coords, x_coords] +#write to csv +transcripts_df.to_csv(TRANSCRIPTS_CSV) + +# Write segmentation to tif +print('Writing segmentation to npy', flush=True) +np.save(SEGMENTATION_NPY, segmentation_image) + +# TODO: figure out how to extract the affine transformation from the spatialdata Transform +# Run Proseg +print('Running Proseg', flush=True) +#Make sure to add a space after each line +proseg_cmd = ( + f'''proseg {TRANSCRIPTS_CSV} --x-column x --y-column y --z-column z --gene-column feature_name ''' + f'''--cell-id-column cell_id --cell-id-unassigned 0 --transcript-id-column transcript_index ''' + f'''--cellpose-masks {SEGMENTATION_NPY} ''' + f'''--cellpose-x-transform 1 0 0 ''' + f'''--cellpose-y-transform 0 1 0 ''' + f'''--output-spatialdata {PROSEG_OUTPUT} --overwrite''' + +) +print("\t" + proseg_cmd, flush=True) +#TODO: UNCOMMENT THIS +os.system(proseg_cmd) + + +# Read Proseg output +print('Reading proseg output', flush=True) +proseg_sd = sd.read_zarr(PROSEG_OUTPUT) +proseg_transcripts = sd.transform(proseg_sd['transcripts'], to_coordinate_system=par['coordinate_system']).compute() +proseg_transcripts.set_index('transcript_id', inplace=True) #here transcript_id is the index of the original, not the transcript_id in the original +proseg_transcripts = proseg_transcripts.reindex(transcripts.index).fillna(-1) + +# Add cell ids to transcripts +#NOTE: ADD 1 SINCE PROSEG STARTS COUNTING CELLS AT 0 +print('Adding cell ids to transcripts', flush=True) +cell_id_dask_series = dask.dataframe.from_dask_array( + dask.array.from_array( + proseg_transcripts['assignment'].values+1, chunks=tuple(sdata[par['transcripts_key']].map_partitions(len).compute()) + ), + index=sdata[par['transcripts_key']].index +) +sdata[par['transcripts_key']]["cell_id"] = cell_id_dask_series + + +# Create objects for cells table +print('Creating objects for cells table', flush=True) +#create new .obs for cells based on the segmentation output (corresponding with the transcripts 'cell_id') +unique_cells = np.unique(cell_id_dask_series) + +# check if a '0' (noise/background) cell is in cell_id and remove +zero_idx = np.where(unique_cells == 0) +if len(zero_idx[0]): unique_cells=np.delete(unique_cells, zero_idx[0][0]) + +#transform into pandas series and check +cell_id_col = pd.Series(unique_cells, name='cell_id', index=unique_cells) +assert 0 not in cell_id_col, "Found '0' in cell_id column of assingment output cell matrix" + + +# Create transcripts only sdata +print('Subsetting to transcripts cell id data', flush=True) +sdata_transcripts_only = sd.SpatialData( + points={ + "transcripts": sdata[par['transcripts_key']] + }, + tables={ + "table": ad.AnnData( + obs=pd.DataFrame(cell_id_col), + var=sdata.tables["table"].var[[]] + ) + } +) + +# Write output +print('Write transcripts with cell ids', flush=True) +if os.path.exists(par["output"]): + shutil.rmtree(par["output"]) + +sdata_transcripts_only.write(par['output']) diff --git a/src/methods/stardist/config.vsh.yaml b/src/methods/stardist/config.vsh.yaml new file mode 100644 index 0000000..f372f11 --- /dev/null +++ b/src/methods/stardist/config.vsh.yaml @@ -0,0 +1,54 @@ +__merge__: /src/api/comp_method.yaml + +name: stardist +label: "StarDist" + +# summary and description are generated by claude sonnet 4.6 based on the paper, github repo and documentation +summary: Detects star-convex shaped cells by predicting radial distances to the object boundary from each pixel. +description: | + StarDist represents each cell as a star-convex polygon defined by a set of radial distances from the cell's + center pixel to its boundary, measured at evenly spaced angles. A U-Net–style convolutional neural network + is trained to predict, for every pixel, both the probability that the pixel lies inside any cell and the + set of radial distances describing the cell polygon centered at that pixel. At inference time, non-maximum + suppression selects the most confident polygon proposals and suppresses overlapping detections, yielding a + final instance-segmentation label image. Because the star-convex parameterization compactly encodes shapes + typical of fluorescence-microscopy nuclei and cells, the method is highly data-efficient and generalizes + well across imaging modalities with pretrained models such as "2D_versatile_fluo". + +links: + documentation: "https://stardist.net" + repository: "https://github.com/stardist/stardist" +references: + doi: "10.48550/arXiv.1806.03535" + +arguments: + - name: --model + type: string + default: "2D_versatile_fluo" + description: "Pretrained StarDist model name." + info: + test_default: "2D_versatile_fluo" + +resources: + - type: python_script + path: script.py + +engines: + - type: docker + image: openproblems/base_python:1 + __merge__: + - /src/base/setup_spatialdata_partial.yaml + setup: + - type: python + pypi: stardist + - type: python + pypi: + - "numpy<2.0.0" + - tensorflow + - type: native + +runners: + - type: executable + - type: nextflow + directives: + label: [ midtime, midcpu, highmem, gpu ] diff --git a/src/methods/stardist/script.py b/src/methods/stardist/script.py new file mode 100644 index 0000000..36b7e54 --- /dev/null +++ b/src/methods/stardist/script.py @@ -0,0 +1,120 @@ +import os +import shutil +import numpy as np +import xarray as xr +import spatialdata as sd +import anndata as ad +from csbdeep.data import Normalizer, normalize_mi_ma +from stardist.models import StarDist2D + +## VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': r'resources_test/task_spatial_segmentation/mouse_brain_combined/spatial_unlabelled.zarr', + 'output': r'resources_test/task_spatial_segmentation/mouse_brain_combined/prediction.zarr', + 'model': r'2D_versatile_fluo' +} +meta = { + 'name': r'stardist', + 'functionality_name': r'stardist', + 'resources_dir': r'/private/tmp/viash_inject_stardist15754903354685377792', + 'executable': r'/private/tmp/viash_inject_stardist15754903354685377792/stardist', + 'config': r'/private/tmp/viash_inject_stardist15754903354685377792/.config.vsh.yaml', + 'temp_dir': r'/var/folders/fq/ymt0vml175s4yvqxzbmlmpz80000gn/T/', + 'cpus': int(r'123'), + 'memory_b': int(r'123'), + 'memory_kb': int(r'123'), + 'memory_mb': int(r'123'), + 'memory_gb': int(r'123'), + 'memory_tb': int(r'123'), + 'memory_pb': int(r'123'), + 'memory_kib': int(r'123'), + 'memory_mib': int(r'123'), + 'memory_gib': int(r'123'), + 'memory_tib': int(r'123'), + 'memory_pib': int(r'123') +} +dep = { + +} + +## VIASH END + + +def convert_to_lower_dtype(arr): + max_val = arr.max() + if max_val <= np.iinfo(np.uint8).max: + new_dtype = np.uint8 + elif max_val <= np.iinfo(np.uint16).max: + new_dtype = np.uint16 + elif max_val <= np.iinfo(np.uint32).max: + new_dtype = np.uint32 + else: + new_dtype = np.uint64 + return arr.astype(new_dtype) + + +# Normalize per block without clipping using percentile-based min/max +class PercentileNormalizer(Normalizer): + def __init__(self, mi, ma): + self.mi, self.ma = mi, ma + + def before(self, x, axes): + return normalize_mi_ma(x, self.mi, self.ma, dtype=np.float32) + + def after(self, *args, **kwargs): + assert False + + @property + def do_after(self): + return False + + +print("Reading input", flush=True) +sdata = sd.read_zarr(par["input"]) +image = sdata["morphology_mip"]["scale0"].image.compute().to_numpy() +transformation = sdata["morphology_mip"]["scale0"].image.transform.copy() + +print(f"Loading pretrained StarDist model: {par['model']}", flush=True) +model = StarDist2D.from_pretrained(par["model"]) + +print("Running StarDist segmentation", flush=True) +mi, ma = np.percentile(image, [1, 99.8]) +normalizer = PercentileNormalizer(mi, ma) +block_size = min(image.shape[1] // 3, 4096) +offset = int(min(block_size / 5.5, 128)) + +labels, _ = model.predict_instances_big( + image[0, :, :], + axes="YX", + block_size=block_size, + min_overlap=offset, + context=offset, + normalizer=normalizer, +) + +print("Post-processing segmentation results", flush=True) +labels = convert_to_lower_dtype(labels) + +print("Creating output data structure", flush=True) +sd_output = sd.SpatialData( + labels={ + "segmentation": sd.models.Labels2DModel.parse( + xr.DataArray(labels, name="segmentation", dims=("y", "x")), + transformations=transformation, + ) + }, + tables={ + "table": ad.AnnData( + uns={ + "dataset_id": sdata.tables["table"].uns["dataset_id"], + "method_id": meta["name"], + } + ) + }, +) + +print("Writing output", flush=True) +if os.path.exists(par["output"]): + shutil.rmtree(par["output"]) +sd_output.write(par["output"]) From 91c26ece687bb9b36f85ce0b733bc64458b0eb7f Mon Sep 17 00:00:00 2001 From: dariarom94 Date: Sun, 17 May 2026 08:07:42 +0200 Subject: [PATCH 5/6] add to the benchmark run --- src/workflows/run_benchmark/config.vsh.yaml | 2 ++ src/workflows/run_benchmark/main.nf | 4 +++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/src/workflows/run_benchmark/config.vsh.yaml b/src/workflows/run_benchmark/config.vsh.yaml index 67bf53c..013192e 100644 --- a/src/workflows/run_benchmark/config.vsh.yaml +++ b/src/workflows/run_benchmark/config.vsh.yaml @@ -65,6 +65,8 @@ dependencies: - name: control_methods/empty_labels - name: control_methods/random_voronoi - name: methods/cellpose + - name: methods/stardist + - name: methods/proseg - name: metrics/ari - name: data_processors/process_prediction diff --git a/src/workflows/run_benchmark/main.nf b/src/workflows/run_benchmark/main.nf index a61478f..0e0063d 100644 --- a/src/workflows/run_benchmark/main.nf +++ b/src/workflows/run_benchmark/main.nf @@ -10,7 +10,9 @@ methods = [ true_labels, empty_labels, random_voronoi, - cellpose + cellpose, + stardist, + proseg ] // construct list of metrics From 53b28c28a8baef2b86a5701c98f368e92380051c Mon Sep 17 00:00:00 2001 From: dariarom94 Date: Mon, 18 May 2026 10:13:27 +0200 Subject: [PATCH 6/6] remove dev script --- src/methods/proseg/script_no_sopa.py | 145 --------------------------- 1 file changed, 145 deletions(-) delete mode 100644 src/methods/proseg/script_no_sopa.py diff --git a/src/methods/proseg/script_no_sopa.py b/src/methods/proseg/script_no_sopa.py deleted file mode 100644 index b9008fb..0000000 --- a/src/methods/proseg/script_no_sopa.py +++ /dev/null @@ -1,145 +0,0 @@ -import os -import shutil -from pathlib import Path -# from tifffile import imwrite -import dask -import numpy as np -import pandas as pd -import anndata as ad -import spatialdata as sd - - -## VIASH START -# Note: this section is auto-generated by viash at runtime. To edit it, make changes -# in config.vsh.yaml and then run `viash config inject config.vsh.yaml`. -par = { - 'input_ist': 'resources_test/task_ist_preprocessing/mouse_brain_combined/raw_ist.zarr', - 'input_segmentation': 'resources_test/task_ist_preprocessing/mouse_brain_combined/segmentation.zarr', - 'transcripts_key': 'transcripts', - 'coordinate_system': 'global', - 'output': './temp/proseg_transcripts.zarr', - -} -meta = { - 'name': 'proseg', - 'temp_dir': "/Users/habib/Projects/txsim_project/task_ist_preprocessing/temp" -} -## VIASH END - -TMP_DIR = Path(meta["temp_dir"] or "/tmp") -TMP_DIR.mkdir(parents=True, exist_ok=True) - -TRANSCRIPTS_CSV = TMP_DIR / "transcripts.csv" -SEGMENTATION_NPY = TMP_DIR / "segmentation.npy" -PROSEG_OUTPUT = TMP_DIR / "proseg_output.zarr" - - -# Read input -print('Reading input files', flush=True) -sdata = sd.read_zarr(par['input_ist']) -sdata_segm = sd.read_zarr(par['input_segmentation']) - -# Check if coordinate system is available in input data -transcripts_coord_systems = sd.transformations.get_transformation(sdata[par["transcripts_key"]], get_all=True).keys() -assert par['coordinate_system'] in transcripts_coord_systems, f"Coordinate system '{par['coordinate_system']}' not found in input data." -segmentation_coord_systems = sd.transformations.get_transformation(sdata_segm["segmentation"], get_all=True).keys() -assert par['coordinate_system'] in segmentation_coord_systems, f"Coordinate system '{par['coordinate_system']}' not found in input data." - -# Transform transcript coordinates to the coordinate system - -print('Transforming transcripts coordinates', flush=True) -transcripts = sd.transform(sdata[par['transcripts_key']], to_coordinate_system=par['coordinate_system']) - -# In case of a translation transformation of the segmentation (e.g. crop of the data), we need to adjust the transcript coordinates -trans = sd.transformations.get_transformation(sdata_segm["segmentation"], get_all=True)[par['coordinate_system']].inverse() -transcripts = sd.transform(transcripts, trans, par['coordinate_system']) - -# Write transcripts to csv -print('Writing transcripts to csv', flush=True) -transcripts_df = transcripts[['x', 'y', 'z', 'feature_name']].compute() -transcripts_df['transcript_index'] = transcripts_df.index - -#add segmentation for cell_id -segmentation_image = sdata_segm["segmentation"]["scale0"].image.to_numpy() - -y_coords = transcripts.y.compute().to_numpy(dtype=np.int64) -x_coords = transcripts.x.compute().to_numpy(dtype=np.int64) -transcripts_df['cell_id'] = segmentation_image[y_coords, x_coords] -#write to csv -transcripts_df.to_csv(TRANSCRIPTS_CSV) - -# Write segmentation to tif -print('Writing segmentation to npy', flush=True) -np.save(SEGMENTATION_NPY, segmentation_image) - -# TODO: figure out how to extract the affine transformation from the spatialdata Transform -# Run Proseg -print('Running Proseg', flush=True) -#Make sure to add a space after each line -proseg_cmd = ( - f'''proseg {TRANSCRIPTS_CSV} --x-column x --y-column y --z-column z --gene-column feature_name ''' - f'''--cell-id-column cell_id --cell-id-unassigned 0 --transcript-id-column transcript_index ''' - f'''--cellpose-masks {SEGMENTATION_NPY} ''' - f'''--cellpose-x-transform 1 0 0 ''' - f'''--cellpose-y-transform 0 1 0 ''' - f'''--output-spatialdata {PROSEG_OUTPUT} --overwrite''' - -) -print("\t" + proseg_cmd, flush=True) -#TODO: UNCOMMENT THIS -os.system(proseg_cmd) - - -# Read Proseg output -print('Reading proseg output', flush=True) -proseg_sd = sd.read_zarr(PROSEG_OUTPUT) -proseg_transcripts = sd.transform(proseg_sd['transcripts'], to_coordinate_system=par['coordinate_system']).compute() -proseg_transcripts.set_index('transcript_id', inplace=True) #here transcript_id is the index of the original, not the transcript_id in the original -proseg_transcripts = proseg_transcripts.reindex(transcripts.index).fillna(-1) - -# Add cell ids to transcripts -#NOTE: ADD 1 SINCE PROSEG STARTS COUNTING CELLS AT 0 -print('Adding cell ids to transcripts', flush=True) -cell_id_dask_series = dask.dataframe.from_dask_array( - dask.array.from_array( - proseg_transcripts['assignment'].values+1, chunks=tuple(sdata[par['transcripts_key']].map_partitions(len).compute()) - ), - index=sdata[par['transcripts_key']].index -) -sdata[par['transcripts_key']]["cell_id"] = cell_id_dask_series - - -# Create objects for cells table -print('Creating objects for cells table', flush=True) -#create new .obs for cells based on the segmentation output (corresponding with the transcripts 'cell_id') -unique_cells = np.unique(cell_id_dask_series) - -# check if a '0' (noise/background) cell is in cell_id and remove -zero_idx = np.where(unique_cells == 0) -if len(zero_idx[0]): unique_cells=np.delete(unique_cells, zero_idx[0][0]) - -#transform into pandas series and check -cell_id_col = pd.Series(unique_cells, name='cell_id', index=unique_cells) -assert 0 not in cell_id_col, "Found '0' in cell_id column of assingment output cell matrix" - - -# Create transcripts only sdata -print('Subsetting to transcripts cell id data', flush=True) -sdata_transcripts_only = sd.SpatialData( - points={ - "transcripts": sdata[par['transcripts_key']] - }, - tables={ - "table": ad.AnnData( - obs=pd.DataFrame(cell_id_col), - var=sdata.tables["table"].var[[]] - ) - } -) - -# Write output -print('Write transcripts with cell ids', flush=True) -if os.path.exists(par["output"]): - shutil.rmtree(par["output"]) - -sdata_transcripts_only.write(par['output'])