From 75d6443d5f676a1d15d3685e223b80edcee238c9 Mon Sep 17 00:00:00 2001
From: dariarom94 <romanovskaiadaria@gmail.com>
Date: Thu, 14 May 2026 13:33:25 +0200
Subject: [PATCH 1/6] Add CLAUDE.md with codebase guidance for Claude Code

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 CLAUDE.md | 79 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 79 insertions(+)
 create mode 100644 CLAUDE.md
diff --git a/CLAUDE.md b/CLAUDE.md
new file mode 100644
index 0000000..e71049f
--- /dev/null
+++ b/CLAUDE.md
@@ -0,0 +1,79 @@
+# CLAUDE.md
+
+This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
+
+## Framework: Viash + Nextflow
+
+This is a [Viash](https://viash.io)-based benchmarking task for spatial transcriptomics cell segmentation, part of the [OpenProblems](https://openproblems.bio) initiative. Every executable is a **Viash component** — a self-contained unit pairing a `config.vsh.yaml` (metadata, arguments, Docker setup) with a `script.py` or `script.R`. Nextflow workflows orchestrate multi-component pipelines.
+
+## Common Commands
+
+```bash
+# Sync test resources from S3 (required before testing)
+scripts/sync_resources.sh
+
+# Build all components (Docker-cached)
+viash ns build --parallel --setup cachedbuild
+
+# Test a single component
+viash test src/methods/cellpose/config.vsh.yaml
+
+# Test all components in parallel
+viash ns test --parallel
+
+# Run benchmark locally (test scale)
+scripts/run_benchmark/run_test_local.sh
+
+# Run full local benchmark
+scripts/run_benchmark/run_full_local.sh
+
+# Create a new method or metric from template
+common/scripts/create_component
+```
+
+## Architecture
+
+The task follows a fixed pipeline defined by the API specs in `src/api/`:
+
+```
+Raw spatial data
+    → [data_processor] process_dataset    → unlabelled + solution files
+    → [method / control_method]           → prediction file
+    → [data_processor] process_prediction → formatted prediction
+    → [metric] ari                        → score file
+```
+
+**Component types** (each defined in `src/api/comp_*.yaml`):
+- `control_method` — baseline segmentations (random Voronoi, true labels, empty labels)
+- `method` — segmentation algorithms under evaluation (currently: Cellpose)
+- `data_processor` — preprocessing and postprocessing steps
+- `metric` — evaluation metrics (currently: ARI)
+- `workflow` — Nextflow pipelines in `src/workflows/`
+
+**File format contracts** are defined in `src/api/file_*.yaml`. All component I/O must conform to these schemas. The `common/component_tests/run_and_check_output.py` helper enforces them during tests.
+
+## Component Structure
+
+Each component under `src/` follows this pattern:
+```
+src/<type>/<name>/
+├── config.vsh.yaml   # arguments, Docker image, test resources, test defaults
+└── script.py         # or script.R
+```
+
+Test resources and default parameters for `viash test` are declared inside `config.vsh.yaml` under `info.test_resources` and `info.test_default`.
+
+## Data Formats
+
+- **Spatial data**: Zarr-based `SpatialData` objects (`.zarr/`)
+- **Single-cell data**: AnnData HDF5 files (`.h5ad`)
+- Additional sources of ground truth will include spatial proteomics or annotated histology
+- Test datasets live in `resources_test/` (downloaded via `scripts/sync_resources.sh` from S3)
+
+## Important Notes
+
+- The `common/` directory is a **git submodule** — clone with `git clone --recursive` or run `git submodule update --init` after cloning.
+- `README.md` is **auto-generated** from the YAML API specs; do not edit it directly.
+- All components run inside Docker containers by default. Use `--platform docker` / `--engine docker` flags with Viash when needed.
+- `_viash.yaml` is the project-level Viash config (project name, organization, package registry, test resource S3 paths).
+- Don't commit to main, always create a new branch

From 1879d81d66ca2796233d510dca997f306009f54c Mon Sep 17 00:00:00 2001
From: dariarom94 <romanovskaiadaria@gmail.com>
Date: Fri, 15 May 2026 00:07:30 +0200
Subject: [PATCH 2/6] add instructions on summary

---
 CLAUDE.md | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/CLAUDE.md b/CLAUDE.md
index e71049f..eefc529 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -15,6 +15,9 @@ scripts/sync_resources.sh
 # Build all components (Docker-cached)
 viash ns build --parallel --setup cachedbuild
 
+# Build all Docker containers (run before local benchmark)
+scripts/project/build_all_docker_containers.sh
+
 # Test a single component
 viash test src/methods/cellpose/config.vsh.yaml
 
@@ -29,6 +32,16 @@ scripts/run_benchmark/run_full_local.sh
 
 # Create a new method or metric from template
 common/scripts/create_component
+
+# Inspect the Docker image ID and Dockerfile for a built component
+target/executable/<name>/<name> ---docker_image_id
+target/executable/<name>/<name> ---dockerfile
+
+# Run a single built component directly
+target/executable/<name>/<name> --input <input> --output <output>
+
+# Run a built component as a Nextflow workflow
+nextflow run target/nextflow/<name> -profile docker --id <id> --input <input> --publish_dir out/
 ```
 
 ## Architecture
@@ -77,3 +90,4 @@ Test resources and default parameters for `viash test` are declared inside `conf
 - All components run inside Docker containers by default. Use `--platform docker` / `--engine docker` flags with Viash when needed.
 - `_viash.yaml` is the project-level Viash config (project name, organization, package registry, test resource S3 paths).
 - Don't commit to main, always create a new branch
+- Fill in the summary for a src/methods/<component>/config.vsh.yaml. Write a one sentence summary and a one paragraph summary of how this method works based on documentation and references.
\ No newline at end of file

From 45e6521b95e1d39206c35e4cd97ffd51737dd1cd Mon Sep 17 00:00:00 2001
From: dariarom94 <romanovskaiadaria@gmail.com>
Date: Sat, 16 May 2026 22:00:07 +0200
Subject: [PATCH 3/6] add test instructions

---
 CLAUDE.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/CLAUDE.md b/CLAUDE.md
index eefc529..de122ca 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -90,4 +90,5 @@ Test resources and default parameters for `viash test` are declared inside `conf
 - All components run inside Docker containers by default. Use `--platform docker` / `--engine docker` flags with Viash when needed.
 - `_viash.yaml` is the project-level Viash config (project name, organization, package registry, test resource S3 paths).
 - Don't commit to main, always create a new branch
-- Fill in the summary for a src/methods/<component>/config.vsh.yaml. Write a one sentence summary and a one paragraph summary of how this method works based on documentation and references.
\ No newline at end of file
+- Fill in the summary for a src/methods/<component>/config.vsh.yaml. Write a one sentence summary and a one paragraph summary of how this method works based on documentation and references.
+- always use viash test to test new components
\ No newline at end of file

From 1e0843a7ecfdc4421962cd4b2b62f6bbfe2f9b2b Mon Sep 17 00:00:00 2001
From: dariarom94 <romanovskaiadaria@gmail.com>
Date: Sun, 17 May 2026 08:07:08 +0200
Subject: [PATCH 4/6] add method implementation

---
 src/methods/proseg/config.vsh.yaml   |  91 +++++++++++++++
 src/methods/proseg/script.py         | 164 +++++++++++++++++++++++++++
 src/methods/proseg/script_no_sopa.py | 145 +++++++++++++++++++++++
 src/methods/stardist/config.vsh.yaml |  54 +++++++++
 src/methods/stardist/script.py       | 120 ++++++++++++++++++++
 5 files changed, 574 insertions(+)
 create mode 100644 src/methods/proseg/config.vsh.yaml
 create mode 100644 src/methods/proseg/script.py
 create mode 100644 src/methods/proseg/script_no_sopa.py
 create mode 100644 src/methods/stardist/config.vsh.yaml
 create mode 100644 src/methods/stardist/script.py

diff --git a/src/methods/proseg/config.vsh.yaml b/src/methods/proseg/config.vsh.yaml
new file mode 100644
index 0000000..420a97c
--- /dev/null
+++ b/src/methods/proseg/config.vsh.yaml
@@ -0,0 +1,91 @@
+__merge__: /src/api/comp_method.yaml
+
+name: proseg
+label: "Proseg"
+
+# summary and description are generated by claude sonnet 4.6 based on the paper, github repo and documentation
+summary: Infers cell boundaries from transcript spatial distributions using a Cellular Potts model–inspired probabilistic framework.
+description: |
+  Proseg assigns a probability to every configuration of cell labels on a voxel lattice, driven by a
+  spatial transcript count model that favors transcripts being contained within a single cell, a cell
+  compactness prior, and optional diffusion terms to account for transcripts that leak outside their
+  true cell of origin. The model is fit using a Markov chain Monte Carlo sampler that alternates between
+  sampling cell assignments for each voxel and sampling cell-specific expression parameters. A burn-in
+  phase at a coarser voxel resolution speeds up initialization before the final-resolution sampling
+  begins. Because the method is purely transcript-driven it does not require a morphology image, though
+  an optional nuclear-segmentation prior can be incorporated. Cell boundaries are reported as polygon
+  shapes and are rasterized to a label image for downstream analysis.
+
+links:
+  documentation: "https://github.com/dcjones/proseg"
+  repository: "https://github.com/dcjones/proseg"
+references:
+  doi: "10.1038/s41592-025-02697-0"
+
+arguments:
+  - name: --voxel_layers
+    type: integer
+    required: false
+    description: "Number of layers on the z-axis to model 3D cells."
+    default: 4
+    info:
+      test_default: 1
+
+  - name: --samples
+    type: integer
+    required: false
+    description: "Run the sampler for this many iterations."
+    default: 200
+    info:
+      test_default: 10
+
+  - name: --burnin_samples
+    type: integer
+    required: false
+    description: "Run the sampler for a preliminary N samples at a lower resolution."
+    default: 200
+    info:
+      test_default: 10
+
+  - name: --voxel_size
+    type: double
+    required: false
+    description: "Voxel size in microns on the x/y axis."
+    default: 1.0
+    info:
+      test_default: 2.0
+
+  - name: --burnin_voxel_size
+    type: double
+    required: false
+    description: "Larger voxel size for the burn-in phase. Must be an integer multiple of voxel_size."
+    default: 2.0
+    info:
+      test_default: 4.0
+
+resources:
+  - type: python_script
+    path: script.py
+
+engines:
+  - type: docker
+    image: openproblems/base_python:1
+    __merge__:
+      - /src/base/setup_spatialdata_partial.yaml
+    setup:
+      - type: docker
+        env:
+          - PATH="/root/.cargo/bin:${PATH}"
+        run:
+          - curl https://sh.rustup.rs -sSf | sh -s -- -y
+          - echo 'source $HOME/.cargo/env' >> $HOME/.bashrc
+          - cargo install proseg
+      - type: python
+        pypi: [rasterio]
+  - type: native
+
+runners:
+  - type: executable
+  - type: nextflow
+    directives:
+      label: [ hightime, highcpu, midmem ]
diff --git a/src/methods/proseg/script.py b/src/methods/proseg/script.py
new file mode 100644
index 0000000..6c1d678
--- /dev/null
+++ b/src/methods/proseg/script.py
@@ -0,0 +1,164 @@
+import os
+import shutil
+import subprocess
+from pathlib import Path
+import numpy as np
+import pandas as pd
+import xarray as xr
+import anndata as ad
+import spatialdata as sd
+from shapely.affinity import affine_transform as shapely_affine_transform
+from spatialdata.models import Labels2DModel
+from spatialdata.transformations import get_transformation
+from rasterio.features import rasterize as rio_rasterize
+
+## VIASH START
+# The following code has been auto-generated by Viash.
+par = {
+  'input': r'resources_test/task_spatial_segmentation/mouse_brain_combined/spatial_unlabelled.zarr',
+  'output': r'resources_test/task_spatial_segmentation/mouse_brain_combined/prediction.zarr',
+  'voxel_layers': int(r'4'),
+  'samples': int(r'200'),
+  'burnin_samples': int(r'200'),
+  'voxel_size': float(r'1.0'),
+  'burnin_voxel_size': float(r'2.0')
+}
+meta = {
+  'name': r'proseg',
+  'functionality_name': r'proseg',
+  'resources_dir': r'/private/tmp/viash_inject_proseg11113472565944867086',
+  'executable': r'/private/tmp/viash_inject_proseg11113472565944867086/proseg',
+  'config': r'/private/tmp/viash_inject_proseg11113472565944867086/.config.vsh.yaml',
+  'temp_dir': r'/var/folders/fq/ymt0vml175s4yvqxzbmlmpz80000gn/T/',
+  'cpus': int(r'123'),
+  'memory_b': int(r'123'),
+  'memory_kb': int(r'123'),
+  'memory_mb': int(r'123'),
+  'memory_gb': int(r'123'),
+  'memory_tb': int(r'123'),
+  'memory_pb': int(r'123'),
+  'memory_kib': int(r'123'),
+  'memory_mib': int(r'123'),
+  'memory_gib': int(r'123'),
+  'memory_tib': int(r'123'),
+  'memory_pib': int(r'123')
+}
+dep = {
+
+}
+
+## VIASH END
+
+proseg_dir = Path(meta['temp_dir'] or '/tmp') / 'proseg'
+proseg_dir.mkdir(parents=True, exist_ok=True)
+
+print('Reading input', flush=True)
+sdata = sd.read_zarr(par['input'])
+image = sdata['morphology_mip']['scale0'].image.compute().to_numpy()
+transformation = sdata['morphology_mip']['scale0'].image.transform.copy()
+h, w = image.shape[-2:]
+
+print('Exporting transcripts to CSV', flush=True)
+transcripts_df = sdata['transcripts'].compute()
+has_z = 'z' in transcripts_df.columns and transcripts_df['z'].notna().any()
+
+# Proseg requires a prior cell-ID column. Build a simple grid-based prior so
+# proseg can refine the segmentation from a meaningful starting state.
+x_min, x_max = float(transcripts_df['x'].min()), float(transcripts_df['x'].max())
+y_min, y_max = float(transcripts_df['y'].min()), float(transcripts_df['y'].max())
+grid_size = 20.0  # microns per grid cell — roughly one cell diameter
+x_bins = np.arange(x_min, x_max + grid_size, grid_size)
+y_bins = np.arange(y_min, y_max + grid_size, grid_size)
+x_idx = np.digitize(transcripts_df['x'].to_numpy(), x_bins) - 1
+y_idx = np.digitize(transcripts_df['y'].to_numpy(), y_bins) - 1
+n_x = max(len(x_bins) - 1, 1)
+prior_cell_ids = (y_idx * n_x + x_idx) + 1  # 1-indexed; 0 is "unassigned"
+
+z_vals = transcripts_df['z'] if has_z else np.zeros(len(transcripts_df))
+csv_data = {
+    'x': transcripts_df['x'],
+    'y': transcripts_df['y'],
+    'z': z_vals,
+    'gene': transcripts_df['feature_name'],
+    'cell_id': prior_cell_ids,
+}
+transcript_csv = proseg_dir / 'transcripts.csv'
+pd.DataFrame(csv_data).to_csv(transcript_csv, index=False)
+
+n_threads = max((meta.get('cpus') or os.cpu_count() or 1) - 2, 1)
+cmd = [
+    'proseg', 'transcripts.csv',
+    '--x-column', 'x',
+    '--y-column', 'y',
+    '--z-column', 'z',
+    '--gene-column', 'gene',
+    '--cell-id-column', 'cell_id',
+    '--cell-id-unassigned', '0',
+    '--nthreads', str(n_threads),
+    '--ncomponents', '10',
+    '--diffusion-probability', '0.2',
+    '--diffusion-sigma-far', '4',
+    '--diffusion-sigma-near', '1',
+    '--nuclear-reassignment-prob', '0.2',
+    '--cell-compactness', '0.03',
+    '--voxel-layers', str(par['voxel_layers']),
+    '--samples', str(par['samples']),
+    '--burnin-samples', str(par['burnin_samples']),
+    '--recorded-samples', str(par['samples']),
+    '--voxel-size', str(par['voxel_size']),
+    '--burnin-voxel-size', str(par['burnin_voxel_size']),
+]
+print(f'Running Proseg: {" ".join(cmd)}', flush=True)
+subprocess.run(cmd, cwd=str(proseg_dir), check=True)
+
+print('Reading Proseg Zarr output', flush=True)
+proseg_sdata = sd.read_zarr(str(proseg_dir / 'proseg-output.zarr'))
+shapes = proseg_sdata.shapes['cell_boundaries']
+print(f'Found {len(shapes)} cell boundaries', flush=True)
+
+# Proseg boundaries are in global (micron) coordinates. Convert to pixel space
+# using the inverse of the morphology image's pixel-to-global transformation.
+print('Converting boundaries to pixel space and rasterizing', flush=True)
+img_transform = get_transformation(sdata['morphology_mip'], to_coordinate_system='global')
+affine_mat = img_transform.to_affine_matrix(input_axes=('x', 'y'), output_axes=('x', 'y'))
+inv_affine = np.linalg.inv(affine_mat)
+
+def global_to_pixel(geom):
+    a, b, xoff = inv_affine[0, 0], inv_affine[0, 1], inv_affine[0, 2]
+    d, e, yoff = inv_affine[1, 0], inv_affine[1, 1], inv_affine[1, 2]
+    return shapely_affine_transform(geom, [a, b, d, e, xoff, yoff])
+
+pixel_geoms = shapes['geometry'].apply(global_to_pixel)
+shape_geoms = list(zip(pixel_geoms, range(1, len(shapes) + 1)))
+labels = rio_rasterize(shape_geoms, out_shape=(h, w), fill=0, dtype=np.int32)
+
+max_val = labels.max()
+if max_val <= np.iinfo(np.uint8).max:
+    labels = labels.astype(np.uint8)
+elif max_val <= np.iinfo(np.uint16).max:
+    labels = labels.astype(np.uint16)
+elif max_val <= np.iinfo(np.uint32).max:
+    labels = labels.astype(np.uint32)
+
+print('Creating output data structure', flush=True)
+sd_output = sd.SpatialData(
+    labels={
+        'segmentation': Labels2DModel.parse(
+            xr.DataArray(labels, name='segmentation', dims=('y', 'x')),
+            transformations=transformation,
+        )
+    },
+    tables={
+        'table': ad.AnnData(
+            uns={
+                'dataset_id': sdata.tables['table'].uns['dataset_id'],
+                'method_id': meta['name'],
+            }
+        )
+    },
+)
+
+print('Writing output', flush=True)
+if os.path.exists(par['output']):
+    shutil.rmtree(par['output'])
+sd_output.write(par['output'])
diff --git a/src/methods/proseg/script_no_sopa.py b/src/methods/proseg/script_no_sopa.py
new file mode 100644
index 0000000..b9008fb
--- /dev/null
+++ b/src/methods/proseg/script_no_sopa.py
@@ -0,0 +1,145 @@
+import os
+import shutil
+from pathlib import Path
+# from tifffile import imwrite
+import dask
+import numpy as np
+import pandas as pd
+import anndata as ad
+import spatialdata as sd
+
+
+## VIASH START
+# Note: this section is auto-generated by viash at runtime. To edit it, make changes
+# in config.vsh.yaml and then run `viash config inject config.vsh.yaml`.
+par = {
+  'input_ist': 'resources_test/task_ist_preprocessing/mouse_brain_combined/raw_ist.zarr',
+  'input_segmentation': 'resources_test/task_ist_preprocessing/mouse_brain_combined/segmentation.zarr',
+  'transcripts_key': 'transcripts',
+  'coordinate_system': 'global',
+  'output': './temp/proseg_transcripts.zarr',
+
+}
+meta = {
+  'name': 'proseg',
+  'temp_dir': "/Users/habib/Projects/txsim_project/task_ist_preprocessing/temp"
+}
+## VIASH END
+
+TMP_DIR = Path(meta["temp_dir"] or "/tmp")
+TMP_DIR.mkdir(parents=True, exist_ok=True)
+
+TRANSCRIPTS_CSV = TMP_DIR / "transcripts.csv"
+SEGMENTATION_NPY = TMP_DIR / "segmentation.npy"
+PROSEG_OUTPUT = TMP_DIR / "proseg_output.zarr"
+
+
+# Read input
+print('Reading input files', flush=True)
+sdata = sd.read_zarr(par['input_ist'])
+sdata_segm = sd.read_zarr(par['input_segmentation'])
+
+# Check if coordinate system is available in input data
+transcripts_coord_systems = sd.transformations.get_transformation(sdata[par["transcripts_key"]], get_all=True).keys()
+assert par['coordinate_system'] in transcripts_coord_systems, f"Coordinate system '{par['coordinate_system']}' not found in input data."
+segmentation_coord_systems = sd.transformations.get_transformation(sdata_segm["segmentation"], get_all=True).keys()
+assert par['coordinate_system'] in segmentation_coord_systems, f"Coordinate system '{par['coordinate_system']}' not found in input data."
+
+# Transform transcript coordinates to the coordinate system
+
+print('Transforming transcripts coordinates', flush=True)
+transcripts = sd.transform(sdata[par['transcripts_key']], to_coordinate_system=par['coordinate_system'])
+
+# In case of a translation transformation of the segmentation (e.g. crop of the data), we need to adjust the transcript coordinates
+trans = sd.transformations.get_transformation(sdata_segm["segmentation"], get_all=True)[par['coordinate_system']].inverse()
+transcripts = sd.transform(transcripts, trans, par['coordinate_system'])
+
+# Write transcripts to csv 
+print('Writing transcripts to csv', flush=True)
+transcripts_df = transcripts[['x', 'y', 'z', 'feature_name']].compute()
+transcripts_df['transcript_index'] = transcripts_df.index
+
+#add segmentation for cell_id
+segmentation_image = sdata_segm["segmentation"]["scale0"].image.to_numpy()
+
+y_coords = transcripts.y.compute().to_numpy(dtype=np.int64)
+x_coords = transcripts.x.compute().to_numpy(dtype=np.int64)
+transcripts_df['cell_id'] = segmentation_image[y_coords, x_coords]
+#write to csv
+transcripts_df.to_csv(TRANSCRIPTS_CSV)
+
+# Write segmentation to tif
+print('Writing segmentation to npy', flush=True)
+np.save(SEGMENTATION_NPY, segmentation_image)
+
+# TODO: figure out how to extract the affine transformation from the spatialdata Transform
+# Run Proseg
+print('Running Proseg', flush=True)
+#Make sure to add a space after each line
+proseg_cmd = (
+    f'''proseg {TRANSCRIPTS_CSV} --x-column x --y-column y --z-column z --gene-column feature_name '''
+    f'''--cell-id-column cell_id --cell-id-unassigned 0 --transcript-id-column transcript_index '''
+    f'''--cellpose-masks {SEGMENTATION_NPY} '''
+    f'''--cellpose-x-transform 1 0 0 '''
+    f'''--cellpose-y-transform 0 1 0 '''
+    f'''--output-spatialdata {PROSEG_OUTPUT} --overwrite'''
+
+)
+print("\t" + proseg_cmd, flush=True)
+#TODO: UNCOMMENT THIS
+os.system(proseg_cmd)
+
+
+# Read Proseg output
+print('Reading proseg output', flush=True)
+proseg_sd = sd.read_zarr(PROSEG_OUTPUT)
+proseg_transcripts = sd.transform(proseg_sd['transcripts'], to_coordinate_system=par['coordinate_system']).compute()
+proseg_transcripts.set_index('transcript_id', inplace=True) #here transcript_id is the index of the original, not the transcript_id in the original
+proseg_transcripts = proseg_transcripts.reindex(transcripts.index).fillna(-1)
+
+# Add cell ids to transcripts
+#NOTE: ADD 1 SINCE PROSEG STARTS COUNTING CELLS AT 0 
+print('Adding cell ids to transcripts', flush=True)
+cell_id_dask_series = dask.dataframe.from_dask_array(
+    dask.array.from_array(
+        proseg_transcripts['assignment'].values+1, chunks=tuple(sdata[par['transcripts_key']].map_partitions(len).compute())
+    ), 
+    index=sdata[par['transcripts_key']].index
+)
+sdata[par['transcripts_key']]["cell_id"] = cell_id_dask_series
+
+
+# Create objects for cells table
+print('Creating objects for cells table', flush=True)
+#create new .obs for cells based on the segmentation output (corresponding with the transcripts 'cell_id')
+unique_cells = np.unique(cell_id_dask_series)
+
+# check if a '0' (noise/background) cell is in cell_id and remove
+zero_idx = np.where(unique_cells == 0)
+if len(zero_idx[0]): unique_cells=np.delete(unique_cells, zero_idx[0][0])
+
+#transform into pandas series and check
+cell_id_col = pd.Series(unique_cells, name='cell_id', index=unique_cells)
+assert 0 not in cell_id_col, "Found '0' in cell_id column of assingment output cell matrix"
+
+
+# Create transcripts only sdata
+print('Subsetting to transcripts cell id data', flush=True)
+sdata_transcripts_only = sd.SpatialData(
+    points={
+        "transcripts": sdata[par['transcripts_key']]
+    },
+    tables={
+        "table": ad.AnnData(
+          obs=pd.DataFrame(cell_id_col),
+          var=sdata.tables["table"].var[[]]
+        )
+    }
+)
+
+# Write output
+print('Write transcripts with cell ids', flush=True)
+if os.path.exists(par["output"]):
+    shutil.rmtree(par["output"])
+    
+sdata_transcripts_only.write(par['output'])
diff --git a/src/methods/stardist/config.vsh.yaml b/src/methods/stardist/config.vsh.yaml
new file mode 100644
index 0000000..f372f11
--- /dev/null
+++ b/src/methods/stardist/config.vsh.yaml
@@ -0,0 +1,54 @@
+__merge__: /src/api/comp_method.yaml
+
+name: stardist
+label: "StarDist"
+
+# summary and description are generated by claude sonnet 4.6 based on the paper, github repo and documentation
+summary: Detects star-convex shaped cells by predicting radial distances to the object boundary from each pixel.
+description: |
+  StarDist represents each cell as a star-convex polygon defined by a set of radial distances from the cell's
+  center pixel to its boundary, measured at evenly spaced angles. A U-Net–style convolutional neural network
+  is trained to predict, for every pixel, both the probability that the pixel lies inside any cell and the
+  set of radial distances describing the cell polygon centered at that pixel. At inference time, non-maximum
+  suppression selects the most confident polygon proposals and suppresses overlapping detections, yielding a
+  final instance-segmentation label image. Because the star-convex parameterization compactly encodes shapes
+  typical of fluorescence-microscopy nuclei and cells, the method is highly data-efficient and generalizes
+  well across imaging modalities with pretrained models such as "2D_versatile_fluo".
+
+links:
+  documentation: "https://stardist.net"
+  repository: "https://github.com/stardist/stardist"
+references:
+  doi: "10.48550/arXiv.1806.03535"
+
+arguments:
+  - name: --model
+    type: string
+    default: "2D_versatile_fluo"
+    description: "Pretrained StarDist model name."
+    info:
+      test_default: "2D_versatile_fluo"
+
+resources:
+  - type: python_script
+    path: script.py
+
+engines:
+  - type: docker
+    image: openproblems/base_python:1
+    __merge__:
+      - /src/base/setup_spatialdata_partial.yaml
+    setup:
+      - type: python
+        pypi: stardist
+      - type: python
+        pypi:
+          - "numpy<2.0.0"
+          - tensorflow
+  - type: native
+
+runners:
+  - type: executable
+  - type: nextflow
+    directives:
+      label: [ midtime, midcpu, highmem, gpu ]
diff --git a/src/methods/stardist/script.py b/src/methods/stardist/script.py
new file mode 100644
index 0000000..36b7e54
--- /dev/null
+++ b/src/methods/stardist/script.py
@@ -0,0 +1,120 @@
+import os
+import shutil
+import numpy as np
+import xarray as xr
+import spatialdata as sd
+import anndata as ad
+from csbdeep.data import Normalizer, normalize_mi_ma
+from stardist.models import StarDist2D
+
+## VIASH START
+# The following code has been auto-generated by Viash.
+par = {
+  'input': r'resources_test/task_spatial_segmentation/mouse_brain_combined/spatial_unlabelled.zarr',
+  'output': r'resources_test/task_spatial_segmentation/mouse_brain_combined/prediction.zarr',
+  'model': r'2D_versatile_fluo'
+}
+meta = {
+  'name': r'stardist',
+  'functionality_name': r'stardist',
+  'resources_dir': r'/private/tmp/viash_inject_stardist15754903354685377792',
+  'executable': r'/private/tmp/viash_inject_stardist15754903354685377792/stardist',
+  'config': r'/private/tmp/viash_inject_stardist15754903354685377792/.config.vsh.yaml',
+  'temp_dir': r'/var/folders/fq/ymt0vml175s4yvqxzbmlmpz80000gn/T/',
+  'cpus': int(r'123'),
+  'memory_b': int(r'123'),
+  'memory_kb': int(r'123'),
+  'memory_mb': int(r'123'),
+  'memory_gb': int(r'123'),
+  'memory_tb': int(r'123'),
+  'memory_pb': int(r'123'),
+  'memory_kib': int(r'123'),
+  'memory_mib': int(r'123'),
+  'memory_gib': int(r'123'),
+  'memory_tib': int(r'123'),
+  'memory_pib': int(r'123')
+}
+dep = {
+  
+}
+
+## VIASH END
+
+
+def convert_to_lower_dtype(arr):
+    max_val = arr.max()
+    if max_val <= np.iinfo(np.uint8).max:
+        new_dtype = np.uint8
+    elif max_val <= np.iinfo(np.uint16).max:
+        new_dtype = np.uint16
+    elif max_val <= np.iinfo(np.uint32).max:
+        new_dtype = np.uint32
+    else:
+        new_dtype = np.uint64
+    return arr.astype(new_dtype)
+
+
+# Normalize per block without clipping using percentile-based min/max
+class PercentileNormalizer(Normalizer):
+    def __init__(self, mi, ma):
+        self.mi, self.ma = mi, ma
+
+    def before(self, x, axes):
+        return normalize_mi_ma(x, self.mi, self.ma, dtype=np.float32)
+
+    def after(self, *args, **kwargs):
+        assert False
+
+    @property
+    def do_after(self):
+        return False
+
+
+print("Reading input", flush=True)
+sdata = sd.read_zarr(par["input"])
+image = sdata["morphology_mip"]["scale0"].image.compute().to_numpy()
+transformation = sdata["morphology_mip"]["scale0"].image.transform.copy()
+
+print(f"Loading pretrained StarDist model: {par['model']}", flush=True)
+model = StarDist2D.from_pretrained(par["model"])
+
+print("Running StarDist segmentation", flush=True)
+mi, ma = np.percentile(image, [1, 99.8])
+normalizer = PercentileNormalizer(mi, ma)
+block_size = min(image.shape[1] // 3, 4096)
+offset = int(min(block_size / 5.5, 128))
+
+labels, _ = model.predict_instances_big(
+    image[0, :, :],
+    axes="YX",
+    block_size=block_size,
+    min_overlap=offset,
+    context=offset,
+    normalizer=normalizer,
+)
+
+print("Post-processing segmentation results", flush=True)
+labels = convert_to_lower_dtype(labels)
+
+print("Creating output data structure", flush=True)
+sd_output = sd.SpatialData(
+    labels={
+        "segmentation": sd.models.Labels2DModel.parse(
+            xr.DataArray(labels, name="segmentation", dims=("y", "x")),
+            transformations=transformation,
+        )
+    },
+    tables={
+        "table": ad.AnnData(
+            uns={
+                "dataset_id": sdata.tables["table"].uns["dataset_id"],
+                "method_id": meta["name"],
+            }
+        )
+    },
+)
+
+print("Writing output", flush=True)
+if os.path.exists(par["output"]):
+    shutil.rmtree(par["output"])
+sd_output.write(par["output"])

From 91c26ece687bb9b36f85ce0b733bc64458b0eb7f Mon Sep 17 00:00:00 2001
From: dariarom94 <romanovskaiadaria@gmail.com>
Date: Sun, 17 May 2026 08:07:42 +0200
Subject: [PATCH 5/6] add to the benchmark run

---
 src/workflows/run_benchmark/config.vsh.yaml | 2 ++
 src/workflows/run_benchmark/main.nf         | 4 +++-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/workflows/run_benchmark/config.vsh.yaml b/src/workflows/run_benchmark/config.vsh.yaml
index 67bf53c..013192e 100644
--- a/src/workflows/run_benchmark/config.vsh.yaml
+++ b/src/workflows/run_benchmark/config.vsh.yaml
@@ -65,6 +65,8 @@ dependencies:
   - name: control_methods/empty_labels
   - name: control_methods/random_voronoi
   - name: methods/cellpose
+  - name: methods/stardist
+  - name: methods/proseg
   - name: metrics/ari
   - name: data_processors/process_prediction
 
diff --git a/src/workflows/run_benchmark/main.nf b/src/workflows/run_benchmark/main.nf
index a61478f..0e0063d 100644
--- a/src/workflows/run_benchmark/main.nf
+++ b/src/workflows/run_benchmark/main.nf
@@ -10,7 +10,9 @@ methods = [
   true_labels,
   empty_labels,
   random_voronoi,
-  cellpose
+  cellpose,
+  stardist,
+  proseg
 ]
 
 // construct list of metrics

From 53b28c28a8baef2b86a5701c98f368e92380051c Mon Sep 17 00:00:00 2001
From: dariarom94 <romanovskaiadaria@gmail.com>
Date: Mon, 18 May 2026 10:13:27 +0200
Subject: [PATCH 6/6] remove dev script

---
 src/methods/proseg/script_no_sopa.py | 145 ---------------------------
 1 file changed, 145 deletions(-)
 delete mode 100644 src/methods/proseg/script_no_sopa.py

diff --git a/src/methods/proseg/script_no_sopa.py b/src/methods/proseg/script_no_sopa.py
deleted file mode 100644
index b9008fb..0000000
--- a/src/methods/proseg/script_no_sopa.py
+++ /dev/null
@@ -1,145 +0,0 @@
-import os
-import shutil
-from pathlib import Path
-# from tifffile import imwrite
-import dask
-import numpy as np
-import pandas as pd
-import anndata as ad
-import spatialdata as sd
-
-
-## VIASH START
-# Note: this section is auto-generated by viash at runtime. To edit it, make changes
-# in config.vsh.yaml and then run `viash config inject config.vsh.yaml`.
-par = {
-  'input_ist': 'resources_test/task_ist_preprocessing/mouse_brain_combined/raw_ist.zarr',
-  'input_segmentation': 'resources_test/task_ist_preprocessing/mouse_brain_combined/segmentation.zarr',
-  'transcripts_key': 'transcripts',
-  'coordinate_system': 'global',
-  'output': './temp/proseg_transcripts.zarr',
-
-}
-meta = {
-  'name': 'proseg',
-  'temp_dir': "/Users/habib/Projects/txsim_project/task_ist_preprocessing/temp"
-}
-## VIASH END
-
-TMP_DIR = Path(meta["temp_dir"] or "/tmp")
-TMP_DIR.mkdir(parents=True, exist_ok=True)
-
-TRANSCRIPTS_CSV = TMP_DIR / "transcripts.csv"
-SEGMENTATION_NPY = TMP_DIR / "segmentation.npy"
-PROSEG_OUTPUT = TMP_DIR / "proseg_output.zarr"
-
-
-# Read input
-print('Reading input files', flush=True)
-sdata = sd.read_zarr(par['input_ist'])
-sdata_segm = sd.read_zarr(par['input_segmentation'])
-
-# Check if coordinate system is available in input data
-transcripts_coord_systems = sd.transformations.get_transformation(sdata[par["transcripts_key"]], get_all=True).keys()
-assert par['coordinate_system'] in transcripts_coord_systems, f"Coordinate system '{par['coordinate_system']}' not found in input data."
-segmentation_coord_systems = sd.transformations.get_transformation(sdata_segm["segmentation"], get_all=True).keys()
-assert par['coordinate_system'] in segmentation_coord_systems, f"Coordinate system '{par['coordinate_system']}' not found in input data."
-
-# Transform transcript coordinates to the coordinate system
-
-print('Transforming transcripts coordinates', flush=True)
-transcripts = sd.transform(sdata[par['transcripts_key']], to_coordinate_system=par['coordinate_system'])
-
-# In case of a translation transformation of the segmentation (e.g. crop of the data), we need to adjust the transcript coordinates
-trans = sd.transformations.get_transformation(sdata_segm["segmentation"], get_all=True)[par['coordinate_system']].inverse()
-transcripts = sd.transform(transcripts, trans, par['coordinate_system'])
-
-# Write transcripts to csv 
-print('Writing transcripts to csv', flush=True)
-transcripts_df = transcripts[['x', 'y', 'z', 'feature_name']].compute()
-transcripts_df['transcript_index'] = transcripts_df.index
-
-#add segmentation for cell_id
-segmentation_image = sdata_segm["segmentation"]["scale0"].image.to_numpy()
-
-y_coords = transcripts.y.compute().to_numpy(dtype=np.int64)
-x_coords = transcripts.x.compute().to_numpy(dtype=np.int64)
-transcripts_df['cell_id'] = segmentation_image[y_coords, x_coords]
-#write to csv
-transcripts_df.to_csv(TRANSCRIPTS_CSV)
-
-# Write segmentation to tif
-print('Writing segmentation to npy', flush=True)
-np.save(SEGMENTATION_NPY, segmentation_image)
-
-# TODO: figure out how to extract the affine transformation from the spatialdata Transform
-# Run Proseg
-print('Running Proseg', flush=True)
-#Make sure to add a space after each line
-proseg_cmd = (
-    f'''proseg {TRANSCRIPTS_CSV} --x-column x --y-column y --z-column z --gene-column feature_name '''
-    f'''--cell-id-column cell_id --cell-id-unassigned 0 --transcript-id-column transcript_index '''
-    f'''--cellpose-masks {SEGMENTATION_NPY} '''
-    f'''--cellpose-x-transform 1 0 0 '''
-    f'''--cellpose-y-transform 0 1 0 '''
-    f'''--output-spatialdata {PROSEG_OUTPUT} --overwrite'''
-
-)
-print("\t" + proseg_cmd, flush=True)
-#TODO: UNCOMMENT THIS
-os.system(proseg_cmd)
-
-
-# Read Proseg output
-print('Reading proseg output', flush=True)
-proseg_sd = sd.read_zarr(PROSEG_OUTPUT)
-proseg_transcripts = sd.transform(proseg_sd['transcripts'], to_coordinate_system=par['coordinate_system']).compute()
-proseg_transcripts.set_index('transcript_id', inplace=True) #here transcript_id is the index of the original, not the transcript_id in the original
-proseg_transcripts = proseg_transcripts.reindex(transcripts.index).fillna(-1)
-
-# Add cell ids to transcripts
-#NOTE: ADD 1 SINCE PROSEG STARTS COUNTING CELLS AT 0 
-print('Adding cell ids to transcripts', flush=True)
-cell_id_dask_series = dask.dataframe.from_dask_array(
-    dask.array.from_array(
-        proseg_transcripts['assignment'].values+1, chunks=tuple(sdata[par['transcripts_key']].map_partitions(len).compute())
-    ), 
-    index=sdata[par['transcripts_key']].index
-)
-sdata[par['transcripts_key']]["cell_id"] = cell_id_dask_series
-
-
-# Create objects for cells table
-print('Creating objects for cells table', flush=True)
-#create new .obs for cells based on the segmentation output (corresponding with the transcripts 'cell_id')
-unique_cells = np.unique(cell_id_dask_series)
-
-# check if a '0' (noise/background) cell is in cell_id and remove
-zero_idx = np.where(unique_cells == 0)
-if len(zero_idx[0]): unique_cells=np.delete(unique_cells, zero_idx[0][0])
-
-#transform into pandas series and check
-cell_id_col = pd.Series(unique_cells, name='cell_id', index=unique_cells)
-assert 0 not in cell_id_col, "Found '0' in cell_id column of assingment output cell matrix"
-
-
-# Create transcripts only sdata
-print('Subsetting to transcripts cell id data', flush=True)
-sdata_transcripts_only = sd.SpatialData(
-    points={
-        "transcripts": sdata[par['transcripts_key']]
-    },
-    tables={
-        "table": ad.AnnData(
-          obs=pd.DataFrame(cell_id_col),
-          var=sdata.tables["table"].var[[]]
-        )
-    }
-)
-
-# Write output
-print('Write transcripts with cell ids', flush=True)
-if os.path.exists(par["output"]):
-    shutil.rmtree(par["output"])
-    
-sdata_transcripts_only.write(par['output'])