openproblems-bio · heylf · May 18, 2026 · May 14, 2026 · May 14, 2026 · May 16, 2026
diff --git a/src/methods/proseg/config.vsh.yaml b/src/methods/proseg/config.vsh.yaml
@@ -0,0 +1,91 @@
+__merge__: /src/api/comp_method.yaml
+
+name: proseg
+label: "Proseg"
+
+# summary and description are generated by claude sonnet 4.6 based on the paper, github repo and documentation
+summary: Infers cell boundaries from transcript spatial distributions using a Cellular Potts model–inspired probabilistic framework.
+description: |
+  Proseg assigns a probability to every configuration of cell labels on a voxel lattice, driven by a
+  spatial transcript count model that favors transcripts being contained within a single cell, a cell
+  compactness prior, and optional diffusion terms to account for transcripts that leak outside their
+  true cell of origin. The model is fit using a Markov chain Monte Carlo sampler that alternates between
+  sampling cell assignments for each voxel and sampling cell-specific expression parameters. A burn-in
+  phase at a coarser voxel resolution speeds up initialization before the final-resolution sampling
+  begins. Because the method is purely transcript-driven it does not require a morphology image, though
+  an optional nuclear-segmentation prior can be incorporated. Cell boundaries are reported as polygon
+  shapes and are rasterized to a label image for downstream analysis.
+
+links:
+  documentation: "https://github.com/dcjones/proseg"
+  repository: "https://github.com/dcjones/proseg"
+references:
+  doi: "10.1038/s41592-025-02697-0"
+
+arguments:
+  - name: --voxel_layers
+    type: integer
+    required: false
+    description: "Number of layers on the z-axis to model 3D cells."
+    default: 4
+    info:
+      test_default: 1
+
+  - name: --samples
+    type: integer
+    required: false
+    description: "Run the sampler for this many iterations."
+    default: 200
+    info:
+      test_default: 10
+
+  - name: --burnin_samples
+    type: integer
+    required: false
+    description: "Run the sampler for a preliminary N samples at a lower resolution."
+    default: 200
+    info:
+      test_default: 10
+
+  - name: --voxel_size
+    type: double
+    required: false
+    description: "Voxel size in microns on the x/y axis."
+    default: 1.0
+    info:
+      test_default: 2.0
+
+  - name: --burnin_voxel_size
+    type: double
+    required: false
+    description: "Larger voxel size for the burn-in phase. Must be an integer multiple of voxel_size."
+    default: 2.0
+    info:
+      test_default: 4.0
+
+resources:
+  - type: python_script
+    path: script.py
+
+engines:
+  - type: docker
+    image: openproblems/base_python:1
+    __merge__:
+      - /src/base/setup_spatialdata_partial.yaml
+    setup:
+      - type: docker
+        env:
+          - PATH="/root/.cargo/bin:${PATH}"
+        run:
+          - curl https://sh.rustup.rs -sSf | sh -s -- -y
+          - echo 'source $HOME/.cargo/env' >> $HOME/.bashrc
+          - cargo install proseg
+      - type: python
+        pypi: [rasterio]
+  - type: native
+
+runners:
+  - type: executable
+  - type: nextflow
+    directives:
+      label: [ hightime, highcpu, midmem ]
diff --git a/src/methods/proseg/script.py b/src/methods/proseg/script.py
@@ -0,0 +1,164 @@
+import os
+import shutil
+import subprocess
+from pathlib import Path
+import numpy as np
+import pandas as pd
+import xarray as xr
+import anndata as ad
+import spatialdata as sd
+from shapely.affinity import affine_transform as shapely_affine_transform
+from spatialdata.models import Labels2DModel
+from spatialdata.transformations import get_transformation
+from rasterio.features import rasterize as rio_rasterize
+
+## VIASH START
+# The following code has been auto-generated by Viash.
+par = {
+  'input': r'resources_test/task_spatial_segmentation/mouse_brain_combined/spatial_unlabelled.zarr',
+  'output': r'resources_test/task_spatial_segmentation/mouse_brain_combined/prediction.zarr',
+  'voxel_layers': int(r'4'),
+  'samples': int(r'200'),
+  'burnin_samples': int(r'200'),
+  'voxel_size': float(r'1.0'),
+  'burnin_voxel_size': float(r'2.0')
+}
+meta = {
+  'name': r'proseg',
+  'functionality_name': r'proseg',
+  'resources_dir': r'/private/tmp/viash_inject_proseg11113472565944867086',
+  'executable': r'/private/tmp/viash_inject_proseg11113472565944867086/proseg',
+  'config': r'/private/tmp/viash_inject_proseg11113472565944867086/.config.vsh.yaml',
+  'temp_dir': r'/var/folders/fq/ymt0vml175s4yvqxzbmlmpz80000gn/T/',
+  'cpus': int(r'123'),
+  'memory_b': int(r'123'),
+  'memory_kb': int(r'123'),
+  'memory_mb': int(r'123'),
+  'memory_gb': int(r'123'),
+  'memory_tb': int(r'123'),
+  'memory_pb': int(r'123'),
+  'memory_kib': int(r'123'),
+  'memory_mib': int(r'123'),
+  'memory_gib': int(r'123'),
+  'memory_tib': int(r'123'),
+  'memory_pib': int(r'123')
+}
+dep = {
+
+}
+
+## VIASH END
+
+proseg_dir = Path(meta['temp_dir'] or '/tmp') / 'proseg'
+proseg_dir.mkdir(parents=True, exist_ok=True)
+
+print('Reading input', flush=True)
+sdata = sd.read_zarr(par['input'])
+image = sdata['morphology_mip']['scale0'].image.compute().to_numpy()
+transformation = sdata['morphology_mip']['scale0'].image.transform.copy()
+h, w = image.shape[-2:]
+
+print('Exporting transcripts to CSV', flush=True)
+transcripts_df = sdata['transcripts'].compute()
+has_z = 'z' in transcripts_df.columns and transcripts_df['z'].notna().any()
+
+# Proseg requires a prior cell-ID column. Build a simple grid-based prior so
+# proseg can refine the segmentation from a meaningful starting state.
+x_min, x_max = float(transcripts_df['x'].min()), float(transcripts_df['x'].max())
+y_min, y_max = float(transcripts_df['y'].min()), float(transcripts_df['y'].max())
+grid_size = 20.0  # microns per grid cell — roughly one cell diameter
+x_bins = np.arange(x_min, x_max + grid_size, grid_size)
+y_bins = np.arange(y_min, y_max + grid_size, grid_size)
+x_idx = np.digitize(transcripts_df['x'].to_numpy(), x_bins) - 1
+y_idx = np.digitize(transcripts_df['y'].to_numpy(), y_bins) - 1
+n_x = max(len(x_bins) - 1, 1)
+prior_cell_ids = (y_idx * n_x + x_idx) + 1  # 1-indexed; 0 is "unassigned"
+
+z_vals = transcripts_df['z'] if has_z else np.zeros(len(transcripts_df))
+csv_data = {
+    'x': transcripts_df['x'],
+    'y': transcripts_df['y'],
+    'z': z_vals,
+    'gene': transcripts_df['feature_name'],
+    'cell_id': prior_cell_ids,
+}
+transcript_csv = proseg_dir / 'transcripts.csv'
+pd.DataFrame(csv_data).to_csv(transcript_csv, index=False)
+
+n_threads = max((meta.get('cpus') or os.cpu_count() or 1) - 2, 1)
+cmd = [
+    'proseg', 'transcripts.csv',
+    '--x-column', 'x',
+    '--y-column', 'y',
+    '--z-column', 'z',
+    '--gene-column', 'gene',
+    '--cell-id-column', 'cell_id',
+    '--cell-id-unassigned', '0',
+    '--nthreads', str(n_threads),
+    '--ncomponents', '10',
+    '--diffusion-probability', '0.2',
+    '--diffusion-sigma-far', '4',
+    '--diffusion-sigma-near', '1',
+    '--nuclear-reassignment-prob', '0.2',
+    '--cell-compactness', '0.03',
+    '--voxel-layers', str(par['voxel_layers']),
+    '--samples', str(par['samples']),
+    '--burnin-samples', str(par['burnin_samples']),
+    '--recorded-samples', str(par['samples']),
+    '--voxel-size', str(par['voxel_size']),
+    '--burnin-voxel-size', str(par['burnin_voxel_size']),
+]
+print(f'Running Proseg: {" ".join(cmd)}', flush=True)
+subprocess.run(cmd, cwd=str(proseg_dir), check=True)
+
+print('Reading Proseg Zarr output', flush=True)
+proseg_sdata = sd.read_zarr(str(proseg_dir / 'proseg-output.zarr'))
+shapes = proseg_sdata.shapes['cell_boundaries']
+print(f'Found {len(shapes)} cell boundaries', flush=True)
+
+# Proseg boundaries are in global (micron) coordinates. Convert to pixel space
+# using the inverse of the morphology image's pixel-to-global transformation.
+print('Converting boundaries to pixel space and rasterizing', flush=True)
+img_transform = get_transformation(sdata['morphology_mip'], to_coordinate_system='global')
+affine_mat = img_transform.to_affine_matrix(input_axes=('x', 'y'), output_axes=('x', 'y'))
+inv_affine = np.linalg.inv(affine_mat)
+
+def global_to_pixel(geom):
+    a, b, xoff = inv_affine[0, 0], inv_affine[0, 1], inv_affine[0, 2]
+    d, e, yoff = inv_affine[1, 0], inv_affine[1, 1], inv_affine[1, 2]
+    return shapely_affine_transform(geom, [a, b, d, e, xoff, yoff])
+
+pixel_geoms = shapes['geometry'].apply(global_to_pixel)
+shape_geoms = list(zip(pixel_geoms, range(1, len(shapes) + 1)))
+labels = rio_rasterize(shape_geoms, out_shape=(h, w), fill=0, dtype=np.int32)
+
+max_val = labels.max()
+if max_val <= np.iinfo(np.uint8).max:
+    labels = labels.astype(np.uint8)
+elif max_val <= np.iinfo(np.uint16).max:
+    labels = labels.astype(np.uint16)
+elif max_val <= np.iinfo(np.uint32).max:
+    labels = labels.astype(np.uint32)
+
+print('Creating output data structure', flush=True)
+sd_output = sd.SpatialData(
+    labels={
+        'segmentation': Labels2DModel.parse(
+            xr.DataArray(labels, name='segmentation', dims=('y', 'x')),
+            transformations=transformation,
+        )
+    },
+    tables={
+        'table': ad.AnnData(
+            uns={
+                'dataset_id': sdata.tables['table'].uns['dataset_id'],
+                'method_id': meta['name'],
+            }
+        )
+    },
+)
+
+print('Writing output', flush=True)
+if os.path.exists(par['output']):
+    shutil.rmtree(par['output'])
+sd_output.write(par['output'])
diff --git a/src/methods/stardist/config.vsh.yaml b/src/methods/stardist/config.vsh.yaml
@@ -0,0 +1,54 @@
+__merge__: /src/api/comp_method.yaml
+
+name: stardist
+label: "StarDist"
+
+# summary and description are generated by claude sonnet 4.6 based on the paper, github repo and documentation
+summary: Detects star-convex shaped cells by predicting radial distances to the object boundary from each pixel.
+description: |
+  StarDist represents each cell as a star-convex polygon defined by a set of radial distances from the cell's
+  center pixel to its boundary, measured at evenly spaced angles. A U-Net–style convolutional neural network
+  is trained to predict, for every pixel, both the probability that the pixel lies inside any cell and the
+  set of radial distances describing the cell polygon centered at that pixel. At inference time, non-maximum
+  suppression selects the most confident polygon proposals and suppresses overlapping detections, yielding a
+  final instance-segmentation label image. Because the star-convex parameterization compactly encodes shapes
+  typical of fluorescence-microscopy nuclei and cells, the method is highly data-efficient and generalizes
+  well across imaging modalities with pretrained models such as "2D_versatile_fluo".
+
+links:
+  documentation: "https://stardist.net"
+  repository: "https://github.com/stardist/stardist"
+references:
+  doi: "10.48550/arXiv.1806.03535"
+
+arguments:
+  - name: --model
+    type: string
+    default: "2D_versatile_fluo"
+    description: "Pretrained StarDist model name."
+    info:
+      test_default: "2D_versatile_fluo"
+
+resources:
+  - type: python_script
+    path: script.py
+
+engines:
+  - type: docker
+    image: openproblems/base_python:1
+    __merge__:
+      - /src/base/setup_spatialdata_partial.yaml
+    setup:
+      - type: python
+        pypi: stardist
+      - type: python
+        pypi:
+          - "numpy<2.0.0"
+          - tensorflow
+  - type: native
+
+runners:
+  - type: executable
+  - type: nextflow
+    directives:
+      label: [ midtime, midcpu, highmem, gpu ]