From ad2af76877e535e33d331d42065d3e681919ab5d Mon Sep 17 00:00:00 2001 From: Robrecht Cannoodt Date: Wed, 13 May 2026 13:17:31 +0200 Subject: [PATCH 1/2] update scripts --- CHANGELOG.md | 19 +-- scripts/create_resources/resources.sh | 8 +- scripts/run_benchmark/run_full_local.sh | 4 +- scripts/run_benchmark/run_full_seqeracloud.sh | 10 +- scripts/run_benchmark/run_test_seqeracloud.sh | 30 ++-- src/base/labels_tw.config | 158 ++++++++++++++++++ 6 files changed, 185 insertions(+), 44 deletions(-) create mode 100644 src/base/labels_tw.config diff --git a/CHANGELOG.md b/CHANGELOG.md index 5962f06..7158473 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -# task_template x.y.z +# task_spatial_segmentation x.y.z ## BREAKING CHANGES @@ -6,25 +6,16 @@ ## NEW FUNCTIONALITY -* Added `control_methods/true_labels` component (PR #5). - -* Added `methods/logistic_regression` component (PR #5). - -* Added `metrics/accuracy` component (PR #5). +* ... ## MAJOR CHANGES -* Updated `api` files (PR #5). - -* Updated configs, components and CI to the latest Viash version (PR #8). - -* Updated to Viash 0.9.4 (PR #12). - -* Use dependencies in `openproblems-bio/openproblems` (PR #12). +* ... ## MINOR CHANGES -* Updated `README.md` (PR #5). +* ... ## BUGFIXES +* ... diff --git a/scripts/create_resources/resources.sh b/scripts/create_resources/resources.sh index 4a921f8..969ee5b 100755 --- a/scripts/create_resources/resources.sh +++ b/scripts/create_resources/resources.sh @@ -8,7 +8,7 @@ cd "$REPO_ROOT" # remove this when you have implemented the script echo "TODO: once the 'process_datasets' workflow is implemented, update this script to use it." -echo " Step 1: replace 'task_template' with the name of the task in the following command." +echo " Step 1: replace 'task_spatial_segmentation' with the name of the task in the following command." echo " Step 2: replace the rename keys parameters to fit your process_dataset inputs" echo " Step 3: replace the settings parameter to fit your process_dataset outputs" echo " Step 4: remove this message" @@ -19,10 +19,10 @@ input_states: s3://openproblems-data/resources/datasets/**/state.yaml rename_keys: 'input_spatial_unlabelled:output_spatial_unlabelled,input_spatial_solution:output_spatial_solution,input_scrnaseq_reference:output_scrnaseq_reference' output_state: '$id/state.yaml' settings: '{"output_spatial_unlabelled": "$id/output_spatial_unlabelled.zarr", "output_spatial_solution": "$id/output_spatial_solution.zarr", "output_scrnaseq": "$id/output_scrnaseq.h5ad"}' -publish_dir: s3://openproblems-data/resources/task_template/datasets/ +publish_dir: s3://openproblems-data/resources/task_spatial_segmentation/datasets/ HERE -tw launch https://github.com/openproblems-bio/task_template.git \ +tw launch https://github.com/openproblems-bio/task_spatial_segmentation.git \ --revision build/main \ --pull-latest \ --main-script target/nextflow/workflows/process_datasets/main.nf \ @@ -31,4 +31,4 @@ tw launch https://github.com/openproblems-bio/task_template.git \ --params-file /tmp/params.yaml \ --entry-name auto \ --config common/nextflow_helpers/labels_tw.config \ - --labels task_template,process_datasets + --labels task_spatial_segmentation,process_datasets diff --git a/scripts/run_benchmark/run_full_local.sh b/scripts/run_benchmark/run_full_local.sh index 808df7f..4dac141 100755 --- a/scripts/run_benchmark/run_full_local.sh +++ b/scripts/run_benchmark/run_full_local.sh @@ -13,7 +13,7 @@ cd "$REPO_ROOT" # remove this when you have implemented the script echo "TODO: once the 'run_benchmark' workflow has been implemented, update this script to use it." -echo " Step 1: replace 'task_template' with the name of the task in the following command." +echo " Step 1: replace 'task_spatial_segmentation' with the name of the task in the following command." echo " Step 2: replace the rename keys parameters to fit your run_benchmark inputs" echo " Step 3: replace the settings parameter to fit your run_benchmark outputs" echo " Step 4: remove this message" @@ -37,7 +37,7 @@ publish_dir: "$publish_dir" HERE # run the benchmark -nextflow run openproblems-bio/task_template \ +nextflow run openproblems-bio/task_spatial_segmentation \ -r build/main \ -main-script target/nextflow/workflows/run_benchmark/main.nf \ -profile docker \ diff --git a/scripts/run_benchmark/run_full_seqeracloud.sh b/scripts/run_benchmark/run_full_seqeracloud.sh index 745aa77..50ae775 100755 --- a/scripts/run_benchmark/run_full_seqeracloud.sh +++ b/scripts/run_benchmark/run_full_seqeracloud.sh @@ -8,7 +8,7 @@ cd "$REPO_ROOT" # remove this when you have implemented the script echo "TODO: once the 'run_benchmark' workflow has been implemented, update this script to use it." -echo " Step 1: replace 'task_template' with the name of the task in the following command." +echo " Step 1: replace 'task_spatial_segmentation' with the name of the task in the following command." echo " Step 2: replace the rename keys parameters to fit your run_benchmark inputs" echo " Step 3: replace the settings parameter to fit your run_benchmark outputs" echo " Step 4: remove this message" @@ -18,17 +18,17 @@ set -e # generate a unique id RUN_ID="run_$(date +%Y-%m-%d_%H-%M-%S)" -publish_dir="s3://openproblems-data/resources/task_template/results/${RUN_ID}" +publish_dir="s3://openproblems-data/resources/task_spatial_segmentation/results/${RUN_ID}" # write the parameters to file cat > /tmp/params.yaml << HERE -input_states: s3://openproblems-data/resources/task_template/datasets/**/state.yaml +input_states: s3://openproblems-data/resources/task_spatial_segmentation/datasets/**/state.yaml rename_keys: 'input_spatial_unlabelled:output_spatial_unlabelled,input_spatial_solution:output_spatial_solution,input_scrnaseq_reference:output_scrnaseq_reference' output_state: "state.yaml" publish_dir: "$publish_dir" HERE -tw launch https://github.com/openproblems-bio/task_template.git \ +tw launch https://github.com/openproblems-bio/task_spatial_segmentation.git \ --revision build/main \ --pull-latest \ --main-script target/nextflow/workflows/run_benchmark/main.nf \ @@ -37,4 +37,4 @@ tw launch https://github.com/openproblems-bio/task_template.git \ --params-file /tmp/params.yaml \ --entry-name auto \ --config common/nextflow_helpers/labels_tw.config \ - --labels task_template,full \ No newline at end of file + --labels task_spatial_segmentation,full \ No newline at end of file diff --git a/scripts/run_benchmark/run_test_seqeracloud.sh b/scripts/run_benchmark/run_test_seqeracloud.sh index bc9c619..44f14c5 100755 --- a/scripts/run_benchmark/run_test_seqeracloud.sh +++ b/scripts/run_benchmark/run_test_seqeracloud.sh @@ -6,35 +6,27 @@ REPO_ROOT=$(git rev-parse --show-toplevel) # ensure that the command below is run from the root of the repository cd "$REPO_ROOT" -# remove this when you have implemented the script -echo "TODO: once the 'run_benchmark' workflow has been implemented, update this script to use it." -echo " Step 1: replace 'task_template' with the name of the task in the following command." -echo " Step 2: replace the rename keys parameters to fit your run_benchmark inputs" -echo " Step 3: replace the settings parameter to fit your run_benchmark outputs" -echo " Step 4: remove this message" -exit 1 - set -e -resources_test_s3=s3://openproblems-data/resources_test/task_template -publish_dir_s3="s3://openproblems-nextflow/temp/results/$(date +%Y-%m-%d_%H-%M-%S)" +resources_test_s3=s3://openproblems-data/resources_test/task_spatial_segmentation +publish_dir_s3="s3://hca-op-spatial/temp/results/$(date +%Y-%m-%d_%H-%M-%S)" # write the parameters to file cat > /tmp/params.yaml << HERE -id: cxg_mouse_pancreas_atlas -input_train: $resources_test_s3/cxg_mouse_pancreas_atlas/train.h5ad -input_test: $resources_test_s3/cxg_mouse_pancreas_atlas/test.h5ad -input_solution: $resources_test_s3/cxg_mouse_pancreas_atlas/solution.h5ad +id: mouse_brain_combined +input_spatial_unlabelled: $resources_test_s3/mouse_brain_combined/spatial_unlabelled.zarr +input_spatial_solution: $resources_test_s3/mouse_brain_combined/spatial_solution.zarr +input_scrnaseq_reference: $resources_test_s3/mouse_brain_combined/scrnaseq_reference.h5ad output_state: "state.yaml" publish_dir: $publish_dir_s3 HERE -tw launch https://github.com/openproblems-bio/task_template.git \ +tw launch https://github.com/openproblems-bio/task_spatial_segmentation.git \ --revision build/main \ --pull-latest \ --main-script target/nextflow/workflows/run_benchmark/main.nf \ - --workspace 53907369739130 \ - --compute-env 6TeIFgV5OY4pJCk8I0bfOh \ + --workspace 8386213183400 \ + --compute-env 7Odt43ln9XureGja6Frdm7 \ --params-file /tmp/params.yaml \ - --config common/nextflow_helpers/labels_tw.config \ - --labels task_template,test + --config src/base/labels_tw.config \ + --labels task_spatial_segmentation,test diff --git a/src/base/labels_tw.config b/src/base/labels_tw.config new file mode 100644 index 0000000..595e63e --- /dev/null +++ b/src/base/labels_tw.config @@ -0,0 +1,158 @@ +// copied from 'common/nextflow_helpers/labels_tw.config', but the queues in the gpu labels have been updated + +def exitStrat(task, max_attempts = 3) { + println "Determining exit strategy for task (attempt '${task.attempt}', exit status '${task.exitStatus}')" + + // if the component failed 3 times, ignore the error so the workflow can continue + // it's important 'ignore' is returned even if maxRetries is set to 3, + // otherwise the workflow will stop + if (task.attempt >= 3) { + return 'ignore' + } + // when an aws spot instance is reclaimed, nextflow seems to use exit code 2147483647 + // throwing in some extra conditions just in case + if (task.exitStatus == null || task.exitStatus <= -1 || task.exitStatus > 2100000000 || !(task.exitStatus.toString().isNumber())) { + return 'retry' + } + // if component failed, retry once + if (task.exitStatus == 1 && task.attempt < 2) { + return 'retry' + } + // if component ran out of memory, retry with more memory and disk + if (task.exitStatus in [137, 139] && task.attempt < max_attempts) { + return 'retry' + } + // return 'ignore' for all other cases to ignore the error, + // otherwise the workflow will stop + return 'ignore' +} + +aws { + batch { + maxTransferAttempts = 3 + delayBetweenAttempts = '5 sec' + maxSpotAttempts = 8 + } +} + +process { + executor = 'awsbatch' + + // Default disk space + disk = 50.GB + + // Retry for exit codes that have something to do with memory issues + // always retry once + errorStrategy = { exitStrat(task) } + maxRetries = 3 + maxMemory = null + + // Resource labels + withLabel: lowcpu { cpus = 5 } + withLabel: midcpu { cpus = 15 } + withLabel: highcpu { cpus = 30 } + withLabel: lowmem { + memory = { get_memory( 20.GB * task.attempt ) } + disk = { 50.GB * task.attempt } + } + withLabel: midmem { + memory = { get_memory( 50.GB * task.attempt ) } + disk = { 100.GB * task.attempt } + } + withLabel: highmem { + memory = { get_memory( 100.GB * task.attempt ) } + disk = { 200.GB * task.attempt } + } + withLabel: veryhighmem { + memory = { get_memory( 200.GB * task.attempt ) } + disk = { 400.GB * task.attempt } + } + withLabel: lowsharedmem { + containerOptions = { workflow.containerEngine != 'singularity' ? "--shm-size ${String.format("%.0f",task.memory.mega * 0.05)}" : ""} + } + withLabel: midsharedmem { + containerOptions = { workflow.containerEngine != 'singularity' ? "--shm-size ${String.format("%.0f",task.memory.mega * 0.1)}" : ""} + } + withLabel: highsharedmem { + containerOptions = { workflow.containerEngine != 'singularity' ? "--shm-size ${String.format("%.0f",task.memory.mega * 0.25)}" : ""} + } + withLabel: gpu { + // assuming g6.8xlarge + cpus = 32 + accelerator = 1 + memory = 100.GB + queue = "TowerForge-YxBvZ5IJWipqLJBWlIx34-work" + // ondemand: + // queue = "TowerForge-1DIo1otpXKvOF1jgVVKo8U-work" + containerOptions = { workflow.containerEngine == "singularity" ? '--nv': + ( workflow.containerEngine == "docker" ? '--gpus all': null ) } + } + withLabel: midgpu { + // assuming g6.8xlarge + cpus = 32 + accelerator = 4 + memory = 100.GB + queue = "TowerForge-YxBvZ5IJWipqLJBWlIx34-work" + // ondemand: + // queue = "TowerForge-1DIo1otpXKvOF1jgVVKo8U-work" + containerOptions = { workflow.containerEngine == "singularity" ? '--nv': + ( workflow.containerEngine == "docker" ? '--gpus all': null ) } + } + withLabel: highgpu { + // assuming g6.16xlarge + cpus = 64 + accelerator = 8 + memory = 200.GB + queue = "TowerForge-YxBvZ5IJWipqLJBWlIx34-work" + // ondemand: + // queue = "TowerForge-1DIo1otpXKvOF1jgVVKo8U-work" + containerOptions = { workflow.containerEngine == "singularity" ? '--nv': + ( workflow.containerEngine == "docker" ? '--gpus all': null ) } + } + withLabel: biggpu { + // assuming p5.4xlarge + cpus = 16 + accelerator = 1 + memory = 200.GB + queue = "TowerForge-jvrqgsfAj9Zm3kua7j07P-work" + containerOptions = { workflow.containerEngine == "singularity" ? '--nv': + ( workflow.containerEngine == "docker" ? '--gpus all': null ) } + } + + // make sure publishstates gets enough disk space and memory + withName:'.*publishStatesProc' { + memory = '16GB' + disk = '100GB' + } +} + +def get_memory(to_compare) { + if (!process.containsKey("maxMemory") || !process.maxMemory) { + return to_compare + } + + try { + if (process.containsKey("maxRetries") && process.maxRetries && task.attempt == (process.maxRetries as int)) { + return process.maxMemory + } + else if (to_compare.compareTo(process.maxMemory as nextflow.util.MemoryUnit) == 1) { + return max_memory as nextflow.util.MemoryUnit + } + else { + return to_compare + } + } catch (all) { + println "Error processing memory resources. Please check that process.maxMemory '${process.maxMemory}' and process.maxRetries '${process.maxRetries}' are valid!" + System.exit(1) + } +} + +// set tracing file +trace { + enabled = true + overwrite = true + file = "${params.publish_dir}/trace.txt" +} + +aws.batch.maxSpotAttempts = 5 +google.batch.maxSpotAttempts = 5 From 0cd3daf3fa0556645d6ffcc9887591d18c965360 Mon Sep 17 00:00:00 2001 From: Robrecht Cannoodt Date: Wed, 13 May 2026 13:39:30 +0200 Subject: [PATCH 2/2] update config --- src/base/labels_tw.config | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/src/base/labels_tw.config b/src/base/labels_tw.config index 595e63e..3a03eb1 100644 --- a/src/base/labels_tw.config +++ b/src/base/labels_tw.config @@ -81,9 +81,8 @@ process { cpus = 32 accelerator = 1 memory = 100.GB - queue = "TowerForge-YxBvZ5IJWipqLJBWlIx34-work" - // ondemand: - // queue = "TowerForge-1DIo1otpXKvOF1jgVVKo8U-work" + // queue = "TowerForge-YxBvZ5IJWipqLJBWlIx34-work" // spot (less expensive, might need to wait longer) + queue = "TowerForge-1DIo1otpXKvOF1jgVVKo8U" // ondemand (more expensive) containerOptions = { workflow.containerEngine == "singularity" ? '--nv': ( workflow.containerEngine == "docker" ? '--gpus all': null ) } } @@ -92,9 +91,8 @@ process { cpus = 32 accelerator = 4 memory = 100.GB - queue = "TowerForge-YxBvZ5IJWipqLJBWlIx34-work" - // ondemand: - // queue = "TowerForge-1DIo1otpXKvOF1jgVVKo8U-work" + // queue = "TowerForge-YxBvZ5IJWipqLJBWlIx34-work" // spot (less expensive, might need to wait longer) + queue = "TowerForge-1DIo1otpXKvOF1jgVVKo8U" // ondemand (more expensive) containerOptions = { workflow.containerEngine == "singularity" ? '--nv': ( workflow.containerEngine == "docker" ? '--gpus all': null ) } } @@ -103,9 +101,8 @@ process { cpus = 64 accelerator = 8 memory = 200.GB - queue = "TowerForge-YxBvZ5IJWipqLJBWlIx34-work" - // ondemand: - // queue = "TowerForge-1DIo1otpXKvOF1jgVVKo8U-work" + // queue = "TowerForge-YxBvZ5IJWipqLJBWlIx34-work" // spot (less expensive, might need to wait longer) + queue = "TowerForge-1DIo1otpXKvOF1jgVVKo8U" // ondemand (more expensive) containerOptions = { workflow.containerEngine == "singularity" ? '--nv': ( workflow.containerEngine == "docker" ? '--gpus all': null ) } } @@ -114,7 +111,7 @@ process { cpus = 16 accelerator = 1 memory = 200.GB - queue = "TowerForge-jvrqgsfAj9Zm3kua7j07P-work" + queue = "TowerForge-jvrqgsfAj9Zm3kua7j07P-work" // spot (less expensive, might need to wait longer) containerOptions = { workflow.containerEngine == "singularity" ? '--nv': ( workflow.containerEngine == "docker" ? '--gpus all': null ) } }