From e816a15bb870ab6ec5777bd7718f29bc5ecf21f3 Mon Sep 17 00:00:00 2001 From: Akanksha Gupta Date: Wed, 6 May 2026 14:34:52 -0700 Subject: [PATCH] Restart the Pathways worker entrypoint on exit code 1 PiperOrigin-RevId: 911553878 --- .../yamls/pw-service.yaml | 32 ++++++++++++++++--- 1 file changed, 28 insertions(+), 4 deletions(-) diff --git a/pathwaysutils/experimental/shared_pathways_service/yamls/pw-service.yaml b/pathwaysutils/experimental/shared_pathways_service/yamls/pw-service.yaml index 19769db..88d0f2a 100644 --- a/pathwaysutils/experimental/shared_pathways_service/yamls/pw-service.yaml +++ b/pathwaysutils/experimental/shared_pathways_service/yamls/pw-service.yaml @@ -83,10 +83,34 @@ spec: - name: pathways-worker image: ${SERVER_IMAGE} imagePullPolicy: Always - args: - - --server_port=29005 - - --resource_manager_address=$$(PATHWAYS_HEAD):29001 - - --gcs_scratch_location=${GCS_SCRATCH_LOCATION} + command: + - /bin/sh + - -c + - | + while true; do + echo "Spawning pathways server ..." + TARGET_BIN="" + for f in /usr/pathways/run/cloud_pathways_server*; do + if [ -x "$$f" ]; then + TARGET_BIN="$$f" + break + fi + done + if [ -z "$$TARGET_BIN" ]; then + echo "Error: Could not find executable cloud_pathways_server* in /usr/pathways/run/" + exit 1 + fi + echo "Found pathways server binary: $$TARGET_BIN" + $$TARGET_BIN --server_port=29005 --resource_manager_address=$$(PATHWAYS_HEAD):29001 --gcs_scratch_location=${GCS_SCRATCH_LOCATION} + ret_code=$$? + if [ $$ret_code -eq 1 ]; then + echo "Worker process exited with return code 1. Restarting the server..." + continue + else + echo "Worker process terminated with unhandled return code: $$ret_code" + exit $$ret_code + fi + done env: - name: TPU_MIN_LOG_LEVEL value: "0"