lambda-gha/.github/workflows/runner.yml at main · Open-Athena/lambda-gha · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
name: Lambda Runner
#
# Environment variables (can be set at org/repo level):
#   LAMBDA_SSH_KEY_NAMES - Comma-separated list of SSH key names registered in Lambda Labs
#   LAMBDA_INSTANCE_TYPE - Default instance type (e.g., gpu_1x_a10)
#   LAMBDA_REGION - Default region (e.g., us-south-1)
#
# Priority: inputs > vars > defaults

on:
  workflow_call:
    secrets:
      GH_SA_TOKEN:
        description: "GitHub token with permissions to manage self-hosted runners"
        required: true
      LAMBDA_API_KEY:
        description: "Lambda Labs API key"
        required: true
      LAMBDA_SSH_PRIVATE_KEY:
        description: "SSH private key for connecting to Lambda instances"
        required: true
    inputs:
      action_ref:
        description: "lambda-gha Git ref (branch/tag/SHA) to checkout"
        required: false
        type: string
        default: "main"
      check_availability:
        description: "Pre-check capacity before attempting launches (default: auto, enabled when multiple types/regions)"
        required: false
        type: string
      debug:
        description: "Debug mode: false=off, true/trace=set -x only, number=set -x + sleep N minutes before shutdown"
        required: false
        type: string
        default: "false"
      extra_gh_labels:
        description: "Extra GitHub labels for the runner (comma-separated)"
        required: false
        type: string
      instance_count:
        description: "Number of instances to create (for parallel jobs)"
        required: false
        type: string
        default: "1"
      instance_type:
        description: "Lambda instance type(s), comma-separated for fallback (e.g., gpu_1x_a10,gpu_1x_a100)"
        required: false
        type: string
      max_instance_lifetime:
        description: "Maximum instance lifetime in minutes before shutdown (default: 360)"
        required: false
        type: string
      name:
        description: "Name for the launch job"
        required: false
        type: string
      region:
        description: "Lambda region(s), comma-separated for fallback (e.g., us-east-1,us-west-1)"
        required: false
        type: string
      retry_count:
        description: "Number of retries per instance type/region combination (default: 1)"
        required: false
        type: string
        default: "1"
      retry_delay:
        description: "Initial delay between retries in seconds (default: 5)"
        required: false
        type: string
        default: "5"
      runner_grace_period:
        description: "Seconds before terminating after last job completes (default: 60)"
        required: false
        type: string
      runner_initial_grace_period:
        description: "Seconds before terminating if no jobs start (default: 180)"
        required: false
        type: string
      runner_poll_interval:
        description: "Seconds between termination condition checks (default: 10)"
        required: false
        type: string
      runner_registration_timeout:
        description: "Max seconds to wait for runner registration (default: 300)"
        required: false
        type: string
      ssh_key_names:
        description: "SSH key names registered in Lambda Labs (comma-separated)"
        required: false
        type: string
      userdata:
        description: "Additional script to run before runner setup"
        required: false
        type: string
    outputs:
      id:
        description: "Runner label for runs-on (single instance)"
        value: ${{ jobs.launch.outputs.id }}
      mtx:
        description: "JSON array of objects for matrix strategies"
        value: ${{ jobs.launch.outputs.mtx }}

jobs:
  launch:
    name: ${{ inputs.name || format('Launch {0}', inputs.instance_type || vars.LAMBDA_INSTANCE_TYPE || 'gpu_1x_a10') }}
    runs-on: ubuntu-latest
    outputs:
      id: ${{ steps.lambda-start.outputs.label }}
      mtx: ${{ steps.lambda-start.outputs.mtx }}
    steps:
      - name: Check SSH key configuration
        run: |
          if [ -z "${{ inputs.ssh_key_names || vars.LAMBDA_SSH_KEY_NAMES }}" ]; then
            echo "ERROR: SSH key names must be provided either as input or as LAMBDA_SSH_KEY_NAMES variable"
            exit 1
          fi

      - name: Checkout lambda-gha repository
        uses: actions/checkout@v4
        with:
          repository: Open-Athena/lambda-gha
          ref: ${{ inputs.action_ref }}

      - name: Create Lambda runner
        id: lambda-start
        uses: ./
        with:
          action_ref: ${{ inputs.action_ref }}
          check_availability: ${{ inputs.check_availability }}
          debug: ${{ inputs.debug }}
          extra_gh_labels: ${{ inputs.extra_gh_labels }}
          instance_count: ${{ inputs.instance_count }}
          instance_type: ${{ inputs.instance_type || vars.LAMBDA_INSTANCE_TYPE }}
          max_instance_lifetime: ${{ inputs.max_instance_lifetime || vars.MAX_INSTANCE_LIFETIME }}
          region: ${{ inputs.region || vars.LAMBDA_REGION }}
          retry_count: ${{ inputs.retry_count }}
          retry_delay: ${{ inputs.retry_delay }}
          runner_grace_period: ${{ inputs.runner_grace_period || vars.RUNNER_GRACE_PERIOD }}
          runner_initial_grace_period: ${{ inputs.runner_initial_grace_period || vars.RUNNER_INITIAL_GRACE_PERIOD }}
          runner_poll_interval: ${{ inputs.runner_poll_interval || vars.RUNNER_POLL_INTERVAL }}
          runner_registration_timeout: ${{ inputs.runner_registration_timeout || vars.RUNNER_REGISTRATION_TIMEOUT }}
          ssh_key_names: ${{ inputs.ssh_key_names || vars.LAMBDA_SSH_KEY_NAMES }}
          ssh_private_key: ${{ secrets.LAMBDA_SSH_PRIVATE_KEY }}
          userdata: ${{ inputs.userdata }}
        env:
          GH_PAT: ${{ secrets.GH_SA_TOKEN }}
          LAMBDA_API_KEY: ${{ secrets.LAMBDA_API_KEY }}