-
Notifications
You must be signed in to change notification settings - Fork 12
Expand file tree
/
Copy pathtrain_parallel.sh
More file actions
executable file
·297 lines (251 loc) · 8.86 KB
/
train_parallel.sh
File metadata and controls
executable file
·297 lines (251 loc) · 8.86 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
#!/bin/bash
# Parallel training script for sparsity predictors
# Divides layer training across multiple parallel train.py jobs
#
# Usage:
# ./train_parallel.sh --layers-per-job 4 --config meta-llama/Llama-2-7b-hf \
# --dataset_dir ./data/c4 --output_dir ./trained_predictors \
# --layer_indices 0 1 2 3 4 5 6 7 8 9 10 11 \
# --batch_size 32 --num_epochs 10 --learning_rate 1e-5
#
# Example with LoRA grid:
# ./train_parallel.sh --layers-per-job 3 --num_layers 32 --config meta-llama/Llama-2-7b-hf \
# --dataset_dir ./data/c4 --output_dir ./trained_predictors \
# --layer_indices all --lora_sizes 4.0 10.0 20.0 30.0 \
# --batch_size 32 --num_epochs 10 --learning_rate 1e-5
set -e # Exit on any error
# Default values
LAYERS_PER_JOB=4
TRAIN_ARGS=()
LAYER_INDICES=()
CONFIG=""
NUM_LAYERS=""
PYTHON_CMD="python"
# Colors for output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m' # No Color
# Function to print colored output
print_info() {
echo -e "${BLUE}[INFO]${NC} $1"
}
print_success() {
echo -e "${GREEN}[SUCCESS]${NC} $1"
}
print_warning() {
echo -e "${YELLOW}[WARNING]${NC} $1"
}
print_error() {
echo -e "${RED}[ERROR]${NC} $1"
}
# Function to show usage
show_usage() {
cat << EOF
Usage: $0 --layers-per-job N [train.py arguments...]
Parallel training script that divides layer training across multiple train.py jobs.
Required arguments:
--layers-per-job N Number of layers to train in each parallel job
Script-specific arguments:
--num_layers N Total number of layers (required when --layer_indices is 'all')
All other arguments are passed directly to train.py. Key arguments include:
--config MODEL_PATH Path to model config (required for train.py)
--dataset_dir PATH Path to dataset directory (required for train.py)
--output_dir PATH Output directory for trained models (required for train.py)
--layer_indices LAYERS Layer indices to train (space-separated numbers or 'all')
--lora_sizes SIZES LoRA sizes as percentages (e.g., 4.0 10.0 20.0 30.0)
--batch_size N Training batch size
--num_epochs N Number of training epochs
--learning_rate RATE Learning rate
--use_wandb Enable Weights & Biases logging
Examples:
# Train 12 layers with 4 layers per job (3 parallel jobs)
$0 --layers-per-job 4 --config meta-llama/Llama-2-7b-hf \\
--dataset_dir ./data/c4 --output_dir ./trained_predictors \\
--layer_indices 0 1 2 3 4 5 6 7 8 9 10 11 \\
--batch_size 32 --num_epochs 10 --learning_rate 1e-5
# Train all 32 layers with LoRA grid using 3 layers per job
$0 --layers-per-job 3 --num_layers 32 --config meta-llama/Llama-2-7b-hf \\
--dataset_dir ./data/c4 --output_dir ./trained_predictors \\
--layer_indices all --lora_sizes 4.0 10.0 20.0 30.0 \\
--batch_size 32 --num_epochs 10 --learning_rate 1e-5 --use_wandb
EOF
}
# Parse arguments
while [[ $# -gt 0 ]]; do
case $1 in
--layers-per-job)
LAYERS_PER_JOB="$2"
shift 2
;;
--num_layers)
NUM_LAYERS="$2"
shift 2
;;
--layer_indices)
shift
# Collect all layer indices until next flag or end
while [[ $# -gt 0 && ! "$1" =~ ^-- ]]; do
LAYER_INDICES+=("$1")
shift
done
# Add to train args
TRAIN_ARGS+=(--layer_indices "${LAYER_INDICES[@]}")
;;
--config)
CONFIG="$2"
TRAIN_ARGS+=("$1" "$2")
shift 2
;;
--help|-h)
show_usage
exit 0
;;
*)
# Pass through all other arguments to train.py
TRAIN_ARGS+=("$1")
shift
;;
esac
done
# Validate required arguments
if [[ -z "$LAYERS_PER_JOB" ]] || [[ ! "$LAYERS_PER_JOB" =~ ^[0-9]+$ ]] || [[ "$LAYERS_PER_JOB" -lt 1 ]]; then
print_error "Invalid --layers-per-job value: '$LAYERS_PER_JOB'. Must be a positive integer."
show_usage
exit 1
fi
if [[ -z "$CONFIG" ]]; then
print_error "Missing required argument: --config"
show_usage
exit 1
fi
if [[ ${#LAYER_INDICES[@]} -eq 0 ]]; then
print_error "Missing required argument: --layer_indices"
show_usage
exit 1
fi
print_info "Parallel training configuration:"
print_info " Layers per job: $LAYERS_PER_JOB"
print_info " Model config: $CONFIG"
print_info " Layer indices: ${LAYER_INDICES[*]}"
# Handle 'all' layers by generating the list from num_layers
if [[ ${#LAYER_INDICES[@]} -eq 1 && "${LAYER_INDICES[0]}" == "all" ]]; then
if [[ -z "$NUM_LAYERS" ]] || [[ ! "$NUM_LAYERS" =~ ^[0-9]+$ ]] || [[ "$NUM_LAYERS" -lt 1 ]]; then
print_error "When --layer_indices is 'all', you must specify --num_layers with a positive integer"
print_error "Example: --num_layers 32 --layer_indices all"
exit 1
fi
print_info "Generating layer indices for 'all' option with $NUM_LAYERS layers..."
# Generate layer indices from 0 to NUM_LAYERS-1
LAYER_INDICES=()
for ((i=0; i<NUM_LAYERS; i++)); do
LAYER_INDICES+=($i)
done
print_info "Generated $NUM_LAYERS layers: ${LAYER_INDICES[*]}"
fi
# Calculate number of jobs needed
TOTAL_LAYERS=${#LAYER_INDICES[@]}
NUM_JOBS=$(( (TOTAL_LAYERS + LAYERS_PER_JOB - 1) / LAYERS_PER_JOB ))
print_info "Training plan:"
print_info " Total layers: $TOTAL_LAYERS"
print_info " Layers per job: $LAYERS_PER_JOB"
print_info " Number of parallel jobs: $NUM_JOBS"
# Create output directory for job logs
LOG_DIR="./logs/parallel_training_$(date +%Y%m%d_%H%M%S)"
mkdir -p "$LOG_DIR"
print_info "Job logs will be saved to: $LOG_DIR"
# Function to run a single training job
run_training_job() {
local job_id=$1
local job_layers=("${@:2}")
local job_name="job_${job_id}"
local log_file="${LOG_DIR}/${job_name}.log"
print_info "Starting $job_name with layers: ${job_layers[*]}"
# Build the command
local cmd=("$PYTHON_CMD" "train.py")
# Add all original args except layer_indices
local skip_next=false
local in_layer_indices=false
for arg in "${TRAIN_ARGS[@]}"; do
if [[ "$skip_next" == true ]]; then
skip_next=false
continue
fi
if [[ "$arg" == "--layer_indices" ]]; then
in_layer_indices=true
cmd+=("$arg")
# Add the job-specific layers
cmd+=("${job_layers[@]}")
skip_next=false
elif [[ "$in_layer_indices" == true && ! "$arg" =~ ^-- ]]; then
# Skip original layer indices
continue
else
in_layer_indices=false
cmd+=("$arg")
fi
done
print_info "$job_name command: ${cmd[*]}"
# Run the training job
if "${cmd[@]}" > "$log_file" 2>&1; then
print_success "$job_name completed successfully"
return 0
else
print_error "$job_name failed! Check log: $log_file"
return 1
fi
}
# Start all jobs in parallel
print_info "Starting $NUM_JOBS parallel training jobs..."
JOB_PIDS=()
FAILED_JOBS=()
for ((job_id=0; job_id<NUM_JOBS; job_id++)); do
start_idx=$((job_id * LAYERS_PER_JOB))
end_idx=$((start_idx + LAYERS_PER_JOB))
# Get layers for this job
job_layers=()
for ((i=start_idx; i<end_idx && i<TOTAL_LAYERS; i++)); do
job_layers+=("${LAYER_INDICES[i]}")
done
# Start job in background
run_training_job "$job_id" "${job_layers[@]}" &
JOB_PIDS+=($!)
# Small delay to avoid overwhelming the system
sleep 2
done
print_info "All jobs started. Waiting for completion..."
print_info "Job PIDs: ${JOB_PIDS[*]}"
print_info "You can monitor progress with: tail -f ${LOG_DIR}/*.log"
# Wait for all jobs to complete
FAILED_COUNT=0
for ((i=0; i<${#JOB_PIDS[@]}; i++)); do
pid=${JOB_PIDS[i]}
job_id=$i
if wait $pid; then
print_success "Job $job_id (PID $pid) completed successfully"
else
print_error "Job $job_id (PID $pid) failed"
FAILED_JOBS+=($job_id)
((FAILED_COUNT++))
fi
done
# Summary
echo
echo "==================== TRAINING SUMMARY ===================="
print_info "Total jobs: $NUM_JOBS"
print_info "Successful jobs: $((NUM_JOBS - FAILED_COUNT))"
print_info "Failed jobs: $FAILED_COUNT"
if [[ $FAILED_COUNT -eq 0 ]]; then
print_success "All training jobs completed successfully!"
print_info "Check your output directory for trained models"
else
print_error "Some training jobs failed: ${FAILED_JOBS[*]}"
print_info "Check the following log files for details:"
for job_id in "${FAILED_JOBS[@]}"; do
echo " ${LOG_DIR}/job_${job_id}.log"
done
exit 1
fi
print_info "Log directory: $LOG_DIR"
echo "============================================================"