Skip to content
343 changes: 339 additions & 4 deletions cookbook/client/server/megatron/run.sh
Original file line number Diff line number Diff line change
@@ -1,6 +1,341 @@
#!/bin/bash

# ============================================
# Twinkle Megatron 服务启动脚本
# ============================================
# 功能:启动 Ray 集群(支持多 GPU/CPU 节点)、Prometheus 监控和 Twinkle 服务器
#
# 用法:./run.sh [选项]
#
# 选项:
# --head NODE Head 节点 GPU 配置,格式 "设备列表:数量" (默认: 0,1,2,3:4)
# --gpu-workers LIST GPU Worker 列表,分号分隔多个节点 (默认: 4,5,6,7:4)
# --cpu-workers N CPU Worker 数量 (默认: 1)
# --temp-dir DIR Ray 临时目录 (默认: /dashscope/caches/application/ray_logs)
# --help 显示帮助信息
#
# 示例:
# ./run.sh # 使用默认配置
# ./run.sh --head "0,1,2,3" --gpu-workers "4,5,6,7" --cpu-workers 1
# ./run.sh --head "0,1,2,3" --gpu-workers "" --cpu-workers 0
# ./run.sh --head "" --cpu-workers 4 # 纯 CPU 模式
# ./run.sh --temp-dir /tmp/my_ray_logs # 自定义临时目录
# ============================================

set -e # 遇到错误立即退出

# ============================================
# 配置区(根据你的环境修改)
# ============================================

# --- Ray 集群配置 ---
# Head 节点(必须是第一个启动)
# 格式:"GPU设备列表:GPU数量",如 "0,1,2,3:4"
# 如果不需要 GPU,设为空字符串 ""
# 可通过命令行参数 $1 传入

# GPU Worker 节点列表(可以有多个)
# 格式:用分号分隔的 "GPU设备列表:GPU数量"
# 示例:"4,5,6,7:4" 或 "4,5,6,7:4;8,9,10,11:4"
# 可通过命令行参数 $2 传入

# CPU Worker 数量
# 可通过命令行参数 $3 传入

# --- 网络配置 ---
RAY_PORT=6379
RAY_ADDRESS="127.0.0.1:$RAY_PORT"

# --- 路径配置 ---
DEFAULT_TEMP_DIR="/dashscope/caches/application/ray_logs"
LOG_FILE="run.log"

# --- Prometheus 监控配置 ---
PROMETHEUS_BIN="/dashscope/caches/application/monitor/prometheus-3.10.0.linux-amd64/prometheus"
PROMETHEUS_CONFIG_SUFFIX="session_latest/metrics/prometheus/prometheus.yml"

# --- Ray 日志轮转配置 ---
export RAY_ROTATION_MAX_BYTES=1024
export RAY_ROTATION_BACKUP_COUNT=1
CUDA_VISIBLE_DEVICES=0,1,2,3 ray start --head --port=6379 --num-gpus=4 --disable-usage-stats --include-dashboard=false
CUDA_VISIBLE_DEVICES=4,5,6,7 ray start --address=127.0.0.1:6379 --num-gpus=4
CUDA_VISIBLE_DEVICES="" ray start --address=127.0.0.1:6379 --num-gpus=0
python server.py

# ============================================
# 参数解析(支持 --key=value 或 --key value 格式)
# ============================================

# 默认值
HEAD_NODE="0,1,2,3"
GPU_WORKERS_INPUT="4,5,6,7"
CPU_WORKER_COUNT="1"
TEMP_DIR="$DEFAULT_TEMP_DIR"

# 解析命名参数
while [[ $# -gt 0 ]]; do
case $1 in
--head)
HEAD_NODE="$2"
shift 2
;;
--head=*)
HEAD_NODE="${1#*=}"
shift
;;
--gpu-workers)
GPU_WORKERS_INPUT="$2"
shift 2
;;
--gpu-workers=*)
GPU_WORKERS_INPUT="${1#*=}"
shift
;;
--cpu-workers)
CPU_WORKER_COUNT="$2"
shift 2
;;
--cpu-workers=*)
CPU_WORKER_COUNT="${1#*=}"
shift
;;
--temp-dir)
TEMP_DIR="$2"
shift 2
;;
--temp-dir=*)
TEMP_DIR="${1#*=}"
shift
;;
--help|-h)
echo "用法: ./run.sh [选项]"
echo ""
echo "选项:"
echo " --head NODE Head 节点 GPU 设备列表,逗号分隔 (默认: 0,1,2,3)"
echo " --gpu-workers LIST GPU Worker 列表,分号分隔多个节点 (默认: 4,5,6,7)"
echo " --cpu-workers N CPU Worker 数量 (默认: 1)"
echo " --temp-dir DIR Ray 临时目录"
echo " --help, -h 显示帮助信息"
echo ""
echo "示例:"
echo " ./run.sh # 默认配置"
echo " ./run.sh --head '0,1,2,3' --gpu-workers '4,5,6,7'"
echo " ./run.sh --head '0,1,2,3,4,5,6,7' # 单机 8 卡"
echo " ./run.sh --gpu-workers '4,5,6,7;8,9,10,11' # 多 GPU Worker"
echo " ./run.sh --cpu-workers 4 --head '' # 纯 CPU 模式"
exit 0
;;
*)
print_error "未知参数: $1"
echo "使用 --help 查看帮助"
exit 1
;;
esac
done

# 将分号分隔的字符串转为数组
if [ -z "$GPU_WORKERS_INPUT" ]; then
GPU_WORKERS=()
else
IFS=';' read -ra GPU_WORKERS <<< "$GPU_WORKERS_INPUT"
fi

PROMETHEUS_CONFIG="${TEMP_DIR}/${PROMETHEUS_CONFIG_SUFFIX}"

# ============================================
# 辅助函数
# ============================================
print_info() {
echo -e "\033[36m[INFO]\033[0m $1"
}

print_success() {
echo -e "\033[32m[SUCCESS]\033[0m $1"
}

print_warning() {
echo -e "\033[33m[WARNING]\033[0m $1"
}

print_error() {
echo -e "\033[31m[ERROR]\033[0m $1"
}

print_separator() {
echo "============================================"
}

print_header() {
echo ""
print_separator
echo -e "\033[1;34m $1 \033[0m"
print_separator
}

# 解析节点配置 "devices" -> 返回 devices 和自动计算 _gpu_count
# 示例: "0,1,2,3" -> devices="0,1,2,3", count=4
parse_node_config() {
local config="$1"
if [ -z "$config" ]; then
_gpu_devices=""
_gpu_count=0
return
fi
_gpu_devices="$config"
# 通过逗号数量+1计算 GPU 数量
local comma_count=$(echo "$config" | tr -cd ',' | wc -c)
_gpu_count=$((comma_count + 1))
}

# ============================================
# 开始启动
# ============================================
print_header "Twinkle Megatron 服务启动脚本"

# 打印配置信息
print_info "集群配置:"
echo ""

# 解析并显示 Head 节点
parse_node_config "$HEAD_NODE"
if [ -n "$_gpu_devices" ]; then
echo " [Head 节点]"
echo " - GPU 设备: $_gpu_devices"
echo " - GPU 数量: $_gpu_count"
else
echo " [Head 节点] CPU only"
fi

# 显示 GPU Worker 节点
if [ ${#GPU_WORKERS[@]} -gt 0 ]; then
echo ""
echo " [GPU Worker 节点] 共 ${#GPU_WORKERS[@]} 个"
for i in "${!GPU_WORKERS[@]}"; do
parse_node_config "${GPU_WORKERS[$i]}"
echo " Worker $((i+1)): GPU=$_gpu_devices, Count=$_gpu_count"
done
fi

# 显示 CPU Worker
if [ "$CPU_WORKER_COUNT" -gt 0 ]; then
echo ""
echo " [CPU Worker 节点] $CPU_WORKER_COUNT 个"
fi

echo ""
print_info "运行参数:"
echo " - Ray 地址: $RAY_ADDRESS"
echo " - 临时目录: $TEMP_DIR"
echo " - 日志文件: $LOG_FILE"
echo ""

# 检查临时目录
if [ ! -d "$TEMP_DIR" ]; then
print_info "创建临时目录: $TEMP_DIR"
mkdir -p "$TEMP_DIR"
fi

# ============================================
# 停止已有 Ray 集群和 Prometheus
# ============================================
print_header "清理环境"
print_info "停止已有的 Ray 集群..."
ray stop --force 2>/dev/null || true

print_info "停止已有的 Prometheus..."
pkill prometheus 2>/dev/null || true

# ============================================
# 启动 Ray Head 节点
# ============================================
print_header "启动 Ray 集群"

parse_node_config "$HEAD_NODE"
if [ -n "$_gpu_devices" ]; then
print_info "启动 Head 节点 (GPU: $_gpu_devices)..."
CUDA_VISIBLE_DEVICES="$_gpu_devices" ray start --head \
--port=$RAY_PORT \
--num-gpus=$_gpu_count \
--disable-usage-stats \
--include-dashboard=true \
--temp-dir="$TEMP_DIR"
else
print_info "启动 Head 节点 (CPU only)..."
CUDA_VISIBLE_DEVICES="" ray start --head \
--port=$RAY_PORT \
--num-gpus=0 \
--disable-usage-stats \
--include-dashboard=true \
--temp-dir="$TEMP_DIR"
fi
print_success "Head 节点启动成功!"

# ============================================
# 启动 GPU Worker 节点
# ============================================
for i in "${!GPU_WORKERS[@]}"; do
parse_node_config "${GPU_WORKERS[$i]}"
print_info "启动 GPU Worker $((i+1)) (GPU: $_gpu_devices)..."
CUDA_VISIBLE_DEVICES="$_gpu_devices" ray start \
--address=$RAY_ADDRESS \
--num-gpus=$_gpu_count
print_success "GPU Worker $((i+1)) 启动成功!"
done

# ============================================
# 启动 CPU Worker 节点
# ============================================
if [ "$CPU_WORKER_COUNT" -gt 0 ]; then
print_info "启动 $CPU_WORKER_COUNT 个 CPU Worker..."
for ((i=1; i<=CPU_WORKER_COUNT; i++)); do
CUDA_VISIBLE_DEVICES="" ray start \
--address=$RAY_ADDRESS \
--num-gpus=0
done
print_success "CPU Worker 启动成功!"
fi

# ============================================
# 显示集群状态
# ============================================
echo ""
print_info "集群状态:"
ray status 2>/dev/null || true

# ============================================
# 启动 Prometheus 监控(可选)
# ============================================
print_header "启动监控(可选)"

PROMETHEUS_PID=""
if [ -f "$PROMETHEUS_BIN" ]; then
print_info "检测到 Prometheus,正在启动监控服务..."

# 等待 Ray 生成 Prometheus 配置
sleep 2

if [ -f "$PROMETHEUS_CONFIG" ]; then
nohup "$PROMETHEUS_BIN" --config.file="$PROMETHEUS_CONFIG" > prometheus.log 2>&1 &
PROMETHEUS_PID=$!
print_success "Prometheus 监控已启动 (PID: $PROMETHEUS_PID)"
echo " - 监控日志: prometheus.log"
echo " - 配置文件: $PROMETHEUS_CONFIG"
else
print_warning "Prometheus 配置文件不存在,跳过监控启动"
echo " - 预期路径: $PROMETHEUS_CONFIG"
fi
else
print_warning "未检测到 Prometheus,跳过监控启动"
echo " - 预期路径: $PROMETHEUS_BIN"
fi

# ============================================
# 启动 Twinkle 服务器
# ============================================
print_header "启动 Twinkle 服务器"

print_info "日志输出到: $LOG_FILE"
echo ""

# 启动服务器并实时显示日志
nohup python server.py > "$LOG_FILE" 2>&1 &
SERVER_PID=$!

# 实时显示日志
tail -f "$LOG_FILE"
2 changes: 1 addition & 1 deletion cookbook/client/server/megatron/server_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ applications:
nproc_per_node: 4 # Number of GPU processes per node
device_group:
name: model
ranks: 4 # GPU rank indices
ranks: 4
device_type: cuda
device_mesh:
device_type: cuda
Expand Down
Loading
Loading