jenkins/deploy_model_start_2.sh
2025-04-22 10:44:36 +08:00

365 lines
13 KiB
Bash
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/bin/bash
# Color definitions
RED='\033[0;31m'
GREEN='\033[0;32m'
CYAN='\033[36m'
RESET='\033[0m'
:<<'COMMENT'
检测介质下载路径中是否包含以下介质模型权重文件和conda环境tar包如果不存在则退出。
检测服务器节点目标conda环境是否存在如果不存在则创建。
检测模型服务是否已启动,如果模型服务已启动,则终止模型服务。
启动模型服务
检查模型服务是否启动成功
执行curl命令验证模型服务
将黄块注册脚本和黄块curl命令脚本scp到黄块服务ip
ssh登录黄块服务ip进行黄块注册执行黄块curl命令
COMMENT
# 定义帮助信息
function print_help() {
echo "Usage: $0"
echo
echo "This script reads configuration from config.yaml and performs the necessary setup."
echo
echo "Example:"
echo " bash $0"
}
# 加载 YAML 配置
function load_config() {
local config_file="$1"
MODEL=$(yq eval '.model' "$config_file")
MODEL_VERSION=$(yq eval '.model_version' "$config_file")
CONDA_ENV=$(yq eval '.conda_env' "$config_file")
TENSOR_PARALLEL_SIZE=$(yq eval '.tensor_parallel_size' "$config_file")
VISIBLE_GPU_INDEX=$(yq eval '.visible_gpu_index' "$config_file")
MODEL_PORT=$(yq eval '.model_port' "$config_file")
GPU_MEMORY_UTILIZATION=$(yq eval '.gpu_memory_utilization' "$config_file")
DTYPE=$(yq eval '.dtype' "$config_file")
MODEL_SERVER_IP=$(yq eval '.model_server_ip' "$config_file")
YELLOW_BLOCK_SERVER_IP=$(yq eval '.yellow_block_server_ip' "$config_file")
YELLOW_BLOCK_SERVER_PORT=$(yq eval '.yellow_block_server_port' "$config_file")
YELLOW_BLOCK_CONDA_ENV=$(yq eval '.yellow_block_conda_env' "$config_file")
MEDIA_DOWNLOAD_PATH=$(yq eval '.media_download_path' "$config_file")
MODEL_PATH=$(yq eval '.model_path' "$config_file")
MODEL_NAME=$(yq eval '.model_name' "$config_file" | sed "s|{{model}}|$MODEL|g" | sed "s|{{model_version}}|$MODEL_VERSION|g")
MODEL_NAME_PATH=$(yq eval '.model_name_path' "$config_file" | sed "s|{{model_path}}|$MODEL_PATH|g" | sed "s|{{model_name}}|$MODEL_NAME|g")
MODEL_TAR=$(yq eval '.model_tar' "$config_file" | sed "s|{{model_name}}|$MODEL_NAME|g")
MINICONDA_PATH=$(yq eval '.miniconda_path' "$config_file")
YELLOW_BLOCK_PATH=$(yq eval '.yellow_block_path' "$config_file")
AUTOMATED_DEPLOYMENT_PATH=$(yq eval '.automated_deployment_path' "$config_file")
}
# 检查配置文件是否存在
cd /data/jenkins_script/automated_deployment
if [ ! -f "config.yaml" ]; then
echo "Error: config.yaml not found in the current directory."
exit 1
fi
# 加载配置
load_config "config.yaml"
# 打印参数值(调试用)
echo "MODEL: $MODEL"
echo "MODEL_VERSION: $MODEL_VERSION"
echo "MODEL_NAME: $MODEL_NAME"
echo "MODEL_NAME_PATH: $MODEL_NAME_PATH"
echo "MODEL_TAR: $MODEL_TAR"
echo "CONDA_ENV: $CONDA_ENV"
echo "TENSOR_PARALLEL_SIZE: $TENSOR_PARALLEL_SIZE"
echo "VISIBLE_GPU_INDEX: $VISIBLE_GPU_INDEX"
echo "MODEL_PORT: $MODEL_PORT"
echo "GPU_MEMORY_UTILIZATION: $GPU_MEMORY_UTILIZATION"
echo "DTYPE: $DTYPE"
echo "MODEL_SERVER_IP: $MODEL_SERVER_IP"
echo "YELLOW_BLOCK_SERVER_IP: $YELLOW_BLOCK_SERVER_IP"
echo "YELLOW_BLOCK_SERVER_PORT: $YELLOW_BLOCK_SERVER_PORT"
echo "YELLOW_BLOCK_CONDA_ENV: $YELLOW_BLOCK_CONDA_ENV"
echo "MEDIA_DOWNLOAD_PATH: $MEDIA_DOWNLOAD_PATH"
echo "MODEL_PATH: $MODEL_PATH"
echo "MINICONDA_PATH: $MINICONDA_PATH"
echo "YELLOW_BLOCK_PATH: $YELLOW_BLOCK_PATH"
echo "AUTOMATED_DEPLOYMENT_PATH: $AUTOMATED_DEPLOYMENT_PATH"
check_and_enter_script() {
local script_name="$1"
local path1="$MINICONDA_PATH/$CONDA_ENV/script/$MODEL"
local path2="$AUTOMATED_DEPLOYMENT_PATH"
# 检查第一个路径是否存在脚本
if [ -f "$path1/$script_name" ]; then
cd "$path1" || { echo "无法进入目录 $path1"; return 1; }
return 0
fi
# 检查第二个路径是否存在脚本
if [ -f "$path2/$script_name" ]; then
cd "$path2" || { echo "无法进入目录 $path2"; return 1; }
return 0
fi
# 如果两个路径都不存在脚本
return 1
}
# 检查介质下载路径中是否包含以下介质模型权重文件和conda环境tar包。
check_media_list() {
echo -e "${CYAN}Checking media files...${RESET}"
cd "$MEDIA_DOWNLOAD_PATH"
if [ -f "$MODEL_TAR" ]; then
echo -e "${GREEN}The media file $MEDIA_DOWNLOAD_PATH/$MODEL_TAR exists.${RESET}"
else
echo -e "${RED}The media file $MEDIA_DOWNLOAD_PATH/$MODEL_TAR does not exist.${RESET}"
fi
if [ -f "$CONDA_ENV.tar.gz" ]; then
echo -e "${GREEN}The media file $MEDIA_DOWNLOAD_PATH/$CONDA_ENV.tar.gz exists.${RESET}"
else
echo -e "${RED}The media file $MEDIA_DOWNLOAD_PATH/$CONDA_ENV.tar.gz does not exist.${RESET}"
fi
}
# 检查服务器节点中模型权重文件是否存在
check_model_path() {
cd "$MODEL_PATH"
if [ ! -d "$MODEL_NAME" ]; then
echo -e "${RED}The model $MODEL_NAME does not exist.${RESET}"
return 1
else
echo -e "${GREEN}The model $MODEL_NAME exists.${RESET}"
return 0
fi
}
# 部署模型
deploy_model() {
echo -e "${CYAN}Start deploying model.${RESET}"
cd "$MODEL_PATH"
cp "$MEDIA_DOWNLOAD_PATH/$MODEL_TAR" ./
tar -zxf "$MODEL_TAR"
if [ $? -eq 0 ]; then
echo -e "${GREEN}Model deployed successfully.${RESET}"
else
echo -e "${RED}Model deployment failed.${RESET}"
exit 1
fi
ll "$MODEL_NAME"
chown -R inspur:inspur "$MODEL_NAME"
rm -rf "$MODEL_TAR"
}
# 检查服务器节点中conda环境是否存在
check_conda_env() {
cd "$MINICONDA_PATH"
if [ ! -d "$CONDA_ENV" ]; then
echo -e "${RED}The conda environment $CONDA_ENV does not exist.${RESET}"
return 1
else
echo -e "${GREEN}The conda environment $CONDA_ENV exists.${RESET}"
return 0
fi
}
# 将介质下载路径中的conda环境压缩包解压至/data/miniconda3/envs/$CONDA_ENV目录下修改用户和用户组。
deploy_conda_env() {
cd "$MINICONDA_PATH"
mkdir "$CONDA_ENV"
cd "$MEDIA_DOWNLOAD_PATH"
tar -zxf "$CONDA_ENV.tar.gz" -C "$MINICONDA_PATH/$CONDA_ENV"
if [ $? -eq 0 ]; then
echo -e "${GREEN}The environment $CONDA_ENV has been created.${RESET}"
else
echo -e "${RED}Failed to create environment $CONDA_ENV.${RESET}"
exit 1
fi
chown -R inspur:inspur "$MINICONDA_PATH/$CONDA_ENV"
}
# 调用conda环境中的check.sh脚本检查模型服务进程
check_model_service_process() {
check_and_enter_script "check.sh"
check_result=$(bash check.sh ${MODEL})
# 检查输出内容
if [[ "$check_result" == *"SUCC"* ]]; then
return 0
elif [[ "$check_result" == *"FAIL"* ]]; then
return 1
else
exit 1
fi
}
# 调用业务代码中的stop.sh脚本停止模型服务
stop_model_service() {
check_and_enter_script "stop.sh"
bash stop.sh ${MODEL} > /dev/null 2>&1
if [ $? -eq 0 ]; then
echo -e "${GREEN}$MODEL_NAME service stopped successfully.${RESET}"
else
echo -e "${RED}Failed to stop $MODEL_NAME service.${RESET}"
fi
}
# 激活conda环境调用业务代码中的start.sh脚本启动模型服务
start_model_service(){
check_and_enter_script "start.sh"
bash "start.sh" ${VISIBLE_GPU_INDEX} ${CONDA_ENV} ${MODEL} ${MODEL_NAME_PATH} ${TENSOR_PARALLEL_SIZE} ${MODEL_SERVER_IP} ${MODEL_PORT} ${GPU_MEMORY_UTILIZATION} ${DTYPE}
}
# 轮询检测模型服务是否已启动成功
check_model_started() {
local start_time=$(date +%s)
local timeout=300
local interval=5
while true; do
sleep $interval
if check_model_service_process; then
echo -e "${GREEN}$MODEL_NAME service started successfully.${RESET}"
break
fi
local current_time=$(date +%s)
local elapsed_time=$(($current_time - $start_time))
if [ $elapsed_time -ge $timeout ]; then
echo -e "${RED}Failed to start $MODEL_NAME service.${RESET}"
echo -e "${RED}Please check the log under $MINICONDA_PATH/$CONDA_ENV/script/$MODEL${RESET}"
exit 1
fi
done
}
update_model_curl_sh() {
local script_file="model_curl.sh"
local model_server_ip=$1
local model_port=$2
local model=$3
check_and_enter_script $script_file
# 使用 sed 命令替换变量值
sed -i "s|^export MODEL_SERVER_IP=.*|export MODEL_SERVER_IP=$model_server_ip|" "$script_file"
sed -i "s|^export MODEL_PORT=.*|export MODEL_PORT=$model_port|" "$script_file"
sed -i "s|^export MODEL=.*|export MODEL=$model|" "$script_file"
echo -e "${GREEN}Updated $script_file with the provided values.${RESET}"
}
# 验证模型服务
model_curl_verification(){
echo -e "${GREEN}Start to execute the curl command to verify the model${RESET}"
update_model_curl_sh $MODEL_SERVER_IP $MODEL_PORT $MODEL
# check_and_enter_script "model_curl.sh"
cat model_curl.sh
echo -e "${GREEN}The curl command returns the following results:${RESET}"
bash model_curl.sh
}
yellow_block_registration_curl(){
echo -e "${CYAN}Start to execute yellow block registration and curl command${RESET}"
if [ ! -d "$YELLOW_BLOCK_PATH" ]; then
echo -e "${RED}Directory $YELLOW_BLOCK_PATH does not exist. Please deploy the start yellow block first.${RESET}"
else
echo -e "${GREEN}Directory $YELLOW_BLOCK_PATH exists.${RESET}"
cd "$YELLOW_BLOCK_PATH"
echo -e "${GREEN}Start to execute the yellow block registration${RESET}"
bash yellow_block_register.sh
echo -e "${GREEN}Start to execute the curl command to verify the yellow block${RESET}"
cat yellow_block_curl.sh
echo -e "${GREEN}The curl command returns the following results:${RESET}"
bash yellow_block_curl.sh
fi
}
update_yellow_block_register_sh() {
local script_file="yellow_block_register.sh"
local yellow_block_conda_env=$1
local model_server_ip=$2
local model_port=$3
local model=$4
check_and_enter_script $script_file
# 使用 sed 命令替换变量值
sed -i "s|^export YELLOW_BLOCK_CONDA_ENV=.*|export YELLOW_BLOCK_CONDA_ENV=$yellow_block_conda_env|" "$script_file"
sed -i "s|^export MODEL_SERVER_IP=.*|export MODEL_SERVER_IP=$model_server_ip|" "$script_file"
sed -i "s|^export MODEL_PORT=.*|export MODEL_PORT=$model_port|" "$script_file"
sed -i "s|^export MODEL=.*|export MODEL=$model|" "$script_file"
# echo -e "${GREEN}Updated $script_file with the provided values.${RESET}"
}
update_yellow_block_curl_sh() {
local script_file="yellow_block_curl.sh"
local model=$1
check_and_enter_script $script_file
# 使用 sed 命令替换变量值
sed -i "s|^export MODEL=.*|export MODEL=$model|" "$script_file"
# echo -e "${GREEN}Updated $script_file with the provided values.${RESET}"
}
cp_registration_curl(){
update_yellow_block_register_sh $YELLOW_BLOCK_CONDA_ENV $MODEL_SERVER_IP $MODEL_PORT $MODEL
update_yellow_block_curl_sh $MODEL
# Check if the Model IP and Yellow Block IP are the same
if [ "$MODEL_SERVER_IP" == "$YELLOW_BLOCK_SERVER_IP" ]; then
echo -e "\n${GREEN}Model IP and Yellow Block IP are the same. Using cp to copy files.${RESET}"
cp -f yellow_block_* "$YELLOW_BLOCK_PATH"
yellow_block_registration_curl
else
echo -e "\n${GREEN}Model IP and Yellow Block IP are different. Using scp to copy files.${RESET}"
scp -P "$YELLOW_BLOCK_SERVER_PORT" yellow_block_* "root@$YELLOW_BLOCK_SERVER_IP":"$YELLOW_BLOCK_PATH"
ssh -p "$YELLOW_BLOCK_SERVER_PORT" "root@$YELLOW_BLOCK_SERVER_IP" << 'EOF'
# Color definitions
RED='\033[0;31m'
GREEN='\033[0;32m'
CYAN='\033[36m'
RESET='\033[0m'
YELLOW_BLOCK_PATH="/data/app/dev/ihp-model-ops/test/model_service_v2"
echo -e "${CYAN}Start to execute yellow block registration and curl command${RESET}"
if [ ! -d "$YELLOW_BLOCK_PATH" ]; then
echo -e "${RED}Directory $YELLOW_BLOCK_PATH does not exist. Please deploy the start yellow block first.${RESET}"
else
echo -e "${GREEN}Directory $YELLOW_BLOCK_PATH exists.${RESET}"
cd "$YELLOW_BLOCK_PATH"
echo -e "${GREEN}Start to execute the yellow block registration${RESET}"
bash yellow_block_register.sh
echo -e "${GREEN}Start to execute the curl command to verify the yellow block${RESET}"
cat yellow_block_curl.sh
echo -e "${GREEN}The curl command returns the following results:${RESET}"
bash yellow_block_curl.sh
fi
EOF
fi
}
# Main function
main(){
echo -e "${CYAN}======================$MODEL_NAME======================${RESET}"
check_media_list
if ! check_model_path; then
deploy_model
fi
if ! check_conda_env; then
deploy_conda_env
fi
if check_model_service_process; then
echo -e "${GREEN}The model service process already exists, starting to stop the process.${RESET}"
stop_model_service
else
echo -e "${GREEN}The model service process does not exist. Start to start the model service.${RESET}"
fi
sleep 5
start_model_service
if [ $? -eq 0 ]; then
echo -e "${CYAN}The model service is starting...${RESET}"
fi
check_model_started
model_curl_verification
cp_registration_curl
echo -e "\n${CYAN}======================$MODEL_NAME======================${RESET}"
}
# Execute the main function
main