#!/usr/bin/env bash # hf-entrypoint.sh - HF Spaces 容器入口 set -euo pipefail echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: starting..." # ============================================ # 0. 保存环境变量到 /etc/profile.d,供后续 bash 会话使用 # ============================================ if [[ -x /usr/local/bin/save-env.sh ]]; then /usr/local/bin/save-env.sh else echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: warning: save-env.sh not found, skipping env export" fi # 加载已保存的环境变量 if [[ -f /etc/profile.d/openclaw-env.sh ]]; then # shellcheck source=/dev/null source /etc/profile.d/openclaw-env.sh echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: loaded environment from /etc/profile.d/openclaw-env.sh" fi # ============================================ # 1. 启动 supervisord(管理 cron + openclaw-gateway) # ============================================ echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: starting supervisord..." mkdir -p /var/run /var/log/supervisor /var/log/hf-entrypoint /usr/bin/supervisord -c /etc/supervisor/supervisord.conf \ >> /var/log/hf-entrypoint/supervisord-stdout.log \ 2>> /var/log/hf-entrypoint/supervisord-stderr.log & SUPERVISORD_PID=$! echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: supervisord started (pid=$SUPERVISORD_PID)" while [[ ! -f /var/run/supervisord.pid ]]; do sleep 0.5 done echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: supervisord is ready" # 1.0 启动 SSH 服务和看门狗(确保SSH持续可用) echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: starting SSH service and watchdog..." # 0. 设置root密码(如果已设置ROOT_PASSWORD环境变量) if [ -n "${ROOT_PASSWORD:-}" ]; then echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: setting root password..." echo "root:${ROOT_PASSWORD}" | chpasswd 2>/dev/null if [ $? -eq 0 ]; then # 确保root账户未锁定 passwd -u root 2>/dev/null || true echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: root password set successfully" else echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: WARNING: failed to set root password" fi fi # 0.1 创建 SSH 权限分离目录并清理残留 PID/套接字 echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: preparing SSH runtime directories..." mkdir -p /run/sshd /var/run/sshd 2>/dev/null || true chmod 755 /run/sshd /var/run/sshd 2>/dev/null || true rm -f /var/run/sshd.pid /var/run/sshd.init.pid /tmp/ssh-* 2>/dev/null || true # 0.2 生成 SSH 主机密钥(如果不存在) if [ ! -f "/etc/ssh/ssh_host_rsa_key" ]; then echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: generating SSH host keys..." ssh-keygen -A 2>/dev/null || true fi # 1. 确保SSH服务启动 if ! pgrep -x "sshd" > /dev/null 2>&1; then _sshd_bin="" if [ -x "/usr/sbin/sshd" ]; then _sshd_bin="/usr/sbin/sshd" elif [ -x "/usr/bin/sshd" ]; then _sshd_bin="/usr/bin/sshd" fi if [ -n "$_sshd_bin" ]; then echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: starting sshd from $_sshd_bin..." $_sshd_bin sleep 2 if pgrep -x "sshd" > /dev/null 2>&1; then echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: sshd started successfully" # 保护 sshd 不被 OOM killer 杀死(降低优先级) for _pid in $(pgrep -x "sshd" 2>/dev/null); do echo -500 > /proc/$_pid/oom_score_adj 2>/dev/null || true done echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: sshd OOM protection applied (oom_score_adj=-500)" # 调整内核内存策略,降低 OOM 误杀关键服务的概率 echo "2" > /proc/sys/vm/overcommit_memory 2>/dev/null || true echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: vm.overcommit_memory=2 set (never overcommit)" else echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: WARNING: sshd failed to start, will be handled by watchdog" fi else echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: WARNING: sshd executable not found" fi fi # 启动SSH看门狗(确保SSH服务持续可用) # 环境变量 SSH_WATCHDOG_DOCKER_LOG: 控制看门狗日志是否重定向到 Docker logs(默认 true) SSH_WATCHDOG_DOCKER_LOG="${SSH_WATCHDOG_DOCKER_LOG:-false}" if [ -x "/usr/local/bin/ssh_service_watchdog.sh" ]; then echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: starting SSH watchdog (docker_log=$SSH_WATCHDOG_DOCKER_LOG)..." if [ "$SSH_WATCHDOG_DOCKER_LOG" = "true" ]; then # 将看门狗输出重定向到Docker标准输出,这样即使xterm无法连接,也可以通过docker logs查看 nohup /usr/local/bin/ssh_service_watchdog.sh >> /proc/1/fd/1 2>> /proc/1/fd/2 & echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: SSH watchdog logs are available via: docker logs " else # 不重定向到 Docker logs,日志仅写入本地文件 /var/log/ssh_watchdog.log nohup /usr/local/bin/ssh_service_watchdog.sh > /dev/null 2>&1 & fi SSH_WATCHDOG_PID=$! echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: SSH watchdog started (pid=$SSH_WATCHDOG_PID)" else echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: WARNING: ssh_service_watchdog.sh not found" fi # 1.1 启动 BT Panel(与 restore 并行启动,节省时间) # ============================================ echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: starting BT Panel..." if [[ -f "/www/server/panel/default.pl" ]]; then echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: BT Panel is installed, starting..." bt start 2>/dev/null || true bt default 2>/dev/null || true else echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: BT Panel not installed, skipping" fi # 1.2 等待 openclaw-gateway 完成恢复 # ============================================ echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: waiting for openclaw-gateway to complete restore..." RESTORE_COMPLETED_FILE="/tmp/openclaw-restore-completed" OPENCLAW_LOG_FILE="/var/log/hf-entrypoint/openclaw-gateway-stdout.log" RESTORE_LOG_FILE="/var/log/openclaw/restore.log" WAITED=0 LAST_RESTORE_LINE=0 LAST_LOG_SIZE=0 PROGRESS_CHECK_INTERVAL=20 SLOW_WARN_THRESHOLD=900 # 15分钟超时预警 MAX_WAIT_TIMEOUT=3600 # 最大等待3600秒(1小时),超时后强制继续启动 IDLE_WARN_THRESHOLD=120 # 日志无新内容120秒则告警 TIME_NO_NEW_LOG=0 mkdir -p "$(dirname "$RESTORE_LOG_FILE")" show_restore_progress() { if [[ ! -f "$RESTORE_LOG_FILE" ]]; then return fi local current_lines current_lines=$(wc -l < "$RESTORE_LOG_FILE" 2>/dev/null || echo "0") if [[ -n "$current_lines" ]] && [[ "$current_lines" -gt "$LAST_RESTORE_LINE" ]]; then local new_lines=$((current_lines - LAST_RESTORE_LINE)) echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: --- Restore log ($new_lines new lines) ---" tail -n "$new_lines" "$RESTORE_LOG_FILE" | while IFS= read -r line; do echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] $line" done LAST_RESTORE_LINE="$current_lines" TIME_NO_NEW_LOG=0 fi # Track log file size (bytes) as an activity indicator local current_size current_size=$(stat -c%s "$RESTORE_LOG_FILE" 2>/dev/null || echo "0") if [[ "$current_size" != "$LAST_LOG_SIZE" ]]; then LAST_LOG_SIZE="$current_size" fi } while true; do # 首次进入循环时显示诊断信息 if [[ $WAITED -eq 0 ]]; then echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: waiting for restore completion..." echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: RESTORE_LOG_FILE=$RESTORE_LOG_FILE" echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: RESTORE_COMPLETED_FILE=$RESTORE_COMPLETED_FILE" fi # 每2秒:检查是否完成 + 获取最新日志 show_restore_progress # 超时兜底:超过3600秒仍未完成,强制继续启动流程 if [[ $WAITED -ge $MAX_WAIT_TIMEOUT ]]; then echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: ⚠ WARNING: Restore timed out after ${WAITED}s ($((WAITED / 60))min), forcing proceed" echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: ⚠ The restore process may still be running in the background" break fi # 只检查恢复完成标志文件 if [[ -f "$RESTORE_COMPLETED_FILE" ]]; then echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: ✓ Restore completed" # 显示恢复日志的最后几行 if [[ -f "$RESTORE_LOG_FILE" ]]; then echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: --- Final restore log (last 10 lines) ---" tail -n 10 "$RESTORE_LOG_FILE" | while IFS= read -r line; do echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] $line" done fi break fi sleep 2 WAITED=$((WAITED + 2)) # 如果日志无新内容,累计无更新时长 if [[ -f "$RESTORE_LOG_FILE" ]]; then current_size=$(stat -c%s "$RESTORE_LOG_FILE" 2>/dev/null || echo "0") if [[ "$current_size" -eq "$LAST_LOG_SIZE" ]]; then TIME_NO_NEW_LOG=$((TIME_NO_NEW_LOG + 2)) else TIME_NO_NEW_LOG=0 fi fi # 每20秒输出一次聚合状态 if [[ $((WAITED % PROGRESS_CHECK_INTERVAL)) -eq 0 ]]; then elapsed_min=$((WAITED / 60)) log_size_str="" if [[ -f "$RESTORE_LOG_FILE" ]]; then file_size=$(stat -c%s "$RESTORE_LOG_FILE" 2>/dev/null || echo "0") if [[ $file_size -ge 1048576 ]]; then log_size_str="$((file_size / 1048576)).$(( (file_size % 1048576) * 10 / 1048576 ))MB" elif [[ $file_size -ge 1024 ]]; then log_size_str="$((file_size / 1024)).$(( (file_size % 1024) * 10 / 1024 ))KB" else log_size_str="${file_size}B" fi fi echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: still waiting for restore... (${WAITED}s / ${elapsed_min}min, log: ${log_size_str:-N/A})" # 如果超过15分钟还没完成,发出预警 if [[ $WAITED -ge $SLOW_WARN_THRESHOLD ]] && [[ $((WAITED % 60)) -eq 0 ]]; then echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: ⚠ WARNING: Restore is taking longer than expected (>${elapsed_min}min). Large backup (>10GB) may require more time." fi show_restore_progress fi done echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: ✓ Restore wait completed (${WAITED}s / $((WAITED / 60))min), proceeding with PM2 startup" # 1.2 确保 cron daemon 运行 if ! pgrep -x cron >/dev/null 2>&1; then echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: starting cron daemon..." /usr/sbin/cron fi # ============================================ # 2. 启动 PM2 管理, 附加的 node 进程(如果需要) # ============================================ echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: starting PM2 for others..." mkdir -p /root/.pm2 /var/log/hf-entrypoint if grep -qE '"name"\s*:' /app/pm2/ecosystem.config.js 2>/dev/null; then /usr/bin/pm2-runtime /app/pm2/ecosystem.config.js \ >> /var/log/hf-entrypoint/pm2-stdout.log \ 2>> /var/log/hf-entrypoint/pm2-stderr.log & PM2_PID=$! echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: PM2 started (pid=$PM2_PID)" else echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: PM2: no applications defined in ecosystem.config.js, skipping..." PM2_PID="" fi # ============================================ # 3. 信号转发(确保 PID 1 的 SIGTERM 能传到 supervisord) # ============================================ signal_handler() { echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: received SIGTERM, forwarding to all processes..." # 停止SSH看门狗 if [ -n "${SSH_WATCHDOG_PID:-}" ]; then echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: stopping SSH watchdog (pid=$SSH_WATCHDOG_PID)..." kill -TERM "$SSH_WATCHDOG_PID" 2>/dev/null || true fi # 停止supervisord kill -TERM "$SUPERVISORD_PID" 2>/dev/null || true # 停止PM2 kill -TERM "$PM2_PID" 2>/dev/null || true # 停止SSH服务 echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: stopping SSH service..." if pgrep -x "sshd" > /dev/null 2>&1; then killall sshd 2>/dev/null || true fi echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: all services stopped" } trap signal_handler TERM INT QUIT # ============================================ # 5. 启动 node hf-server.js 作为 PID 1 # ============================================ echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: starting node server.js..." cd /app exec node hf-server.js