#!/bin/bash # gateway-watchdog.sh — 检测 Gateway 进程日志异常并自动重启(v2) # # 改动(v2): # - 数据源从 session jsonl 改为 Gateway 进程日志 # - 三条检测规则:R1(FailoverError)、R2(stalled recovery=none)、R3(rate_limit/429) # - 防重启风暴:重启后 5 分钟冷却期 # - 重启原因记录到 /tmp/gateway-watchdog-restarts.log 用于统计分析 # # 部署:cron 每分钟执行一次 # * * * * * /Users/chufeng/.openclaw/sanguo_projects/sanguo_moziplus_v2/scripts/gateway-watchdog.sh >> /tmp/gateway-watchdog.log 2>&1 set -euo pipefail # crontab 环境下 PATH 不完整,补上 export PATH="/opt/homebrew/bin:/usr/local/bin:/usr/bin:/bin:$PATH" # === flock 防并发(macOS 无 flock,用 mkdir) === LOCK_FILE="/tmp/gateway-watchdog.lock" if ! mkdir "$LOCK_FILE" 2>/dev/null; then exit 0 fi trap 'rmdir "$LOCK_FILE" 2>/dev/null' EXIT # === 配置 === CHECK_WINDOW=120 # 检查最近多少秒的日志 R1_THRESHOLD=2 # FailoverError 阈值 R2_THRESHOLD=3 # stalled recovery=none 阈值 R3_THRESHOLD=2 # rate_limit/429 阈值 COOLDOWN=300 # 重启后冷却期(秒) LOG_FILE="/tmp/openclaw/openclaw-$(date '+%Y-%m-%d').log" # Gateway 进程日志 STATE_FILE="/tmp/gateway-watchdog-state" # JSON 状态文件 RESTART_LOG="/tmp/gateway-watchdog-restarts.log" # 重启原因记录(永久追加) RESTART_CMD="openclaw gateway restart" HEALTH_CMD="openclaw gateway health" # === 函数 === log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*" } # 从状态文件读取 JSON 字段值 state_get() { local key="$1" if [ -f "$STATE_FILE" ]; then python3 -c " import json, sys try: with open('$STATE_FILE') as f: d = json.load(f) print(d.get('$key', '')) except: print('') " 2>/dev/null fi } # 写状态文件(JSON 格式) state_write() { local last_restart_time="$1" local last_restart_reason="$2" local cooldown_until="$3" cat > "$STATE_FILE" </dev/null } # 检查是否在冷却期内 is_in_cooldown() { local cooldown_until cooldown_until=$(state_get "cooldown_until") if [ -n "$cooldown_until" ] && [ "$cooldown_until" != "None" ]; then local now now=$(date +%s) if [ "$now" -lt "$cooldown_until" ]; then return 0 # 在冷却期内 fi fi return 1 # 不在冷却期 } # 从 Gateway 日志中提取最近 CHECK_WINDOW 秒内的行 # 输出:符合条件的日志行(供后续 grep 计数) get_recent_lines() { if [ ! -f "$LOG_FILE" ]; then return fi local cutoff cutoff=$(python3 -c " from datetime import datetime, timedelta, timezone import sys tz = timezone(timedelta(hours=8)) cutoff = datetime.now(tz) - timedelta(seconds=${CHECK_WINDOW}) print(cutoff.strftime('%Y-%m-%dT%H:%M:%S')) " 2>/dev/null) [ -z "$cutoff" ] && return # 用 python3 做时间过滤(日志行中 time 字段是 ISO 格式) python3 -c " import json, sys cutoff = '${cutoff}' with open('${LOG_FILE}') as f: for line in f: line = line.strip() if not line: continue try: d = json.loads(line) ts = d.get('time', '')[:19] # 截取到秒 if ts >= cutoff: sys.stdout.write(line + '\n') except: pass " 2>/dev/null } # 辅助函数:安全计数(从管道输入 grep 后取整数值) _safe_count() { # 接受 grep pattern,对 stdin 计数,返回纯整数 local count count=$(grep -c "$1" 2>/dev/null || true) # 去掉所有空白和换行,确保是纯数字 count=$(echo "$count" | tr -d '[:space:]') if [ -z "$count" ]; then count=0; fi echo "$count" } # 统计各规则命中次数 # 用全局变量返回:R1_COUNT, R2_COUNT, R3_COUNT count_rules() { local recent_lines recent_lines=$(get_recent_lines) if [ -z "$recent_lines" ]; then R1_COUNT=0 R2_COUNT=0 R3_COUNT=0 return fi # R1: 含 "lane task error" 且含 "FailoverError" local r1_matched r1_matched=$(echo "$recent_lines" | grep "lane task error" || true) if [ -n "$r1_matched" ]; then R1_COUNT=$(echo "$r1_matched" | _safe_count "FailoverError") else R1_COUNT=0 fi # R2: 含 "stalled session" 且含 "recovery=none" local r2_matched r2_matched=$(echo "$recent_lines" | grep "stalled session" || true) if [ -n "$r2_matched" ]; then R2_COUNT=$(echo "$r2_matched" | _safe_count "recovery=none") else R2_COUNT=0 fi # R3: 含 "rate_limit" 或含 "429" local r3_matched r3_matched=$(echo "$recent_lines" | grep -E 'rate_limit|429' || true) if [ -n "$r3_matched" ]; then R3_COUNT=$(echo "$r3_matched" | wc -l | tr -d '[:space:]') else R3_COUNT=0 fi } # === 主逻辑 === log "watchdog v2 check start (log: $LOG_FILE)" # 1. 检查日志文件是否存在 if [ ! -f "$LOG_FILE" ]; then log "INFO: Gateway log file not found ($LOG_FILE), skipping detection" exit 0 fi # 2. 先检查 Gateway 是否在运行 if ! $HEALTH_CMD &>/dev/null; then log "WARN: Gateway health check failed, attempting restart" $RESTART_CMD 2>&1 || true local_ts=$(date '+%Y-%m-%dT%H:%M:%S%z') cooldown_ts=$(($(date +%s) + COOLDOWN)) state_write "$local_ts" "health_fail" "$cooldown_ts" log_restart "health_fail" "Gateway health check failed" 0 0 0 log "Gateway restart attempted (reason: health_fail)" exit 0 fi # 3. 统计各规则命中次数 count_rules log "rule counts: R1(FailoverError)=${R1_COUNT}/${R1_THRESHOLD} R2(stalled)=${R2_COUNT}/${R2_THRESHOLD} R3(rate_limit)=${R3_COUNT}/${R3_THRESHOLD}" # 4. 判断是否触发重启 triggered_reason="" triggered_detail="" if [ "$R1_COUNT" -ge "$R1_THRESHOLD" ]; then triggered_reason="R1" triggered_detail="FailoverError x${R1_COUNT}" elif [ "$R2_COUNT" -ge "$R2_THRESHOLD" ]; then triggered_reason="R2" triggered_detail="stalled recovery=none x${R2_COUNT}" elif [ "$R3_COUNT" -ge "$R3_THRESHOLD" ]; then triggered_reason="R3" triggered_detail="rate_limit/429 x${R3_COUNT}" fi if [ -n "$triggered_reason" ]; then if is_in_cooldown; then local_cooldown_until=$(state_get "cooldown_until") log "ALERT: ${triggered_reason} triggered (${triggered_detail}) but in cooldown until ${local_cooldown_until}, skipping restart" else log "ALERT: ${triggered_reason} triggered (${triggered_detail}), restarting Gateway" $RESTART_CMD 2>&1 local_ts=$(date '+%Y-%m-%dT%H:%M:%S%z') cooldown_ts=$(($(date +%s) + COOLDOWN)) state_write "$local_ts" "$triggered_reason" "$cooldown_ts" log_restart "$triggered_reason" "$triggered_detail" "$R1_COUNT" "$R2_COUNT" "$R3_COUNT" log "Gateway restart completed (reason: ${triggered_reason})" fi else # 无规则命中,一切正常 log "all clear" fi