From 96c3f1409bd47c359e3d4286b0dbe7b94500f838 Mon Sep 17 00:00:00 2001 From: cfdaily Date: Tue, 2 Jun 2026 22:00:32 +0800 Subject: [PATCH] auto-sync: 2026-06-02 22:00:32 --- scripts/gateway-watchdog.sh | 261 +++++++++++++++++++++++++----------- 1 file changed, 182 insertions(+), 79 deletions(-) diff --git a/scripts/gateway-watchdog.sh b/scripts/gateway-watchdog.sh index 83d4f5e..fe06e25 100755 --- a/scripts/gateway-watchdog.sh +++ b/scripts/gateway-watchdog.sh @@ -1,10 +1,11 @@ #!/bin/bash -# gateway-watchdog.sh — 检测 OpenClaw Gateway 429 并自动重启 -# -# 机制: -# 1. 检查最近 CHECK_WINDOW 秒内的 session jsonl 日志 -# 2. 统计 429 错误次数(errorCode=1305 或 errorMessage 含 "429") -# 3. CHECK_WINDOW 内 429 总数 >= THRESHOLD → 重启 Gateway +# gateway-watchdog.sh — 检测 Gateway 进程日志异常并自动重启(v2) +# +# 改动(v2): +# - 数据源从 session jsonl 改为 Gateway 进程日志 +# - 三条检测规则:R1(FailoverError)、R2(stalled recovery=none)、R3(rate_limit/429) +# - 防重启风暴:重启后 5 分钟冷却期 +# - 重启原因记录到 /tmp/gateway-watchdog-restarts.log 用于统计分析 # # 部署:cron 每分钟执行一次 # * * * * * /Users/chufeng/.openclaw/sanguo_projects/sanguo_moziplus_v2/scripts/gateway-watchdog.sh >> /tmp/gateway-watchdog.log 2>&1 @@ -14,20 +15,22 @@ set -euo pipefail # crontab 环境下 PATH 不完整,补上 export PATH="/opt/homebrew/bin:/usr/local/bin:/usr/bin:/bin:$PATH" -# === flock 防并发 === +# === flock 防并发(macOS 无 flock,用 mkdir) === LOCK_FILE="/tmp/gateway-watchdog.lock" -# macOS 无 flock,用 mkdir 做简易锁 if ! mkdir "$LOCK_FILE" 2>/dev/null; then exit 0 fi trap 'rmdir "$LOCK_FILE" 2>/dev/null' EXIT # === 配置 === -CHECK_WINDOW=120 # 检查最近多少秒的日志 -THRESHOLD=3 # CHECK_WINDOW 内发现多少个 429 就重启 -MTIME_BUFFER=180 # 文件 mtime 阈值(CHECK_WINDOW + buffer,防漏检) -STATE_FILE="/tmp/gateway-watchdog-429-count" -LOG_DIR="/Users/chufeng/.openclaw/agents" +CHECK_WINDOW=120 # 检查最近多少秒的日志 +R1_THRESHOLD=2 # FailoverError 阈值 +R2_THRESHOLD=3 # stalled recovery=none 阈值 +R3_THRESHOLD=2 # rate_limit/429 阈值 +COOLDOWN=300 # 重启后冷却期(秒) +LOG_FILE="/tmp/openclaw/openclaw-$(date '+%Y-%m-%d').log" # Gateway 进程日志 +STATE_FILE="/tmp/gateway-watchdog-state" # JSON 状态文件 +RESTART_LOG="/tmp/gateway-watchdog-restarts.log" # 重启原因记录(永久追加) RESTART_CMD="openclaw gateway restart" HEALTH_CMD="openclaw gateway health" @@ -37,90 +40,190 @@ log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*" } -# 统计最近的 429 错误数 -count_recent_429() { - local cutoff=$(date -u -v-${CHECK_WINDOW}S '+%Y-%m-%dT%H:%M:%S' 2>/dev/null || date -u -d "${CHECK_WINDOW} seconds ago" '+%Y-%m-%dT%H:%M:%S') - local count=0 +# 从状态文件读取 JSON 字段值 +state_get() { + local key="$1" + if [ -f "$STATE_FILE" ]; then + python3 -c " +import json, sys +try: + with open('$STATE_FILE') as f: + d = json.load(f) + print(d.get('$key', '')) +except: + print('') +" 2>/dev/null + fi +} - for jsonl in "$LOG_DIR"/*/sessions/*.jsonl; do - # 排除 trajectory 和 checkpoint 文件 - [[ "$jsonl" == *"trajectory"* ]] && continue - [[ "$jsonl" == *"checkpoint"* ]] && continue - [ -f "$jsonl" ] || continue +# 写状态文件(JSON 格式) +state_write() { + local last_restart_time="$1" + local last_restart_reason="$2" + local cooldown_until="$3" + cat > "$STATE_FILE" </dev/null | tr -d ' \n' || stat -c '%Y' "$jsonl" 2>/dev/null | tr -d ' \n') - [ -z "$mtime" ] && continue +# 记录重启原因到永久日志 +log_restart() { + local reason="$1" + local detail="$2" + local r1_count="$3" + local r2_count="$4" + local r3_count="$5" + local ts + ts=$(date '+%Y-%m-%dT%H:%M:%S%z') + # 确保 restarts log 文件存在 + touch "$RESTART_LOG" + python3 -c " +import json +entry = { + 'time': '${ts}', + 'reason': '${reason}', + 'detail': '''${detail}''', + 'counts': {'r1': ${r1_count}, 'r2': ${r2_count}, 'r3': ${r3_count}} +} +with open('${RESTART_LOG}', 'a') as f: + f.write(json.dumps(entry, ensure_ascii=False) + '\n') +" 2>/dev/null +} + +# 检查是否在冷却期内 +is_in_cooldown() { + local cooldown_until + cooldown_until=$(state_get "cooldown_until") + if [ -n "$cooldown_until" ] && [ "$cooldown_until" != "None" ]; then local now now=$(date +%s) - local age=$(( now - mtime )) - [ "$age" -gt "$MTIME_BUFFER" ] && continue + if [ "$now" -lt "$cooldown_until" ]; then + return 0 # 在冷却期内 + fi + fi + return 1 # 不在冷却期 +} - # 跳过空文件 - local fsize - fsize=$(wc -c < "$jsonl" | tr -d ' ') - [ "$fsize" -lt 100 ] && continue +# 从 Gateway 日志中提取最近 CHECK_WINDOW 秒内的行 +# 输出:符合条件的日志行(供后续 grep 计数) +get_recent_lines() { + if [ ! -f "$LOG_FILE" ]; then + return + fi + local cutoff + cutoff=$(python3 -c " +from datetime import datetime, timedelta, timezone +import sys +tz = timezone(timedelta(hours=8)) +cutoff = datetime.now(tz) - timedelta(seconds=${CHECK_WINDOW}) +print(cutoff.strftime('%Y-%m-%dT%H:%M:%S')) +" 2>/dev/null) + [ -z "$cutoff" ] && return - # 搜索 429 错误:errorCode=1305 或 errorMessage 含 "429" - local found=0 - found=$(grep -c '"errorCode".*"1305"\|"errorMessage".*"429"' "$jsonl" 2>/dev/null || true) - [ -z "$found" ] && found=0 - if [ "$found" -gt 0 ]; then - local recent - recent=$(python3 -c " + # 用 python3 做时间过滤(日志行中 time 字段是 ISO 格式) + python3 -c " import json, sys cutoff = '${cutoff}' -count = 0 -with open('${jsonl}') as f: +with open('${LOG_FILE}') as f: for line in f: + line = line.strip() + if not line: + continue try: d = json.loads(line) - ts = d.get('timestamp', '')[:19] - if ts < cutoff: - continue - msg = d.get('message', {}) - if msg.get('stopReason') == 'error': - err = str(msg.get('errorMessage', '')) - code = msg.get('errorCode') or '' - if '429' in err or '1305' in code: - count += 1 + ts = d.get('time', '')[:19] # 截取到秒 + if ts >= cutoff: + sys.stdout.write(line + '\n') except: pass -print(count) -" 2>/dev/null || echo 0) - count=$(( count + recent )) - fi - done - echo "$count" +" 2>/dev/null +} + +# 统计各规则命中次数 +# 用全局变量返回:R1_COUNT, R2_COUNT, R3_COUNT +count_rules() { + local recent_lines + recent_lines=$(get_recent_lines) + + if [ -z "$recent_lines" ]; then + R1_COUNT=0 + R2_COUNT=0 + R3_COUNT=0 + return + fi + + # R1: 含 "lane task error" 且含 "FailoverError" + R1_COUNT=$(echo "$recent_lines" | grep -c "lane task error" 2>/dev/null | tr -d ' ' || echo 0) + if [ "$R1_COUNT" -gt 0 ]; then + R1_COUNT=$(echo "$recent_lines" | grep "lane task error" | grep -c "FailoverError" 2>/dev/null | tr -d ' ' || echo 0) + fi + + # R2: 含 "stalled session" 且含 "recovery=none" + R2_COUNT=$(echo "$recent_lines" | grep -c "stalled session" 2>/dev/null | tr -d ' ' || echo 0) + if [ "$R2_COUNT" -gt 0 ]; then + R2_COUNT=$(echo "$recent_lines" | grep "stalled session" | grep -c "recovery=none" 2>/dev/null | tr -d ' ' || echo 0) + fi + + # R3: 含 "rate_limit" 或含 "429" + R3_COUNT=$(echo "$recent_lines" | grep -c "rate_limit\|\"429\"" 2>/dev/null | tr -d ' ' || echo 0) } # === 主逻辑 === -# 1. 先检查 Gateway 是否在运行 -if ! $HEALTH_CMD &>/dev/null; then - log "WARN: Gateway health check failed, attempting restart" - $RESTART_CMD 2>&1 || true - echo 0 > "$STATE_FILE" +log "watchdog v2 check start (log: $LOG_FILE)" + +# 1. 检查日志文件是否存在 +if [ ! -f "$LOG_FILE" ]; then + log "INFO: Gateway log file not found ($LOG_FILE), skipping detection" exit 0 fi -# 2. 统计最近 429 错误 -four29_count=$(count_recent_429) - -# 3. 判断:CHECK_WINDOW 内 429 总数 >= THRESHOLD → 重启 -if [ "$four29_count" -ge "$THRESHOLD" ]; then - log "ALERT: ${four29_count} 429 errors in last ${CHECK_WINDOW}s (threshold=${THRESHOLD}), restarting Gateway" - $RESTART_CMD 2>&1 - log "Gateway restart completed" - echo "$four29_count" > "$STATE_FILE" -elif [ "$four29_count" -gt 0 ]; then - log "429 detected: ${four29_count} errors in last ${CHECK_WINDOW}s (threshold=${THRESHOLD}, no restart)" - echo "$four29_count" > "$STATE_FILE" -else - # 只在有前次记录时打印 clear - if [ -f "$STATE_FILE" ] && [ "$(cat "$STATE_FILE")" != "0" ]; then - log "429 cleared" - fi - echo 0 > "$STATE_FILE" +# 2. 先检查 Gateway 是否在运行 +if ! $HEALTH_CMD &>/dev/null; then + log "WARN: Gateway health check failed, attempting restart" + $RESTART_CMD 2>&1 || true + local_ts=$(date '+%Y-%m-%dT%H:%M:%S%z') + cooldown_ts=$(($(date +%s) + COOLDOWN)) + state_write "$local_ts" "health_fail" "$cooldown_ts" + log_restart "health_fail" "Gateway health check failed" 0 0 0 + log "Gateway restart completed (reason: health_fail)" + exit 0 +fi + +# 3. 统计各规则命中次数 +count_rules + +log "rule counts: R1(FailoverError)=${R1_COUNT}/${R1_THRESHOLD} R2(stalled)=${R2_COUNT}/${R2_THRESHOLD} R3(rate_limit)=${R3_COUNT}/${R3_THRESHOLD}" + +# 4. 判断是否触发重启 +triggered_reason="" +triggered_detail="" + +if [ "$R1_COUNT" -ge "$R1_THRESHOLD" ]; then + triggered_reason="R1" + triggered_detail="FailoverError x${R1_COUNT}" +elif [ "$R2_COUNT" -ge "$R2_THRESHOLD" ]; then + triggered_reason="R2" + triggered_detail="stalled recovery=none x${R2_COUNT}" +elif [ "$R3_COUNT" -ge "$R3_THRESHOLD" ]; then + triggered_reason="R3" + triggered_detail="rate_limit/429 x${R3_COUNT}" +fi + +if [ -n "$triggered_reason" ]; then + if is_in_cooldown; then + local_cooldown_until=$(state_get "cooldown_until") + log "ALERT: ${triggered_reason} triggered (${triggered_detail}) but in cooldown until ${local_cooldown_until}, skipping restart" + else + log "ALERT: ${triggered_reason} triggered (${triggered_detail}), restarting Gateway" + $RESTART_CMD 2>&1 + local_ts=$(date '+%Y-%m-%dT%H:%M:%S%z') + cooldown_ts=$(($(date +%s) + COOLDOWN)) + state_write "$local_ts" "$triggered_reason" "$cooldown_ts" + log_restart "$triggered_reason" "$triggered_detail" "$R1_COUNT" "$R2_COUNT" "$R3_COUNT" + log "Gateway restart completed (reason: ${triggered_reason})" + fi +else + # 无规则命中,一切正常 + log "all clear" fi