auto-sync: 2026-06-02 22:00:32

2026-06-02 22:00:32 +08:00
parent 4d2d13e0c1
commit 96c3f1409b
1 changed files with 182 additions and 79 deletions
@@ -1,10 +1,11 @@
 #!/bin/bash
-# gateway-watchdog.sh — 检测 OpenClaw Gateway 429 并自动重启
-# 
-# 机制：
-#   1. 检查最近 CHECK_WINDOW 秒内的 session jsonl 日志
-#   2. 统计 429 错误次数（errorCode=1305 或 errorMessage 含 "429"）
-#   3. CHECK_WINDOW 内 429 总数 >= THRESHOLD → 重启 Gateway
+# gateway-watchdog.sh — 检测 Gateway 进程日志异常并自动重启（v2）
+#
+# 改动（v2）：
+#   - 数据源从 session jsonl 改为 Gateway 进程日志
+#   - 三条检测规则：R1（FailoverError）、R2（stalled recovery=none）、R3（rate_limit/429）
+#   - 防重启风暴：重启后 5 分钟冷却期
+#   - 重启原因记录到 /tmp/gateway-watchdog-restarts.log 用于统计分析
 #
 # 部署：cron 每分钟执行一次
 #   * * * * * /Users/chufeng/.openclaw/sanguo_projects/sanguo_moziplus_v2/scripts/gateway-watchdog.sh >> /tmp/gateway-watchdog.log 2>&1
@@ -14,20 +15,22 @@ set -euo pipefail
 # crontab 环境下 PATH 不完整，补上
 export PATH="/opt/homebrew/bin:/usr/local/bin:/usr/bin:/bin:$PATH"

-# === flock 防并发 ===
+# === flock 防并发（macOS 无 flock，用 mkdir） ===
 LOCK_FILE="/tmp/gateway-watchdog.lock"
-# macOS 无 flock，用 mkdir 做简易锁
 if ! mkdir "$LOCK_FILE" 2>/dev/null; then
    exit 0
 fi
 trap 'rmdir "$LOCK_FILE" 2>/dev/null' EXIT

 # === 配置 ===
-CHECK_WINDOW=120          # 检查最近多少秒的日志
-THRESHOLD=3               # CHECK_WINDOW 内发现多少个 429 就重启
-MTIME_BUFFER=180          # 文件 mtime 阈值（CHECK_WINDOW + buffer，防漏检）
-STATE_FILE="/tmp/gateway-watchdog-429-count"
-LOG_DIR="/Users/chufeng/.openclaw/agents"
+CHECK_WINDOW=120                                              # 检查最近多少秒的日志
+R1_THRESHOLD=2                                                # FailoverError 阈值
+R2_THRESHOLD=3                                                # stalled recovery=none 阈值
+R3_THRESHOLD=2                                                # rate_limit/429 阈值
+COOLDOWN=300                                                  # 重启后冷却期（秒）
+LOG_FILE="/tmp/openclaw/openclaw-$(date '+%Y-%m-%d').log"      # Gateway 进程日志
+STATE_FILE="/tmp/gateway-watchdog-state"                      # JSON 状态文件
+RESTART_LOG="/tmp/gateway-watchdog-restarts.log"              # 重启原因记录（永久追加）
 RESTART_CMD="openclaw gateway restart"
 HEALTH_CMD="openclaw gateway health"

@@ -37,90 +40,190 @@ log() {
    echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*"
 }

-# 统计最近的 429 错误数
-count_recent_429() {
-    local cutoff=$(date -u -v-${CHECK_WINDOW}S '+%Y-%m-%dT%H:%M:%S' 2>/dev/null || date -u -d "${CHECK_WINDOW} seconds ago" '+%Y-%m-%dT%H:%M:%S')
-    local count=0
+# 从状态文件读取 JSON 字段值
+state_get() {
+    local key="$1"
+    if [ -f "$STATE_FILE" ]; then
+        python3 -c "
+import json, sys
+try:
+    with open('$STATE_FILE') as f:
+        d = json.load(f)
+    print(d.get('$key', ''))
+except:
+    print('')
+" 2>/dev/null
+    fi
+}

-    for jsonl in "$LOG_DIR"/*/sessions/*.jsonl; do
-        # 排除 trajectory 和 checkpoint 文件
-        [[ "$jsonl" == *"trajectory"* ]] && continue
-        [[ "$jsonl" == *"checkpoint"* ]] && continue
-        [ -f "$jsonl" ] || continue
+# 写状态文件（JSON 格式）
+state_write() {
+    local last_restart_time="$1"
+    local last_restart_reason="$2"
+    local cooldown_until="$3"
+    cat > "$STATE_FILE" <<STATEOF
+{"last_restart_time":"${last_restart_time}","last_restart_reason":"${last_restart_reason}","cooldown_until":${cooldown_until}}
+STATEOF
+}

-        # mtime 过滤（加 buffer 防漏检）
-        local mtime
-        mtime=$(stat -f '%m' "$jsonl" 2>/dev/null | tr -d ' \n' || stat -c '%Y' "$jsonl" 2>/dev/null | tr -d ' \n')
-        [ -z "$mtime" ] && continue
+# 记录重启原因到永久日志
+log_restart() {
+    local reason="$1"
+    local detail="$2"
+    local r1_count="$3"
+    local r2_count="$4"
+    local r3_count="$5"
+    local ts
+    ts=$(date '+%Y-%m-%dT%H:%M:%S%z')
+    # 确保 restarts log 文件存在
+    touch "$RESTART_LOG"
+    python3 -c "
+import json
+entry = {
+    'time': '${ts}',
+    'reason': '${reason}',
+    'detail': '''${detail}''',
+    'counts': {'r1': ${r1_count}, 'r2': ${r2_count}, 'r3': ${r3_count}}
+}
+with open('${RESTART_LOG}', 'a') as f:
+    f.write(json.dumps(entry, ensure_ascii=False) + '\n')
+" 2>/dev/null
+}
+
+# 检查是否在冷却期内
+is_in_cooldown() {
+    local cooldown_until
+    cooldown_until=$(state_get "cooldown_until")
+    if [ -n "$cooldown_until" ] && [ "$cooldown_until" != "None" ]; then
        local now
        now=$(date +%s)
-        local age=$(( now - mtime ))
-        [ "$age" -gt "$MTIME_BUFFER" ] && continue
+        if [ "$now" -lt "$cooldown_until" ]; then
+            return 0  # 在冷却期内
+        fi
+    fi
+    return 1  # 不在冷却期
+}

-        # 跳过空文件
-        local fsize
-        fsize=$(wc -c < "$jsonl" | tr -d ' ')
-        [ "$fsize" -lt 100 ] && continue
+# 从 Gateway 日志中提取最近 CHECK_WINDOW 秒内的行
+# 输出：符合条件的日志行（供后续 grep 计数）
+get_recent_lines() {
+    if [ ! -f "$LOG_FILE" ]; then
+        return
+    fi
+    local cutoff
+    cutoff=$(python3 -c "
+from datetime import datetime, timedelta, timezone
+import sys
+tz = timezone(timedelta(hours=8))
+cutoff = datetime.now(tz) - timedelta(seconds=${CHECK_WINDOW})
+print(cutoff.strftime('%Y-%m-%dT%H:%M:%S'))
+" 2>/dev/null)
+    [ -z "$cutoff" ] && return

-        # 搜索 429 错误：errorCode=1305 或 errorMessage 含 "429"
-        local found=0
-        found=$(grep -c '"errorCode".*"1305"\|"errorMessage".*"429"' "$jsonl" 2>/dev/null || true)
-        [ -z "$found" ] && found=0
-        if [ "$found" -gt 0 ]; then
-            local recent
-            recent=$(python3 -c "
+    # 用 python3 做时间过滤（日志行中 time 字段是 ISO 格式）
+    python3 -c "
 import json, sys
 cutoff = '${cutoff}'
-count = 0
-with open('${jsonl}') as f:
+with open('${LOG_FILE}') as f:
    for line in f:
+        line = line.strip()
+        if not line:
+            continue
        try:
            d = json.loads(line)
-            ts = d.get('timestamp', '')[:19]
-            if ts < cutoff:
-                continue
-            msg = d.get('message', {})
-            if msg.get('stopReason') == 'error':
-                err = str(msg.get('errorMessage', ''))
-                code = msg.get('errorCode') or ''
-                if '429' in err or '1305' in code:
-                    count += 1
+            ts = d.get('time', '')[:19]  # 截取到秒
+            if ts >= cutoff:
+                sys.stdout.write(line + '\n')
        except:
            pass
-print(count)
-" 2>/dev/null || echo 0)
-            count=$(( count + recent ))
-        fi
-    done
-    echo "$count"
+" 2>/dev/null
+}
+
+# 统计各规则命中次数
+# 用全局变量返回：R1_COUNT, R2_COUNT, R3_COUNT
+count_rules() {
+    local recent_lines
+    recent_lines=$(get_recent_lines)
+
+    if [ -z "$recent_lines" ]; then
+        R1_COUNT=0
+        R2_COUNT=0
+        R3_COUNT=0
+        return
+    fi
+
+    # R1: 含 "lane task error" 且含 "FailoverError"
+    R1_COUNT=$(echo "$recent_lines" | grep -c "lane task error" 2>/dev/null | tr -d ' ' || echo 0)
+    if [ "$R1_COUNT" -gt 0 ]; then
+        R1_COUNT=$(echo "$recent_lines" | grep "lane task error" | grep -c "FailoverError" 2>/dev/null | tr -d ' ' || echo 0)
+    fi
+
+    # R2: 含 "stalled session" 且含 "recovery=none"
+    R2_COUNT=$(echo "$recent_lines" | grep -c "stalled session" 2>/dev/null | tr -d ' ' || echo 0)
+    if [ "$R2_COUNT" -gt 0 ]; then
+        R2_COUNT=$(echo "$recent_lines" | grep "stalled session" | grep -c "recovery=none" 2>/dev/null | tr -d ' ' || echo 0)
+    fi
+
+    # R3: 含 "rate_limit" 或含 "429"
+    R3_COUNT=$(echo "$recent_lines" | grep -c "rate_limit\|\"429\"" 2>/dev/null | tr -d ' ' || echo 0)
 }

 # === 主逻辑 ===

-# 1. 先检查 Gateway 是否在运行
-if ! $HEALTH_CMD &>/dev/null; then
-    log "WARN: Gateway health check failed, attempting restart"
-    $RESTART_CMD 2>&1 || true
-    echo 0 > "$STATE_FILE"
+log "watchdog v2 check start (log: $LOG_FILE)"
+
+# 1. 检查日志文件是否存在
+if [ ! -f "$LOG_FILE" ]; then
+    log "INFO: Gateway log file not found ($LOG_FILE), skipping detection"
    exit 0
 fi

-# 2. 统计最近 429 错误
-four29_count=$(count_recent_429)
-
-# 3. 判断：CHECK_WINDOW 内 429 总数 >= THRESHOLD → 重启
-if [ "$four29_count" -ge "$THRESHOLD" ]; then
-    log "ALERT: ${four29_count} 429 errors in last ${CHECK_WINDOW}s (threshold=${THRESHOLD}), restarting Gateway"
-    $RESTART_CMD 2>&1
-    log "Gateway restart completed"
-    echo "$four29_count" > "$STATE_FILE"
-elif [ "$four29_count" -gt 0 ]; then
-    log "429 detected: ${four29_count} errors in last ${CHECK_WINDOW}s (threshold=${THRESHOLD}, no restart)"
-    echo "$four29_count" > "$STATE_FILE"
-else
-    # 只在有前次记录时打印 clear
-    if [ -f "$STATE_FILE" ] && [ "$(cat "$STATE_FILE")" != "0" ]; then
-        log "429 cleared"
-    fi
-    echo 0 > "$STATE_FILE"
+# 2. 先检查 Gateway 是否在运行
+if ! $HEALTH_CMD &>/dev/null; then
+    log "WARN: Gateway health check failed, attempting restart"
+    $RESTART_CMD 2>&1 || true
+    local_ts=$(date '+%Y-%m-%dT%H:%M:%S%z')
+    cooldown_ts=$(($(date +%s) + COOLDOWN))
+    state_write "$local_ts" "health_fail" "$cooldown_ts"
+    log_restart "health_fail" "Gateway health check failed" 0 0 0
+    log "Gateway restart completed (reason: health_fail)"
+    exit 0
+fi
+
+# 3. 统计各规则命中次数
+count_rules
+
+log "rule counts: R1(FailoverError)=${R1_COUNT}/${R1_THRESHOLD} R2(stalled)=${R2_COUNT}/${R2_THRESHOLD} R3(rate_limit)=${R3_COUNT}/${R3_THRESHOLD}"
+
+# 4. 判断是否触发重启
+triggered_reason=""
+triggered_detail=""
+
+if [ "$R1_COUNT" -ge "$R1_THRESHOLD" ]; then
+    triggered_reason="R1"
+    triggered_detail="FailoverError x${R1_COUNT}"
+elif [ "$R2_COUNT" -ge "$R2_THRESHOLD" ]; then
+    triggered_reason="R2"
+    triggered_detail="stalled recovery=none x${R2_COUNT}"
+elif [ "$R3_COUNT" -ge "$R3_THRESHOLD" ]; then
+    triggered_reason="R3"
+    triggered_detail="rate_limit/429 x${R3_COUNT}"
+fi
+
+if [ -n "$triggered_reason" ]; then
+    if is_in_cooldown; then
+        local_cooldown_until=$(state_get "cooldown_until")
+        log "ALERT: ${triggered_reason} triggered (${triggered_detail}) but in cooldown until ${local_cooldown_until}, skipping restart"
+    else
+        log "ALERT: ${triggered_reason} triggered (${triggered_detail}), restarting Gateway"
+        $RESTART_CMD 2>&1
+        local_ts=$(date '+%Y-%m-%dT%H:%M:%S%z')
+        cooldown_ts=$(($(date +%s) + COOLDOWN))
+        state_write "$local_ts" "$triggered_reason" "$cooldown_ts"
+        log_restart "$triggered_reason" "$triggered_detail" "$R1_COUNT" "$R2_COUNT" "$R3_COUNT"
+        log "Gateway restart completed (reason: ${triggered_reason})"
+    fi
+else
+    # 无规则命中，一切正常
+    log "all clear"
 fi