diff --git a/scripts/gateway-watchdog.sh b/scripts/gateway-watchdog.sh new file mode 100755 index 0000000..d181026 --- /dev/null +++ b/scripts/gateway-watchdog.sh @@ -0,0 +1,115 @@ +#!/bin/bash +# gateway-watchdog.sh — 检测 OpenClaw Gateway 429 并自动重启 +# +# 机制: +# 1. 检查最近 CHECK_WINDOW 秒内的 session jsonl 日志 +# 2. 统计 429 错误次数(errorCode=1305 或 errorMessage 含 "429") +# 3. 连续 THRESHOLD 次检测都发现新 429 → 重启 Gateway +# +# 部署:cron 每分钟执行一次 +# * * * * * /Users/chufeng/.openclaw/sanguo_projects/sanguo_moziplus_v2/scripts/gateway-watchdog.sh >> /tmp/gateway-watchdog.log 2>&1 + +set -euo pipefail + +# === 配置 === +CHECK_WINDOW=120 # 检查最近多少秒的日志 +THRESHOLD=3 # 连续检测到多少次 429 才重启 +STATE_FILE="/tmp/gateway-watchdog-429-count" +LOG_DIR="/Users/chufeng/.openclaw/agents" +RESTART_CMD="openclaw gateway restart" +HEALTH_CMD="openclaw gateway health" + +# === 函数 === + +log() { + echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*" +} + +# 统计最近的 429 错误数 +count_recent_429() { + local cutoff=$(date -u -v-${CHECK_WINDOW}S '+%Y-%m-%dT%H:%M:%S' 2>/dev/null || date -u -d "${CHECK_WINDOW} seconds ago" '+%Y-%m-%dT%H:%M:%S') + local count=0 + + # 遍历所有 agent 的 session jsonl + for jsonl in "$LOG_DIR"/*/sessions/*.jsonl; do + [ -f "$jsonl" ] || continue + # 只看最近修改的文件(性能优化) + local mtime + mtime=$(stat -f %m "$jsonl" 2>/dev/null || stat -c %Y "$jsonl" 2>/dev/null) + local now=$(date +%s) + local age=$(( now - mtime )) + # 文件超过 CHECK_WINDOW 秒没修改就跳过 + [ "$age" -gt "$CHECK_WINDOW" ] && continue + + # 搜索 429 错误:errorCode=1305 或 errorMessage 含 "429" + local found + found=$(grep -c '"errorCode".*"1305"\|"errorMessage".*"429"' "$jsonl" 2>/dev/null || echo 0) + # 进一步过滤:只统计时间窗口内的 + if [ "$found" -gt 0 ]; then + local recent + recent=$(python3 -c " +import json, sys +cutoff = '${cutoff}' +count = 0 +with open('${jsonl}') as f: + for line in f: + try: + d = json.loads(line) + ts = d.get('timestamp', '')[:19] + if ts < cutoff: + continue + msg = d.get('message', {}) + if msg.get('stopReason') == 'error': + err = str(msg.get('errorMessage', '')) + code = str(msg.get('errorCode', '')) + if '429' in err or '1305' in code: + count += 1 + except: + pass +print(count) +" 2>/dev/null || echo 0) + count=$(( count + recent )) + fi + done + echo "$count" +} + +# === 主逻辑 === + +# 1. 先检查 Gateway 是否在运行 +if ! $HEALTH_CMD &>/dev/null; then + log "WARN: Gateway health check failed, attempting restart" + $RESTART_CMD 2>&1 || true + echo 0 > "$STATE_FILE" + exit 0 +fi + +# 2. 统计最近 429 错误 +four29_count=$(count_recent_429) + +# 3. 读取连续计数 +consecutive=0 +if [ -f "$STATE_FILE" ]; then + consecutive=$(cat "$STATE_FILE") +fi + +# 4. 判断 +if [ "$four29_count" -gt 0 ]; then + consecutive=$(( consecutive + 1 )) + log "429 detected: ${four29_count} recent errors, consecutive=${consecutive}/${THRESHOLD}" +else + if [ "$consecutive" -gt 0 ]; then + log "429 cleared (was ${consecutive} consecutive, now 0)" + fi + consecutive=0 +fi + +echo "$consecutive" > "$STATE_FILE" + +# 5. 达到阈值 → 重启 +if [ "$consecutive" -ge "$THRESHOLD" ]; then + log "ALERT: ${consecutive} consecutive 429 detections, restarting Gateway" + $RESTART_CMD 2>&1 + log "Gateway restart completed" + echo 0 > "$STATE_FILE" +fi