diff --git a/scripts/gateway-watchdog.sh b/scripts/gateway-watchdog.sh index 1427d45..30200c7 100755 --- a/scripts/gateway-watchdog.sh +++ b/scripts/gateway-watchdog.sh @@ -4,7 +4,7 @@ # 机制: # 1. 检查最近 CHECK_WINDOW 秒内的 session jsonl 日志 # 2. 统计 429 错误次数(errorCode=1305 或 errorMessage 含 "429") -# 3. 连续 THRESHOLD 次检测都发现新 429 → 重启 Gateway +# 3. CHECK_WINDOW 内 429 总数 >= THRESHOLD → 重启 Gateway # # 部署:cron 每分钟执行一次 # * * * * * /Users/chufeng/.openclaw/sanguo_projects/sanguo_moziplus_v2/scripts/gateway-watchdog.sh >> /tmp/gateway-watchdog.log 2>&1 @@ -14,9 +14,17 @@ set -euo pipefail # crontab 环境下 PATH 不完整,补上 export PATH="/opt/homebrew/bin:/usr/local/bin:/usr/bin:/bin:$PATH" +# === flock 防并发 === +LOCK_FILE="/tmp/gateway-watchdog.lock" +exec 9>"$LOCK_FILE" +if ! flock -n 9; then + exit 0 +fi + # === 配置 === CHECK_WINDOW=120 # 检查最近多少秒的日志 -THRESHOLD=3 # 连续检测到多少次 429 才重启 +THRESHOLD=3 # CHECK_WINDOW 内发现多少个 429 就重启 +MTIME_BUFFER=180 # 文件 mtime 阈值(CHECK_WINDOW + buffer,防漏检) STATE_FILE="/tmp/gateway-watchdog-429-count" LOG_DIR="/Users/chufeng/.openclaw/agents" RESTART_CMD="openclaw gateway restart" @@ -33,20 +41,22 @@ count_recent_429() { local cutoff=$(date -u -v-${CHECK_WINDOW}S '+%Y-%m-%dT%H:%M:%S' 2>/dev/null || date -u -d "${CHECK_WINDOW} seconds ago" '+%Y-%m-%dT%H:%M:%S') local count=0 - # 只看 session jsonl(排除 trajectory) for jsonl in "$LOG_DIR"/*/sessions/*.jsonl; do + # 排除 trajectory 和 checkpoint 文件 [[ "$jsonl" == *"trajectory"* ]] && continue + [[ "$jsonl" == *"checkpoint"* ]] && continue [ -f "$jsonl" ] || continue - # 只看最近修改的文件(性能优化) + + # mtime 过滤(加 buffer 防漏检) local mtime mtime=$(stat -f '%m' "$jsonl" 2>/dev/null | tr -d ' \n' || stat -c '%Y' "$jsonl" 2>/dev/null | tr -d ' \n') [ -z "$mtime" ] && continue local now now=$(date +%s) local age=$(( now - mtime )) - # 文件超过 CHECK_WINDOW 秒没修改就跳过 - [ "$age" -gt "$CHECK_WINDOW" ] && continue - # 文件小于 100 字节就跳过(空/刚创建) + [ "$age" -gt "$MTIME_BUFFER" ] && continue + + # 跳过空文件 local fsize fsize=$(wc -c < "$jsonl" | tr -d ' ') [ "$fsize" -lt 100 ] && continue @@ -55,7 +65,6 @@ count_recent_429() { local found=0 found=$(grep -c '"errorCode".*"1305"\|"errorMessage".*"429"' "$jsonl" 2>/dev/null || true) [ -z "$found" ] && found=0 - # 进一步过滤:只统计时间窗口内的 if [ "$found" -gt 0 ]; then local recent recent=$(python3 -c " @@ -72,7 +81,7 @@ with open('${jsonl}') as f: msg = d.get('message', {}) if msg.get('stopReason') == 'error': err = str(msg.get('errorMessage', '')) - code = str(msg.get('errorCode', '')) + code = msg.get('errorCode') or '' if '429' in err or '1305' in code: count += 1 except: @@ -98,29 +107,19 @@ fi # 2. 统计最近 429 错误 four29_count=$(count_recent_429) -# 3. 读取连续计数 -consecutive=0 -if [ -f "$STATE_FILE" ]; then - consecutive=$(cat "$STATE_FILE") -fi - -# 4. 判断 -if [ "$four29_count" -gt 0 ]; then - consecutive=$(( consecutive + 1 )) - log "429 detected: ${four29_count} recent errors, consecutive=${consecutive}/${THRESHOLD}" -else - if [ "$consecutive" -gt 0 ]; then - log "429 cleared (was ${consecutive} consecutive, now 0)" - fi - consecutive=0 -fi - -echo "$consecutive" > "$STATE_FILE" - -# 5. 达到阈值 → 重启 -if [ "$consecutive" -ge "$THRESHOLD" ]; then - log "ALERT: ${consecutive} consecutive 429 detections, restarting Gateway" +# 3. 判断:CHECK_WINDOW 内 429 总数 >= THRESHOLD → 重启 +if [ "$four29_count" -ge "$THRESHOLD" ]; then + log "ALERT: ${four29_count} 429 errors in last ${CHECK_WINDOW}s (threshold=${THRESHOLD}), restarting Gateway" $RESTART_CMD 2>&1 log "Gateway restart completed" + echo "$four29_count" > "$STATE_FILE" +elif [ "$four29_count" -gt 0 ]; then + log "429 detected: ${four29_count} errors in last ${CHECK_WINDOW}s (threshold=${THRESHOLD}, no restart)" + echo "$four29_count" > "$STATE_FILE" +else + # 只在有前次记录时打印 clear + if [ -f "$STATE_FILE" ] && [ "$(cat "$STATE_FILE")" != "0" ]; then + log "429 cleared" + fi echo 0 > "$STATE_FILE" fi