auto-sync: 2026-05-28 12:25:12
This commit is contained in:
Executable
+115
@@ -0,0 +1,115 @@
|
||||
#!/bin/bash
|
||||
# gateway-watchdog.sh — 检测 OpenClaw Gateway 429 并自动重启
|
||||
#
|
||||
# 机制:
|
||||
# 1. 检查最近 CHECK_WINDOW 秒内的 session jsonl 日志
|
||||
# 2. 统计 429 错误次数(errorCode=1305 或 errorMessage 含 "429")
|
||||
# 3. 连续 THRESHOLD 次检测都发现新 429 → 重启 Gateway
|
||||
#
|
||||
# 部署:cron 每分钟执行一次
|
||||
# * * * * * /Users/chufeng/.openclaw/sanguo_projects/sanguo_moziplus_v2/scripts/gateway-watchdog.sh >> /tmp/gateway-watchdog.log 2>&1
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
# === 配置 ===
|
||||
CHECK_WINDOW=120 # 检查最近多少秒的日志
|
||||
THRESHOLD=3 # 连续检测到多少次 429 才重启
|
||||
STATE_FILE="/tmp/gateway-watchdog-429-count"
|
||||
LOG_DIR="/Users/chufeng/.openclaw/agents"
|
||||
RESTART_CMD="openclaw gateway restart"
|
||||
HEALTH_CMD="openclaw gateway health"
|
||||
|
||||
# === 函数 ===
|
||||
|
||||
log() {
|
||||
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*"
|
||||
}
|
||||
|
||||
# 统计最近的 429 错误数
|
||||
count_recent_429() {
|
||||
local cutoff=$(date -u -v-${CHECK_WINDOW}S '+%Y-%m-%dT%H:%M:%S' 2>/dev/null || date -u -d "${CHECK_WINDOW} seconds ago" '+%Y-%m-%dT%H:%M:%S')
|
||||
local count=0
|
||||
|
||||
# 遍历所有 agent 的 session jsonl
|
||||
for jsonl in "$LOG_DIR"/*/sessions/*.jsonl; do
|
||||
[ -f "$jsonl" ] || continue
|
||||
# 只看最近修改的文件(性能优化)
|
||||
local mtime
|
||||
mtime=$(stat -f %m "$jsonl" 2>/dev/null || stat -c %Y "$jsonl" 2>/dev/null)
|
||||
local now=$(date +%s)
|
||||
local age=$(( now - mtime ))
|
||||
# 文件超过 CHECK_WINDOW 秒没修改就跳过
|
||||
[ "$age" -gt "$CHECK_WINDOW" ] && continue
|
||||
|
||||
# 搜索 429 错误:errorCode=1305 或 errorMessage 含 "429"
|
||||
local found
|
||||
found=$(grep -c '"errorCode".*"1305"\|"errorMessage".*"429"' "$jsonl" 2>/dev/null || echo 0)
|
||||
# 进一步过滤:只统计时间窗口内的
|
||||
if [ "$found" -gt 0 ]; then
|
||||
local recent
|
||||
recent=$(python3 -c "
|
||||
import json, sys
|
||||
cutoff = '${cutoff}'
|
||||
count = 0
|
||||
with open('${jsonl}') as f:
|
||||
for line in f:
|
||||
try:
|
||||
d = json.loads(line)
|
||||
ts = d.get('timestamp', '')[:19]
|
||||
if ts < cutoff:
|
||||
continue
|
||||
msg = d.get('message', {})
|
||||
if msg.get('stopReason') == 'error':
|
||||
err = str(msg.get('errorMessage', ''))
|
||||
code = str(msg.get('errorCode', ''))
|
||||
if '429' in err or '1305' in code:
|
||||
count += 1
|
||||
except:
|
||||
pass
|
||||
print(count)
|
||||
" 2>/dev/null || echo 0)
|
||||
count=$(( count + recent ))
|
||||
fi
|
||||
done
|
||||
echo "$count"
|
||||
}
|
||||
|
||||
# === 主逻辑 ===
|
||||
|
||||
# 1. 先检查 Gateway 是否在运行
|
||||
if ! $HEALTH_CMD &>/dev/null; then
|
||||
log "WARN: Gateway health check failed, attempting restart"
|
||||
$RESTART_CMD 2>&1 || true
|
||||
echo 0 > "$STATE_FILE"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# 2. 统计最近 429 错误
|
||||
four29_count=$(count_recent_429)
|
||||
|
||||
# 3. 读取连续计数
|
||||
consecutive=0
|
||||
if [ -f "$STATE_FILE" ]; then
|
||||
consecutive=$(cat "$STATE_FILE")
|
||||
fi
|
||||
|
||||
# 4. 判断
|
||||
if [ "$four29_count" -gt 0 ]; then
|
||||
consecutive=$(( consecutive + 1 ))
|
||||
log "429 detected: ${four29_count} recent errors, consecutive=${consecutive}/${THRESHOLD}"
|
||||
else
|
||||
if [ "$consecutive" -gt 0 ]; then
|
||||
log "429 cleared (was ${consecutive} consecutive, now 0)"
|
||||
fi
|
||||
consecutive=0
|
||||
fi
|
||||
|
||||
echo "$consecutive" > "$STATE_FILE"
|
||||
|
||||
# 5. 达到阈值 → 重启
|
||||
if [ "$consecutive" -ge "$THRESHOLD" ]; then
|
||||
log "ALERT: ${consecutive} consecutive 429 detections, restarting Gateway"
|
||||
$RESTART_CMD 2>&1
|
||||
log "Gateway restart completed"
|
||||
echo 0 > "$STATE_FILE"
|
||||
fi
|
||||
Reference in New Issue
Block a user