Files
sanguo_moziplus_v2/scripts/gateway-watchdog.sh
T
2026-05-28 12:28:32 +08:00

126 lines
3.9 KiB
Bash
Executable File
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/bin/bash
# gateway-watchdog.sh — 检测 OpenClaw Gateway 429 并自动重启
#
# 机制:
# 1. 检查最近 CHECK_WINDOW 秒内的 session jsonl 日志
# 2. 统计 429 错误次数(errorCode=1305 或 errorMessage 含 "429"
# 3. 连续 THRESHOLD 次检测都发现新 429 → 重启 Gateway
#
# 部署:cron 每分钟执行一次
# * * * * * /Users/chufeng/.openclaw/sanguo_projects/sanguo_moziplus_v2/scripts/gateway-watchdog.sh >> /tmp/gateway-watchdog.log 2>&1
set -euo pipefail
# crontab 环境下 PATH 不完整,补上
export PATH="/opt/homebrew/bin:/usr/local/bin:/usr/bin:/bin:$PATH"
# === 配置 ===
CHECK_WINDOW=120 # 检查最近多少秒的日志
THRESHOLD=3 # 连续检测到多少次 429 才重启
STATE_FILE="/tmp/gateway-watchdog-429-count"
LOG_DIR="/Users/chufeng/.openclaw/agents"
RESTART_CMD="openclaw gateway restart"
HEALTH_CMD="openclaw gateway health"
# === 函数 ===
log() {
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*"
}
# 统计最近的 429 错误数
count_recent_429() {
local cutoff=$(date -u -v-${CHECK_WINDOW}S '+%Y-%m-%dT%H:%M:%S' 2>/dev/null || date -u -d "${CHECK_WINDOW} seconds ago" '+%Y-%m-%dT%H:%M:%S')
local count=0
# 只看 session jsonl(排除 trajectory
for jsonl in "$LOG_DIR"/*/sessions/*.jsonl; do
[[ "$jsonl" == *"trajectory"* ]] && continue
[ -f "$jsonl" ] || continue
# 只看最近修改的文件(性能优化)
local mtime
mtime=$(stat -f '%m' "$jsonl" 2>/dev/null | tr -d ' \n' || stat -c '%Y' "$jsonl" 2>/dev/null | tr -d ' \n')
[ -z "$mtime" ] && continue
local now
now=$(date +%s)
local age=$(( now - mtime ))
# 文件超过 CHECK_WINDOW 秒没修改就跳过
[ "$age" -gt "$CHECK_WINDOW" ] && continue
# 文件小于 100 字节就跳过(空/刚创建)
local fsize
fsize=$(wc -c < "$jsonl" | tr -d ' ')
[ "$fsize" -lt 100 ] && continue
# 搜索 429 错误:errorCode=1305 或 errorMessage 含 "429"
local found
found=$(grep -c '"errorCode".*"1305"\|"errorMessage".*"429"' "$jsonl" 2>/dev/null || echo 0)
# 进一步过滤:只统计时间窗口内的
if [ "$found" -gt 0 ]; then
local recent
recent=$(python3 -c "
import json, sys
cutoff = '${cutoff}'
count = 0
with open('${jsonl}') as f:
for line in f:
try:
d = json.loads(line)
ts = d.get('timestamp', '')[:19]
if ts < cutoff:
continue
msg = d.get('message', {})
if msg.get('stopReason') == 'error':
err = str(msg.get('errorMessage', ''))
code = str(msg.get('errorCode', ''))
if '429' in err or '1305' in code:
count += 1
except:
pass
print(count)
" 2>/dev/null || echo 0)
count=$(( count + recent ))
fi
done
echo "$count"
}
# === 主逻辑 ===
# 1. 先检查 Gateway 是否在运行
if ! $HEALTH_CMD &>/dev/null; then
log "WARN: Gateway health check failed, attempting restart"
$RESTART_CMD 2>&1 || true
echo 0 > "$STATE_FILE"
exit 0
fi
# 2. 统计最近 429 错误
four29_count=$(count_recent_429)
# 3. 读取连续计数
consecutive=0
if [ -f "$STATE_FILE" ]; then
consecutive=$(cat "$STATE_FILE")
fi
# 4. 判断
if [ "$four29_count" -gt 0 ]; then
consecutive=$(( consecutive + 1 ))
log "429 detected: ${four29_count} recent errors, consecutive=${consecutive}/${THRESHOLD}"
else
if [ "$consecutive" -gt 0 ]; then
log "429 cleared (was ${consecutive} consecutive, now 0)"
fi
consecutive=0
fi
echo "$consecutive" > "$STATE_FILE"
# 5. 达到阈值 → 重启
if [ "$consecutive" -ge "$THRESHOLD" ]; then
log "ALERT: ${consecutive} consecutive 429 detections, restarting Gateway"
$RESTART_CMD 2>&1
log "Gateway restart completed"
echo 0 > "$STATE_FILE"
fi