Files
sanguo_moziplus_v2/scripts/gateway-watchdog.sh
T
2026-05-28 13:15:30 +08:00

127 lines
4.1 KiB
Bash
Executable File
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/bin/bash
# gateway-watchdog.sh — 检测 OpenClaw Gateway 429 并自动重启
#
# 机制:
# 1. 检查最近 CHECK_WINDOW 秒内的 session jsonl 日志
# 2. 统计 429 错误次数(errorCode=1305 或 errorMessage 含 "429"
# 3. CHECK_WINDOW 内 429 总数 >= THRESHOLD → 重启 Gateway
#
# 部署:cron 每分钟执行一次
# * * * * * /Users/chufeng/.openclaw/sanguo_projects/sanguo_moziplus_v2/scripts/gateway-watchdog.sh >> /tmp/gateway-watchdog.log 2>&1
set -euo pipefail
# crontab 环境下 PATH 不完整,补上
export PATH="/opt/homebrew/bin:/usr/local/bin:/usr/bin:/bin:$PATH"
# === flock 防并发 ===
LOCK_FILE="/tmp/gateway-watchdog.lock"
# macOS 无 flock,用 mkdir 做简易锁
if ! mkdir "$LOCK_FILE" 2>/dev/null; then
exit 0
fi
trap 'rmdir "$LOCK_FILE" 2>/dev/null' EXIT
# === 配置 ===
CHECK_WINDOW=120 # 检查最近多少秒的日志
THRESHOLD=3 # CHECK_WINDOW 内发现多少个 429 就重启
MTIME_BUFFER=180 # 文件 mtime 阈值(CHECK_WINDOW + buffer,防漏检)
STATE_FILE="/tmp/gateway-watchdog-429-count"
LOG_DIR="/Users/chufeng/.openclaw/agents"
RESTART_CMD="openclaw gateway restart"
HEALTH_CMD="openclaw gateway health"
# === 函数 ===
log() {
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*"
}
# 统计最近的 429 错误数
count_recent_429() {
local cutoff=$(date -u -v-${CHECK_WINDOW}S '+%Y-%m-%dT%H:%M:%S' 2>/dev/null || date -u -d "${CHECK_WINDOW} seconds ago" '+%Y-%m-%dT%H:%M:%S')
local count=0
for jsonl in "$LOG_DIR"/*/sessions/*.jsonl; do
# 排除 trajectory 和 checkpoint 文件
[[ "$jsonl" == *"trajectory"* ]] && continue
[[ "$jsonl" == *"checkpoint"* ]] && continue
[ -f "$jsonl" ] || continue
# mtime 过滤(加 buffer 防漏检)
local mtime
mtime=$(stat -f '%m' "$jsonl" 2>/dev/null | tr -d ' \n' || stat -c '%Y' "$jsonl" 2>/dev/null | tr -d ' \n')
[ -z "$mtime" ] && continue
local now
now=$(date +%s)
local age=$(( now - mtime ))
[ "$age" -gt "$MTIME_BUFFER" ] && continue
# 跳过空文件
local fsize
fsize=$(wc -c < "$jsonl" | tr -d ' ')
[ "$fsize" -lt 100 ] && continue
# 搜索 429 错误:errorCode=1305 或 errorMessage 含 "429"
local found=0
found=$(grep -c '"errorCode".*"1305"\|"errorMessage".*"429"' "$jsonl" 2>/dev/null || true)
[ -z "$found" ] && found=0
if [ "$found" -gt 0 ]; then
local recent
recent=$(python3 -c "
import json, sys
cutoff = '${cutoff}'
count = 0
with open('${jsonl}') as f:
for line in f:
try:
d = json.loads(line)
ts = d.get('timestamp', '')[:19]
if ts < cutoff:
continue
msg = d.get('message', {})
if msg.get('stopReason') == 'error':
err = str(msg.get('errorMessage', ''))
code = msg.get('errorCode') or ''
if '429' in err or '1305' in code:
count += 1
except:
pass
print(count)
" 2>/dev/null || echo 0)
count=$(( count + recent ))
fi
done
echo "$count"
}
# === 主逻辑 ===
# 1. 先检查 Gateway 是否在运行
if ! $HEALTH_CMD &>/dev/null; then
log "WARN: Gateway health check failed, attempting restart"
$RESTART_CMD 2>&1 || true
echo 0 > "$STATE_FILE"
exit 0
fi
# 2. 统计最近 429 错误
four29_count=$(count_recent_429)
# 3. 判断:CHECK_WINDOW 内 429 总数 >= THRESHOLD → 重启
if [ "$four29_count" -ge "$THRESHOLD" ]; then
log "ALERT: ${four29_count} 429 errors in last ${CHECK_WINDOW}s (threshold=${THRESHOLD}), restarting Gateway"
$RESTART_CMD 2>&1
log "Gateway restart completed"
echo "$four29_count" > "$STATE_FILE"
elif [ "$four29_count" -gt 0 ]; then
log "429 detected: ${four29_count} errors in last ${CHECK_WINDOW}s (threshold=${THRESHOLD}, no restart)"
echo "$four29_count" > "$STATE_FILE"
else
# 只在有前次记录时打印 clear
if [ -f "$STATE_FILE" ] && [ "$(cat "$STATE_FILE")" != "0" ]; then
log "429 cleared"
fi
echo 0 > "$STATE_FILE"
fi