127 lines
4.1 KiB
Bash
Executable File
127 lines
4.1 KiB
Bash
Executable File
#!/bin/bash
|
||
# gateway-watchdog.sh — 检测 OpenClaw Gateway 429 并自动重启
|
||
#
|
||
# 机制:
|
||
# 1. 检查最近 CHECK_WINDOW 秒内的 session jsonl 日志
|
||
# 2. 统计 429 错误次数(errorCode=1305 或 errorMessage 含 "429")
|
||
# 3. CHECK_WINDOW 内 429 总数 >= THRESHOLD → 重启 Gateway
|
||
#
|
||
# 部署:cron 每分钟执行一次
|
||
# * * * * * /Users/chufeng/.openclaw/sanguo_projects/sanguo_moziplus_v2/scripts/gateway-watchdog.sh >> /tmp/gateway-watchdog.log 2>&1
|
||
|
||
set -euo pipefail
|
||
|
||
# crontab 环境下 PATH 不完整,补上
|
||
export PATH="/opt/homebrew/bin:/usr/local/bin:/usr/bin:/bin:$PATH"
|
||
|
||
# === flock 防并发 ===
|
||
LOCK_FILE="/tmp/gateway-watchdog.lock"
|
||
# macOS 无 flock,用 mkdir 做简易锁
|
||
if ! mkdir "$LOCK_FILE" 2>/dev/null; then
|
||
exit 0
|
||
fi
|
||
trap 'rmdir "$LOCK_FILE" 2>/dev/null' EXIT
|
||
|
||
# === 配置 ===
|
||
CHECK_WINDOW=120 # 检查最近多少秒的日志
|
||
THRESHOLD=3 # CHECK_WINDOW 内发现多少个 429 就重启
|
||
MTIME_BUFFER=180 # 文件 mtime 阈值(CHECK_WINDOW + buffer,防漏检)
|
||
STATE_FILE="/tmp/gateway-watchdog-429-count"
|
||
LOG_DIR="/Users/chufeng/.openclaw/agents"
|
||
RESTART_CMD="openclaw gateway restart"
|
||
HEALTH_CMD="openclaw gateway health"
|
||
|
||
# === 函数 ===
|
||
|
||
log() {
|
||
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*"
|
||
}
|
||
|
||
# 统计最近的 429 错误数
|
||
count_recent_429() {
|
||
local cutoff=$(date -u -v-${CHECK_WINDOW}S '+%Y-%m-%dT%H:%M:%S' 2>/dev/null || date -u -d "${CHECK_WINDOW} seconds ago" '+%Y-%m-%dT%H:%M:%S')
|
||
local count=0
|
||
|
||
for jsonl in "$LOG_DIR"/*/sessions/*.jsonl; do
|
||
# 排除 trajectory 和 checkpoint 文件
|
||
[[ "$jsonl" == *"trajectory"* ]] && continue
|
||
[[ "$jsonl" == *"checkpoint"* ]] && continue
|
||
[ -f "$jsonl" ] || continue
|
||
|
||
# mtime 过滤(加 buffer 防漏检)
|
||
local mtime
|
||
mtime=$(stat -f '%m' "$jsonl" 2>/dev/null | tr -d ' \n' || stat -c '%Y' "$jsonl" 2>/dev/null | tr -d ' \n')
|
||
[ -z "$mtime" ] && continue
|
||
local now
|
||
now=$(date +%s)
|
||
local age=$(( now - mtime ))
|
||
[ "$age" -gt "$MTIME_BUFFER" ] && continue
|
||
|
||
# 跳过空文件
|
||
local fsize
|
||
fsize=$(wc -c < "$jsonl" | tr -d ' ')
|
||
[ "$fsize" -lt 100 ] && continue
|
||
|
||
# 搜索 429 错误:errorCode=1305 或 errorMessage 含 "429"
|
||
local found=0
|
||
found=$(grep -c '"errorCode".*"1305"\|"errorMessage".*"429"' "$jsonl" 2>/dev/null || true)
|
||
[ -z "$found" ] && found=0
|
||
if [ "$found" -gt 0 ]; then
|
||
local recent
|
||
recent=$(python3 -c "
|
||
import json, sys
|
||
cutoff = '${cutoff}'
|
||
count = 0
|
||
with open('${jsonl}') as f:
|
||
for line in f:
|
||
try:
|
||
d = json.loads(line)
|
||
ts = d.get('timestamp', '')[:19]
|
||
if ts < cutoff:
|
||
continue
|
||
msg = d.get('message', {})
|
||
if msg.get('stopReason') == 'error':
|
||
err = str(msg.get('errorMessage', ''))
|
||
code = msg.get('errorCode') or ''
|
||
if '429' in err or '1305' in code:
|
||
count += 1
|
||
except:
|
||
pass
|
||
print(count)
|
||
" 2>/dev/null || echo 0)
|
||
count=$(( count + recent ))
|
||
fi
|
||
done
|
||
echo "$count"
|
||
}
|
||
|
||
# === 主逻辑 ===
|
||
|
||
# 1. 先检查 Gateway 是否在运行
|
||
if ! $HEALTH_CMD &>/dev/null; then
|
||
log "WARN: Gateway health check failed, attempting restart"
|
||
$RESTART_CMD 2>&1 || true
|
||
echo 0 > "$STATE_FILE"
|
||
exit 0
|
||
fi
|
||
|
||
# 2. 统计最近 429 错误
|
||
four29_count=$(count_recent_429)
|
||
|
||
# 3. 判断:CHECK_WINDOW 内 429 总数 >= THRESHOLD → 重启
|
||
if [ "$four29_count" -ge "$THRESHOLD" ]; then
|
||
log "ALERT: ${four29_count} 429 errors in last ${CHECK_WINDOW}s (threshold=${THRESHOLD}), restarting Gateway"
|
||
$RESTART_CMD 2>&1
|
||
log "Gateway restart completed"
|
||
echo "$four29_count" > "$STATE_FILE"
|
||
elif [ "$four29_count" -gt 0 ]; then
|
||
log "429 detected: ${four29_count} errors in last ${CHECK_WINDOW}s (threshold=${THRESHOLD}, no restart)"
|
||
echo "$four29_count" > "$STATE_FILE"
|
||
else
|
||
# 只在有前次记录时打印 clear
|
||
if [ -f "$STATE_FILE" ] && [ "$(cat "$STATE_FILE")" != "0" ]; then
|
||
log "429 cleared"
|
||
fi
|
||
echo 0 > "$STATE_FILE"
|
||
fi
|