auto-sync: 2026-05-28 13:15:19

This commit is contained in:
cfdaily
2026-05-28 13:15:19 +08:00
parent 8de712786f
commit 2d0279bc2c
+30 -31
View File
@@ -4,7 +4,7 @@
# 机制:
# 1. 检查最近 CHECK_WINDOW 秒内的 session jsonl 日志
# 2. 统计 429 错误次数(errorCode=1305 或 errorMessage 含 "429"
# 3. 连续 THRESHOLD 次检测都发现新 429 → 重启 Gateway
# 3. CHECK_WINDOW 内 429 总数 >= THRESHOLD → 重启 Gateway
#
# 部署:cron 每分钟执行一次
# * * * * * /Users/chufeng/.openclaw/sanguo_projects/sanguo_moziplus_v2/scripts/gateway-watchdog.sh >> /tmp/gateway-watchdog.log 2>&1
@@ -14,9 +14,17 @@ set -euo pipefail
# crontab 环境下 PATH 不完整,补上
export PATH="/opt/homebrew/bin:/usr/local/bin:/usr/bin:/bin:$PATH"
# === flock 防并发 ===
LOCK_FILE="/tmp/gateway-watchdog.lock"
exec 9>"$LOCK_FILE"
if ! flock -n 9; then
exit 0
fi
# === 配置 ===
CHECK_WINDOW=120 # 检查最近多少秒的日志
THRESHOLD=3 # 连续检测到多少 429 重启
THRESHOLD=3 # CHECK_WINDOW 内发现多少 429 重启
MTIME_BUFFER=180 # 文件 mtime 阈值(CHECK_WINDOW + buffer,防漏检)
STATE_FILE="/tmp/gateway-watchdog-429-count"
LOG_DIR="/Users/chufeng/.openclaw/agents"
RESTART_CMD="openclaw gateway restart"
@@ -33,20 +41,22 @@ count_recent_429() {
local cutoff=$(date -u -v-${CHECK_WINDOW}S '+%Y-%m-%dT%H:%M:%S' 2>/dev/null || date -u -d "${CHECK_WINDOW} seconds ago" '+%Y-%m-%dT%H:%M:%S')
local count=0
# 只看 session jsonl(排除 trajectory
for jsonl in "$LOG_DIR"/*/sessions/*.jsonl; do
# 排除 trajectory 和 checkpoint 文件
[[ "$jsonl" == *"trajectory"* ]] && continue
[[ "$jsonl" == *"checkpoint"* ]] && continue
[ -f "$jsonl" ] || continue
# 只看最近修改的文件(性能优化)
# mtime 过滤(加 buffer 防漏检)
local mtime
mtime=$(stat -f '%m' "$jsonl" 2>/dev/null | tr -d ' \n' || stat -c '%Y' "$jsonl" 2>/dev/null | tr -d ' \n')
[ -z "$mtime" ] && continue
local now
now=$(date +%s)
local age=$(( now - mtime ))
# 文件超过 CHECK_WINDOW 秒没修改就跳过
[ "$age" -gt "$CHECK_WINDOW" ] && continue
# 文件小于 100 字节就跳过(空/刚创建)
[ "$age" -gt "$MTIME_BUFFER" ] && continue
# 跳过空文件
local fsize
fsize=$(wc -c < "$jsonl" | tr -d ' ')
[ "$fsize" -lt 100 ] && continue
@@ -55,7 +65,6 @@ count_recent_429() {
local found=0
found=$(grep -c '"errorCode".*"1305"\|"errorMessage".*"429"' "$jsonl" 2>/dev/null || true)
[ -z "$found" ] && found=0
# 进一步过滤:只统计时间窗口内的
if [ "$found" -gt 0 ]; then
local recent
recent=$(python3 -c "
@@ -72,7 +81,7 @@ with open('${jsonl}') as f:
msg = d.get('message', {})
if msg.get('stopReason') == 'error':
err = str(msg.get('errorMessage', ''))
code = str(msg.get('errorCode', ''))
code = msg.get('errorCode') or ''
if '429' in err or '1305' in code:
count += 1
except:
@@ -98,29 +107,19 @@ fi
# 2. 统计最近 429 错误
four29_count=$(count_recent_429)
# 3. 读取连续计数
consecutive=0
if [ -f "$STATE_FILE" ]; then
consecutive=$(cat "$STATE_FILE")
fi
# 4. 判断
if [ "$four29_count" -gt 0 ]; then
consecutive=$(( consecutive + 1 ))
log "429 detected: ${four29_count} recent errors, consecutive=${consecutive}/${THRESHOLD}"
else
if [ "$consecutive" -gt 0 ]; then
log "429 cleared (was ${consecutive} consecutive, now 0)"
fi
consecutive=0
fi
echo "$consecutive" > "$STATE_FILE"
# 5. 达到阈值 → 重启
if [ "$consecutive" -ge "$THRESHOLD" ]; then
log "ALERT: ${consecutive} consecutive 429 detections, restarting Gateway"
# 3. 判断:CHECK_WINDOW 内 429 总数 >= THRESHOLD → 重启
if [ "$four29_count" -ge "$THRESHOLD" ]; then
log "ALERT: ${four29_count} 429 errors in last ${CHECK_WINDOW}s (threshold=${THRESHOLD}), restarting Gateway"
$RESTART_CMD 2>&1
log "Gateway restart completed"
echo "$four29_count" > "$STATE_FILE"
elif [ "$four29_count" -gt 0 ]; then
log "429 detected: ${four29_count} errors in last ${CHECK_WINDOW}s (threshold=${THRESHOLD}, no restart)"
echo "$four29_count" > "$STATE_FILE"
else
# 只在有前次记录时打印 clear
if [ -f "$STATE_FILE" ] && [ "$(cat "$STATE_FILE")" != "0" ]; then
log "429 cleared"
fi
echo 0 > "$STATE_FILE"
fi