auto-sync: 2026-06-02 22:00:32

This commit is contained in:
cfdaily
2026-06-02 22:00:32 +08:00
parent 4d2d13e0c1
commit 96c3f1409b
+182 -79
View File
@@ -1,10 +1,11 @@
#!/bin/bash
# gateway-watchdog.sh — 检测 OpenClaw Gateway 429 并自动重启
#
# 机制
# 1. 检查最近 CHECK_WINDOW 秒内的 session jsonl 日志
# 2. 统计 429 错误次数(errorCode=1305 或 errorMessage 含 "429"
# 3. CHECK_WINDOW 内 429 总数 >= THRESHOLD → 重启 Gateway
# gateway-watchdog.sh — 检测 Gateway 进程日志异常并自动重启v2
#
# 改动(v2
# - 数据源从 session jsonl 改为 Gateway 进程日志
# - 三条检测规则:R1FailoverError)、R2stalled recovery=none)、R3rate_limit/429
# - 防重启风暴:重启后 5 分钟冷却期
# - 重启原因记录到 /tmp/gateway-watchdog-restarts.log 用于统计分析
#
# 部署:cron 每分钟执行一次
# * * * * * /Users/chufeng/.openclaw/sanguo_projects/sanguo_moziplus_v2/scripts/gateway-watchdog.sh >> /tmp/gateway-watchdog.log 2>&1
@@ -14,20 +15,22 @@ set -euo pipefail
# crontab 环境下 PATH 不完整,补上
export PATH="/opt/homebrew/bin:/usr/local/bin:/usr/bin:/bin:$PATH"
# === flock 防并发 ===
# === flock 防并发macOS 无 flock,用 mkdir ===
LOCK_FILE="/tmp/gateway-watchdog.lock"
# macOS 无 flock,用 mkdir 做简易锁
if ! mkdir "$LOCK_FILE" 2>/dev/null; then
exit 0
fi
trap 'rmdir "$LOCK_FILE" 2>/dev/null' EXIT
# === 配置 ===
CHECK_WINDOW=120 # 检查最近多少秒的日志
THRESHOLD=3 # CHECK_WINDOW 内发现多少个 429 就重启
MTIME_BUFFER=180 # 文件 mtime 阈值(CHECK_WINDOW + buffer,防漏检)
STATE_FILE="/tmp/gateway-watchdog-429-count"
LOG_DIR="/Users/chufeng/.openclaw/agents"
CHECK_WINDOW=120 # 检查最近多少秒的日志
R1_THRESHOLD=2 # FailoverError 阈值
R2_THRESHOLD=3 # stalled recovery=none 阈值
R3_THRESHOLD=2 # rate_limit/429 阈值
COOLDOWN=300 # 重启后冷却期(秒)
LOG_FILE="/tmp/openclaw/openclaw-$(date '+%Y-%m-%d').log" # Gateway 进程日志
STATE_FILE="/tmp/gateway-watchdog-state" # JSON 状态文件
RESTART_LOG="/tmp/gateway-watchdog-restarts.log" # 重启原因记录(永久追加)
RESTART_CMD="openclaw gateway restart"
HEALTH_CMD="openclaw gateway health"
@@ -37,90 +40,190 @@ log() {
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*"
}
# 统计最近的 429 错误数
count_recent_429() {
local cutoff=$(date -u -v-${CHECK_WINDOW}S '+%Y-%m-%dT%H:%M:%S' 2>/dev/null || date -u -d "${CHECK_WINDOW} seconds ago" '+%Y-%m-%dT%H:%M:%S')
local count=0
# 从状态文件读取 JSON 字段值
state_get() {
local key="$1"
if [ -f "$STATE_FILE" ]; then
python3 -c "
import json, sys
try:
with open('$STATE_FILE') as f:
d = json.load(f)
print(d.get('$key', ''))
except:
print('')
" 2>/dev/null
fi
}
for jsonl in "$LOG_DIR"/*/sessions/*.jsonl; do
# 排除 trajectory 和 checkpoint 文件
[[ "$jsonl" == *"trajectory"* ]] && continue
[[ "$jsonl" == *"checkpoint"* ]] && continue
[ -f "$jsonl" ] || continue
# 写状态文件(JSON 格式)
state_write() {
local last_restart_time="$1"
local last_restart_reason="$2"
local cooldown_until="$3"
cat > "$STATE_FILE" <<STATEOF
{"last_restart_time":"${last_restart_time}","last_restart_reason":"${last_restart_reason}","cooldown_until":${cooldown_until}}
STATEOF
}
# mtime 过滤(加 buffer 防漏检)
local mtime
mtime=$(stat -f '%m' "$jsonl" 2>/dev/null | tr -d ' \n' || stat -c '%Y' "$jsonl" 2>/dev/null | tr -d ' \n')
[ -z "$mtime" ] && continue
# 记录重启原因到永久日志
log_restart() {
local reason="$1"
local detail="$2"
local r1_count="$3"
local r2_count="$4"
local r3_count="$5"
local ts
ts=$(date '+%Y-%m-%dT%H:%M:%S%z')
# 确保 restarts log 文件存在
touch "$RESTART_LOG"
python3 -c "
import json
entry = {
'time': '${ts}',
'reason': '${reason}',
'detail': '''${detail}''',
'counts': {'r1': ${r1_count}, 'r2': ${r2_count}, 'r3': ${r3_count}}
}
with open('${RESTART_LOG}', 'a') as f:
f.write(json.dumps(entry, ensure_ascii=False) + '\n')
" 2>/dev/null
}
# 检查是否在冷却期内
is_in_cooldown() {
local cooldown_until
cooldown_until=$(state_get "cooldown_until")
if [ -n "$cooldown_until" ] && [ "$cooldown_until" != "None" ]; then
local now
now=$(date +%s)
local age=$(( now - mtime ))
[ "$age" -gt "$MTIME_BUFFER" ] && continue
if [ "$now" -lt "$cooldown_until" ]; then
return 0 # 在冷却期内
fi
fi
return 1 # 不在冷却期
}
# 跳过空文件
local fsize
fsize=$(wc -c < "$jsonl" | tr -d ' ')
[ "$fsize" -lt 100 ] && continue
# 从 Gateway 日志中提取最近 CHECK_WINDOW 秒内的行
# 输出:符合条件的日志行(供后续 grep 计数)
get_recent_lines() {
if [ ! -f "$LOG_FILE" ]; then
return
fi
local cutoff
cutoff=$(python3 -c "
from datetime import datetime, timedelta, timezone
import sys
tz = timezone(timedelta(hours=8))
cutoff = datetime.now(tz) - timedelta(seconds=${CHECK_WINDOW})
print(cutoff.strftime('%Y-%m-%dT%H:%M:%S'))
" 2>/dev/null)
[ -z "$cutoff" ] && return
# 搜索 429 错误:errorCode=1305 或 errorMessage 含 "429"
local found=0
found=$(grep -c '"errorCode".*"1305"\|"errorMessage".*"429"' "$jsonl" 2>/dev/null || true)
[ -z "$found" ] && found=0
if [ "$found" -gt 0 ]; then
local recent
recent=$(python3 -c "
# 用 python3 做时间过滤(日志行中 time 字段是 ISO 格式)
python3 -c "
import json, sys
cutoff = '${cutoff}'
count = 0
with open('${jsonl}') as f:
with open('${LOG_FILE}') as f:
for line in f:
line = line.strip()
if not line:
continue
try:
d = json.loads(line)
ts = d.get('timestamp', '')[:19]
if ts < cutoff:
continue
msg = d.get('message', {})
if msg.get('stopReason') == 'error':
err = str(msg.get('errorMessage', ''))
code = msg.get('errorCode') or ''
if '429' in err or '1305' in code:
count += 1
ts = d.get('time', '')[:19] # 截取到秒
if ts >= cutoff:
sys.stdout.write(line + '\n')
except:
pass
print(count)
" 2>/dev/null || echo 0)
count=$(( count + recent ))
fi
done
echo "$count"
" 2>/dev/null
}
# 统计各规则命中次数
# 用全局变量返回:R1_COUNT, R2_COUNT, R3_COUNT
count_rules() {
local recent_lines
recent_lines=$(get_recent_lines)
if [ -z "$recent_lines" ]; then
R1_COUNT=0
R2_COUNT=0
R3_COUNT=0
return
fi
# R1: 含 "lane task error" 且含 "FailoverError"
R1_COUNT=$(echo "$recent_lines" | grep -c "lane task error" 2>/dev/null | tr -d ' ' || echo 0)
if [ "$R1_COUNT" -gt 0 ]; then
R1_COUNT=$(echo "$recent_lines" | grep "lane task error" | grep -c "FailoverError" 2>/dev/null | tr -d ' ' || echo 0)
fi
# R2: 含 "stalled session" 且含 "recovery=none"
R2_COUNT=$(echo "$recent_lines" | grep -c "stalled session" 2>/dev/null | tr -d ' ' || echo 0)
if [ "$R2_COUNT" -gt 0 ]; then
R2_COUNT=$(echo "$recent_lines" | grep "stalled session" | grep -c "recovery=none" 2>/dev/null | tr -d ' ' || echo 0)
fi
# R3: 含 "rate_limit" 或含 "429"
R3_COUNT=$(echo "$recent_lines" | grep -c "rate_limit\|\"429\"" 2>/dev/null | tr -d ' ' || echo 0)
}
# === 主逻辑 ===
# 1. 先检查 Gateway 是否在运行
if ! $HEALTH_CMD &>/dev/null; then
log "WARN: Gateway health check failed, attempting restart"
$RESTART_CMD 2>&1 || true
echo 0 > "$STATE_FILE"
log "watchdog v2 check start (log: $LOG_FILE)"
# 1. 检查日志文件是否存在
if [ ! -f "$LOG_FILE" ]; then
log "INFO: Gateway log file not found ($LOG_FILE), skipping detection"
exit 0
fi
# 2. 统计最近 429 错误
four29_count=$(count_recent_429)
# 3. 判断:CHECK_WINDOW 内 429 总数 >= THRESHOLD → 重启
if [ "$four29_count" -ge "$THRESHOLD" ]; then
log "ALERT: ${four29_count} 429 errors in last ${CHECK_WINDOW}s (threshold=${THRESHOLD}), restarting Gateway"
$RESTART_CMD 2>&1
log "Gateway restart completed"
echo "$four29_count" > "$STATE_FILE"
elif [ "$four29_count" -gt 0 ]; then
log "429 detected: ${four29_count} errors in last ${CHECK_WINDOW}s (threshold=${THRESHOLD}, no restart)"
echo "$four29_count" > "$STATE_FILE"
else
# 只在有前次记录时打印 clear
if [ -f "$STATE_FILE" ] && [ "$(cat "$STATE_FILE")" != "0" ]; then
log "429 cleared"
fi
echo 0 > "$STATE_FILE"
# 2. 先检查 Gateway 是否在运行
if ! $HEALTH_CMD &>/dev/null; then
log "WARN: Gateway health check failed, attempting restart"
$RESTART_CMD 2>&1 || true
local_ts=$(date '+%Y-%m-%dT%H:%M:%S%z')
cooldown_ts=$(($(date +%s) + COOLDOWN))
state_write "$local_ts" "health_fail" "$cooldown_ts"
log_restart "health_fail" "Gateway health check failed" 0 0 0
log "Gateway restart completed (reason: health_fail)"
exit 0
fi
# 3. 统计各规则命中次数
count_rules
log "rule counts: R1(FailoverError)=${R1_COUNT}/${R1_THRESHOLD} R2(stalled)=${R2_COUNT}/${R2_THRESHOLD} R3(rate_limit)=${R3_COUNT}/${R3_THRESHOLD}"
# 4. 判断是否触发重启
triggered_reason=""
triggered_detail=""
if [ "$R1_COUNT" -ge "$R1_THRESHOLD" ]; then
triggered_reason="R1"
triggered_detail="FailoverError x${R1_COUNT}"
elif [ "$R2_COUNT" -ge "$R2_THRESHOLD" ]; then
triggered_reason="R2"
triggered_detail="stalled recovery=none x${R2_COUNT}"
elif [ "$R3_COUNT" -ge "$R3_THRESHOLD" ]; then
triggered_reason="R3"
triggered_detail="rate_limit/429 x${R3_COUNT}"
fi
if [ -n "$triggered_reason" ]; then
if is_in_cooldown; then
local_cooldown_until=$(state_get "cooldown_until")
log "ALERT: ${triggered_reason} triggered (${triggered_detail}) but in cooldown until ${local_cooldown_until}, skipping restart"
else
log "ALERT: ${triggered_reason} triggered (${triggered_detail}), restarting Gateway"
$RESTART_CMD 2>&1
local_ts=$(date '+%Y-%m-%dT%H:%M:%S%z')
cooldown_ts=$(($(date +%s) + COOLDOWN))
state_write "$local_ts" "$triggered_reason" "$cooldown_ts"
log_restart "$triggered_reason" "$triggered_detail" "$R1_COUNT" "$R2_COUNT" "$R3_COUNT"
log "Gateway restart completed (reason: ${triggered_reason})"
fi
else
# 无规则命中,一切正常
log "all clear"
fi