Files
2026-06-02 22:12:14 +08:00

253 lines
7.8 KiB
Bash
Executable File
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/bin/bash
# gateway-watchdog.sh — 检测 Gateway 进程日志异常并自动重启(v2)
#
# 改动(v2):
# - 数据源从 session jsonl 改为 Gateway 进程日志
# - 三条检测规则:R1FailoverError)、R2stalled recovery=none)、R3rate_limit/429
# - 防重启风暴:重启后 5 分钟冷却期
# - 重启原因记录到 /tmp/gateway-watchdog-restarts.log 用于统计分析
#
# 部署:cron 每分钟执行一次
# * * * * * /Users/chufeng/.openclaw/sanguo_projects/sanguo_moziplus_v2/scripts/gateway-watchdog.sh >> /tmp/gateway-watchdog.log 2>&1
set -euo pipefail
# crontab 环境下 PATH 不完整,补上
export PATH="/opt/homebrew/bin:/usr/local/bin:/usr/bin:/bin:$PATH"
# === flock 防并发(macOS 无 flock,用 mkdir ===
LOCK_FILE="/tmp/gateway-watchdog.lock"
if ! mkdir "$LOCK_FILE" 2>/dev/null; then
exit 0
fi
trap 'rmdir "$LOCK_FILE" 2>/dev/null' EXIT
# === 配置 ===
CHECK_WINDOW=120 # 检查最近多少秒的日志
R1_THRESHOLD=2 # FailoverError 阈值
R2_THRESHOLD=3 # stalled recovery=none 阈值
R3_THRESHOLD=2 # rate_limit/429 阈值
COOLDOWN=300 # 重启后冷却期(秒)
LOG_FILE="/tmp/openclaw/openclaw-$(date '+%Y-%m-%d').log" # Gateway 进程日志
STATE_FILE="/tmp/gateway-watchdog-state" # JSON 状态文件
RESTART_LOG="/tmp/gateway-watchdog-restarts.log" # 重启原因记录(永久追加)
RESTART_CMD="openclaw gateway restart"
HEALTH_CMD="openclaw gateway health"
# === 函数 ===
log() {
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*"
}
# 从状态文件读取 JSON 字段值
state_get() {
local key="$1"
if [ -f "$STATE_FILE" ]; then
python3 -c "
import json, sys
try:
with open('$STATE_FILE') as f:
d = json.load(f)
print(d.get('$key', ''))
except:
print('')
" 2>/dev/null
fi
}
# 写状态文件(JSON 格式)
state_write() {
local last_restart_time="$1"
local last_restart_reason="$2"
local cooldown_until="$3"
cat > "$STATE_FILE" <<STATEOF
{"last_restart_time":"${last_restart_time}","last_restart_reason":"${last_restart_reason}","cooldown_until":${cooldown_until}}
STATEOF
}
# 记录重启原因到永久日志
log_restart() {
local reason="$1"
local detail="$2"
local r1_count="$3"
local r2_count="$4"
local r3_count="$5"
local ts
ts=$(date '+%Y-%m-%dT%H:%M:%S%z')
# 确保 restarts log 文件存在
touch "$RESTART_LOG"
python3 -c "
import json
entry = {
'time': '${ts}',
'reason': '${reason}',
'detail': '''${detail}''',
'counts': {'r1': ${r1_count}, 'r2': ${r2_count}, 'r3': ${r3_count}}
}
with open('${RESTART_LOG}', 'a') as f:
f.write(json.dumps(entry, ensure_ascii=False) + '\n')
" 2>/dev/null
}
# 检查是否在冷却期内
is_in_cooldown() {
local cooldown_until
cooldown_until=$(state_get "cooldown_until")
if [ -n "$cooldown_until" ] && [ "$cooldown_until" != "None" ]; then
local now
now=$(date +%s)
if [ "$now" -lt "$cooldown_until" ]; then
return 0 # 在冷却期内
fi
fi
return 1 # 不在冷却期
}
# 从 Gateway 日志中提取最近 CHECK_WINDOW 秒内的行
# 输出:符合条件的日志行(供后续 grep 计数)
get_recent_lines() {
if [ ! -f "$LOG_FILE" ]; then
return
fi
local cutoff
cutoff=$(python3 -c "
from datetime import datetime, timedelta, timezone
import sys
tz = timezone(timedelta(hours=8))
cutoff = datetime.now(tz) - timedelta(seconds=${CHECK_WINDOW})
print(cutoff.strftime('%Y-%m-%dT%H:%M:%S'))
" 2>/dev/null)
[ -z "$cutoff" ] && return
# 用 python3 做时间过滤(日志行中 time 字段是 ISO 格式)
python3 -c "
import json, sys
cutoff = '${cutoff}'
with open('${LOG_FILE}') as f:
for line in f:
line = line.strip()
if not line:
continue
try:
d = json.loads(line)
ts = d.get('time', '')[:19] # 截取到秒
if ts >= cutoff:
sys.stdout.write(line + '\n')
except:
pass
" 2>/dev/null
}
# 辅助函数:安全计数(从管道输入 grep 后取整数值)
_safe_count() {
# 接受 grep pattern,对 stdin 计数,返回纯整数
local count
count=$(grep -c "$1" 2>/dev/null || true)
# 去掉所有空白和换行,确保是纯数字
count=$(echo "$count" | tr -d '[:space:]')
if [ -z "$count" ]; then count=0; fi
echo "$count"
}
# 统计各规则命中次数
# 用全局变量返回:R1_COUNT, R2_COUNT, R3_COUNT
count_rules() {
local recent_lines
recent_lines=$(get_recent_lines)
if [ -z "$recent_lines" ]; then
R1_COUNT=0
R2_COUNT=0
R3_COUNT=0
return
fi
# R1: 含 "lane task error" 且含 "FailoverError"
local r1_matched
r1_matched=$(echo "$recent_lines" | grep "lane task error" || true)
if [ -n "$r1_matched" ]; then
R1_COUNT=$(echo "$r1_matched" | _safe_count "FailoverError")
else
R1_COUNT=0
fi
# R2: 含 "stalled session" 且含 "recovery=none"
local r2_matched
r2_matched=$(echo "$recent_lines" | grep "stalled session" || true)
if [ -n "$r2_matched" ]; then
R2_COUNT=$(echo "$r2_matched" | _safe_count "recovery=none")
else
R2_COUNT=0
fi
# R3: 含 "rate_limit" 或含 "429"
local r3_matched
r3_matched=$(echo "$recent_lines" | grep -E 'rate_limit|429' || true)
if [ -n "$r3_matched" ]; then
R3_COUNT=$(echo "$r3_matched" | wc -l | tr -d '[:space:]')
else
R3_COUNT=0
fi
}
# === 主逻辑 ===
log "watchdog v2 check start (log: $LOG_FILE)"
# 1. 检查日志文件是否存在
if [ ! -f "$LOG_FILE" ]; then
log "INFO: Gateway log file not found ($LOG_FILE), skipping detection"
exit 0
fi
# 2. 先检查 Gateway 是否在运行
if ! $HEALTH_CMD &>/dev/null; then
log "WARN: Gateway health check failed, attempting restart"
$RESTART_CMD 2>&1 || true
local_ts=$(date '+%Y-%m-%dT%H:%M:%S%z')
cooldown_ts=$(($(date +%s) + COOLDOWN))
state_write "$local_ts" "health_fail" "$cooldown_ts"
log_restart "health_fail" "Gateway health check failed" 0 0 0
log "Gateway restart attempted (reason: health_fail)"
exit 0
fi
# 3. 统计各规则命中次数
count_rules
log "rule counts: R1(FailoverError)=${R1_COUNT}/${R1_THRESHOLD} R2(stalled)=${R2_COUNT}/${R2_THRESHOLD} R3(rate_limit)=${R3_COUNT}/${R3_THRESHOLD}"
# 4. 判断是否触发重启
triggered_reason=""
triggered_detail=""
if [ "$R1_COUNT" -ge "$R1_THRESHOLD" ]; then
triggered_reason="R1"
triggered_detail="FailoverError x${R1_COUNT}"
elif [ "$R2_COUNT" -ge "$R2_THRESHOLD" ]; then
triggered_reason="R2"
triggered_detail="stalled recovery=none x${R2_COUNT}"
elif [ "$R3_COUNT" -ge "$R3_THRESHOLD" ]; then
triggered_reason="R3"
triggered_detail="rate_limit/429 x${R3_COUNT}"
fi
if [ -n "$triggered_reason" ]; then
if is_in_cooldown; then
local_cooldown_until=$(state_get "cooldown_until")
log "ALERT: ${triggered_reason} triggered (${triggered_detail}) but in cooldown until ${local_cooldown_until}, skipping restart"
else
log "ALERT: ${triggered_reason} triggered (${triggered_detail}), restarting Gateway"
$RESTART_CMD 2>&1
local_ts=$(date '+%Y-%m-%dT%H:%M:%S%z')
cooldown_ts=$(($(date +%s) + COOLDOWN))
state_write "$local_ts" "$triggered_reason" "$cooldown_ts"
log_restart "$triggered_reason" "$triggered_detail" "$R1_COUNT" "$R2_COUNT" "$R3_COUNT"
log "Gateway restart completed (reason: ${triggered_reason})"
fi
else
# 无规则命中,一切正常
log "all clear"
fi