#!/bin/bash
# gateway-watchdog.sh — 检测 Gateway 进程日志异常并自动重启（v2）
#
# 改动（v2）：
#   - 数据源从 session jsonl 改为 Gateway 进程日志
#   - 三条检测规则：R1（FailoverError）、R2（stalled recovery=none）、R3（rate_limit/429）
#   - 防重启风暴：重启后 5 分钟冷却期
#   - 重启原因记录到 /tmp/gateway-watchdog-restarts.log 用于统计分析
#
# 部署：cron 每分钟执行一次
#   * * * * * /Users/chufeng/.openclaw/sanguo_projects/sanguo_moziplus_v2/scripts/gateway-watchdog.sh >> /tmp/gateway-watchdog.log 2>&1

set -euo pipefail

# crontab 环境下 PATH 不完整，补上
export PATH="/opt/homebrew/bin:/usr/local/bin:/usr/bin:/bin:$PATH"

# === flock 防并发（macOS 无 flock，用 mkdir） ===
LOCK_FILE="/tmp/gateway-watchdog.lock"
if ! mkdir "$LOCK_FILE" 2>/dev/null; then
    exit 0
fi
trap 'rmdir "$LOCK_FILE" 2>/dev/null' EXIT

# === 配置 ===
CHECK_WINDOW=120                                              # 检查最近多少秒的日志
R1_THRESHOLD=2                                                # FailoverError 阈值
R2_THRESHOLD=3                                                # stalled recovery=none 阈值
R3_THRESHOLD=2                                                # rate_limit/429 阈值
COOLDOWN=300                                                  # 重启后冷却期（秒）
LOG_FILE="/tmp/openclaw/openclaw-$(date '+%Y-%m-%d').log"      # Gateway 进程日志
STATE_FILE="/tmp/gateway-watchdog-state"                      # JSON 状态文件
RESTART_LOG="/tmp/gateway-watchdog-restarts.log"              # 重启原因记录（永久追加）
RESTART_CMD="openclaw gateway restart"
HEALTH_CMD="openclaw gateway health"

# === 函数 ===

log() {
    echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*"
}

# 从状态文件读取 JSON 字段值
state_get() {
    local key="$1"
    if [ -f "$STATE_FILE" ]; then
        python3 -c "
import json, sys
try:
    with open('$STATE_FILE') as f:
        d = json.load(f)
    print(d.get('$key', ''))
except:
    print('')
" 2>/dev/null
    fi
}

# 写状态文件（JSON 格式）
state_write() {
    local last_restart_time="$1"
    local last_restart_reason="$2"
    local cooldown_until="$3"
    cat > "$STATE_FILE" <<STATEOF
{"last_restart_time":"${last_restart_time}","last_restart_reason":"${last_restart_reason}","cooldown_until":${cooldown_until}}
STATEOF
}

# 记录重启原因到永久日志
log_restart() {
    local reason="$1"
    local detail="$2"
    local r1_count="$3"
    local r2_count="$4"
    local r3_count="$5"
    local ts
    ts=$(date '+%Y-%m-%dT%H:%M:%S%z')
    # 确保 restarts log 文件存在
    touch "$RESTART_LOG"
    python3 -c "
import json
entry = {
    'time': '${ts}',
    'reason': '${reason}',
    'detail': '''${detail}''',
    'counts': {'r1': ${r1_count}, 'r2': ${r2_count}, 'r3': ${r3_count}}
}
with open('${RESTART_LOG}', 'a') as f:
    f.write(json.dumps(entry, ensure_ascii=False) + '\n')
" 2>/dev/null
}

# 检查是否在冷却期内
is_in_cooldown() {
    local cooldown_until
    cooldown_until=$(state_get "cooldown_until")
    if [ -n "$cooldown_until" ] && [ "$cooldown_until" != "None" ]; then
        local now
        now=$(date +%s)
        if [ "$now" -lt "$cooldown_until" ]; then
            return 0  # 在冷却期内
        fi
    fi
    return 1  # 不在冷却期
}

# 从 Gateway 日志中提取最近 CHECK_WINDOW 秒内的行
# 输出：符合条件的日志行（供后续 grep 计数）
get_recent_lines() {
    if [ ! -f "$LOG_FILE" ]; then
        return
    fi
    local cutoff
    cutoff=$(python3 -c "
from datetime import datetime, timedelta, timezone
import sys
tz = timezone(timedelta(hours=8))
cutoff = datetime.now(tz) - timedelta(seconds=${CHECK_WINDOW})
print(cutoff.strftime('%Y-%m-%dT%H:%M:%S'))
" 2>/dev/null)
    [ -z "$cutoff" ] && return

    # 用 python3 做时间过滤（日志行中 time 字段是 ISO 格式）
    python3 -c "
import json, sys
cutoff = '${cutoff}'
with open('${LOG_FILE}') as f:
    for line in f:
        line = line.strip()
        if not line:
            continue
        try:
            d = json.loads(line)
            ts = d.get('time', '')[:19]  # 截取到秒
            if ts >= cutoff:
                sys.stdout.write(line + '\n')
        except:
            pass
" 2>/dev/null
}

# 辅助函数：安全计数（从管道输入 grep 后取整数值）
_safe_count() {
    # 接受 grep pattern，对 stdin 计数，返回纯整数
    local count
    count=$(grep -c "$1" 2>/dev/null || true)
    # 去掉所有空白和换行，确保是纯数字
    count=$(echo "$count" | tr -d '[:space:]')
    if [ -z "$count" ]; then count=0; fi
    echo "$count"
}

# 统计各规则命中次数
# 用全局变量返回：R1_COUNT, R2_COUNT, R3_COUNT
count_rules() {
    local recent_lines
    recent_lines=$(get_recent_lines)

    if [ -z "$recent_lines" ]; then
        R1_COUNT=0
        R2_COUNT=0
        R3_COUNT=0
        return
    fi

    # R1: 含 "lane task error" 且含 "FailoverError"
    local r1_matched
    r1_matched=$(echo "$recent_lines" | grep "lane task error" || true)
    if [ -n "$r1_matched" ]; then
        R1_COUNT=$(echo "$r1_matched" | _safe_count "FailoverError")
    else
        R1_COUNT=0
    fi

    # R2: 含 "stalled session" 且含 "recovery=none"
    local r2_matched
    r2_matched=$(echo "$recent_lines" | grep "stalled session" || true)
    if [ -n "$r2_matched" ]; then
        R2_COUNT=$(echo "$r2_matched" | _safe_count "recovery=none")
    else
        R2_COUNT=0
    fi

    # R3: 含 "rate_limit" 或含 "429"
    local r3_matched
    r3_matched=$(echo "$recent_lines" | grep -E 'rate_limit|429' || true)
    if [ -n "$r3_matched" ]; then
        R3_COUNT=$(echo "$r3_matched" | wc -l | tr -d '[:space:]')
    else
        R3_COUNT=0
    fi
}

# === 主逻辑 ===

log "watchdog v2 check start (log: $LOG_FILE)"

# 1. 检查日志文件是否存在
if [ ! -f "$LOG_FILE" ]; then
    log "INFO: Gateway log file not found ($LOG_FILE), skipping detection"
    exit 0
fi

# 2. 先检查 Gateway 是否在运行
if ! $HEALTH_CMD &>/dev/null; then
    log "WARN: Gateway health check failed, attempting restart"
    $RESTART_CMD 2>&1 || true
    local_ts=$(date '+%Y-%m-%dT%H:%M:%S%z')
    cooldown_ts=$(($(date +%s) + COOLDOWN))
    state_write "$local_ts" "health_fail" "$cooldown_ts"
    log_restart "health_fail" "Gateway health check failed" 0 0 0
    log "Gateway restart attempted (reason: health_fail)"
    exit 0
fi

# 3. 统计各规则命中次数
count_rules

log "rule counts: R1(FailoverError)=${R1_COUNT}/${R1_THRESHOLD} R2(stalled)=${R2_COUNT}/${R2_THRESHOLD} R3(rate_limit)=${R3_COUNT}/${R3_THRESHOLD}"

# 4. 判断是否触发重启
triggered_reason=""
triggered_detail=""

if [ "$R1_COUNT" -ge "$R1_THRESHOLD" ]; then
    triggered_reason="R1"
    triggered_detail="FailoverError x${R1_COUNT}"
elif [ "$R2_COUNT" -ge "$R2_THRESHOLD" ]; then
    triggered_reason="R2"
    triggered_detail="stalled recovery=none x${R2_COUNT}"
elif [ "$R3_COUNT" -ge "$R3_THRESHOLD" ]; then
    triggered_reason="R3"
    triggered_detail="rate_limit/429 x${R3_COUNT}"
fi

if [ -n "$triggered_reason" ]; then
    if is_in_cooldown; then
        local_cooldown_until=$(state_get "cooldown_until")
        log "ALERT: ${triggered_reason} triggered (${triggered_detail}) but in cooldown until ${local_cooldown_until}, skipping restart"
    else
        log "ALERT: ${triggered_reason} triggered (${triggered_detail}), restarting Gateway"
        $RESTART_CMD 2>&1
        local_ts=$(date '+%Y-%m-%dT%H:%M:%S%z')
        cooldown_ts=$(($(date +%s) + COOLDOWN))
        state_write "$local_ts" "$triggered_reason" "$cooldown_ts"
        log_restart "$triggered_reason" "$triggered_detail" "$R1_COUNT" "$R2_COUNT" "$R3_COUNT"
        log "Gateway restart completed (reason: ${triggered_reason})"
    fi
else
    # 无规则命中，一切正常
    log "all clear"
fi