auto-sync: 2026-05-26 11:47:58
This commit is contained in:
+31
-6
@@ -659,13 +659,38 @@ curl -X POST http://{api_host}:{api_port}/api/projects/{project_id}/tasks/{task_
|
||||
# 检查 session 状态
|
||||
state = self._check_session_state(agent_id)
|
||||
|
||||
# B1: 假死
|
||||
# B1: 假死 — 先复活,连续假死 ≥2 次再 failed
|
||||
if state.get("status") == "running" and not state.get("lock_pid_alive", True):
|
||||
logger.error("Agent %s session stuck (session=%s, lock PID dead)",
|
||||
agent_id, session_id)
|
||||
self._mark_task(db_path, task_id, "failed",
|
||||
{"reason": "session_stuck", "diagnostics": state})
|
||||
await self._do_on_complete_async(on_complete, agent_id, "session_stuck")
|
||||
# 假死计数
|
||||
stuck_count = self._stuck_counts.get(task_id, 0) + 1
|
||||
self._stuck_counts[task_id] = stuck_count
|
||||
|
||||
if stuck_count >= 2:
|
||||
# 连续假死 ≥2 次,标 failed
|
||||
logger.error("Agent %s session stuck %d times (session=%s, lock PID dead)",
|
||||
agent_id, stuck_count, session_id)
|
||||
self._mark_task(db_path, task_id, "failed",
|
||||
{"reason": "session_stuck", "stuck_count": stuck_count,
|
||||
"diagnostics": state})
|
||||
await self._do_on_complete_async(on_complete, agent_id, "session_stuck")
|
||||
return
|
||||
|
||||
# 第 1 次假死 → 尝试复活
|
||||
logger.warning("Agent %s session stuck (attempt %d), reviving (session=%s)",
|
||||
agent_id, stuck_count, session_id)
|
||||
revived = self._revive_session(agent_id)
|
||||
if revived:
|
||||
logger.info("Agent %s session revived, releasing counter for ticker re-dispatch",
|
||||
agent_id)
|
||||
# release counter → 任务保持 working → ticker 下次 re-dispatch
|
||||
await self._do_on_complete_async(on_complete, agent_id, "session_revived")
|
||||
else:
|
||||
# 复活失败 → 标 failed
|
||||
logger.error("Agent %s revive failed, marking failed", agent_id)
|
||||
self._mark_task(db_path, task_id, "failed",
|
||||
{"reason": "revive_failed", "stuck_count": stuck_count,
|
||||
"diagnostics": state})
|
||||
await self._do_on_complete_async(on_complete, agent_id, "revive_failed")
|
||||
return
|
||||
|
||||
# B2/B3/B4: 进程还活着
|
||||
|
||||
Reference in New Issue
Block a user