diff --git a/src/daemon/spawner.py b/src/daemon/spawner.py index ef108e9..df972bf 100644 --- a/src/daemon/spawner.py +++ b/src/daemon/spawner.py @@ -659,13 +659,38 @@ curl -X POST http://{api_host}:{api_port}/api/projects/{project_id}/tasks/{task_ # 检查 session 状态 state = self._check_session_state(agent_id) - # B1: 假死 + # B1: 假死 — 先复活,连续假死 ≥2 次再 failed if state.get("status") == "running" and not state.get("lock_pid_alive", True): - logger.error("Agent %s session stuck (session=%s, lock PID dead)", - agent_id, session_id) - self._mark_task(db_path, task_id, "failed", - {"reason": "session_stuck", "diagnostics": state}) - await self._do_on_complete_async(on_complete, agent_id, "session_stuck") + # 假死计数 + stuck_count = self._stuck_counts.get(task_id, 0) + 1 + self._stuck_counts[task_id] = stuck_count + + if stuck_count >= 2: + # 连续假死 ≥2 次,标 failed + logger.error("Agent %s session stuck %d times (session=%s, lock PID dead)", + agent_id, stuck_count, session_id) + self._mark_task(db_path, task_id, "failed", + {"reason": "session_stuck", "stuck_count": stuck_count, + "diagnostics": state}) + await self._do_on_complete_async(on_complete, agent_id, "session_stuck") + return + + # 第 1 次假死 → 尝试复活 + logger.warning("Agent %s session stuck (attempt %d), reviving (session=%s)", + agent_id, stuck_count, session_id) + revived = self._revive_session(agent_id) + if revived: + logger.info("Agent %s session revived, releasing counter for ticker re-dispatch", + agent_id) + # release counter → 任务保持 working → ticker 下次 re-dispatch + await self._do_on_complete_async(on_complete, agent_id, "session_revived") + else: + # 复活失败 → 标 failed + logger.error("Agent %s revive failed, marking failed", agent_id) + self._mark_task(db_path, task_id, "failed", + {"reason": "revive_failed", "stuck_count": stuck_count, + "diagnostics": state}) + await self._do_on_complete_async(on_complete, agent_id, "revive_failed") return # B2/B3/B4: 进程还活着