diff --git a/src/daemon/spawner.py b/src/daemon/spawner.py index 99e0b4c..60c71ae 100644 --- a/src/daemon/spawner.py +++ b/src/daemon/spawner.py @@ -768,6 +768,10 @@ curl -X POST http://{api_host}:{api_port}/api/projects/{project_id}/tasks/{task_ agent_id, session_id, outcome, exit_code, task_status) if cls["should_retry"]: + # cooldown: 新增的可恢复场景(A14/A15/A16/A8/A10) + cooldown_seconds = cls.get("cooldown_seconds", 0) + if cooldown_seconds and self.counter: + self.counter.set_cooldown(agent_id, seconds=cooldown_seconds) # A2/A3: gateway_timeout → 续杯(on_complete 会 release counter) await self._do_retry( session_id, agent_id, task_id, on_complete, db_path, @@ -828,7 +832,7 @@ curl -X POST http://{api_host}:{api_port}/api/projects/{project_id}/tasks/{task_ agent_id, session_id, task_id, fallback_count, self.max_retries, json_result.get("fallback_reason")) if self.counter: - self.counter.set_cooldown(agent_id, seconds=30) + self.counter.set_cooldown(agent_id, seconds=60) await self._do_retry( session_id, agent_id, task_id, on_complete, db_path, "fallback_retry_count" # 独立计数,不与 gateway_timeout 的 retry_count 共用 @@ -839,7 +843,7 @@ curl -X POST http://{api_host}:{api_port}/api/projects/{project_id}/tasks/{task_ # A10(compact_failed), A12(agent_error) # v2.8.1 Fix-3a: crash 类 outcome 设 cooldown,给 agent session 恢复时间 if outcome in ("crashed", "compact_failed", "process_crash", "session_stuck", - "compact_hanging", "agent_error") and self.counter: + "compact_hanging", "agent_error", "compact_interrupted") and self.counter: self.counter.set_cooldown(agent_id, seconds=300) # 5 分钟 logger.info("Crash/error cooldown set for %s: 300s (outcome=%s)", agent_id, outcome) # 注意: cooldown 期间任务状态仍为 working,但 counter 已释放。 @@ -1286,8 +1290,8 @@ curl -X POST http://{api_host}:{api_port}/api/projects/{project_id}/tasks/{task_ task_status: Optional[str], stdout_text: str = "") -> dict: """分类退出原因,返回处理策略 - v3.0: 基于 JSON status/summary/executionTrace 判定,不再依赖 transport 字段。 - 只有 status="timeout" 触发 retry,其他都不 retry。 + v3.1: A0 拆分为 A14-A17(信号中断/stderr 智能分类)。 + A8/A10 改为可恢复 retry。cooldown 统一 60s。 """ status = json_result.get("status") summary = json_result.get("summary", "") @@ -1312,10 +1316,22 @@ curl -X POST http://{api_host}:{api_port}/api/projects/{project_id}/tasks/{task_ return {"outcome": "gateway_timeout", "should_retry": True, "retry_field": "retry_count"} - # A0: stdout 为空且 exit≠0 = 进程异常终止 - # 注意:exit=0 + stdout 为空可能是正常完成(--json 没输出), - # 此时 task_status 如果是 done/review 会被上面的 A4 兜住 + # A0 拆分: 无 JSON 输出 + exit≠0 if status is None and not stdout_text.strip() and exit_code != 0: + # A14: SIGINT(130) / SIGTERM(143) → 外部中断,可恢复 + if exit_code in (130, 143): + return {"outcome": "interrupted", "should_retry": True, + "retry_field": "retry_count", "cooldown_seconds": 60} + # A15/A16: stderr 含 network/compact 关键字 → 可恢复 + if stderr_text: + stderr_lower = stderr_text.lower() + if any(kw in stderr_lower for kw in ["econnrefused", "etimedout", "gateway closed", "econnreset"]): + return {"outcome": "gateway_unreachable", "should_retry": True, + "retry_field": "retry_count", "cooldown_seconds": 60} + if any(kw in stderr_lower for kw in ["compaction-diag", "context-overflow"]): + return {"outcome": "compact_interrupted", "should_retry": True, + "retry_field": "retry_count", "cooldown_seconds": 60} + # A17: 真正的 crash → 保持 working,ticker 兜底 return {"outcome": "crashed", "should_retry": False, "original": "process_crash"} # stdout 为空但 exit=0:可能是正常完成但 --json 没输出 @@ -1332,13 +1348,15 @@ curl -X POST http://{api_host}:{api_port}/api/projects/{project_id}/tasks/{task_ if any(kw in stderr_lower for kw in ["401", "403", "unauthorized", "auth"]): return {"outcome": "auth_failed", "should_retry": False} if any(kw in stderr_lower for kw in ["econnrefused", "etimedout", "gateway closed", "econnreset"]): - return {"outcome": "gateway_unreachable", "should_retry": False} + return {"outcome": "gateway_unreachable", "should_retry": True, + "retry_field": "retry_count", "cooldown_seconds": 60} if any(kw in stderr_lower for kw in ["rate_limit", "500", "503", "api error"]): return {"outcome": "api_error", "should_retry": False} if any(kw in stderr_lower for kw in ["compaction-diag", "context-overflow"]): return {"outcome": "compact_failed", "should_retry": False} if any(kw in stderr_lower for kw in ["lock", "busy", "concurrent", "lane task error"]): - return {"outcome": "lock_conflict", "should_retry": False} + return {"outcome": "lock_conflict", "should_retry": True, + "retry_field": "retry_count", "cooldown_seconds": 60} return {"outcome": "agent_error", "should_retry": False} # 兜底:status 未知值