From 336504fe7986f517ca484b88ad2ef52622789899 Mon Sep 17 00:00:00 2001 From: cfdaily Date: Tue, 26 May 2026 08:21:55 +0800 Subject: [PATCH] auto-sync: 2026-05-26 08:21:55 --- src/daemon/spawner.py | 48 ++++++++++++++++++++++++++++++++++++------- 1 file changed, 41 insertions(+), 7 deletions(-) diff --git a/src/daemon/spawner.py b/src/daemon/spawner.py index 0d7e9e6..29306e0 100644 --- a/src/daemon/spawner.py +++ b/src/daemon/spawner.py @@ -544,7 +544,12 @@ curl -X POST http://{api_host}:{api_port}/api/projects/{project_id}/tasks/{task_ async def _handle_exit(self, session_id, agent_id, task_id, exit_code, stdout_chunks, stderr_chunks, on_complete, db_path): - """情况 A:进程退出后的处理""" + """情况 A:进程退出后的处理 + + v2.7.2: 进程退出 = counter release(由 on_complete = wrapped_on_complete 保证)。 + 只有 A2/A3(gateway_timeout)触发续杯,其他都不 retry。 + A9(api_error/429)额外推回 pending + 设冷却。 + """ stdout_text = b"".join(stdout_chunks).decode("utf-8", errors="replace") stderr_text = b"".join(stderr_chunks).decode("utf-8", errors="replace") @@ -582,16 +587,45 @@ curl -X POST http://{api_host}:{api_port}/api/projects/{project_id}/tasks/{task_ logger.info("Agent %s finished (session=%s, outcome=%s, exit=%d, task_status=%s)", agent_id, session_id, outcome, exit_code, task_status) - if cls["release_counter"]: - await self._do_on_complete_async(on_complete, agent_id, outcome) - elif cls["should_retry"]: - # 续杯期间 counter 保持占用(设计文档规定) - # on_complete 传入续杯链:最终完成或超限时由 _do_retry 调用 release + if cls["should_retry"]: + # A2/A3: gateway_timeout → 续杯(on_complete 会 release counter) await self._do_retry( session_id, agent_id, task_id, on_complete, db_path, cls.get("retry_field", "retry_count") ) - # else: 暂时性失败(A8/A9/A11),不 release,不 retry,等 ticker + elif outcome == "api_error": + # A9: 429/API 错误 → release counter(on_complete)+ 推回 pending + 冷却 + await self._do_on_complete_async(on_complete, agent_id, outcome) + if self.counter: + self.counter.set_cooldown(agent_id) + if db_path and task_id: + self._mark_task(db_path, task_id, "pending", { + "reason": "api_error_retry", + }) + logger.info("Task %s pushed back to pending (api_error)", task_id) + elif outcome == "fallback_timeout" and not cls["should_retry"]: + # A5/A6: fallback 不应出现,标 failed + escalate + context 日志 + logger.error("UNEXPECTED FALLBACK: agent=%s session=%s task=%s " + "transport=%s fallbackReason=%s counter_active=%s " + "This indicates counter check failed to prevent concurrent spawn.", + agent_id, session_id, task_id, + meta.get("transport"), meta.get("fallbackReason"), + self.counter.active_agents if self.counter else "N/A") + await self._do_on_complete_async(on_complete, agent_id, outcome) + if db_path and task_id: + self._mark_task(db_path, task_id, "failed", { + "reason": "unexpected_fallback", + "transport": meta.get("transport"), + "fallback_reason": meta.get("fallbackReason"), + "duration_ms": meta.get("durationMs"), + }) + else: + # 其他:A1(completed), A4(agent_failed), A7(auth_failed), + # A8(gateway_unreachable), A11(lock_conflict), + # A10(compact_failed), A12(agent_error) + # 进程退出 → on_complete release counter + # 任务状态由各 outcome 自行处理(或等 ticker) + await self._do_on_complete_async(on_complete, agent_id, outcome) async def _handle_monitor_timeout(self, session_id, agent_id, task_id, proc, on_complete, db_path, stderr_chunks,