From f83de496e3200d51a1fb167b7b4cf56a6b74ff23 Mon Sep 17 00:00:00 2001 From: cfdaily Date: Tue, 2 Jun 2026 19:10:31 +0800 Subject: [PATCH] auto-sync: 2026-06-02 19:10:31 --- src/daemon/spawner.py | 69 ++++++++++++++++++++++++++++++++----------- 1 file changed, 52 insertions(+), 17 deletions(-) diff --git a/src/daemon/spawner.py b/src/daemon/spawner.py index b867c05..5e8ec45 100644 --- a/src/daemon/spawner.py +++ b/src/daemon/spawner.py @@ -775,29 +775,64 @@ curl -X POST http://{api_host}:{api_port}/api/projects/{project_id}/tasks/{task_ ) elif outcome == "api_error": # A9: 429/API 错误 → release counter(on_complete)+ 推回 pending + 冷却 + # 有上限:api_retry_count 累计达 max_retries 则标 failed await self._do_on_complete_async(on_complete, agent_id, outcome) if self.counter: self.counter.set_cooldown(agent_id) if db_path and task_id: - self._mark_task(db_path, task_id, "pending", { - "reason": "api_error_retry", - }) - logger.info("Task %s pushed back to pending (api_error)", task_id) + retry_counts = self._get_retry_counts(db_path, task_id) + api_count = retry_counts.get("api_retry_count", 0) + 1 + retry_counts["api_retry_count"] = api_count + self._update_retry_counts(db_path, task_id, retry_counts) + if api_count >= self.max_retries: + logger.error("Task %s api_retry_count=%d >= max_retries, marking failed", + task_id, api_count) + self._mark_task(db_path, task_id, "failed", { + "reason": "max_api_retry_count", "count": api_count, + }) + else: + self._mark_task(db_path, task_id, "pending", { + "reason": "api_error_retry", + "api_retry_count": api_count, + }) + logger.info("Task %s pushed back to pending (api_error, api_retry=%d/%d)", + task_id, api_count, self.max_retries) elif outcome == "fallback_timeout" and not cls["should_retry"]: - # A5/A6: fallback 不应出现,标 failed + escalate + context 日志 - logger.error("UNEXPECTED FALLBACK: agent=%s session=%s task=%s " - "fallback_used=%s fallback_reason=%s counter_active=%s " - "This indicates counter check failed to prevent concurrent spawn.", - agent_id, session_id, task_id, - json_result.get("fallback_used"), json_result.get("fallback_reason"), - self.counter.active_agents if self.counter else "N/A") - await self._do_on_complete_async(on_complete, agent_id, outcome) + # A3/A3b: fallback 分级处理 + # fallback_count 从 task_attempts.metadata 读取, + # 达 max_retries 标 failed(A3),否则 retry + cooldown(A3b) + fallback_count = 0 if db_path and task_id: - self._mark_task(db_path, task_id, "failed", { - "reason": "unexpected_fallback", - "status": json_result.get("status"), - "fallback_reason": json_result.get("fallback_reason"), - }) + retry_counts = self._get_retry_counts(db_path, task_id) + fallback_count = retry_counts.get("fallback_count", 0) + 1 + retry_counts["fallback_count"] = fallback_count + self._update_retry_counts(db_path, task_id, retry_counts) + + if fallback_count >= self.max_retries: + # A3: 连续 fallback 达上限,标 failed + logger.error("A3 fallback exhausted: agent=%s session=%s task=%s " + "fallback_count=%d reason=%s", + agent_id, session_id, task_id, fallback_count, + json_result.get("fallback_reason")) + await self._do_on_complete_async(on_complete, agent_id, outcome) + if db_path and task_id: + self._mark_task(db_path, task_id, "failed", { + "reason": "fallback_exhausted", + "fallback_count": fallback_count, + "fallback_reason": json_result.get("fallback_reason"), + }) + else: + # A3b: fallback 未达上限,retry + cooldown + logger.warning("A3b fallback retry: agent=%s session=%s task=%s " + "fallback_count=%d/%d reason=%s", + agent_id, session_id, task_id, fallback_count, + self.max_retries, json_result.get("fallback_reason")) + if self.counter: + self.counter.set_cooldown(agent_id, seconds=30) + await self._do_retry( + session_id, agent_id, task_id, on_complete, db_path, + "retry_count" + ) else: # 其他:A1(completed), A4(agent_failed), A7(auth_failed), # A8(gateway_unreachable), A11(lock_conflict),