From f83de496e3200d51a1fb167b7b4cf56a6b74ff23 Mon Sep 17 00:00:00 2001
From: cfdaily <cfdaily@gitee.com>
Date: Tue, 2 Jun 2026 19:10:31 +0800
Subject: [PATCH] auto-sync: 2026-06-02 19:10:31

---
 src/daemon/spawner.py | 69 ++++++++++++++++++++++++++++++++-----------
 1 file changed, 52 insertions(+), 17 deletions(-)

diff --git a/src/daemon/spawner.py b/src/daemon/spawner.py
index b867c05..5e8ec45 100644
--- a/src/daemon/spawner.py
+++ b/src/daemon/spawner.py
@@ -775,29 +775,64 @@ curl -X POST http://{api_host}:{api_port}/api/projects/{project_id}/tasks/{task_
             )
         elif outcome == "api_error":
             # A9: 429/API 错误 → release counter(on_complete)+ 推回 pending + 冷却
+            # 有上限：api_retry_count 累计达 max_retries 则标 failed
             await self._do_on_complete_async(on_complete, agent_id, outcome)
             if self.counter:
                 self.counter.set_cooldown(agent_id)
             if db_path and task_id:
-                self._mark_task(db_path, task_id, "pending", {
-                    "reason": "api_error_retry",
-                })
-                logger.info("Task %s pushed back to pending (api_error)", task_id)
+                retry_counts = self._get_retry_counts(db_path, task_id)
+                api_count = retry_counts.get("api_retry_count", 0) + 1
+                retry_counts["api_retry_count"] = api_count
+                self._update_retry_counts(db_path, task_id, retry_counts)
+                if api_count >= self.max_retries:
+                    logger.error("Task %s api_retry_count=%d >= max_retries, marking failed",
+                                 task_id, api_count)
+                    self._mark_task(db_path, task_id, "failed", {
+                        "reason": "max_api_retry_count", "count": api_count,
+                    })
+                else:
+                    self._mark_task(db_path, task_id, "pending", {
+                        "reason": "api_error_retry",
+                        "api_retry_count": api_count,
+                    })
+                    logger.info("Task %s pushed back to pending (api_error, api_retry=%d/%d)",
+                                task_id, api_count, self.max_retries)
         elif outcome == "fallback_timeout" and not cls["should_retry"]:
-            # A5/A6: fallback 不应出现,标 failed + escalate + context 日志
-            logger.error("UNEXPECTED FALLBACK: agent=%s session=%s task=%s "
-                         "fallback_used=%s fallback_reason=%s counter_active=%s "
-                         "This indicates counter check failed to prevent concurrent spawn.",
-                         agent_id, session_id, task_id,
-                         json_result.get("fallback_used"), json_result.get("fallback_reason"),
-                         self.counter.active_agents if self.counter else "N/A")
-            await self._do_on_complete_async(on_complete, agent_id, outcome)
+            # A3/A3b: fallback 分级处理
+            # fallback_count 从 task_attempts.metadata 读取，
+            # 达 max_retries 标 failed(A3)，否则 retry + cooldown(A3b)
+            fallback_count = 0
             if db_path and task_id:
-                self._mark_task(db_path, task_id, "failed", {
-                    "reason": "unexpected_fallback",
-                    "status": json_result.get("status"),
-                    "fallback_reason": json_result.get("fallback_reason"),
-                })
+                retry_counts = self._get_retry_counts(db_path, task_id)
+                fallback_count = retry_counts.get("fallback_count", 0) + 1
+                retry_counts["fallback_count"] = fallback_count
+                self._update_retry_counts(db_path, task_id, retry_counts)
+
+            if fallback_count >= self.max_retries:
+                # A3: 连续 fallback 达上限，标 failed
+                logger.error("A3 fallback exhausted: agent=%s session=%s task=%s "
+                             "fallback_count=%d reason=%s",
+                             agent_id, session_id, task_id, fallback_count,
+                             json_result.get("fallback_reason"))
+                await self._do_on_complete_async(on_complete, agent_id, outcome)
+                if db_path and task_id:
+                    self._mark_task(db_path, task_id, "failed", {
+                        "reason": "fallback_exhausted",
+                        "fallback_count": fallback_count,
+                        "fallback_reason": json_result.get("fallback_reason"),
+                    })
+            else:
+                # A3b: fallback 未达上限，retry + cooldown
+                logger.warning("A3b fallback retry: agent=%s session=%s task=%s "
+                               "fallback_count=%d/%d reason=%s",
+                               agent_id, session_id, task_id, fallback_count,
+                               self.max_retries, json_result.get("fallback_reason"))
+                if self.counter:
+                    self.counter.set_cooldown(agent_id, seconds=30)
+                await self._do_retry(
+                    session_id, agent_id, task_id, on_complete, db_path,
+                    "retry_count"
+                )
         else:
             # 其他:A1(completed), A4(agent_failed), A7(auth_failed),
             #      A8(gateway_unreachable), A11(lock_conflict),