diff --git a/src/daemon/spawner.py b/src/daemon/spawner.py index 9163920..122fd66 100644 --- a/src/daemon/spawner.py +++ b/src/daemon/spawner.py @@ -558,12 +558,33 @@ curl -X POST http://{api_host}:{api_port}/api/projects/{project_id}/tasks/{task_ async def _do_retry(self, session_id, agent_id, task_id, on_complete, db_path, retry_field="retry_count"): """续杯:用同一 session_id 再 spawn 一次""" - retry_counts = self._get_retry_counts(db_path, task_id) - count = retry_counts.get(retry_field, 0) + 1 - - # 更新计数器并写回最新 attempt 的 metadata - retry_counts[retry_field] = count - self._update_retry_counts(db_path, task_id, retry_counts) + # 直接读写 tasks 表的 retry_count(广播场景下所有 Agent 共享同一 tasks 记录) + # task_attempts metadata 的 retry_count 不可靠(多 Agent 互相覆盖) + if retry_field == "retry_count" and db_path and task_id: + try: + conn = get_connection(db_path) + try: + conn.execute("BEGIN IMMEDIATE") + conn.execute( + "UPDATE tasks SET retry_count = COALESCE(retry_count, 0) + 1 WHERE id=?", + (task_id,), + ) + conn.commit() + row = conn.execute( + "SELECT retry_count FROM tasks WHERE id=?", (task_id,) + ).fetchone() + count = row["retry_count"] if row else 1 + finally: + conn.close() + except Exception: + logger.exception("Failed to update retry_count for task %s", task_id) + count = 1 + else: + # 非 retry_count 的计数器(connect/api/lock)仍用 task_attempts metadata + retry_counts = self._get_retry_counts(db_path, task_id) + count = retry_counts.get(retry_field, 0) + 1 + retry_counts[retry_field] = count + self._update_retry_counts(db_path, task_id, retry_counts) if count >= self.max_retries: logger.error("Agent %s max retries (session=%s, %s=%d)",