[moz] feat: Runaway Guard per-task dispatch 上限

§15 Runaway Guard — per-task dispatch_count 上限，防止无限循环 dispatch 问题：mail/toolchain task 走 handler auto-working（跳过 claim），不受 claim_timeout 3 次重试兜底保护。如果反复 spawn 但永远到不了 done/failed，会无限循环消耗资源（实际案例：2026-06-15 mention 重复投递事件）。设计： - tasks 表新增 dispatch_count 字段 - 每次 ticker 成功 dispatch 时递增 - dispatch_count >= 10 时自动标 failed（reason=runaway_guard） - 覆盖所有非终态（pending/working/claimed） - 参考 Hermes v0.13 §3 Per-Task 重试上限改动文件： - src/blackboard/db.py: _safe_add_column dispatch_count - src/blackboard/models.py: Task dataclass 加 dispatch_count - src/daemon/ticker.py: dispatch 递增 + _check_timeouts runaway guard - docs/design/15-runaway-guard.md: 设计文档 - tests/integration/test_ticker_integration.py: E13 测试 3 个测试：456 passed, 3 skipped
2026-06-16 00:18:15 +08:00
parent cc5c7f5ad1
commit 9ec601d747
5 changed files with 206 additions and 0 deletions
@@ -117,6 +117,7 @@ def _migrate_v28(conn: sqlite3.Connection) -> None:

    _safe_add_column(conn, "tasks", "round_count", "INTEGER DEFAULT 0")
    _safe_add_column(conn, "tasks", "resumed_from", "TEXT")
+    _safe_add_column(conn, "tasks", "dispatch_count", "INTEGER DEFAULT 0")

    # 3. checkpoints 表（M3）
    conn.execute("""CREATE TABLE IF NOT EXISTS checkpoints (
@@ -41,6 +41,8 @@ class Task:
    resumed_from: Optional[str] = None       # 暂停前状态，恢复时回到原状态
    # v2.9 四相循环
    round_count: int = 0                     # 庞统 review 轮次计数
+    # §15 Runaway Guard
+    dispatch_count: int = 0                  # 被 ticker dispatch 的总次数
    # v2.8 归档
    archived: bool = False
    archived_at: Optional[str] = None
@@ -1084,6 +1084,19 @@ Parent Task ID: {parent_task.id}
            broadcast_ids = await self._broadcast_claim(broadcast_tasks, db_path, project_id)
            dispatched.extend(broadcast_ids)

+        # §15 Runaway Guard: 统一递增 dispatch_count
+        if dispatched:
+            conn = get_connection(db_path)
+            try:
+                for tid in dispatched:
+                    conn.execute(
+                        "UPDATE tasks SET dispatch_count = COALESCE(dispatch_count, 0) + 1 WHERE id=?",
+                        (tid,),
+                    )
+                conn.commit()
+            finally:
+                conn.close()
+
        return dispatched

    async def _broadcast_claim(self, tasks: list, db_path: Path,
@@ -1376,6 +1389,19 @@ Parent Task ID: {parent_task.id}
            except Exception:
                logger.exception("Review dispatch failed for %s", task.id)

+        # §15 Runaway Guard: 统一递增 dispatch_count (review)
+        if dispatched:
+            conn = get_connection(db_path)
+            try:
+                for tid in dispatched:
+                    conn.execute(
+                        "UPDATE tasks SET dispatch_count = COALESCE(dispatch_count, 0) + 1 WHERE id=?",
+                        (tid,),
+                    )
+                conn.commit()
+            finally:
+                conn.close()
+
        return dispatched

    # ------------------------------------------------------------------
@@ -1388,6 +1414,31 @@ Parent Task ID: {parent_task.id}
        reclaimed: List[str] = []
        now = datetime.utcnow()  # UTC，与 SQLite datetime('now') 一致

+        # §15 Runaway Guard: per-task dispatch_count 上限检查
+        # 覆盖所有状态，防止无限循环 dispatch
+        MAX_DISPATCH_COUNT = 10
+        for status_to_check in ("pending", "working", "claimed"):
+            tasks_to_check = queries.tasks_by_status(status_to_check)
+            for task in tasks_to_check:
+                dispatch_count = getattr(task, 'dispatch_count', 0) or 0
+                if dispatch_count >= MAX_DISPATCH_COUNT:
+                    conn = get_connection(db_path)
+                    try:
+                        ok = self._transition_status(
+                            conn, task.id, "failed",
+                            agent="daemon",
+                            detail={"reason": "runaway_guard",
+                                    "dispatch_count": dispatch_count,
+                                    "message": f"dispatch {dispatch_count} 次仍未完成，自动标 failed"},
+                        )
+                        if ok:
+                            reclaimed.append(task.id)
+                            logger.error(
+                                "Task %s: runaway guard triggered (dispatch_count=%d, status=%s), marking failed",
+                                task.id, dispatch_count, status_to_check)
+                    finally:
+                        conn.close()
+
        # claimed 超时 → 重置为 pending（如果 retry_count >= 3 则升级庞统）
        claimed = queries.tasks_by_status("claimed")
        for task in claimed: