"""Daemon Ticker — 30s 状态扫描 + 依赖推进 + 事件驱动 Tick 循环是整个 Daemon 的核心驱动: 1. 遍历所有 active 项目 2. 每个 tick 扫描黑板状态 3. 推进依赖链(blocked → pending) 4. 写入 daemon_tick 事件 5. 调度 pending 任务(F9 实现) """ from __future__ import annotations import asyncio import json import logging from datetime import datetime from pathlib import Path from typing import Any, Callable, Coroutine, Dict, List, Optional from src.blackboard.operations import Blackboard from src.blackboard.db import get_connection from src.blackboard.models import Task from src.blackboard.queries import Queries from src.blackboard.registry import ProjectRegistry logger = logging.getLogger("moziplus-v2.ticker") class Ticker: """Daemon ticker 主循环""" def __init__( self, registry: ProjectRegistry, tick_interval: float = 30.0, max_ticks: Optional[int] = None, on_tick_complete: Optional[Callable[[], Coroutine[Any, Any, None]]] = None, dispatcher: Optional[Any] = None, spawner: Optional[Any] = None, max_dispatch_per_tick: int = 3, claim_timeout_minutes: float = 5.0, default_task_timeout_minutes: float = 30.0, ): """ Args: registry: 项目注册表 tick_interval: tick 间隔秒数 max_ticks: 测试用,限制最大 tick 次数 on_tick_complete: 每次 tick 完成后的回调(用于通知等) dispatcher: Dispatcher 实例(Agent 调度) spawner: AgentSpawner 实例(Agent spawn) max_dispatch_per_tick: 每个 tick 最多调度多少个 pending 任务 claim_timeout_minutes: claimed 超时重置为 pending 的分钟数 default_task_timeout_minutes: working 超时标记 failed 的分钟数 """ self.registry = registry self.tick_interval = tick_interval self.max_ticks = max_ticks self.on_tick_complete = on_tick_complete self.dispatcher = dispatcher self.spawner = spawner self.max_dispatch_per_tick = max_dispatch_per_tick self.claim_timeout_minutes = claim_timeout_minutes self.default_task_timeout_minutes = default_task_timeout_minutes self._tick_count: int = 0 self._running: bool = False self._task: Optional[asyncio.Task] = None # 已初始化的 db_path 集合,避免每次 tick 重复 init_db self._initialized_dbs: set = set() # 每个项目上次 tick 的 event count(用于僵尸检测 F8) self._last_event_counts: Dict[str, int] = {} # 当前项目 ID(_dispatch_pending 需要) self._current_project_id: Optional[str] = None # ------------------------------------------------------------------ # 生命周期 # ------------------------------------------------------------------ async def start(self) -> None: """启动 tick 循环""" if self._running: return self._running = True self._task = asyncio.create_task(self._loop()) logger.info("Ticker started (interval=%.1fs)", self.tick_interval) async def stop(self) -> None: """停止 tick 循环""" self._running = False if self._task: self._task.cancel() try: await self._task except asyncio.CancelledError: pass logger.info("Ticker stopped (ticks=%d)", self._tick_count) @property def tick_count(self) -> int: return self._tick_count @property def is_running(self) -> bool: return self._running # ------------------------------------------------------------------ # 主循环 # ------------------------------------------------------------------ async def _loop(self) -> None: """Tick 主循环""" while self._running: try: await self.tick() except asyncio.CancelledError: raise except Exception: logger.exception("Tick %d error", self._tick_count + 1) if self.max_ticks and self._tick_count >= self.max_ticks: logger.info("Max ticks reached (%d), stopping", self.max_ticks) self._running = False break try: await asyncio.sleep(self.tick_interval) except asyncio.CancelledError: return # ------------------------------------------------------------------ # 单次 tick # ------------------------------------------------------------------ async def tick(self) -> Dict[str, Any]: """执行一次 tick,遍历所有 active 项目""" self._tick_count += 1 tick_num = self._tick_count results: Dict[str, Any] = { "tick": tick_num, "projects": {}, } projects = self.registry.list_projects() active_projects = { pid: info for pid, info in projects.items() if info.get("status") == "active" } for project_id, project_info in active_projects.items(): try: pr = await self._tick_project(project_id, project_info) results["projects"][project_id] = pr except Exception as e: logger.exception("Tick %d project %s error", tick_num, project_id) results["projects"][project_id] = {"error": str(e)} logger.debug("Tick %d complete: %d projects", tick_num, len(active_projects)) if self.on_tick_complete: try: await self.on_tick_complete() except Exception: logger.exception("on_tick_complete callback error") return results async def _tick_project(self, project_id: str, project_info: Dict[str, Any]) -> Dict[str, Any]: """单项目的 tick 处理""" project_dir = self.registry.root / project_id db_path = project_dir / "blackboard.db" if not db_path.exists(): return {"status": "no_db"} # 只在首次遇到该 db_path 时执行 init_db db_key = str(db_path) if db_key not in self._initialized_dbs: from src.blackboard.db import init_db init_db(db_path) self._initialized_dbs.add(db_key) result: Dict[str, Any] = { "status": "ok", "summary_before": {}, "summary_after": {}, "advanced": [], "dispatched": [], "review_dispatched": [], "zombie_reclaimed": [], "parents_refreshed": [], } # 保存当前 project_id(供 _dispatch_pending 使用) self._current_project_id = project_id # 1. 扫描当前状态 queries = Queries(db_path) result["summary_before"] = queries.task_summary() # 2. 依赖推进 advanced = self._advance_dependencies(db_path) result["advanced"] = advanced # 3. 僵尸/超时处理(在依赖推进后、调度前) zombie_reclaimed = self._check_timeouts(db_path) result["zombie_reclaimed"] = zombie_reclaimed # 4. 调度 pending 任务 if self.dispatcher and self.spawner: dispatched = await self._dispatch_pending(db_path, project_id) result["dispatched"] = dispatched # 5. 调度审查任务 if self.dispatcher and self.spawner: review_dispatched = await self._dispatch_reviews(db_path, project_id) result["review_dispatched"] = review_dispatched # 6. 聚合父 Task 状态(v2.7) parents_refreshed = self._refresh_parent_statuses(db_path) result["parents_refreshed"] = parents_refreshed # 7. 写 daemon_tick 事件 conn = get_connection(db_path) try: conn.execute("BEGIN IMMEDIATE") conn.execute( "INSERT INTO events (task_id, agent, event_type, detail) VALUES (?,?,?,?)", (None, "daemon", "daemon_tick", json.dumps({"tick": self._tick_count, "advanced_count": len(advanced), "parents_refreshed": len(parents_refreshed)})), ) conn.commit() finally: conn.close() # 8. 扫描后状态 result["summary_after"] = queries.task_summary() return result # ------------------------------------------------------------------ # 父 Task 状态聚合 # ------------------------------------------------------------------ def _refresh_parent_statuses(self, db_path: Path) -> List[str]: """全量扫描有子 Task 的父 Task,刷新聚合状态 跳过手动状态(cancelled, paused)的父 Task。使用单连接批量处理。 """ queries = Queries(db_path) refreshed: List[str] = [] conn = get_connection(db_path) try: # 找出所有有子 Task 的父 Task ID parent_rows = conn.execute( "SELECT DISTINCT parent_task FROM tasks WHERE parent_task IS NOT NULL" ).fetchall() parent_ids = [r["parent_task"] for r in parent_rows] for pid in parent_ids: computed = queries.compute_parent_status(pid) if computed is None: continue parent = conn.execute( "SELECT status FROM tasks WHERE id=?", (pid,) ).fetchone() if parent and parent["status"] != computed: conn.execute( "UPDATE tasks SET status=?, updated_at=datetime('now') WHERE id=?", (computed, pid), ) refreshed.append(pid) logger.info("Parent %s status aggregated: → %s", pid, computed) if refreshed: conn.commit() finally: conn.close() return refreshed return refreshed # ------------------------------------------------------------------ # 依赖推进 # ------------------------------------------------------------------ def _advance_dependencies(self, db_path: Path) -> List[str]: """检查 blocked 任务,若所有依赖已完成则推进为 pending Returns: 被推进的 task_id 列表 """ queries = Queries(db_path) blocked = queries.blocked_tasks_with_deps() advanced: List[str] = [] if not blocked: return advanced conn = get_connection(db_path) try: for item in blocked: if item["all_deps_done"]: task_id = item["task_id"] ok = self._transition_status(conn, task_id, "pending", agent="daemon", detail={"reason": "all_dependencies_done"}) if ok: advanced.append(task_id) logger.info("Advanced %s: blocked → pending", task_id) finally: conn.close() return advanced def _transition_status(self, conn, task_id: str, new_status: str, agent: str = "daemon", detail: Optional[Dict] = None) -> bool: """轻量状态转换(不走 Blackboard 类,避免 init_db)""" from src.blackboard.db import VALID_TRANSITIONS, TERMINAL_STATUSES, EVENT_TYPES from datetime import datetime conn.execute("BEGIN IMMEDIATE") row = conn.execute("SELECT status FROM tasks WHERE id=?", (task_id,)).fetchone() if not row: return False old_status = row["status"] if old_status in TERMINAL_STATUSES or old_status == new_status: return old_status == new_status if new_status not in VALID_TRANSITIONS.get(old_status, set()): return False now = datetime.utcnow().isoformat() conn.execute( "UPDATE tasks SET status=?, updated_at=? WHERE id=?", (new_status, now, task_id), ) event_type = f"task_{new_status}" if event_type not in EVENT_TYPES: event_type = "daemon_tick" conn.execute( "INSERT INTO events (task_id, agent, event_type, detail) VALUES (?,?,?,?)", (task_id, agent, event_type, json.dumps({"from": old_status, "to": new_status, **(detail or {})})), ) conn.commit() return True # ------------------------------------------------------------------ # Agent 调度 # ------------------------------------------------------------------ async def _dispatch_pending(self, db_path: Path, project_id: str) -> List[str]: """扫描 pending 任务并调度""" queries = Queries(db_path) pending = queries.pending_dispatchable() dispatched: List[str] = [] if not pending: return dispatched for task in pending[:self.max_dispatch_per_tick]: try: result = await self.dispatcher.dispatch( task, project_config={"project_id": project_id, "db_path": db_path}, ) if result["status"] == "dispatched" and result["level"] in ("full", "escalate"): # 标记为 claimed + 更新 current_agent conn = get_connection(db_path) try: ok = self._transition_status( conn, task.id, "claimed", agent="daemon", detail={"dispatched_to": result["agent_id"], "session_id": result.get("session_id")}, ) if ok: # 更新 current_agent(Router 审查时用) conn.execute( "UPDATE tasks SET current_agent=? WHERE id=?", (result["agent_id"], task.id), ) conn.commit() dispatched.append(task.id) logger.info("Dispatched %s to %s (session=%s)", task.id, result["agent_id"], result.get("session_id")) finally: conn.close() except Exception: logger.exception("Dispatch failed for %s", task.id) return dispatched async def _dispatch_reviews(self, db_path: Path, project_id: str) -> List[str]: """扫描 review 状态任务,检查是否有产出,调度审查 Agent""" queries = Queries(db_path) bb = Blackboard(db_path) review_tasks = queries.tasks_by_status("review") dispatched: List[str] = [] for task in review_tasks: # 检查是否已有 review 记录 reviews = bb.get_reviews(task.id) if reviews: continue # 已有审查,跳过 # 检查是否最近已 dispatch 过 review(防重复 dispatch) existing = self._check_recent_routing(db_path, task.id, "review") if existing: continue # 已有活跃 review dispatch # 检查是否有产出(司马懿建议:无产出直接标 failed) outputs = bb.get_outputs(task.id) if not outputs: conn = get_connection(db_path) try: self._transition_status( conn, task.id, "failed", agent="daemon", detail={"reason": "no_output_for_review"}, ) bb.add_observation( task.id, "daemon", "任务进入 review 状态但没有产出物,自动标记为 failed", ) logger.warning("Task %s in review but no output, marking failed", task.id) finally: conn.close() continue # 调度审查 Agent(司马懿) try: result = await self.dispatcher.dispatch( task, action_type="review", project_config={"project_id": project_id, "db_path": db_path}, ) if result["status"] == "dispatched": dispatched.append(task.id) # 更新 current_agent 为审查者 conn = get_connection(db_path) try: conn.execute( "UPDATE tasks SET current_agent=? WHERE id=?", (result["agent_id"], task.id), ) conn.commit() finally: conn.close() logger.info("Dispatched review for %s to %s", task.id, result["agent_id"]) except Exception: logger.exception("Review dispatch failed for %s", task.id) return dispatched # ------------------------------------------------------------------ # 僵尸/超时处理 # ------------------------------------------------------------------ def _check_timeouts(self, db_path: Path) -> List[str]: """检查 claimed/working 超时的任务""" queries = Queries(db_path) reclaimed: List[str] = [] now = datetime.utcnow() # UTC,与 SQLite datetime('now') 一致 # claimed 超时 → 重置为 pending claimed = queries.tasks_by_status("claimed") for task in claimed: if not task.claimed_at: continue try: claimed_time = datetime.fromisoformat(task.claimed_at) elapsed = (now - claimed_time).total_seconds() / 60.0 if elapsed > self.claim_timeout_minutes: conn = get_connection(db_path) try: ok = self._transition_status( conn, task.id, "pending", agent="daemon", detail={"reason": "claim_timeout", "elapsed_minutes": round(elapsed, 1)}, ) if ok: reclaimed.append(task.id) logger.info("Reclaimed %s: claimed → pending (timeout %.1fm)", task.id, elapsed) finally: conn.close() except (ValueError, TypeError): pass # working 超时 → 标记为 failed working = queries.tasks_by_status("working") for task in working: start_time_str = task.started_at or task.claimed_at if not start_time_str: continue try: start_time = datetime.fromisoformat(start_time_str) # per-task timeout: deadline 优先,否则用默认值 if task.deadline: deadline_time = datetime.fromisoformat(task.deadline) timeout_minutes = (deadline_time - start_time).total_seconds() / 60.0 if timeout_minutes < 1: timeout_minutes = self.default_task_timeout_minutes else: timeout_minutes = self.default_task_timeout_minutes elapsed = (now - start_time).total_seconds() / 60.0 if elapsed > timeout_minutes: conn = get_connection(db_path) try: ok = self._transition_status( conn, task.id, "failed", agent="daemon", detail={"reason": "task_timeout", "elapsed_minutes": round(elapsed, 1), "timeout_minutes": round(timeout_minutes, 1)}, ) if ok: reclaimed.append(task.id) logger.warning("Task %s timed out (working %.1fm > %.1fm)", task.id, elapsed, timeout_minutes) finally: conn.close() except (ValueError, TypeError): pass return reclaimed def _check_recent_routing(self, db_path: Path, task_id: str, action_type: str) -> bool: """检查最近 5 分钟内是否已 dispatch 过指定类型的路由(防重复)""" try: conn = get_connection(db_path) try: # 检查是否有 from_status=review 的 dispatched 记录(防止重复 review dispatch) if action_type == "review": row = conn.execute( "SELECT COUNT(*) as cnt FROM routing_decisions " "WHERE task_id=? AND outcome='dispatched' " "AND from_status='review' " "AND created_at > datetime('now', '-5 minutes')", (task_id,), ).fetchone() else: row = conn.execute( "SELECT COUNT(*) as cnt FROM routing_decisions " "WHERE task_id=? AND outcome='dispatched' " "AND created_at > datetime('now', '-5 minutes')", (task_id,), ).fetchone() return row["cnt"] > 0 if row else False finally: conn.close() except Exception: return False # ------------------------------------------------------------------ # 手动 tick(API 端点触发) # ------------------------------------------------------------------ async def manual_tick(self) -> Dict[str, Any]: """手动触发一次 tick""" logger.info("Manual tick triggered") result = await self.tick() result["manual"] = True return result