fix(lint): 修复 PR #14 引入的 lint 回退 (119→0)
CI / lint (pull_request) Successful in 6s
CI / test (pull_request) Successful in 9s
CI / notify-on-failure (pull_request) Successful in 0s

PR #14 从旧分支复制文件导致回退了 PR #10 的 lint 修复。
修复内容:
- autoflake 移除未使用导入/变量
- autopep8 修复缩进/空格
- 手动修复 F821(pathlib→Path), F541(f-string), F841(未使用变量)
- 所有修复均通过 flake8 --max-line-length=120 --extend-ignore=E501 检查 (0 errors)
This commit is contained in:
cfdaily
2026-06-09 23:53:29 +08:00
parent 7184079a75
commit d58e38d58f
27 changed files with 863 additions and 417 deletions
+11 -9
View File
@@ -11,8 +11,7 @@ A 类 Skill 由引擎确定性注入全文,不靠 Description 触发。
import logging
import os
from pathlib import Path
from typing import Any, Dict, List, Optional
from typing import Any, List
logger = logging.getLogger("moziplus-v2.bootstrap")
@@ -28,12 +27,12 @@ class BootstrapBuilder:
"""L2 引擎注入层构建器(v2.1 四段式)"""
ROLE_SKILL_MAP = {
"executor": "blackboard-executor",
"reviewer": "blackboard-reviewer",
"reviewer-simayi": "blackboard-reviewer-simayi",
"executor": "blackboard-executor",
"reviewer": "blackboard-reviewer",
"reviewer-simayi": "blackboard-reviewer-simayi",
"reviewer-pangtong": "blackboard-reviewer-pangtong",
"planner": "blackboard-planner",
"claim": "blackboard-claim",
"planner": "blackboard-planner",
"claim": "blackboard-claim",
}
# 默认从环境变量或配置读取,fallback 到默认路径
@@ -62,7 +61,9 @@ class BootstrapBuilder:
# 段 2: 前序产出(有依赖时注入)
if task.get("depends_on_outputs"):
sections.append(self._format_prior_outputs(task["depends_on_outputs"]))
sections.append(
self._format_prior_outputs(
task["depends_on_outputs"]))
# 段 3: 角色操作规范全文(通过 ROLE_SKILL_MAP 从 Skill 文件读取)
skill_name = self.ROLE_SKILL_MAP.get(role)
@@ -134,7 +135,8 @@ class BootstrapBuilder:
"""格式化前序产出摘要(段 2"""
parts = ["## 前序产出"]
for out in outputs:
parts.append(f"- [{out.get('task_id', '?')}] {out.get('summary', '无摘要')}")
parts.append(
f"- [{out.get('task_id', '?')}] {out.get('summary', '无摘要')}")
return "\n".join(parts)
def _format_constraints(self, role: str) -> str:
+9 -5
View File
@@ -68,20 +68,23 @@ class ActiveAgentCounter:
self._cooldown_until.pop(agent_id, None)
return False
def set_cooldown(self, agent_id: str, seconds: Optional[float] = None) -> None:
def set_cooldown(self, agent_id: str,
seconds: Optional[float] = None) -> None:
"""设置冷却期(默认 120 秒)"""
cd = seconds if seconds is not None else self._default_cooldown_seconds
self._cooldown_until[agent_id] = time.time() + cd
logger.info("Cooldown set for %s: %.0fs (until %.0f)",
agent_id, cd, self._cooldown_until[agent_id])
agent_id, cd, self._cooldown_until[agent_id])
async def can_acquire(self, agent_id: str, session_id: str = "main") -> bool:
async def can_acquire(self, agent_id: str,
session_id: str = "main") -> bool:
"""三层检查:cooldown → global → per agent → per session key"""
if self.is_cooling_down(agent_id):
return False
if self._global_active >= self._max_global:
return False
if self._agent_active.get(agent_id, 0) >= self._max_concurrent_sessions:
if self._agent_active.get(
agent_id, 0) >= self._max_concurrent_sessions:
return False
key = self._make_key(agent_id, session_id)
if self._active_keys.get(key, 0) >= self._max_per_session:
@@ -122,7 +125,8 @@ class ActiveAgentCounter:
del self._active_keys[key]
if agent_id in self._agent_active:
self._agent_active[agent_id] = max(0, self._agent_active[agent_id] - 1)
self._agent_active[agent_id] = max(
0, self._agent_active[agent_id] - 1)
if self._agent_active[agent_id] == 0:
del self._agent_active[agent_id]
+162 -71
View File
@@ -14,7 +14,6 @@ from __future__ import annotations
import json
import logging
import sqlite3
from datetime import datetime
from enum import Enum
from pathlib import Path
from typing import Any, Dict, List, Optional
@@ -22,7 +21,7 @@ from typing import Any, Dict, List, Optional
from src.blackboard.models import Task
from src.blackboard.db import get_connection
from src.daemon.spawner import AgentBusyError
from src.daemon.router import AgentRouter, RouteDecision
from src.daemon.router import AgentRouter
logger = logging.getLogger("moziplus-v2.dispatcher")
@@ -64,7 +63,8 @@ class Dispatcher:
if self._legacy_mode:
self.registered_agents = set(registered_agents or [])
self.capability_map = capability_map or {}
logger.warning("Dispatcher running in legacy mode (no AgentRouter)")
logger.warning(
"Dispatcher running in legacy mode (no AgentRouter)")
def decide(self, task: Task, action_type: str = "") -> Dict[str, Any]:
"""调度决策(委托给 Router
@@ -124,16 +124,21 @@ class Dispatcher:
"""
# 安全红线检查(调度前拦截)
# Mail 是 Agent 间通信,不做 guardrail 检查
is_mail = project_config.get("project_id") == "_mail" if project_config else False
is_mail = project_config.get(
"project_id") == "_mail" if project_config else False
if self.guardrails and not is_mail:
violations = self.guardrails.check_task(task)
critical = [v for v in violations if v.action in ("block_and_notify", "terminate_and_escalate")]
critical = [
v for v in violations if v.action in (
"block_and_notify",
"terminate_and_escalate")]
if critical:
v = critical[0]
logger.warning("Task '%s' blocked by guardrail: %s - %s",
task.title, v.rule_id, v.message)
# 写入黑板事件
_routing_db = Path(project_config["db_path"]) if project_config and "db_path" in project_config else self.db_path
_routing_db = Path(
project_config["db_path"]) if project_config and "db_path" in project_config else self.db_path
if _routing_db:
self._record_routing(task, {"level": DispatchLevel.BLOCKED, "agent_id": "none",
"reason": v.message}, "blocked", v.message, _routing_db)
@@ -152,7 +157,8 @@ class Dispatcher:
decision = self.decide(task, action_type)
level = decision["level"]
# 从 project_config 获取项目级 DB 路径(路由审计日志写入项目 DB)
_routing_db = Path(project_config["db_path"]) if project_config and "db_path" in project_config else None
_routing_db = Path(
project_config["db_path"]) if project_config and "db_path" in project_config else None
agent_id = decision["agent_id"]
# v2.7.2: counter 检查移到 spawn_full_agent 内部
@@ -160,7 +166,8 @@ class Dispatcher:
# 本地执行
if level == DispatchLevel.LOCAL:
self._record_routing(task, decision, "dispatched", None, _routing_db)
self._record_routing(
task, decision, "dispatched", None, _routing_db)
return {
"level": level.value,
"agent_id": "daemon",
@@ -172,7 +179,8 @@ class Dispatcher:
# Full Agent / Escalate spawn
if level in (DispatchLevel.FULL_AGENT, DispatchLevel.ESCALATE):
if not self.spawner:
self._record_routing(task, decision, "error", "No spawner", _routing_db)
self._record_routing(
task, decision, "error", "No spawner", _routing_db)
return {
"level": level.value,
"agent_id": agent_id,
@@ -183,9 +191,11 @@ class Dispatcher:
try:
# [v2.7.1] Mail: 标 working 移到 spawn_full_agent 内部(check 通过后、subprocess 前)
is_mail = project_config.get("project_id") == "_mail" if project_config else False
is_mail = project_config.get(
"project_id") == "_mail" if project_config else False
if is_mail:
db_path = Path(project_config["db_path"]) if project_config and "db_path" in project_config else None
db_path = Path(
project_config["db_path"]) if project_config and "db_path" in project_config else None
# on_checks_passed: 所有检查通过后才标 working,检查失败不标
on_checks_passed = None
@@ -194,6 +204,7 @@ class Dispatcher:
_task_id = task.id
_mail_db = db_path
_disp = self
def _mail_on_checks_passed():
nonlocal _mail_marked_working
if not _disp._mail_auto_working(_task_id, _mail_db):
@@ -203,8 +214,9 @@ class Dispatcher:
# 构建 spawn message
message = self._build_spawn_message(task, agent_id, project_config,
mode=decision.get("mode", ""),
spawn_type=action_type or "executor")
mode=decision.get(
"mode", ""),
spawn_type=action_type or "executor")
# v2.7.2: on_complete 只含业务逻辑,不含 counter.release
# counter.release 由 spawn_full_agent 内部的 wrapped_on_complete 保证
@@ -218,14 +230,17 @@ class Dispatcher:
def _mail_on_complete(aid, outcome):
# 幻觉门控:检查是否有回复,自动标 done/failed
try:
_dispatcher._mail_auto_complete(_task_id, aid, _mail_db, _must_haves, outcome=outcome)
_dispatcher._mail_auto_complete(
_task_id, aid, _mail_db, _must_haves, outcome=outcome)
except Exception as e:
logger.error("Mail %s: on_complete error: %s", _task_id, e)
logger.error(
"Mail %s: on_complete error: %s", _task_id, e)
on_complete = _mail_on_complete
else:
# #02: Task 路径也加 on_complete(幻觉门控)
_task_id = task.id
_task_db = Path(project_config["db_path"]) if project_config and "db_path" in project_config else None
_task_db = Path(
project_config["db_path"]) if project_config and "db_path" in project_config else None
_dispatcher = self
_is_review = action_type == "review"
@@ -239,10 +254,12 @@ class Dispatcher:
try:
# #07.2: 统一 crash 回退——executor 和 review 都回退 current_agent
if outcome in ROLLBACK_CURRENT_AGENT_OUTCOMES and _task_db:
_dispatcher._rollback_current_agent(_task_db, _task_id, aid)
_dispatcher._rollback_current_agent(
_task_db, _task_id, aid)
if _is_review:
if _task_db and outcome in ("completed", "session_revived"):
if _task_db and outcome in (
"completed", "session_revived"):
# #09: 读 verdict 决定后续动作
conn = get_connection(_task_db)
try:
@@ -254,14 +271,18 @@ class Dispatcher:
conn.close()
if review and review["verdict"] == "approved":
_dispatcher._mark_task_status(_task_db, _task_id, "done")
logger.info("Task %s: review approved, marking done", _task_id)
_dispatcher._mark_task_status(
_task_db, _task_id, "done")
logger.info(
"Task %s: review approved, marking done", _task_id)
else:
# 非 approved → @mention 被审 agentassignee,非 current_agent
# 非 approved → @mention 被审
# agentassignee,非 current_agent
verdict_str = review["verdict"] if review else "未知"
conn2 = get_connection(_task_db)
try:
task_row = conn2.execute("SELECT assignee FROM tasks WHERE id=?", (_task_id,)).fetchone()
task_row = conn2.execute(
"SELECT assignee FROM tasks WHERE id=?", (_task_id,)).fetchone()
finally:
conn2.close()
@@ -269,18 +290,21 @@ class Dispatcher:
from src.blackboard.blackboard import Blackboard
bb = Blackboard(_task_db)
bb.add_comment(_task_id, "daemon",
f"@{task_row['assignee']} 审查结论: {verdict_str},请查看详情并决定接受或反驳",
comment_type="review")
f"@{task_row['assignee']} 审查结论: {verdict_str},请查看详情并决定接受或反驳",
comment_type="review")
logger.info("Task %s: review verdict=%s, notified assignee=%s",
_task_id, verdict_str, task_row["assignee"] if task_row else "?")
# 不标 done,保持 review 状态
else:
logger.warning("Task %s: review agent %s (%s), NOT marking done", _task_id, aid, outcome)
logger.warning(
"Task %s: review agent %s (%s), NOT marking done", _task_id, aid, outcome)
else:
# executor: 三信号验证 → 标 review
_dispatcher._task_auto_complete(_task_id, _task_db)
_dispatcher._task_auto_complete(
_task_id, _task_db)
except Exception as e:
logger.error("Task %s: on_complete error: %s", _task_id, e)
logger.error(
"Task %s: on_complete error: %s", _task_id, e)
on_complete = _task_on_complete
session_id = await self.spawner.spawn_full_agent(
@@ -289,7 +313,8 @@ class Dispatcher:
task_id=task.id,
on_complete=on_complete,
use_main_session=True, # #02: 统一投递到 main session
task_db_path=Path(project_config["db_path"]) if project_config and "db_path" in project_config else None,
task_db_path=Path(
project_config["db_path"]) if project_config and "db_path" in project_config else None,
on_checks_passed=on_checks_passed,
)
@@ -312,9 +337,14 @@ class Dispatcher:
else:
log_level = logger.debug
detail_msg = f"Agent busy: {reason}"
log_level("Dispatch skipped %s for task %s: %s", agent_id, task.id, detail_msg)
log_level(
"Dispatch skipped %s for task %s: %s",
agent_id,
task.id,
detail_msg)
# on_checks_passed 未执行(check 失败在它之前),working 未标,无需回退
self._record_routing(task, decision, "skipped", detail_msg, _routing_db)
self._record_routing(
task, decision, "skipped", detail_msg, _routing_db)
return {
"level": level.value,
"agent_id": agent_id,
@@ -326,7 +356,8 @@ class Dispatcher:
# on_checks_passed 已执行但 subprocess 失败 → 回退 working → pending
if _mail_marked_working:
self._mail_revert_to_pending(task.id, db_path)
self._record_routing(task, decision, "error", str(e), _routing_db)
self._record_routing(
task, decision, "error", str(e), _routing_db)
return {
"level": level.value,
"agent_id": agent_id,
@@ -385,9 +416,16 @@ class Dispatcher:
def _build_delegate_prompt(self, task: Task,
project_config: Optional[Dict]) -> str:
"""构建 delegate 模式的 prompt(协调员分配任务)"""
api_host = getattr(self.spawner, 'api_host', '127.0.0.1') if self.spawner else '127.0.0.1'
api_port = getattr(self.spawner, 'api_port', 8083) if self.spawner else 8083
project_id = project_config.get("project_id", "") if project_config else ""
api_host = getattr(
self.spawner,
'api_host',
'127.0.0.1') if self.spawner else '127.0.0.1'
api_port = getattr(
self.spawner,
'api_port',
8083) if self.spawner else 8083
project_id = project_config.get(
"project_id", "") if project_config else ""
return f"""你是任务协调员。请分析以下任务,决定最合适的执行者并分配。
@@ -478,7 +516,8 @@ class Dispatcher:
# ── Legacy 兼容(deprecated ──
def _legacy_decide(self, task: Task, action_type: str = "") -> Dict[str, Any]:
def _legacy_decide(
self, task: Task, action_type: str = "") -> Dict[str, Any]:
"""旧版三级决策树(兼容过渡用)"""
LOCAL_ACTIONS = frozenset({
"L1_guardrail", "format_check",
@@ -518,7 +557,8 @@ class Dispatcher:
return registered[0]
return "pangtong-fujunshi"
async def _legacy_dispatch(self, task, action_type="", project_config=None):
async def _legacy_dispatch(
self, task, action_type="", project_config=None):
"""旧版 dispatch(兼容过渡用)
v2.7.2: counter acquire/release 移到 spawn_full_agent 内部。
@@ -541,15 +581,19 @@ class Dispatcher:
# NOTE: _legacy_dispatch 仅在 router=None 时触发,当前配置不会进入。
# Mail 永远走 dispatch() 主路径(on_checks_passed 方案),不走此路径。
# 如果未来 legacy 路径被启用,需同步 on_checks_passed 逻辑。
is_mail_legacy = project_config.get("project_id") == "_mail" if project_config else False
is_mail_legacy = project_config.get(
"project_id") == "_mail" if project_config else False
if is_mail_legacy:
db_path_legacy = Path(project_config["db_path"]) if project_config and "db_path" in project_config else None
if not db_path_legacy or not self._mail_auto_working(task.id, db_path_legacy):
db_path_legacy = Path(
project_config["db_path"]) if project_config and "db_path" in project_config else None
if not db_path_legacy or not self._mail_auto_working(
task.id, db_path_legacy):
return {"level": level.value, "agent_id": agent_id,
"session_id": None, "status": "error",
"reason": "mail_auto_working_failed"}
if hasattr(self.spawner, 'build_spawn_message') and project_config:
if hasattr(self.spawner,
'build_spawn_message') and project_config:
retry_ctx = self._build_retry_context(task)
message = self.spawner.build_spawn_message(
task_id=task.id, title=task.title,
@@ -576,9 +620,11 @@ class Dispatcher:
def _mail_oc_legacy(aid, outcome):
try:
_disp._mail_auto_complete(_t_id, aid, _m_db, _m_mh, outcome=outcome)
_disp._mail_auto_complete(
_t_id, aid, _m_db, _m_mh, outcome=outcome)
except Exception as e:
logger.error("Mail %s: legacy on_complete error: %s", _t_id, e)
logger.error(
"Mail %s: legacy on_complete error: %s", _t_id, e)
on_complete_legacy = _mail_oc_legacy
session_id = await self.spawner.spawn_full_agent(
@@ -586,14 +632,16 @@ class Dispatcher:
task_id=task.id,
on_complete=on_complete_legacy,
use_main_session=True, # #02: 统一投递到 main session
task_db_path=Path(project_config["db_path"]) if project_config and "db_path" in project_config else None,
task_db_path=Path(
project_config["db_path"]) if project_config and "db_path" in project_config else None,
)
return {"level": level.value, "agent_id": agent_id,
"session_id": session_id, "status": "dispatched",
"reason": decision["reason"]}
except AgentBusyError as e:
reason = getattr(e, 'reason', 'busy')
detail_msg = f"Session busy: {reason}" if reason.startswith("session_") else f"Agent busy: {reason}"
detail_msg = f"Session busy: {reason}" if reason.startswith(
"session_") else f"Agent busy: {reason}"
return {"level": level.value, "agent_id": agent_id,
"session_id": None, "status": "skipped",
"reason": detail_msg}
@@ -618,9 +666,11 @@ class Dispatcher:
conn = get_connection(db_path)
try:
conn.execute("BEGIN IMMEDIATE")
row = conn.execute("SELECT status FROM tasks WHERE id=?", (task_id,)).fetchone()
row = conn.execute(
"SELECT status FROM tasks WHERE id=?", (task_id,)).fetchone()
if not row:
logger.warning("Mail %s: cannot mark working (task not found)", task_id)
logger.warning(
"Mail %s: cannot mark working (task not found)", task_id)
return False
if row["status"] not in ("pending", "claimed"):
logger.warning("Mail %s: cannot mark working (status=%s, expected pending/claimed)",
@@ -631,7 +681,10 @@ class Dispatcher:
(task_id,),
)
conn.commit()
logger.info("Mail %s: auto-marked working (system, was %s)", task_id, row["status"])
logger.info(
"Mail %s: auto-marked working (system, was %s)",
task_id,
row["status"])
return True
finally:
conn.close()
@@ -645,30 +698,40 @@ class Dispatcher:
conn = get_connection(db_path)
try:
conn.execute("BEGIN IMMEDIATE")
row = conn.execute("SELECT status FROM tasks WHERE id=?", (task_id,)).fetchone()
row = conn.execute(
"SELECT status FROM tasks WHERE id=?", (task_id,)).fetchone()
if row and row["status"] == "working":
conn.execute(
"UPDATE tasks SET status='pending', updated_at=datetime('now') WHERE id=?",
(task_id,),
)
conn.commit()
logger.info("Mail %s: reverted working → pending (spawn failed)", task_id)
logger.info(
"Mail %s: reverted working → pending (spawn failed)", task_id)
else:
logger.debug("Mail %s: skip revert (status=%s, expected working)", task_id, row["status"] if row else "not_found")
logger.debug(
"Mail %s: skip revert (status=%s, expected working)",
task_id,
row["status"] if row else "not_found")
finally:
conn.close()
except Exception as e:
logger.error("Mail %s: failed to revert to pending: %s", task_id, e)
logger.error(
"Mail %s: failed to revert to pending: %s",
task_id,
e)
def _mail_auto_complete(self, task_id: str, agent_id: str,
db_path: Path, must_haves: str, outcome=None) -> None:
db_path: Path, must_haves: str, outcome=None) -> None:
"""Mail 任务:on_complete 后自动标 done/failed(含幻觉门控)"""
try:
# 解析 performative
performative = "request"
try:
meta = json.loads(must_haves) if must_haves else {}
performative = meta.get("performative", meta.get("type", "request"))
performative = meta.get(
"performative", meta.get(
"type", "request"))
except Exception:
pass
@@ -677,13 +740,15 @@ class Dispatcher:
has_reply = self._mail_check_reply(task_id, db_path)
if not has_reply:
# F3: 立刻标 failed(不等 ticker 30 分钟)
logger.error("Mail %s: no reply found, marking failed (no_reply_found)", task_id)
logger.error(
"Mail %s: no reply found, marking failed (no_reply_found)", task_id)
for attempt in range(3):
try:
conn = get_connection(db_path)
try:
conn.execute("BEGIN IMMEDIATE")
row = conn.execute("SELECT status FROM tasks WHERE id=?", (task_id,)).fetchone()
row = conn.execute(
"SELECT status FROM tasks WHERE id=?", (task_id,)).fetchone()
if not row:
return
if row["status"] == "working":
@@ -697,19 +762,24 @@ class Dispatcher:
json.dumps({"reason": "no_reply_found"}, ensure_ascii=False)),
)
conn.commit()
logger.info("Mail %s: marked failed (no_reply_found)", task_id)
logger.info(
"Mail %s: marked failed (no_reply_found)", task_id)
# Mail 失败通知:通知发件人
try:
from src.daemon.mail_notify import notify_mail_failed
notify_mail_failed(db_path, task_id, "no_reply_found")
notify_mail_failed(
db_path, task_id, "no_reply_found")
except Exception as ne:
logger.warning("Mail %s: failed to send no_reply_found notification: %s", task_id, ne)
logger.warning(
"Mail %s: failed to send no_reply_found notification: %s", task_id, ne)
return
finally:
conn.close()
except Exception as e:
logger.warning("Mail %s: failed attempt %d: %s", task_id, attempt + 1, e)
logger.error("Mail %s: all 3 failed attempts failed, leaving for ticker", task_id)
logger.warning(
"Mail %s: failed attempt %d: %s", task_id, attempt + 1, e)
logger.error(
"Mail %s: all 3 failed attempts failed, leaving for ticker", task_id)
return
# inform 类型:只对成功 outcome 标 done,失败 outcome 留 working 等 ticker 重投
@@ -717,7 +787,10 @@ class Dispatcher:
if performative == "inform":
INFORM_DONE_OUTCOMES = {"completed", "claimed", "no_reply"}
if outcome not in INFORM_DONE_OUTCOMES:
logger.info("Mail %s: inform outcome=%s, skip auto-done", task_id, outcome)
logger.info(
"Mail %s: inform outcome=%s, skip auto-done",
task_id,
outcome)
return
# 标 done(重试 3 次)
@@ -726,7 +799,8 @@ class Dispatcher:
conn = get_connection(db_path)
try:
conn.execute("BEGIN IMMEDIATE")
row = conn.execute("SELECT status FROM tasks WHERE id=?", (task_id,)).fetchone()
row = conn.execute(
"SELECT status FROM tasks WHERE id=?", (task_id,)).fetchone()
if not row:
return
if row["status"] == "working":
@@ -741,9 +815,15 @@ class Dispatcher:
finally:
conn.close()
except Exception as e:
logger.warning("Mail %s: done attempt %d failed: %s", task_id, attempt + 1, e)
logger.warning(
"Mail %s: done attempt %d failed: %s",
task_id,
attempt + 1,
e)
# 3 次都失败,留 working 等 ticker 超时兜底
logger.error("Mail %s: all 3 done attempts failed, leaving for ticker", task_id)
logger.error(
"Mail %s: all 3 done attempts failed, leaving for ticker",
task_id)
except Exception as e:
logger.error("Mail %s: auto-complete error: %s", task_id, e)
@@ -788,7 +868,9 @@ class Dispatcher:
logger.info("Task %s: verify passed, marking review", task_id)
self._mark_task_status(db_path, task_id, "review")
else:
logger.info("Task %s: verify not passed (no signal), leaving working", task_id)
logger.info(
"Task %s: verify not passed (no signal), leaving working",
task_id)
except Exception as e:
logger.error("Task %s: auto-complete error: %s", task_id, e)
@@ -823,7 +905,8 @@ class Dispatcher:
logger.error("Task %s: verify error: %s", task_id, e)
return True
def _rollback_current_agent(self, db_path: Path, task_id: str, agent_id: str) -> None:
def _rollback_current_agent(
self, db_path: Path, task_id: str, agent_id: str) -> None:
"""#07.2: crash 后回退 current_agent 到 assignee,避免 exclude_current 卡死"""
try:
conn = get_connection(db_path)
@@ -837,11 +920,18 @@ class Dispatcher:
conn.commit()
finally:
conn.close()
logger.info("Task %s: rolled back current_agent from %s to assignee", task_id, agent_id)
logger.info(
"Task %s: rolled back current_agent from %s to assignee",
task_id,
agent_id)
except Exception as e:
logger.warning("Task %s: failed to rollback current_agent: %s", task_id, e)
logger.warning(
"Task %s: failed to rollback current_agent: %s",
task_id,
e)
def _mark_task_status(self, db_path: Path, task_id: str, status: str) -> None:
def _mark_task_status(self, db_path: Path,
task_id: str, status: str) -> None:
"""更新任务状态 + 写审计事件"""
try:
conn = get_connection(db_path)
@@ -857,7 +947,8 @@ class Dispatcher:
)
conn.execute(
"INSERT INTO events (task_id, agent, event_type, payload) VALUES (?, 'dispatcher', 'status_change', ?)",
(task_id, f'{{"from": "{old_status}", "to": "{status}", "source": "auto_complete"}}'),
(task_id,
f'{{"from": "{old_status}", "to": "{status}", "source": "auto_complete"}}'),
)
conn.commit()
finally:
@@ -866,7 +957,7 @@ class Dispatcher:
logger.error("Task %s: mark status error: %s", task_id, e)
@staticmethod
def _check_crash_limit(task_id: str, db_path: pathlib.Path, limit: int = 3,
def _check_crash_limit(task_id: str, db_path: Path, limit: int = 3,
window_minutes: int = 30) -> bool:
"""v2.8.1 Fix-3c: 检查 task 最近 window_minutes 内的 crash 次数是否超限。
+3 -3
View File
@@ -14,7 +14,7 @@ import logging
import re
from datetime import datetime
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple
from typing import Any, Dict, List, Optional
logger = logging.getLogger("moziplus-v2.experience")
@@ -68,7 +68,7 @@ class Experience:
@classmethod
def from_dict(cls, data: Dict[str, Any]) -> Experience:
return cls(**{k: v for k, v in data.items() if k != "id"},
experience_id=data.get("id"))
experience_id=data.get("id"))
class ExperienceStore:
@@ -284,7 +284,7 @@ class ExperienceDistiller:
all_tags.append(task_type)
results = self.store.search(tags=all_tags if all_tags else None,
query=query, limit=limit)
query=query, limit=limit)
# 按置信度排序
results.sort(key=lambda e: e.confidence, reverse=True)
+16 -6
View File
@@ -4,7 +4,7 @@ from __future__ import annotations
import logging
import re
from dataclasses import dataclass, field
from dataclasses import dataclass
from pathlib import Path
from typing import Any, Dict, List, Optional
@@ -38,7 +38,9 @@ class GuardrailEngine:
data = yaml.safe_load(f)
self.rules = data.get("rules", [])
self.settings = data.get("settings", {"enabled": True})
logger.info("Loaded %d guardrail rules from %s", len(self.rules), config_path)
logger.info(
"Loaded %d guardrail rules from %s", len(
self.rules), config_path)
def check_task(self, task: Any) -> List[GuardrailViolation]:
"""检查 Task 是否触犯安全红线(调度前调用)"""
@@ -95,7 +97,8 @@ class GuardrailEngine:
return violations
def check_token_usage(self, token_count: int) -> Optional[GuardrailViolation]:
def check_token_usage(
self, token_count: int) -> Optional[GuardrailViolation]:
"""检查 Token 消耗是否超标"""
if not self.settings.get("enabled", True):
return None
@@ -103,7 +106,10 @@ class GuardrailEngine:
for rule in self.rules:
if rule["id"] != "high_token_usage":
continue
threshold = rule.get("triggers", [{}])[0].get("token_threshold", 100000)
threshold = rule.get(
"triggers", [
{}])[0].get(
"token_threshold", 100000)
if token_count > threshold:
return GuardrailViolation(
rule_id=rule["id"],
@@ -114,7 +120,8 @@ class GuardrailEngine:
)
return None
def check_consecutive_failure(self, failure_count: int) -> Optional[GuardrailViolation]:
def check_consecutive_failure(
self, failure_count: int) -> Optional[GuardrailViolation]:
"""检查连续失败次数"""
if not self.settings.get("enabled", True):
return None
@@ -122,7 +129,10 @@ class GuardrailEngine:
for rule in self.rules:
if rule["id"] != "consecutive_failure":
continue
threshold = rule.get("triggers", [{}])[0].get("consecutive_failures", 3)
threshold = rule.get(
"triggers", [
{}])[0].get(
"consecutive_failures", 3)
if failure_count >= threshold:
return GuardrailViolation(
rule_id=rule["id"],
+11 -6
View File
@@ -9,9 +9,9 @@ from __future__ import annotations
import json
import logging
from pathlib import Path
from typing import Any, Dict, Optional
from typing import Any, Dict
from src.blackboard.db import get_connection, init_db
from src.blackboard.db import get_connection
from src.blackboard.queries import Queries
logger = logging.getLogger("moziplus-v2.health")
@@ -41,7 +41,7 @@ class HealthChecker:
{"healthy": bool, "zombie": bool, "stale_ticks": int,
"alert_written": bool, "resolved": bool}
"""
db_key = str(db_path)
str(db_path)
result: Dict[str, Any] = {
"healthy": True,
"zombie": False,
@@ -58,7 +58,8 @@ class HealthChecker:
# 用 event count 变化判断是否有真实变更
conn = queries._conn()
try:
total_events = conn.execute("SELECT COUNT(*) FROM events").fetchone()[0]
conn.execute(
"SELECT COUNT(*) FROM events").fetchone()[0]
non_tick_events = conn.execute(
"SELECT COUNT(*) FROM events WHERE event_type != 'daemon_tick' "
"AND event_type != 'agent_zombie_detected'"
@@ -85,7 +86,8 @@ class HealthChecker:
self._stale_ticks[project_id] = stale
result["stale_ticks"] = stale
if stale >= self.zombie_threshold and not self._alerted.get(project_id):
if stale >= self.zombie_threshold and not self._alerted.get(
project_id):
# 写告警
self._write_alert(db_path, project_id, tick_num, stale)
self._alerted[project_id] = True
@@ -126,7 +128,10 @@ class HealthChecker:
conn.commit()
finally:
conn.close()
logger.warning("Zombie detected: %s (stale=%d)", project_id, stale_ticks)
logger.warning(
"Zombie detected: %s (stale=%d)",
project_id,
stale_ticks)
def _write_resolution(self, db_path: Path, project_id: str,
tick_num: int) -> None:
+6 -5
View File
@@ -15,7 +15,6 @@ from __future__ import annotations
import asyncio
import json
import logging
import os
from pathlib import Path
from typing import Any, Callable, Coroutine, Dict, List, Optional
@@ -28,7 +27,8 @@ class InboxWatcher:
def __init__(
self,
inbox_path: Path,
process_callback: Optional[Callable[[Dict[str, Any]], Coroutine[Any, Any, None]]] = None,
process_callback: Optional[Callable[[
Dict[str, Any]], Coroutine[Any, Any, None]]] = None,
watch_interval: float = 1.0,
):
"""
@@ -57,7 +57,7 @@ class InboxWatcher:
self._running = True
self._task = asyncio.create_task(self._loop())
logger.info("Inbox watcher started (path=%s, interval=%.1fs)",
self.inbox_path, self.watch_interval)
self.inbox_path, self.watch_interval)
async def stop(self) -> None:
"""停止监听"""
@@ -69,7 +69,7 @@ class InboxWatcher:
except asyncio.CancelledError:
pass
logger.info("Inbox watcher stopped (processed=%d, errors=%d)",
self._total_processed, self._total_errors)
self._total_processed, self._total_errors)
@property
def is_running(self) -> bool:
@@ -160,7 +160,8 @@ class InboxWatcher:
line_no, type(event).__name__)
self._total_errors += 1
except json.JSONDecodeError:
logger.warning("Inbox line %d: invalid JSON, skipping", line_no)
logger.warning(
"Inbox line %d: invalid JSON, skipping", line_no)
self._total_errors += 1
return events
+14 -5
View File
@@ -50,7 +50,9 @@ def notify_mail_failed(db_path: Path, original_mail_id: str,
bb = Blackboard(db_path)
original = bb.get_task(original_mail_id)
if not original:
logger.warning("notify_mail_failed: original mail %s not found", original_mail_id)
logger.warning(
"notify_mail_failed: original mail %s not found",
original_mail_id)
return
# 解析原邮件元数据
@@ -58,7 +60,9 @@ def notify_mail_failed(db_path: Path, original_mail_id: str,
# 防递归:系统通知邮件失败不再发通知
if meta.get("system_notify"):
logger.info("Mail %s: system notify mail failed, skipping recursive notification", original_mail_id)
logger.info(
"Mail %s: system notify mail failed, skipping recursive notification",
original_mail_id)
return
# 获取发件人(优先 assigned_byfallback must_haves.from
@@ -67,7 +71,9 @@ def notify_mail_failed(db_path: Path, original_mail_id: str,
title = original.title or ""
if not from_agent:
logger.warning("notify_mail_failed: cannot determine sender for mail %s", original_mail_id)
logger.warning(
"notify_mail_failed: cannot determine sender for mail %s",
original_mail_id)
return
# 发件人不是有效 Agent(如 system)→ 通知庞统代处理,不触发广播
@@ -108,7 +114,10 @@ def notify_mail_failed(db_path: Path, original_mail_id: str,
)
bb.create_task(notify_task)
logger.info("Mail %s: sent failure notification to %s (original_sender=%s, reason=%s, notify_id=%s)",
original_mail_id, target_agent, from_agent, reason, notify_id)
original_mail_id, target_agent, from_agent, reason, notify_id)
except Exception as e:
logger.warning("notify_mail_failed: failed to send notification for mail %s: %s", original_mail_id, e)
logger.warning(
"notify_mail_failed: failed to send notification for mail %s: %s",
original_mail_id,
e)
+15 -11
View File
@@ -8,15 +8,12 @@ from __future__ import annotations
import json
import logging
import re
from datetime import datetime
from enum import Enum
from pathlib import Path
from typing import Any, Callable, Dict, List, Optional, Tuple
from typing import Any, Callable, Dict, List, Optional
from src.blackboard.models import Task
from src.blackboard.operations import Blackboard
from src.blackboard.queries import Queries
logger = logging.getLogger("moziplus-v2.review")
@@ -151,12 +148,14 @@ class ReviewPipeline:
) -> ReviewResult:
"""Step 2: 格式合规"""
if not outputs:
return ReviewResult("format", ReviewVerdict.FAIL, 0.0, "No outputs")
return ReviewResult(
"format", ReviewVerdict.FAIL, 0.0, "No outputs")
issues = []
for out in outputs:
# output.md 必须存在且非空
if out.get("type") == "markdown" or out.get("path", "").endswith(".md"):
if out.get("type") == "markdown" or out.get(
"path", "").endswith(".md"):
content = out.get("content", "")
if not content and out.get("path"):
try:
@@ -167,7 +166,8 @@ class ReviewPipeline:
issues.append(f"Output too short: {out.get('path', '?')}")
# 结论 JSON 必须有效
if out.get("type") == "json" or out.get("path", "").endswith(".json"):
if out.get("type") == "json" or out.get(
"path", "").endswith(".json"):
content = out.get("content", "")
if not content and out.get("path"):
try:
@@ -177,7 +177,8 @@ class ReviewPipeline:
try:
data = json.loads(content)
if not isinstance(data, dict):
issues.append(f"JSON not a dict: {out.get('path', '?')}")
issues.append(
f"JSON not a dict: {out.get('path', '?')}")
except (json.JSONDecodeError, TypeError):
issues.append(f"Invalid JSON: {out.get('path', '?')}")
@@ -194,7 +195,8 @@ class ReviewPipeline:
) -> ReviewResult:
"""Step 3: 内容质量(自定义检查)"""
if not outputs:
return ReviewResult("quality", ReviewVerdict.FAIL, 0.0, "No outputs")
return ReviewResult(
"quality", ReviewVerdict.FAIL, 0.0, "No outputs")
suggestions = []
total_score = 0.0
@@ -215,7 +217,8 @@ class ReviewPipeline:
avg = 1.0 # 无自定义检查默认通过
verdict = ReviewVerdict.PASS if avg >= 0.6 else ReviewVerdict.FAIL
return ReviewResult("quality", verdict, round(avg, 2), suggestions=suggestions)
return ReviewResult("quality", verdict, round(
avg, 2), suggestions=suggestions)
def _determine_gate(
self, task: Task, results: List[ReviewResult]
@@ -329,6 +332,7 @@ class RebuttalManager:
return 0
try:
observations = self.bb.get_observations(task_id=task_id)
return sum(1 for o in observations if "Rebuttal round" in (o.body or ""))
return sum(
1 for o in observations if "Rebuttal round" in (o.body or ""))
except Exception:
return 0
+11 -5
View File
@@ -107,7 +107,8 @@ class AgentRouter:
# ── 快速路径 2: retry → 原执行者 ──
if action_type == "retry":
current = task_info.get("current_agent") or task_info.get("assignee")
current = task_info.get(
"current_agent") or task_info.get("assignee")
if current and current in self.agent_profiles:
return RouteDecision(
agent_id=current,
@@ -119,7 +120,8 @@ class AgentRouter:
# ── Mode B: Agent 声明式交接 ──
next_cap = task_info.get("next_capability")
if next_cap and self._validate_capability(next_cap):
current = task_info.get("current_agent") or task_info.get("assignee")
current = task_info.get(
"current_agent") or task_info.get("assignee")
exclude = {current} if current else set()
matched = self._match_capability(next_cap, exclude)
if matched:
@@ -129,7 +131,9 @@ class AgentRouter:
mode="agent_handoff",
latency_ms=int((time.monotonic() - start) * 1000),
)
logger.info("next_capability '%s' no match, delegate to coordinator", next_cap)
logger.info(
"next_capability '%s' no match, delegate to coordinator",
next_cap)
# ── 快速路径 3: 生命周期流转查表 ──
lifecycle = self.LIFECYCLE_CAPABILITY.get(action_type)
@@ -140,7 +144,8 @@ class AgentRouter:
exclude_current = lifecycle.get("exclude_current", False)
exclude = set()
if exclude_current:
current = task_info.get("current_agent") or task_info.get("assignee")
current = task_info.get(
"current_agent") or task_info.get("assignee")
if current:
exclude.add(current)
matched = self._match_capability(cap, exclude)
@@ -154,7 +159,8 @@ class AgentRouter:
# ── 快速路径 4: 有 assignee 且非生命周期流转 ──
assignee = task_info.get("assignee")
if assignee and assignee in self.agent_profiles and action_type not in ("review", "escalation"):
if assignee and assignee in self.agent_profiles and action_type not in (
"review", "escalation"):
return RouteDecision(
agent_id=assignee,
reason=f"Direct assignee: {assignee}",
+1 -2
View File
@@ -10,12 +10,11 @@ from __future__ import annotations
import json
import logging
import re
from dataclasses import dataclass, field
from datetime import datetime
from enum import Enum
from pathlib import Path
from typing import Any, Callable, Dict, List, Optional, Tuple
from typing import Any, Dict, List, Optional, Tuple
logger = logging.getLogger("moziplus-v2.skill")
+168 -75
View File
@@ -15,7 +15,7 @@ from datetime import datetime
from pathlib import Path
from typing import Any, Dict, List, Optional
from src.blackboard.db import get_connection, init_db
from src.blackboard.db import get_connection
logger = logging.getLogger("moziplus-v2.spawner")
@@ -163,9 +163,12 @@ class AgentBusyError(Exception):
#07: reason 字段区分具体原因,便于 dispatcher 层区分处理。
"""
def __init__(self, agent_id: str, reason: str = "busy", detail: Optional[dict] = None):
def __init__(self, agent_id: str, reason: str = "busy",
detail: Optional[dict] = None):
self.agent_id = agent_id
self.reason = reason # counter_blocked / session_locked / session_running / session_compacting / session_stuck
# counter_blocked / session_locked / session_running / session_compacting / session_stuck
self.reason = reason
self.detail = detail or {}
super().__init__(f"{agent_id}: {reason}")
@@ -277,11 +280,15 @@ class AgentSpawner:
# mail 任务用精简模板
if project_id == "_mail":
return self._build_mail_prompt(task_id, title, description, must_haves, agent_id)
return self._build_mail_prompt(
task_id, title, description, must_haves, agent_id)
# 走 BootstrapBuilder 新路径
if self.bootstrap_builder and task is not None:
role_map = {"executor": "executor", "review": "reviewer", "discussion": "planner"}
role_map = {
"executor": "executor",
"review": "reviewer",
"discussion": "planner"}
role = role_map.get(spawn_type, "executor")
bootstrap_prompt = self.bootstrap_builder.build_for_task(
task=task,
@@ -293,13 +300,14 @@ class AgentSpawner:
# 无 BootstrapBuilder 或无 task 对象 → 最小 fallback
# 只保留任务上下文 + API 操作指令
logger.warning("No BootstrapBuilder or task object, using minimal fallback")
logger.warning(
"No BootstrapBuilder or task object, using minimal fallback")
return self._build_minimal_fallback(
task_id, title, description, must_haves,
project_id, agent_id)
def _build_minimal_fallback(self, task_id, title, description, must_haves,
project_id, agent_id):
project_id, agent_id):
"""最小 fallback:只有任务上下文 + API 指令"""
task_section = f"""## 任务
{title}
@@ -311,7 +319,7 @@ class AgentSpawner:
return task_section + "\n\n---\n\n" + api_section
def _build_api_section(self, project_id: str, task_id: str,
agent_id: str) -> str:
agent_id: str) -> str:
"""构建 API 回写操作指令(BootstrapBuilder 模式下补充)"""
# mail 任务直接 done,不走 review
success_status = '"done"' if project_id == "_mail" else '"review"'
@@ -337,8 +345,8 @@ curl -X POST http://{self.api_host}:{self.api_port}/api/projects/{project_id}/ta
"""
def _build_discussion_prompt(self, task_id: str, title: str,
description: str, must_haves: str,
project_id: str, agent_id: str) -> str:
description: str, must_haves: str,
project_id: str, agent_id: str) -> str:
"""构建讨论类 spawn prompt(§3.3 框架 + Boids)"""
goal_snapshot = description or title
constraints = must_haves or "(无特殊约束)"
@@ -368,7 +376,8 @@ curl -X POST http://{self.api_host}:{self.api_port}/api/projects/{project_id}/ta
if not self.guardrails:
return "无特殊限制"
try:
return "".join(r.get("name", r.get("rule_id", "")) for r in self.guardrails.rules[:6])
return "".join(r.get("name", r.get("rule_id", ""))
for r in self.guardrails.rules[:6])
except Exception:
return "无特殊限制"
@@ -379,9 +388,8 @@ curl -X POST http://{self.api_host}:{self.api_port}/api/projects/{project_id}/ta
return router.agent_profiles.get(agent_id)
return None
def _build_mail_prompt(self, task_id: str, title: str, description: str,
must_haves: str, agent_id: str) -> str:
must_haves: str, agent_id: str) -> str:
"""构建 Mail 专用精简模板"""
# 解析 must_haves 获取 from 和 performative
from_agent = agent_id
@@ -389,7 +397,9 @@ curl -X POST http://{self.api_host}:{self.api_port}/api/projects/{project_id}/ta
try:
meta = json.loads(must_haves) if must_haves else {}
from_agent = meta.get("from", agent_id)
performative = meta.get("performative", meta.get("type", "request"))
performative = meta.get(
"performative", meta.get(
"type", "request"))
except Exception:
pass
@@ -472,7 +482,9 @@ curl -X POST http://{self.api_host}:{self.api_port}/api/projects/{project_id}/ta
self._revive_session(agent_id)
elif pre_state.get("status") == "running" and not pre_state.get("lock_pid_alive"):
# status=running 但 lock PID 已死 → 假死,revive
logger.warning("Phase 0: %s status=running but lock PID dead, reviving", agent_id)
logger.warning(
"Phase 0: %s status=running but lock PID dead, reviving",
agent_id)
self._revive_session(agent_id)
# Phase 1: Counter acquire(互斥锁)
@@ -487,12 +499,15 @@ curl -X POST http://{self.api_host}:{self.api_port}/api/projects/{project_id}/ta
if use_main_session:
session_state = self._check_session_state(agent_id)
logger.info("Phase 2 session check for %s: status=%s lock_pid=%s lock_pid_alive=%s compact=%s",
agent_id, session_state.get('status'), session_state.get('lock_pid'),
agent_id, session_state.get(
'status'), session_state.get('lock_pid'),
session_state.get('lock_pid_alive'), session_state.get('recent_compact'))
blockers = []
if session_state.get("lock_pid_alive") and not session_state.get("lock_expired"):
blockers.append(("session_locked", session_state.get("lock_pid")))
if session_state.get(
"lock_pid_alive") and not session_state.get("lock_expired"):
blockers.append(
("session_locked", session_state.get("lock_pid")))
if session_state.get("status") == "running":
if session_state.get("lock_pid_alive"):
# 真 running:外部进程占用
@@ -515,7 +530,8 @@ curl -X POST http://{self.api_host}:{self.api_port}/api/projects/{project_id}/ta
# Phase 2.5: 假死修复(status=running + lock PID 死 → revive → 重检)
# 此场景应被 Phase 0 提前修复,这里做兜底
if session_state.get("status") == "running" and not session_state.get("lock_pid_alive"):
if session_state.get("status") == "running" and not session_state.get(
"lock_pid_alive"):
logger.warning("Phase 2.5: %s status=running + lock dead (should be caught in Phase 0), reviving",
agent_id)
self._revive_session(agent_id)
@@ -538,7 +554,10 @@ curl -X POST http://{self.api_host}:{self.api_port}/api/projects/{project_id}/ta
raise
if self.dry_run:
logger.info("[DRY RUN] Would spawn agent %s (session=%s)", agent_id, _sid_key)
logger.info(
"[DRY RUN] Would spawn agent %s (session=%s)",
agent_id,
_sid_key)
self._register_session(_sid_key, agent_id, task_id, pid=None)
return _sid_key
@@ -554,7 +573,8 @@ curl -X POST http://{self.api_host}:{self.api_port}/api/projects/{project_id}/ta
if asyncio.iscoroutine(result):
await result
except Exception:
logger.warning("Business on_complete failed for %s", aid, exc_info=True)
logger.warning(
"Business on_complete failed for %s", aid, exc_info=True)
cmd = [
"openclaw", "agent",
@@ -575,7 +595,7 @@ curl -X POST http://{self.api_host}:{self.api_port}/api/projects/{project_id}/ta
stderr=asyncio.subprocess.PIPE,
)
self._register_session(session_id, agent_id, task_id, proc.pid,
broadcast_task_ids=broadcast_task_ids)
broadcast_task_ids=broadcast_task_ids)
logger.info("Spawned agent %s (session=%s, pid=%d)",
agent_id, session_id, proc.pid)
@@ -593,7 +613,11 @@ curl -X POST http://{self.api_host}:{self.api_port}/api/projects/{project_id}/ta
if self.counter:
self.counter.release(agent_id, _sid_key)
logger.exception("Failed to spawn agent %s", agent_id)
self._record_attempt(task_id, agent_id, "spawn_failed", error=str(e))
self._record_attempt(
task_id,
agent_id,
"spawn_failed",
error=str(e))
raise
async def spawn_subagent(
@@ -609,7 +633,9 @@ curl -X POST http://{self.api_host}:{self.api_port}/api/projects/{project_id}/ta
session_id = str(uuid.uuid4())
if self.dry_run:
logger.info("[DRY RUN] Would spawn subagent (session=%s)", session_id)
logger.info(
"[DRY RUN] Would spawn subagent (session=%s)",
session_id)
self._register_session(session_id, "subagent", task_id, pid=None)
return session_id
@@ -729,10 +755,16 @@ curl -X POST http://{api_host}:{api_port}/api/projects/{project_id}/tasks/{task_
agent_id, session_id, json_result)
# 查任务实际状态
task_status = self._get_task_status(db_path, task_id) if task_id else None
task_status = self._get_task_status(
db_path, task_id) if task_id else None
# 分类
cls = self._classify_outcome(exit_code, json_result, stderr_text, task_status, stdout_text)
cls = self._classify_outcome(
exit_code,
json_result,
stderr_text,
task_status,
stdout_text)
outcome = cls["outcome"]
# 更新 session 状态
@@ -761,17 +793,21 @@ curl -X POST http://{api_host}:{api_port}/api/projects/{project_id}/tasks/{task_
agent_id, session_id, outcome, exit_code, task_status)
# 广播反馈追踪(Phase 1 bug fix)
if task_id == "broadcast" and hasattr(self, '_ticker') and self._ticker:
if task_id == "broadcast" and hasattr(
self, '_ticker') and self._ticker:
# 广播任务:从 session 信息取真实 task_id 列表,逐一回调 tracker
sess_info = self._sessions.get(session_id or "main", {})
bt_ids = sess_info.get("broadcast_task_ids") or []
# 广播场景一律标 no_reply:Agent 只 claim 一个任务,
# 其余任务的 tracker 不能被 claimed 清除
for real_task_id in bt_ids:
self._ticker.record_broadcast_response(real_task_id, agent_id, "no_reply")
self._ticker.record_broadcast_response(
real_task_id, agent_id, "no_reply")
elif task_id and hasattr(self, '_ticker') and self._ticker:
outcome_str = "claimed" if cls.get("status") == "ok" else "no_reply"
self._ticker.record_broadcast_response(task_id, agent_id, outcome_str)
outcome_str = "claimed" if cls.get(
"status") == "ok" else "no_reply"
self._ticker.record_broadcast_response(
task_id, agent_id, outcome_str)
if cls["should_retry"]:
# cooldown: 新增的可恢复场景(A14/A15/A16/A8/A10)
@@ -850,14 +886,24 @@ curl -X POST http://{api_host}:{api_port}/api/projects/{project_id}/tasks/{task_
# v2.8.1 Fix-3a: crash 类 outcome 设 cooldown,给 agent session 恢复时间
if outcome == "crashed" and self.counter:
self.counter.set_cooldown(agent_id, seconds=60)
logger.info("Crash cooldown set for %s: 60s (outcome=%s)", agent_id, outcome)
logger.info(
"Crash cooldown set for %s: 60s (outcome=%s)",
agent_id,
outcome)
elif outcome in ("compact_failed", "process_crash", "session_stuck",
"compact_hanging", "agent_error", "compact_interrupted") and self.counter:
"compact_hanging", "agent_error", "compact_interrupted") and self.counter:
self.counter.set_cooldown(agent_id, seconds=300) # 5 分钟
logger.info("Error cooldown set for %s: 300s (outcome=%s)", agent_id, outcome)
logger.info(
"Error cooldown set for %s: 300s (outcome=%s)",
agent_id,
outcome)
# F1: 不可恢复 outcome → 立刻标 failed + 写黑板
if outcome in ("auth_failed", "agent_error") and db_path and task_id:
logger.error("Task %s: unrecoverable outcome=%s, marking failed immediately", task_id, outcome)
if outcome in ("auth_failed",
"agent_error") and db_path and task_id:
logger.error(
"Task %s: unrecoverable outcome=%s, marking failed immediately",
task_id,
outcome)
self._mark_task(db_path, task_id, "failed", {
"reason": outcome,
"stderr_preview": (stderr_text or "")[:500],
@@ -881,13 +927,16 @@ curl -X POST http://{api_host}:{api_port}/api/projects/{project_id}/tasks/{task_
except Exception:
pass
stderr_text = b"".join(stderr_chunks).decode("utf-8", errors="replace")
# stderr collected but not used in this handler
# (kept for potential future diagnostics)
b"".join(stderr_chunks).decode("utf-8", errors="replace")
# 检查 session 状态
state = self._check_session_state(agent_id)
# B1: 假死 - 先复活,连续假死 ≥2 次再 failed
if state.get("status") == "running" and not state.get("lock_pid_alive", True):
if state.get("status") == "running" and not state.get(
"lock_pid_alive", True):
# 假死计数
stuck_count = self._stuck_counts.get(task_id, 0) + 1
self._stuck_counts[task_id] = stuck_count
@@ -913,7 +962,8 @@ curl -X POST http://{api_host}:{api_port}/api/projects/{project_id}/tasks/{task_
await self._do_on_complete_async(on_complete, agent_id, "session_revived")
else:
# 复活失败 → 标 failed
logger.error("Agent %s revive failed, marking failed", agent_id)
logger.error(
"Agent %s revive failed, marking failed", agent_id)
self._mark_task(db_path, task_id, "failed",
{"reason": "revive_failed", "stuck_count": stuck_count,
"diagnostics": state})
@@ -994,7 +1044,8 @@ curl -X POST http://{api_host}:{api_port}/api/projects/{project_id}/tasks/{task_
"SELECT status FROM tasks WHERE id=?", (task_id,)
).fetchone()
# Bug-6 fix: pending 不是终态
if row and row["status"] in ("done", "failed", "cancelled", "review"):
if row and row["status"] in (
"done", "failed", "cancelled", "review"):
logger.info("Retry skip: task %s already %s (agent=%s)",
task_id, row["status"], agent_id)
# on_complete = wrapped_on_complete,会 release counter
@@ -1003,7 +1054,8 @@ curl -X POST http://{api_host}:{api_port}/api/projects/{project_id}/tasks/{task_
finally:
conn.close()
except Exception:
logger.warning("Retry status check failed for %s, proceeding", task_id)
logger.warning(
"Retry status check failed for %s, proceeding", task_id)
# 直接读写 tasks 表的 retry_count
if retry_field == "retry_count" and db_path and task_id:
@@ -1023,7 +1075,8 @@ curl -X POST http://{api_host}:{api_port}/api/projects/{project_id}/tasks/{task_
finally:
conn.close()
except Exception:
logger.exception("Failed to update retry_count for task %s", task_id)
logger.exception(
"Failed to update retry_count for task %s", task_id)
count = 1
else:
retry_counts = self._get_retry_counts(db_path, task_id)
@@ -1107,7 +1160,8 @@ curl -X POST http://{api_host}:{api_port}/api/projects/{project_id}/tasks/{task_
"""
text = stdout_text.strip()
if not text:
return {"status": None, "summary": None, "fallback_used": False, "fallback_reason": None, "payloads": []}
return {"status": None, "summary": None, "fallback_used": False,
"fallback_reason": None, "payloads": []}
try:
data = json.loads(text)
except json.JSONDecodeError:
@@ -1119,7 +1173,8 @@ curl -X POST http://{api_host}:{api_port}/api/projects/{project_id}/tasks/{task_
except json.JSONDecodeError:
continue
else:
return {"status": None, "summary": None, "fallback_used": False, "fallback_reason": None, "payloads": []}
return {"status": None, "summary": None, "fallback_used": False,
"fallback_reason": None, "payloads": []}
# 从 data.result.meta.executionTrace 取 fallback 信息
result = data.get("result", {})
@@ -1135,7 +1190,8 @@ curl -X POST http://{api_host}:{api_port}/api/projects/{project_id}/tasks/{task_
}
@staticmethod
def _get_task_status(db_path: Optional[Path], task_id: Optional[str]) -> Optional[str]:
def _get_task_status(
db_path: Optional[Path], task_id: Optional[str]) -> Optional[str]:
"""查任务实际 API 状态"""
if not db_path or not task_id:
return None
@@ -1152,7 +1208,8 @@ curl -X POST http://{api_host}:{api_port}/api/projects/{project_id}/tasks/{task_
return None
@staticmethod
def _get_task_info(db_path: Optional[Path], task_id: Optional[str]) -> Optional[dict]:
def _get_task_info(db_path: Optional[Path],
task_id: Optional[str]) -> Optional[dict]:
"""查任务基本信息"""
if not db_path or not task_id:
return None
@@ -1160,7 +1217,8 @@ curl -X POST http://{api_host}:{api_port}/api/projects/{project_id}/tasks/{task_
conn = get_connection(db_path)
try:
row = conn.execute(
"SELECT id, title, status FROM tasks WHERE id=?", (task_id,)
"SELECT id, title, status FROM tasks WHERE id=?", (
task_id,)
).fetchone()
if not row:
return None
@@ -1192,7 +1250,9 @@ curl -X POST http://{api_host}:{api_port}/api/projects/{project_id}/tasks/{task_
sessions[main_key] = main_session
with open(sessions_path, "w") as f:
json.dump(sessions, f, indent=2)
logger.info("Revived %s: sessions.json status changed running→idle", agent_id)
logger.info(
"Revived %s: sessions.json status changed running→idle",
agent_id)
# #07 O4: 同时清理残留 lock 文件
sf = main_session.get("sessionFile", "")
if sf:
@@ -1200,7 +1260,10 @@ curl -X POST http://{api_host}:{api_port}/api/projects/{project_id}/tasks/{task_
if lock_path.exists():
try:
lock_path.unlink()
logger.info("Cleaned stale lock for %s: %s", agent_id, lock_path.name)
logger.info(
"Cleaned stale lock for %s: %s",
agent_id,
lock_path.name)
except Exception:
pass
return True
@@ -1209,7 +1272,8 @@ curl -X POST http://{api_host}:{api_port}/api/projects/{project_id}/tasks/{task_
return False
@staticmethod
def _check_recent_compaction_jsonl(session_file: str, window_seconds: int = 900) -> bool:
def _check_recent_compaction_jsonl(
session_file: str, window_seconds: int = 900) -> bool:
"""v2.8.2 Fix-2: 读 session jsonl 末尾,检查是否有 window_seconds 内的 compaction 记录。
compactionCheckpoints 更可靠:Gateway 每次完成 compact 必然在 jsonl 末尾追加记录,
@@ -1219,7 +1283,7 @@ curl -X POST http://{api_host}:{api_port}/api/projects/{project_id}/tasks/{task_
实测 50KB 在长对话中不够compact 记录被推出窗口导致漏检
正常扫描量不变从尾部往前扫遇到超过 15min timestamp break
"""
if not session_file or not pathlib.Path(session_file).exists():
if not session_file or not Path(session_file).exists():
return False
try:
from datetime import datetime, timezone
@@ -1241,7 +1305,8 @@ curl -X POST http://{api_host}:{api_port}/api/projects/{project_id}/tasks/{task_
ts = obj.get("timestamp", "")
if ts:
try:
ct = datetime.fromisoformat(ts.replace("Z", "+00:00"))
ct = datetime.fromisoformat(
ts.replace("Z", "+00:00"))
if (now - ct).total_seconds() < window_seconds:
return True
except (ValueError, TypeError):
@@ -1265,7 +1330,11 @@ curl -X POST http://{api_host}:{api_port}/api/projects/{project_id}/tasks/{task_
v2.8.1: compact 检测改用 session jsonl 末尾扫描(Fix-1),
替代失效的 compactionCheckpoints 检测
"""
result = {"status": "unknown", "lock_pid": None, "lock_pid_alive": False, "recent_compact": False}
result = {
"status": "unknown",
"lock_pid": None,
"lock_pid_alive": False,
"recent_compact": False}
sessions_path = Path(os.environ.get(
"OPENCLAW_HOME", str(Path.home() / ".openclaw")
)) / "agents" / agent_id / "sessions" / "sessions.json"
@@ -1304,8 +1373,10 @@ curl -X POST http://{api_host}:{api_port}/api/projects/{project_id}/tasks/{task_
created_at_str = lock_data.get("createdAt", "")
if created_at_str:
from datetime import datetime as _dt, timezone as _tz
created_dt = _dt.fromisoformat(created_at_str.replace("Z", "+00:00"))
elapsed = (_dt.now(_tz.utc) - created_dt).total_seconds()
created_dt = _dt.fromisoformat(
created_at_str.replace("Z", "+00:00"))
elapsed = (_dt.now(_tz.utc) -
created_dt).total_seconds()
if elapsed > 1800: # 30 minutes
result["lock_pid_alive"] = False
result["lock_expired"] = True
@@ -1318,8 +1389,10 @@ curl -X POST http://{api_host}:{api_port}/api/projects/{project_id}/tasks/{task_
# v2.8.1 Fix-1: compact 检测改用 session jsonl 末尾扫描
# 只在 agent 非空闲时才扫描(减少不必要 I/O)
if result["status"] not in ("done", "idle", "unknown", None) and sf:
result["recent_compact"] = AgentSpawner._check_recent_compaction_jsonl(sf)
if result["status"] not in (
"done", "idle", "unknown", None) and sf:
result["recent_compact"] = AgentSpawner._check_recent_compaction_jsonl(
sf)
except Exception:
pass
return result
@@ -1364,14 +1437,17 @@ curl -X POST http://{api_host}:{api_port}/api/projects/{project_id}/tasks/{task_
# A15/A16: stderr 含 network/compact 关键字 → 可恢复
if stderr_text:
stderr_lower = stderr_text.lower()
if any(kw in stderr_lower for kw in ["econnrefused", "etimedout", "gateway closed", "econnreset"]):
if any(kw in stderr_lower for kw in [
"econnrefused", "etimedout", "gateway closed", "econnreset"]):
return {"outcome": "gateway_unreachable", "should_retry": True,
"retry_field": "retry_count", "cooldown_seconds": 60}
if any(kw in stderr_lower for kw in ["compaction-diag", "context-overflow"]):
if any(kw in stderr_lower for kw in [
"compaction-diag", "context-overflow"]):
return {"outcome": "compact_interrupted", "should_retry": True,
"retry_field": "retry_count", "cooldown_seconds": 60}
# A17: 真正的 crash → 保持 working,ticker 兜底
return {"outcome": "crashed", "should_retry": False, "original": "process_crash"}
return {"outcome": "crashed", "should_retry": False,
"original": "process_crash"}
# A13 revised: stdout 为空但 exit=0 → 信任进程退出码,视为正常完成
# 实测发现 openclaw session=None + exit=0 是正常场景(inform 通知等)
@@ -1382,25 +1458,32 @@ curl -X POST http://{api_host}:{api_port}/api/projects/{project_id}/tasks/{task_
# A7-A12: status=error → 不续杯,stderr 辅助分类
if status == "error":
stderr_lower = stderr_text.lower()
if any(kw in stderr_lower for kw in ["401", "403", "unauthorized", "auth"]):
if any(kw in stderr_lower for kw in [
"401", "403", "unauthorized", "auth"]):
return {"outcome": "auth_failed", "should_retry": False}
if any(kw in stderr_lower for kw in ["econnrefused", "etimedout", "gateway closed", "econnreset"]):
if any(kw in stderr_lower for kw in [
"econnrefused", "etimedout", "gateway closed", "econnreset"]):
return {"outcome": "gateway_unreachable", "should_retry": True,
"retry_field": "retry_count", "cooldown_seconds": 60}
if any(kw in stderr_lower for kw in ["rate_limit", "500", "503", "api error"]):
if any(kw in stderr_lower for kw in [
"rate_limit", "500", "503", "api error"]):
return {"outcome": "api_error", "should_retry": False}
if any(kw in stderr_lower for kw in ["compaction-diag", "context-overflow"]):
if any(kw in stderr_lower for kw in [
"compaction-diag", "context-overflow"]):
return {"outcome": "compact_failed", "should_retry": False}
if any(kw in stderr_lower for kw in ["lock", "busy", "concurrent", "lane task error"]):
if any(kw in stderr_lower for kw in [
"lock", "busy", "concurrent", "lane task error"]):
return {"outcome": "lock_conflict", "should_retry": True,
"retry_field": "retry_count", "cooldown_seconds": 60}
return {"outcome": "agent_error", "should_retry": False}
# 兜底:status 未知值
return {"outcome": "agent_error", "should_retry": False, "original": "unknown_status"}
return {"outcome": "agent_error",
"should_retry": False, "original": "unknown_status"}
@staticmethod
def _get_retry_counts(db_path: Optional[Path], task_id: Optional[str]) -> dict:
def _get_retry_counts(
db_path: Optional[Path], task_id: Optional[str]) -> dict:
"""从最新 task_attempt 的 metadata 读计数器"""
defaults = {"retry_count": 0, "connect_retry_count": 0,
"api_retry_count": 0, "lock_retry_count": 0,
@@ -1426,7 +1509,7 @@ curl -X POST http://{api_host}:{api_port}/api/projects/{project_id}/tasks/{task_
return defaults
def _update_retry_counts(self, db_path: Optional[Path],
task_id: Optional[str], counts: dict):
task_id: Optional[str], counts: dict):
"""将 retry counts 写回最新 task_attempt 的 metadata"""
if not db_path or not task_id:
return
@@ -1440,7 +1523,8 @@ curl -X POST http://{api_host}:{api_port}/api/projects/{project_id}/tasks/{task_
(task_id,)
).fetchone()
if row:
meta = json.loads(row["metadata"]) if row["metadata"] else {}
meta = json.loads(
row["metadata"]) if row["metadata"] else {}
meta.update(counts)
conn.execute(
"UPDATE task_attempts SET metadata=? WHERE rowid=?",
@@ -1450,7 +1534,8 @@ curl -X POST http://{api_host}:{api_port}/api/projects/{project_id}/tasks/{task_
finally:
conn.close()
except Exception:
logger.exception("Failed to update retry counts for task %s", task_id)
logger.exception(
"Failed to update retry counts for task %s", task_id)
def _mark_task(self, db_path: Optional[Path], task_id: Optional[str],
status: str, detail: Optional[dict] = None):
@@ -1468,7 +1553,8 @@ curl -X POST http://{api_host}:{api_port}/api/projects/{project_id}/tasks/{task_
if detail:
conn.execute(
"INSERT INTO events (task_id, agent, event_type, detail) VALUES (?,?,?,?)",
(task_id, "daemon", status, json.dumps(detail, ensure_ascii=False))
(task_id, "daemon", status, json.dumps(
detail, ensure_ascii=False))
)
conn.commit()
finally:
@@ -1486,10 +1572,13 @@ curl -X POST http://{api_host}:{api_port}/api/projects/{project_id}/tasks/{task_
from src.blackboard.operations import Blackboard
bb = Blackboard(db_path)
cid = bb.add_comment(task_id, "daemon",
f"@pangtong-fujunshi 任务执行失败: {reason},请评估是否需要介入",
comment_type="system")
f"@pangtong-fujunshi 任务执行失败: {reason},请评估是否需要介入",
comment_type="system")
bb.record_mentions(cid, task_id, ["pangtong-fujunshi"])
logger.info("Task %s: failure notified pangtong via comment+mention (reason=%s)", task_id, reason)
logger.info(
"Task %s: failure notified pangtong via comment+mention (reason=%s)",
task_id,
reason)
except Exception as e:
logger.warning("Task %s: failed to notify: %s", task_id, e)
except Exception:
@@ -1518,7 +1607,10 @@ curl -X POST http://{api_host}:{api_port}/api/projects/{project_id}/tasks/{task_
if asyncio.iscoroutine(result):
await result
except Exception:
logger.warning("on_complete callback failed for %s", agent_id, exc_info=True)
logger.warning(
"on_complete callback failed for %s",
agent_id,
exc_info=True)
def _register_session(
self,
@@ -1596,7 +1688,8 @@ curl -X POST http://{api_host}:{api_port}/api/projects/{project_id}/tasks/{task_
def get_session_by_agent(self, agent_id: str) -> Optional[Dict[str, Any]]:
"""v2.7.2: 根据 agent_id 获取活跃 session 信息(用于进程存活性检查)"""
for sid, info in self._sessions.items():
if info.get("agent_id") == agent_id and info.get("status") == "running":
if info.get("agent_id") == agent_id and info.get(
"status") == "running":
return info
return None
+3 -5
View File
@@ -9,14 +9,11 @@ from __future__ import annotations
import asyncio
import json
import logging
import subprocess
import uuid
from datetime import datetime
from enum import Enum
from pathlib import Path
from typing import Any, Callable, Dict, List, Optional, Set
from typing import Any, Dict, List, Optional
from src.blackboard.models import Event
logger = logging.getLogger("moziplus-v2.sse")
@@ -52,7 +49,8 @@ class SSEEvent:
"""格式化为 SSE 协议文本"""
lines = [f"id: {self.id}"]
lines.append(f"event: {self.event_type}")
lines.append(f"data: {json.dumps(self.data, ensure_ascii=False, default=str)}")
lines.append(
f"data: {json.dumps(self.data, ensure_ascii=False, default=str)}")
return "\n".join(lines) + "\n\n"
+199 -96
View File
@@ -21,7 +21,6 @@ from dataclasses import dataclass, field as dc_field
from src.blackboard.operations import Blackboard
from src.blackboard.db import get_connection
from src.blackboard.models import Task
from src.daemon.spawner import AgentBusyError
from src.blackboard.queries import Queries
from src.blackboard.registry import ProjectRegistry
@@ -32,9 +31,11 @@ class BroadcastRound:
"""追踪单个任务的广播状态"""
task_id: str
notified_agents: set = dc_field(default_factory=set) # 已 spawn 过的 Agent
responded_agents: set = dc_field(default_factory=set) # 已返回反馈的 Agent(含 NO_REPLY
responded_agents: set = dc_field(
default_factory=set) # 已返回反馈的 Agent(含 NO_REPLY
round_number: int = 0 # 当前第几轮(0=未开始,1=第1轮)
logger = logging.getLogger("moziplus-v2.ticker")
@@ -46,7 +47,8 @@ class Ticker:
registry: ProjectRegistry,
tick_interval: float = 30.0,
max_ticks: Optional[int] = None,
on_tick_complete: Optional[Callable[[], Coroutine[Any, Any, None]]] = None,
on_tick_complete: Optional[Callable[[],
Coroutine[Any, Any, None]]] = None,
dispatcher: Optional[Any] = None,
spawner: Optional[Any] = None,
max_dispatch_per_tick: int = 3,
@@ -194,7 +196,10 @@ class Ticker:
pr = await self._tick_project(project_id, project_info)
results["projects"][project_id] = pr
except Exception as e:
logger.exception("Tick %d project %s error", tick_num, project_id)
logger.exception(
"Tick %d project %s error",
tick_num,
project_id)
results["projects"][project_id] = {"error": str(e)}
# 虚拟项目 _general:不在 registry 但需要调度
@@ -223,7 +228,10 @@ class Ticker:
logger.exception("Tick %d _mail error", tick_num)
results["projects"]["_mail"] = {"error": str(e)}
logger.debug("Tick %d complete: %d projects", tick_num, len(active_projects))
logger.debug(
"Tick %d complete: %d projects",
tick_num,
len(active_projects))
if self.on_tick_complete:
try:
@@ -314,7 +322,8 @@ class Ticker:
# 8. 健康检查(僵尸检测)
if self.health_checker:
try:
self.health_checker.check(project_id, db_path, self._tick_count)
self.health_checker.check(
project_id, db_path, self._tick_count)
except Exception as e:
logger.warning("HealthChecker error for %s: %s", project_id, e)
@@ -335,7 +344,8 @@ class Ticker:
task_id=t.id, task_title=t.title, task_type=t.task_type
)
except Exception as e:
logger.warning("ExperienceDistiller error for %s: %s", project_id, e)
logger.warning(
"ExperienceDistiller error for %s: %s", project_id, e)
# 10. 扫描后状态
result["summary_after"] = queries.task_summary()
@@ -375,7 +385,8 @@ class Ticker:
(computed, pid),
)
refreshed.append(pid)
logger.info("Parent %s status aggregated: → %s", pid, computed)
logger.info(
"Parent %s status aggregated: → %s", pid, computed)
if refreshed:
conn.commit()
@@ -391,7 +402,7 @@ class Ticker:
MAX_ROUNDS = 5 # §4.5 防无限循环
async def _check_round_complete(self, db_path: Path,
project_id: str) -> List[str]:
project_id: str) -> List[str]:
"""检测 parent task 下所有 sub task 终态 → spawn 庞统 review
流程§4.4
@@ -462,7 +473,7 @@ class Ticker:
"Round %d review spawned for parent %s (subs: %s)",
new_round, parent_id, summary
)
except Exception as e:
except Exception:
logger.exception("Round check error for parent %s", parent_id)
return reviewed
@@ -531,9 +542,9 @@ Parent Task ID: {parent_task.id}
"""
async def _spawn_pangtong_review(self, parent_task,
review_prompt: str,
project_id: str,
new_round: int = 0) -> bool:
review_prompt: str,
project_id: str,
new_round: int = 0) -> bool:
"""Spawn 庞统进行 review
流程
@@ -543,7 +554,7 @@ Parent Task ID: {parent_task.id}
"""
try:
agent_id = "pangtong-fujunshi"
session_id = f"review-{parent_task.id}-r{new_round}"
f"review-{parent_task.id}-r{new_round}"
# 构造 on_complete 回调:解析庞统结论,更新 parent 状态
async def _on_review_complete(aid: str, outcome: str):
@@ -555,7 +566,8 @@ Parent Task ID: {parent_task.id}
latest_meta = None
latest_time = ""
for sid, sess in self.spawner._sessions.items():
if sess.get("agent_id") == agent_id and sess.get("meta"):
if sess.get(
"agent_id") == agent_id and sess.get("meta"):
t = sess.get("completed_at", "")
if t > latest_time:
latest_time = t
@@ -586,8 +598,10 @@ Parent Task ID: {parent_task.id}
self._set_parent_reviewing(parent_task.id, project_id)
return True
return False
except Exception as e:
logger.exception("Failed to spawn pangtong review for %s", parent_task.id)
except Exception:
logger.exception(
"Failed to spawn pangtong review for %s",
parent_task.id)
return False
def _set_parent_reviewing(self, parent_id: str, project_id: str):
@@ -603,14 +617,14 @@ Parent Task ID: {parent_task.id}
(parent_id,))
conn.commit()
logger.info("Parent %s → reviewing (round review in progress)",
parent_id)
parent_id)
finally:
conn.close()
except Exception:
logger.exception("Failed to set parent %s to reviewing", parent_id)
def _handle_review_conclusion(self, parent_id: str, project_id: str,
review_text: str, round_num: int):
review_text: str, round_num: int):
"""解析庞统 review 结论,更新 parent 状态
review_text 是庞统回复的文本 spawner session meta payloads 拼接
@@ -619,7 +633,8 @@ Parent Task ID: {parent_task.id}
conn = get_connection(db_path)
try:
# 解析 GOAL_ACHIEVED
is_achieved = bool(review_text and "GOAL_ACHIEVED" in review_text.upper())
is_achieved = bool(
review_text and "GOAL_ACHIEVED" in review_text.upper())
if is_achieved:
# Goal 达成 → parent 最终完成
@@ -649,7 +664,9 @@ Parent Task ID: {parent_task.id}
"(round %d, subs=%d)",
parent_id, round_num, sub_count)
except Exception:
logger.exception("Failed to handle review conclusion for %s", parent_id)
logger.exception(
"Failed to handle review conclusion for %s",
parent_id)
# 安全恢复:reviewing → working
try:
conn.execute("BEGIN IMMEDIATE")
@@ -675,7 +692,7 @@ Parent Task ID: {parent_task.id}
MENTION_MAX_RETRIES = 5
async def _process_mentions(self, db_path: Path,
project_id: str) -> List[str]:
project_id: str) -> List[str]:
"""扫描 pending mentions → spawn 被 @ 的 Agent
流程§3.4
@@ -687,7 +704,8 @@ Parent Task ID: {parent_task.id}
return []
bb = Blackboard(db_path)
mentions = bb.get_pending_mentions(max_retries=self.MENTION_MAX_RETRIES)
mentions = bb.get_pending_mentions(
max_retries=self.MENTION_MAX_RETRIES)
if not mentions:
return []
@@ -751,27 +769,32 @@ Parent Task ID: {parent_task.id}
if new_review and new_review["verdict"] == "approved":
_ticker._transition_status(
get_connection(rdb_path), _t_id, "done",
get_connection(
rdb_path), _t_id, "done",
agent="daemon",
detail={"reason": "rebuttal_approved"})
logger.info("Rebuttal: task %s approved after rebuttal", _t_id)
logger.info(
"Rebuttal: task %s approved after rebuttal", _t_id)
else:
# 仍非 approved → @mention assignee
verdict_str = new_review["verdict"] if new_review else "未知"
rconn2 = get_connection(rdb_path)
try:
t_row = rconn2.execute("SELECT assignee FROM tasks WHERE id=?", (_t_id,)).fetchone()
t_row = rconn2.execute(
"SELECT assignee FROM tasks WHERE id=?", (_t_id,)).fetchone()
finally:
rconn2.close()
if t_row and t_row["assignee"]:
from src.blackboard.blackboard import Blackboard
bb2 = Blackboard(rdb_path)
bb2.add_comment(_t_id, "daemon",
f"@{t_row['assignee']} 审查结论: {verdict_str},请查看详情并决定接受或反驳",
comment_type="review")
logger.info("Rebuttal: task %s still %s after rebuttal", _t_id, verdict_str)
f"@{t_row['assignee']} 审查结论: {verdict_str},请查看详情并决定接受或反驳",
comment_type="review")
logger.info(
"Rebuttal: task %s still %s after rebuttal", _t_id, verdict_str)
except Exception:
logger.exception("Rebuttal on_complete failed for task %s", _t_id)
logger.exception(
"Rebuttal on_complete failed for task %s", _t_id)
result = await self.spawner.spawn_full_agent(
agent_id=agent_id,
@@ -794,22 +817,30 @@ Parent Task ID: {parent_task.id}
for item in items:
bb.mark_mention_notified(item["id"])
processed.append(agent_id)
logger.info("Mention spawn success: %s (%d mentions)", agent_id, len(items))
logger.info(
"Mention spawn success: %s (%d mentions)",
agent_id,
len(items))
else:
# spawn 返回 None(其他原因)→ 递增 retry_count
for item in items:
bb.mark_mention_retry(item["id"])
logger.warning("Mention spawn failed: %s, retrying next tick", agent_id)
logger.warning(
"Mention spawn failed: %s, retrying next tick", agent_id)
except AgentBusyError:
# Agent 忙,不递增 retry_count,等下次 tick 自然重试
logger.info("Mention spawn skipped: %s busy, will retry next tick", agent_id)
logger.info(
"Mention spawn skipped: %s busy, will retry next tick",
agent_id)
except Exception as e:
logger.exception("Mention processing error for agent %s", agent_id)
except Exception:
logger.exception(
"Mention processing error for agent %s", agent_id)
for item in items:
try:
if item.get("retry_count", 0) >= self.MENTION_MAX_RETRIES - 1:
if item.get("retry_count",
0) >= self.MENTION_MAX_RETRIES - 1:
bb.mark_mention_failed(item["id"])
else:
bb.mark_mention_retry(item["id"])
@@ -822,8 +853,14 @@ Parent Task ID: {parent_task.id}
mention_lines: List[str],
project_id: str) -> str:
"""#03: @mention prompt(身份注入)"""
api_host = getattr(self.spawner, 'api_host', '127.0.0.1') if self.spawner else '127.0.0.1'
api_port = getattr(self.spawner, 'api_port', 8083) if self.spawner else 8083
api_host = getattr(
self.spawner,
'api_host',
'127.0.0.1') if self.spawner else '127.0.0.1'
api_port = getattr(
self.spawner,
'api_port',
8083) if self.spawner else 8083
api_base = f"http://{api_host}:{api_port}/api"
# 获取 Agent 专长
@@ -899,7 +936,8 @@ Parent Task ID: {parent_task.id}
from datetime import datetime
conn.execute("BEGIN IMMEDIATE")
row = conn.execute("SELECT status FROM tasks WHERE id=?", (task_id,)).fetchone()
row = conn.execute(
"SELECT status FROM tasks WHERE id=?", (task_id,)).fetchone()
if not row:
return False
old_status = row["status"]
@@ -938,7 +976,8 @@ Parent Task ID: {parent_task.id}
event_type = "daemon_tick"
conn.execute(
"INSERT INTO events (task_id, agent, event_type, detail) VALUES (?,?,?,?)",
(task_id, agent, event_type, json.dumps({"from": old_status, "to": new_status, **(detail or {})})),
(task_id, agent, event_type, json.dumps(
{"from": old_status, "to": new_status, **(detail or {})})),
)
conn.commit()
return True
@@ -948,7 +987,7 @@ Parent Task ID: {parent_task.id}
# ------------------------------------------------------------------
async def _dispatch_pending(self, db_path: Path,
project_id: str) -> List[str]:
project_id: str) -> List[str]:
"""扫描 pending 任务并调度
v3.0: 两条路径
@@ -978,9 +1017,12 @@ Parent Task ID: {parent_task.id}
try:
result = await self.dispatcher.dispatch(
task,
project_config={"project_id": project_id, "db_path": db_path},
project_config={
"project_id": project_id,
"db_path": db_path},
)
if result["status"] == "dispatched" and result["level"] in ("full", "escalate"):
if result["status"] == "dispatched" and result["level"] in (
"full", "escalate"):
conn = get_connection(db_path)
try:
# [v2.7.1] Mail 已在 dispatcher 中标 working,跳过 claimed
@@ -1073,7 +1115,8 @@ Parent Task ID: {parent_task.id}
detail={"reason": "no_taker_after_3_broadcasts",
"round_number": self._broadcast_tracker.get(t.id).round_number if self._broadcast_tracker.get(t.id) else 0},
)
logger.warning("Escalated %s: no taker after 3 broadcast rounds", t.id)
logger.warning(
"Escalated %s: no taker after 3 broadcast rounds", t.id)
self._broadcast_tracker.pop(t.id, None)
finally:
conn.close()
@@ -1083,7 +1126,8 @@ Parent Task ID: {parent_task.id}
idle_agents = self._get_idle_agents()
if not idle_agents:
logger.warning("No idle agents for broadcast, skipping (capacity issue)")
logger.warning(
"No idle agents for broadcast, skipping (capacity issue)")
return []
task_ids = [t.id for t in broadcastable]
@@ -1114,7 +1158,8 @@ Parent Task ID: {parent_task.id}
spawned = []
for agent_id in idle_agents:
prompt = self._build_claim_prompt(agent_id, broadcastable, project_id)
prompt = self._build_claim_prompt(
agent_id, broadcastable, project_id)
try:
session_id = await self.spawner.spawn_full_agent(
agent_id=agent_id,
@@ -1128,7 +1173,8 @@ Parent Task ID: {parent_task.id}
spawned.append(session_id)
# 记录已通知的 Agent
for t in broadcastable:
self._broadcast_tracker[t.id].notified_agents.add(agent_id)
self._broadcast_tracker[t.id].notified_agents.add(
agent_id)
except AgentBusyError:
logger.debug("Broadcast skip %s: busy", agent_id)
except Exception:
@@ -1139,8 +1185,14 @@ Parent Task ID: {parent_task.id}
def _build_claim_prompt(self, agent_id: str, tasks: list,
project_id: str) -> str:
"""#03: 广播认领 prompt(身份+专长注入)"""
api_host = getattr(self.spawner, 'api_host', '127.0.0.1') if self.spawner else '127.0.0.1'
api_port = getattr(self.spawner, 'api_port', 8083) if self.spawner else 8083
api_host = getattr(
self.spawner,
'api_host',
'127.0.0.1') if self.spawner else '127.0.0.1'
api_port = getattr(
self.spawner,
'api_port',
8083) if self.spawner else 8083
api_base = f"http://{api_host}:{api_port}/api"
# 获取 Agent 专长
@@ -1195,7 +1247,8 @@ Parent Task ID: {parent_task.id}
@property
def counter(self):
"""从 Dispatcher 获取 counter"""
return getattr(self.dispatcher, 'counter', None) if self.dispatcher else None
return getattr(self.dispatcher, 'counter',
None) if self.dispatcher else None
@staticmethod
def _is_pid_alive(pid: int) -> bool:
@@ -1207,7 +1260,8 @@ Parent Task ID: {parent_task.id}
except (ProcessLookupError, PermissionError):
return False
def record_broadcast_response(self, task_id: str, agent_id: str, outcome: str):
def record_broadcast_response(
self, task_id: str, agent_id: str, outcome: str):
"""记录 Agent 对广播任务的反馈(Spawner 调用的公共 API"""
tracker = self._broadcast_tracker.get(task_id)
if not tracker:
@@ -1228,7 +1282,8 @@ Parent Task ID: {parent_task.id}
def _get_all_agent_ids(self) -> List[str]:
"""获取所有配置的 Agent ID"""
if self.dispatcher and hasattr(self.dispatcher, 'router') and self.dispatcher.router:
if self.dispatcher and hasattr(
self.dispatcher, 'router') and self.dispatcher.router:
return list(self.dispatcher.router.agent_profiles.keys())
return []
@@ -1237,12 +1292,13 @@ Parent Task ID: {parent_task.id}
if not self.counter:
return []
# agent_profiles 在 Router 初始化时从 config 填充,是完整 Agent 列表
all_agents = list(self.dispatcher.router.agent_profiles.keys()) if self.dispatcher else []
all_agents = list(
self.dispatcher.router.agent_profiles.keys()) if self.dispatcher else []
active = self.counter.active_agents
return [aid for aid in all_agents if active.get(aid, 0) == 0]
async def _dispatch_reviews(self, db_path: Path,
project_id: str) -> List[str]:
project_id: str) -> List[str]:
"""扫描 review 状态任务,检查是否有产出,调度审查 Agent"""
# mail 任务不走 review 流程,直接跳过
if project_id == "_mail":
@@ -1291,7 +1347,9 @@ Parent Task ID: {parent_task.id}
result = await self.dispatcher.dispatch(
task,
action_type="review",
project_config={"project_id": project_id, "db_path": db_path},
project_config={
"project_id": project_id,
"db_path": db_path},
)
if result["status"] == "dispatched":
dispatched.append(task.id)
@@ -1344,7 +1402,7 @@ Parent Task ID: {parent_task.id}
)
reclaimed.append(task.id)
logger.warning("Escalated %s: no taker after %d broadcasts",
task.id, retry_count)
task.id, retry_count)
finally:
conn.close()
else:
@@ -1375,8 +1433,10 @@ Parent Task ID: {parent_task.id}
working = queries.tasks_by_status("working")
for task in working:
# #07.2: crash_limit 统一检查(比超时更严重的信号)
if self.dispatcher and hasattr(self.dispatcher, '_check_crash_limit'):
if self.dispatcher._check_crash_limit(task.id, db_path, limit=3, window_minutes=30):
if self.dispatcher and hasattr(
self.dispatcher, '_check_crash_limit'):
if self.dispatcher._check_crash_limit(
task.id, db_path, limit=3, window_minutes=30):
conn = get_connection(db_path)
try:
self._transition_status(
@@ -1388,7 +1448,8 @@ Parent Task ID: {parent_task.id}
finally:
conn.close()
reclaimed.append(task.id)
logger.error("Task %s: executor crash limit (3/30m), marking failed", task.id)
logger.error(
"Task %s: executor crash limit (3/30m), marking failed", task.id)
continue
# #07.3 ACT-1: updated_at fallback 覆盖 mail auto-working(无 started_at/claimed_at
@@ -1400,7 +1461,8 @@ Parent Task ID: {parent_task.id}
# per-task timeout: deadline 优先,否则用默认值
if task.deadline:
deadline_time = datetime.fromisoformat(task.deadline)
timeout_minutes = (deadline_time - start_time).total_seconds() / 60.0
timeout_minutes = (
deadline_time - start_time).total_seconds() / 60.0
if timeout_minutes < 1:
timeout_minutes = self.default_task_timeout_minutes
else:
@@ -1423,7 +1485,7 @@ Parent Task ID: {parent_task.id}
if ok:
reclaimed.append(task.id)
logger.info("Mail %s: ticker recheck found reply, marked done (%.1fm)",
task.id, elapsed)
task.id, elapsed)
finally:
conn.close()
continue
@@ -1440,15 +1502,17 @@ Parent Task ID: {parent_task.id}
if ok:
reclaimed.append(task.id)
logger.warning("Task %s timed out (working %.1fm > %.1fm)",
task.id, elapsed, timeout_minutes)
task.id, elapsed, timeout_minutes)
finally:
conn.close()
except (ValueError, TypeError):
pass
# v2.7.2: 进程存活性检查 — counter 占用但进程已死的兜底
if self.spawner and self.counter and hasattr(self.counter, "active_agents"):
for agent_id in list(self.counter.active_agents.keys()) if hasattr(self.counter, "active_agents") else []:
if self.spawner and self.counter and hasattr(
self.counter, "active_agents"):
for agent_id in list(self.counter.active_agents.keys()) if hasattr(
self.counter, "active_agents") else []:
session_info = self.spawner.get_session_by_agent(agent_id)
if not session_info:
continue
@@ -1465,20 +1529,24 @@ Parent Task ID: {parent_task.id}
conn = get_connection(db_path)
try:
current_row = conn.execute(
"SELECT status FROM tasks WHERE id=?", (task_id_check,)
"SELECT status FROM tasks WHERE id=?", (
task_id_check,)
).fetchone()
if current_row and current_row["status"] == "review":
logger.info("Task %s in review, keeping status (process dead)", task_id_check)
logger.info(
"Task %s in review, keeping status (process dead)", task_id_check)
else:
self._transition_status(
conn, task_id_check, "pending",
agent="daemon",
detail={"reason": "process_dead", "pid": pid},
detail={
"reason": "process_dead", "pid": pid},
)
finally:
conn.close()
except Exception:
logger.exception("Failed to handle process dead for task %s", task_id_check)
logger.exception(
"Failed to handle process dead for task %s", task_id_check)
# #07.2: Fix-3b 已删除。review 超时/crash 统一由 process_dead + _check_timeouts 处理
@@ -1497,16 +1565,20 @@ Parent Task ID: {parent_task.id}
finally:
conn.close()
except Exception as e:
logger.error("Mail %s: ticker reply check error: %s", original_task_id, e)
logger.error(
"Mail %s: ticker reply check error: %s",
original_task_id,
e)
return True # 保守:查询失败假设有回复
def _check_recent_routing(self, db_path: Path, task_id: str,
action_type: str) -> bool:
action_type: str) -> bool:
"""检查最近 5 分钟内是否已 dispatch 过指定类型的路由(防重复)"""
try:
conn = get_connection(db_path)
try:
# 检查是否有 from_status=review 的 dispatched 记录(防止重复 review dispatch
# 检查是否有 from_status=review 的 dispatched 记录(防止重复 review
# dispatch
if action_type == "review":
row = conn.execute(
"SELECT COUNT(*) as cnt FROM routing_decisions "
@@ -1537,17 +1609,22 @@ Parent Task ID: {parent_task.id}
NON_TERMINAL = {"claimed", "working", "review", "reviewing"}
projects = self.registry.list_projects()
recovery_report = {"projects": {}, "total_recovered": 0, "total_noop": 0}
recovery_report = {
"projects": {},
"total_recovered": 0,
"total_noop": 0}
# 收集所有需要扫描的项目(registry + 虚拟项目)
project_dirs = {}
for project_id, project_info in projects.items():
if project_info.get("status") == "active":
project_dirs[project_id] = self.registry.root / project_id / "blackboard.db"
project_dirs[project_id] = self.registry.root / \
project_id / "blackboard.db"
# 虚拟项目
for virtual_id in ("_general", "_mail"):
virtual_db = Path(self.registry.root) / virtual_id / "blackboard.db"
virtual_db = Path(self.registry.root) / \
virtual_id / "blackboard.db"
if virtual_db.exists() and virtual_id not in project_dirs:
project_dirs[virtual_id] = virtual_db
@@ -1567,25 +1644,28 @@ Parent Task ID: {parent_task.id}
old_pid = self._current_project_id
self._current_project_id = project_id
try:
recovered, noop_count = self._recover_project(db_path, NON_TERMINAL)
recovered, noop_count = self._recover_project(
db_path, NON_TERMINAL)
if recovered:
recovery_report["projects"][project_id] = recovered
recovery_report["total_recovered"] += len(recovered)
recovery_report["total_noop"] += noop_count
except Exception:
logger.exception("Startup recovery failed for project %s", project_id)
logger.exception(
"Startup recovery failed for project %s", project_id)
finally:
self._current_project_id = old_pid
if recovery_report["total_recovered"] > 0:
logger.info("Startup recovery: %d tasks recovered across %d projects",
recovery_report["total_recovered"],
len(recovery_report["projects"]))
recovery_report["total_recovered"],
len(recovery_report["projects"]))
elif recovery_report["total_noop"] > 0:
logger.info("Startup recovery: %d tasks kept as-is (no recovery needed)",
recovery_report["total_noop"])
recovery_report["total_noop"])
else:
logger.info("Startup recovery: no non-terminal tasks found, clean start")
logger.info(
"Startup recovery: no non-terminal tasks found, clean start")
return recovery_report
@@ -1608,10 +1688,13 @@ Parent Task ID: {parent_task.id}
for task in rows:
try:
action = self._determine_recovery_action(conn, task, status, db_path)
action = self._determine_recovery_action(
conn, task, status, db_path)
if action:
self._execute_recovery(conn, task["id"], action, db_path)
recovered.append({"task_id": task["id"], "from": status, "action": action})
self._execute_recovery(
conn, task["id"], action, db_path)
recovered.append(
{"task_id": task["id"], "from": status, "action": action})
else:
# 审计:保持原状的任务也记录事件
noop_count += 1
@@ -1622,14 +1705,15 @@ Parent Task ID: {parent_task.id}
)
conn.commit()
except Exception:
logger.exception("Startup recovery failed for task %s", task["id"])
logger.exception(
"Startup recovery failed for task %s", task["id"])
finally:
conn.close()
return recovered, noop_count
def _determine_recovery_action(self, conn, task, status: str,
db_path: Path) -> Optional[str]:
db_path: Path) -> Optional[str]:
"""根据黑板线索决定恢复动作,返回 None 表示不需要干预"""
task_id = task["id"]
@@ -1700,7 +1784,8 @@ Parent Task ID: {parent_task.id}
# 无审查结论 → 保持 reviewticker 自然会 dispatch reviewer
return None
def _execute_recovery(self, conn, task_id: str, action: str, db_path: Path):
def _execute_recovery(self, conn, task_id: str,
action: str, db_path: Path):
"""执行恢复动作"""
# 获取原始状态(用于审计)
orig_row = conn.execute(
@@ -1712,17 +1797,22 @@ Parent Task ID: {parent_task.id}
self._transition_status(
conn, task_id, "pending",
agent="daemon",
detail={"reason": "startup_recovery", "original_status": orig_status},
detail={
"reason": "startup_recovery",
"original_status": orig_status},
)
# 清空 current_agent(常规推 pending,无特定 agent 接手)
conn.execute("UPDATE tasks SET current_agent=NULL WHERE id=?", (task_id,))
conn.execute(
"UPDATE tasks SET current_agent=NULL WHERE id=?", (task_id,))
conn.commit()
elif action == "push_to_pending_keep_agent":
self._transition_status(
conn, task_id, "pending",
agent="daemon",
detail={"reason": "startup_recovery", "original_status": orig_status},
detail={
"reason": "startup_recovery",
"original_status": orig_status},
)
# 保留 current_agent,让同一 agent 重新接手
conn.commit()
@@ -1731,7 +1821,9 @@ Parent Task ID: {parent_task.id}
self._transition_status(
conn, task_id, "review",
agent="daemon",
detail={"reason": "startup_recovery", "original_status": "working"},
detail={
"reason": "startup_recovery",
"original_status": "working"},
)
conn.commit()
@@ -1739,7 +1831,9 @@ Parent Task ID: {parent_task.id}
self._transition_status(
conn, task_id, "done",
agent="daemon",
detail={"reason": "startup_recovery", "original_status": orig_status},
detail={
"reason": "startup_recovery",
"original_status": orig_status},
)
conn.commit()
@@ -1747,22 +1841,30 @@ Parent Task ID: {parent_task.id}
self._transition_status(
conn, task_id, "failed",
agent="daemon",
detail={"reason": "startup_recovery", "original_status": orig_status},
detail={
"reason": "startup_recovery",
"original_status": orig_status},
)
conn.commit()
# 记录恢复审计事件
conn.execute(
"INSERT INTO events (task_id, agent, event_type, detail) VALUES (?, ?, ?, ?)",
(task_id, "daemon", "startup_recovery", json.dumps({"action": action}))
(task_id, "daemon", "startup_recovery",
json.dumps({"action": action}))
)
conn.commit()
logger.info("Recovery: task %s%s (action=%s)", task_id, action, action)
logger.info(
"Recovery: task %s%s (action=%s)",
task_id,
action,
action)
def _find_pre_reviewing_status(self, conn, task_id: str) -> str:
"""查 events 表找到 reviewing 之前的状态(done 或 failed"""
# _transition_status 写入 event_type=f"task_{new_status}"detail 用 from/to
# _transition_status 写入 event_type=f"task_{new_status}"detail 用
# from/to
rows = conn.execute(
"""SELECT detail FROM events
WHERE task_id=? AND event_type='task_reviewing'
@@ -1773,7 +1875,8 @@ Parent Task ID: {parent_task.id}
for event in rows:
try:
detail = json.loads(event["detail"])
# _transition_status detail 格式: {"from": old_status, "to": new_status, ...}
# _transition_status detail 格式: {"from": old_status, "to":
# new_status, ...}
prev = detail.get("from") or detail.get("old_status")
if prev in ("done", "failed"):
return prev