From f4dd9ff78d62ed92e75ac4b8f93298b6bf40eb17 Mon Sep 17 00:00:00 2001 From: cfdaily Date: Sat, 13 Jun 2026 09:27:17 +0800 Subject: [PATCH 1/3] =?UTF-8?q?feat(daemon):=20Mail=20=E5=A4=B1=E8=B4=A5?= =?UTF-8?q?=E9=80=9A=E7=9F=A5=20v2.0=20=E2=80=94=20api=5Ferror=20retry=20+?= =?UTF-8?q?=20=E9=80=9A=E7=9F=A5=E5=A2=9E=E5=BC=BA?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit P1: api_error rate_limit/500/503 改为可恢复 retry(should_retry=True,60s cooldown) P2: 通知模板动态化(reason 人话翻译 + detail 信息 + 重试情况 + AI Native 知识库) 设计文档:§20.7 (20-task-type-architecture.md) --- src/daemon/mail_notify.py | 125 +++++++++++++++++++++++++++++++------- src/daemon/spawner.py | 5 +- 2 files changed, 108 insertions(+), 22 deletions(-) diff --git a/src/daemon/mail_notify.py b/src/daemon/mail_notify.py index 77cc8a2..092fbff 100644 --- a/src/daemon/mail_notify.py +++ b/src/daemon/mail_notify.py @@ -1,4 +1,4 @@ -"""Mail 失败通知 — 以 system 身份通知发件人""" +"""Mail 失败通知 v2.0 — 以 system 身份通知发件人(AI Native)""" from __future__ import annotations @@ -6,7 +6,7 @@ import json import logging from datetime import datetime from pathlib import Path -from typing import Optional +from typing import Callable, Dict, Optional from src.blackboard.models import Task from src.blackboard.operations import Blackboard @@ -15,21 +15,108 @@ from src.config.agents import AGENT_IDS logger = logging.getLogger(__name__) -# 邮件通知正文模板(统一模板,包含所有可能的失败原因和建议) -_NOTIFY_TEMPLATE = """你的邮件投递失败了。 +# ── Reason 人话翻译 + detail 提取 ────────────────────────────── -📧 原始邮件:「{title}」 -👤 收件人:{to_agent} -❌ 失败原因:{reason} +def _extract_stderr(detail: dict, max_len: int = 200) -> str: + """从 detail 中提取 stderr_preview""" + preview = (detail or {}).get("stderr_preview", "") + if preview and len(preview) > max_len: + preview = preview[:max_len] + "..." + return preview -常见失败原因及处理建议: -• no_reply_found:收件人未回复。建议重发邮件,或通过黑板任务方式联系 -• auth_failed:收件人认证失败。需检查 Agent 配置,联系姜维(jiangwei-infra)排查 -• crash_limit:收件人处理时多次崩溃。系统异常,建议稍后重试 -• task_timeout:处理超时。建议重发或通过其他方式联系 -• 其他原因:建议联系副军师(pangtong-fujunshi)排查 -——系统自动通知""" +def _fmt_retry_info(reason: str, detail: dict) -> str: + """格式化重试情况描述""" + _NO_RETRY_REASONS = { + "no_reply_found", "auth_failed", "agent_error", + "agent_failed", "compact_failed", + } + if reason in _NO_RETRY_REASONS: + return f"无法重试({_REASON_MAP.get(reason, _REASON_MAP[\"_default\"])[0]})" + + count = (detail or {}).get("count", 0) + fallback_count = (detail or {}).get("fallback_count", 0) + + if count > 0: + return f"已自动重试 {count} 次" + if fallback_count > 0: + return f"已自动重试 {fallback_count} 次(fallback)" + return "系统已尝试恢复,但仍失败" + + +# reason_raw → (reason_human_readable, detail_format_fn) +_REASON_MAP: Dict[str, tuple] = { + "no_reply_found": ("收件人未回复(Agent 未能识别或处理此邮件)", lambda d: ""), + "crashed": ("处理时进程崩溃", lambda d: f"stderr: {_extract_stderr(d)}" if _extract_stderr(d) else "无 stderr 输出"), + "max_crash_count": ("连续崩溃达上限", lambda d: f"崩溃 {d.get('count', '?')} 次"), + "max_retries": ("续杯耗尽(已自动重试)", lambda d: f"重试 {d.get('count', '?')} 次"), + "max_api_retry_count": ("API 连续失败达上限", lambda d: f"API 重试 {d.get('count', '?')} 次"), + "max_monitor_timeouts": ("处理超时达上限", lambda d: f"超时 {d.get('count', '?')} 次,共约 {d.get('elapsed_seconds', 0) // 60} 分钟"), + "gateway_timeout": ("Agent 执行超时(已续杯重试)", lambda d: ""), + "session_stuck": ("会话假死(lock PID 死亡)", lambda d: f"假死 {d.get('stuck_count', '?')} 次"), + "revive_failed": ("会话恢复失败", lambda d: f"假死 {d.get('stuck_count', '?')} 次"), + "auth_failed": ("Agent 认证失败(配置问题)", lambda d: f"stderr: {_extract_stderr(d)}" if _extract_stderr(d) else ""), + "fallback_exhausted": ("主模型和备用模型均失败", lambda d: f"fallback {d.get('fallback_count', '?')} 次,原因: {d.get('fallback_reason', '未知')}"), + "agent_failed": ("收件人主动标记失败", lambda d: ""), + "compact_failed": ("上下文压缩失败", lambda d: f"stderr: {_extract_stderr(d)}" if _extract_stderr(d) else ""), + "compact_hanging": ("上下文压缩长时间未完成", lambda d: ""), + "compact_interrupted": ("上下文压缩被中断(已自动重试)", lambda d: ""), + "gateway_unreachable": ("Gateway 不可达(已自动重试)", lambda d: f"stderr: {_extract_stderr(d)}" if _extract_stderr(d) else ""), + "lock_conflict": ("会话锁冲突(已自动重试)", lambda d: ""), + "max_retry_count": ("重试耗尽", lambda d: f"重试 {d.get('count', '?')} 次"), + "max_lock_retry_count": ("锁冲突重试耗尽", lambda d: f"重试 {d.get('count', '?')} 次"), + "max_connect_retry_count": ("连接重试耗尽", lambda d: f"重试 {d.get('count', '?')} 次"), + "_default": ("未知原因", lambda d: f"stderr: {_extract_stderr(d)}" if _extract_stderr(d) else ""), +} + +# 常见失败原因参考(AI Native:提供知识库让收件 AI 自行判断) +_REASON_REFERENCE = """常见失败原因参考: +• no_reply_found:收件人未回复(Agent 未能识别或处理此邮件) +• crashed / max_crash_count:收件人处理时进程崩溃(已自动重试 3 次) +• max_retries:续杯耗尽(已自动重试 3 次,共约 34 分钟) +• max_api_retry_count:API 连续失败达上限(rate_limit/500/503) +• max_monitor_timeouts:处理超时达上限(共约 31.5 分钟) +• gateway_timeout:Agent 执行超时(已续杯重试) +• session_stuck:Agent 会话假死(lock PID 死亡,revive 失败) +• revive_failed:会话假死后恢复失败 +• auth_failed:Agent 认证失败(配置问题) +• fallback_exhausted:主模型和备用模型均失败 +• agent_failed:收件人主动标记失败 +• compact_failed:上下文压缩失败 +• compact_hanging:上下文压缩长时间未完成(等待超 31.5 分钟) +• compact_interrupted:上下文压缩被中断(已自动重试 3 次) +• gateway_unreachable:Gateway 不可达(已自动重试 3 次) +• lock_conflict:会话锁冲突(已自动重试 3 次) +• 其他:建议排查系统日志""" + + +def _build_notify_text(title: str, to_agent: str, reason: str, + detail: Optional[dict] = None) -> str: + """构建通知正文(v2.0 AI Native)""" + reason_human, detail_fn = _REASON_MAP.get(reason, _REASON_MAP["_default"]) + detail_info = detail_fn(detail or {}) + + retry_info = _fmt_retry_info(reason, detail or {}) + + lines = [ + "邮件投递失败通知", + "", + f"📧 原始邮件:「{title}」", + f"👤 收件人:{to_agent}", + f"❌ 失败原因:{reason_human}({reason})", + f"📊 重试情况:{retry_info}", + ] + + if detail_info: + lines.append("📋 上下文信息:") + lines.append(f" {detail_info}") + + lines.append("") + lines.append(_REASON_REFERENCE) + lines.append("") + lines.append("——系统自动通知") + + return "\n".join(lines) def _is_mail_project(db_path: Path) -> bool: @@ -65,7 +152,7 @@ def notify_mail_failed(db_path: Path, original_mail_id: str, original_mail_id) return - # 获取发件人(优先 assigned_by,fallback must_haves.from) + # 获取发件人(优先 assigned_by,fallback must_hives.from) from_agent = original.assigned_by or meta.get("from", "") to_agent = original.assignee or "" title = original.title or "" @@ -83,12 +170,8 @@ def notify_mail_failed(db_path: Path, original_mail_id: str, original_mail_id, from_agent) target_agent = "pangtong-fujunshi" - # 构造通知正文 - text = _NOTIFY_TEMPLATE.format( - title=title, - to_agent=to_agent, - reason=reason, - ) + # 构造通知正文(v2.0 AI Native) + text = _build_notify_text(title, to_agent, reason, detail) # 创建通知邮件 Task notify_id = f"mail-{int(datetime.now().timestamp() * 1000)}" diff --git a/src/daemon/spawner.py b/src/daemon/spawner.py index b30d43e..91d2770 100644 --- a/src/daemon/spawner.py +++ b/src/daemon/spawner.py @@ -845,6 +845,8 @@ curl -X POST http://{api_host}:{api_port}/api/projects/{project_id}/tasks/{task_ cls.get("retry_field", "retry_count") ) elif outcome == "api_error": + # A9: [DEPRECATED] api_error 已改为 should_retry=True 走续杯路径。 + # 此分支理论上不再命中,保留作为安全兜底。 # A9: 429/API 错误 → release counter(on_complete)+ 推回 pending + 冷却 # 有上限:api_retry_count 累计达 max_retries 则标 failed await self._do_on_complete_async(on_complete, agent_id, outcome) @@ -1842,7 +1844,8 @@ curl -X POST http://{api_host}:{api_port}/api/projects/{project_id}/tasks/{task_ "retry_field": "retry_count", "cooldown_seconds": 60} if any(kw in stderr_lower for kw in [ "rate_limit", "500", "503", "api error"]): - return {"outcome": "api_error", "should_retry": False} + return {"outcome": "api_error", "should_retry": True, + "retry_field": "retry_count", "cooldown_seconds": 60} if any(kw in stderr_lower for kw in [ "compaction-diag", "context-overflow"]): return {"outcome": "compact_failed", "should_retry": False} -- 2.45.4 From 7fb4d988ecdd2ddcd34c743e132727c90b6cb308 Mon Sep 17 00:00:00 2001 From: cfdaily Date: Sat, 13 Jun 2026 09:29:52 +0800 Subject: [PATCH 2/3] =?UTF-8?q?fix:=20lint=20=E4=BF=AE=E5=A4=8D=20+=20api?= =?UTF-8?q?=5Ferror=20=E6=B5=8B=E8=AF=95=E6=9B=B4=E6=96=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - mail_notify: f-string 反斜杠修复、行过长修复、unused import - test_classify_outcome: api_error should_retry 改 True --- src/daemon/mail_notify.py | 20 +++++++++++++++----- tests/unit/test_classify_outcome.py | 6 ++++-- 2 files changed, 19 insertions(+), 7 deletions(-) diff --git a/src/daemon/mail_notify.py b/src/daemon/mail_notify.py index 092fbff..683a0fc 100644 --- a/src/daemon/mail_notify.py +++ b/src/daemon/mail_notify.py @@ -6,7 +6,7 @@ import json import logging from datetime import datetime from pathlib import Path -from typing import Callable, Dict, Optional +from typing import Dict, Optional from src.blackboard.models import Task from src.blackboard.operations import Blackboard @@ -32,7 +32,8 @@ def _fmt_retry_info(reason: str, detail: dict) -> str: "agent_failed", "compact_failed", } if reason in _NO_RETRY_REASONS: - return f"无法重试({_REASON_MAP.get(reason, _REASON_MAP[\"_default\"])[0]})" + reason_human = _REASON_MAP.get(reason, _REASON_MAP.get("_default", ("未知原因", lambda d: "")))[0] + return f"无法重试({reason_human})" count = (detail or {}).get("count", 0) fallback_count = (detail or {}).get("fallback_count", 0) @@ -51,17 +52,26 @@ _REASON_MAP: Dict[str, tuple] = { "max_crash_count": ("连续崩溃达上限", lambda d: f"崩溃 {d.get('count', '?')} 次"), "max_retries": ("续杯耗尽(已自动重试)", lambda d: f"重试 {d.get('count', '?')} 次"), "max_api_retry_count": ("API 连续失败达上限", lambda d: f"API 重试 {d.get('count', '?')} 次"), - "max_monitor_timeouts": ("处理超时达上限", lambda d: f"超时 {d.get('count', '?')} 次,共约 {d.get('elapsed_seconds', 0) // 60} 分钟"), + "max_monitor_timeouts": ( + "处理超时达上限", + lambda d: f"超时 {d.get('count', '?')} 次," + f"共约 {d.get('elapsed_seconds', 0) // 60} 分钟"), "gateway_timeout": ("Agent 执行超时(已续杯重试)", lambda d: ""), "session_stuck": ("会话假死(lock PID 死亡)", lambda d: f"假死 {d.get('stuck_count', '?')} 次"), "revive_failed": ("会话恢复失败", lambda d: f"假死 {d.get('stuck_count', '?')} 次"), "auth_failed": ("Agent 认证失败(配置问题)", lambda d: f"stderr: {_extract_stderr(d)}" if _extract_stderr(d) else ""), - "fallback_exhausted": ("主模型和备用模型均失败", lambda d: f"fallback {d.get('fallback_count', '?')} 次,原因: {d.get('fallback_reason', '未知')}"), + "fallback_exhausted": ( + "主模型和备用模型均失败", + lambda d: f"fallback {d.get('fallback_count', '?')} 次," + f"原因: {d.get('fallback_reason', '未知')}"), "agent_failed": ("收件人主动标记失败", lambda d: ""), "compact_failed": ("上下文压缩失败", lambda d: f"stderr: {_extract_stderr(d)}" if _extract_stderr(d) else ""), "compact_hanging": ("上下文压缩长时间未完成", lambda d: ""), "compact_interrupted": ("上下文压缩被中断(已自动重试)", lambda d: ""), - "gateway_unreachable": ("Gateway 不可达(已自动重试)", lambda d: f"stderr: {_extract_stderr(d)}" if _extract_stderr(d) else ""), + "gateway_unreachable": ( + "Gateway 不可达(已自动重试)", + lambda d: f"stderr: {_extract_stderr(d)}" + if _extract_stderr(d) else ""), "lock_conflict": ("会话锁冲突(已自动重试)", lambda d: ""), "max_retry_count": ("重试耗尽", lambda d: f"重试 {d.get('count', '?')} 次"), "max_lock_retry_count": ("锁冲突重试耗尽", lambda d: f"重试 {d.get('count', '?')} 次"), diff --git a/tests/unit/test_classify_outcome.py b/tests/unit/test_classify_outcome.py index 07f21d8..62a2061 100644 --- a/tests/unit/test_classify_outcome.py +++ b/tests/unit/test_classify_outcome.py @@ -165,14 +165,16 @@ class TestClassifyErrorApi: 1, {"status": "error"}, "rate_limit exceeded", None ) assert result["outcome"] == "api_error" - assert result["should_retry"] is False + assert result["should_retry"] is True + assert result["cooldown_seconds"] == 60 def test_stderr_500(self): result = Spawner._classify_outcome( 1, {"status": "error"}, "HTTP 500 Internal Server Error", None ) assert result["outcome"] == "api_error" - assert result["should_retry"] is False + assert result["should_retry"] is True + assert result["cooldown_seconds"] == 60 class TestClassifyErrorCompact: -- 2.45.4 From a116f7e6c0de0fbd69e0f7a5fda6c9d1b44768d7 Mon Sep 17 00:00:00 2001 From: cfdaily Date: Sat, 13 Jun 2026 09:33:59 +0800 Subject: [PATCH 3/3] =?UTF-8?q?fix:=20=E6=B3=A8=E9=87=8A=E6=8B=BC=E5=86=99?= =?UTF-8?q?=20must=5Fhives=20=E2=86=92=20must=5Fhaves?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/daemon/mail_notify.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/daemon/mail_notify.py b/src/daemon/mail_notify.py index 683a0fc..b6d86c2 100644 --- a/src/daemon/mail_notify.py +++ b/src/daemon/mail_notify.py @@ -140,7 +140,7 @@ def notify_mail_failed(db_path: Path, original_mail_id: str, """Mail 失败后以 system 身份给发件人发通知邮件 直接通过 Blackboard 创建 Task,不走 HTTP API。 - 防递归:检查原邮件 must_hives.system_notify,为 true 则跳过。 + 防递归:检查原邮件 must_haves.system_notify,为 true 则跳过。 发件人不是有效 Agent(如 system)→ 通知庞统代处理,避免广播风暴。 """ try: @@ -162,7 +162,7 @@ def notify_mail_failed(db_path: Path, original_mail_id: str, original_mail_id) return - # 获取发件人(优先 assigned_by,fallback must_hives.from) + # 获取发件人(优先 assigned_by,fallback must_haves.from) from_agent = original.assigned_by or meta.get("from", "") to_agent = original.assignee or "" title = original.title or "" -- 2.45.4