add: 关羽完成已调研结果和实时风控系统上传

This commit is contained in:
cfdaily
2026-03-21 20:24:43 +08:00
parent e43c7a5214
commit f549b28ac7
7 changed files with 1454 additions and 0 deletions
@@ -0,0 +1,157 @@
#!/usr/bin/env bash
# =============================================================================
# sanguo_vnpy 自动化部署流水线脚本
# 版本: v1.0
# 作者: 姜维(后勤总督)
# =============================================================================
set -e
echo "========================================================================"
echo "🚀 sanguo_vnpy 自动化部署流水线"
echo "========================================================================"
# 颜色定义
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m'
print_info() {
echo -e "${BLUE}[INFO]${NC} $1"
}
print_success() {
echo -e "${GREEN}[SUCCESS]${NC} $1"
}
print_warning() {
echo -e "${YELLOW}[WARNING]${NC} $1"
}
print_error() {
echo -e "${RED}[ERROR]${NC} $1"
}
# 获取当前时间
get_timestamp() {
date +"%Y%m%d_%H%M%S"
}
# 配置变量
PROJECT_DIR="/Users/chufeng/.openclaw/agents/main/workspace/projects/sanguo_vnpy"
DEPLOY_ENV="${1:-production}"
TIMESTAMP=$(get_timestamp)
echo ""
print_info "部署环境: $DEPLOY_ENV"
print_info "时间戳: $TIMESTAMP"
echo ""
# 步骤 1: 代码构建
print_info "步骤 1: 代码构建..."
cd "$PROJECT_DIR"
# 检查虚拟环境
if [ ! -d "venv" ]; then
print_warning "虚拟环境不存在,正在创建..."
python3 -m venv venv
fi
# 激活虚拟环境
print_info "激活虚拟环境..."
source venv/bin/activate
# 升级依赖
print_info "升级项目依赖..."
pip install --upgrade pip wheel
pip install -e ".[alpha,dev]"
print_success "代码构建完成"
# 步骤 2: 代码检查
print_info "步骤 2: 代码质量检查..."
# 运行代码检查
if command -v ruff &> /dev/null; then
print_info "运行 Ruff 代码检查..."
ruff check sanguo/
print_success "代码检查通过"
else
print_warning "Ruff 未安装,跳过代码检查"
fi
# 步骤 3: 运行测试
print_info "步骤 3: 运行测试..."
if [ -d "tests" ]; then
if command -v pytest &> /dev/null; then
print_info "运行测试..."
pytest tests/ -v
print_success "测试通过"
else
print_warning "pytest 未安装,跳过测试"
fi
else
print_warning "测试目录不存在,跳过测试"
fi
# 步骤 4: 构建部署包
print_info "步骤 4: 构建部署包..."
# 创建部署目录
DEPLOY_DIR="/tmp/sanguo_vnpy_deploy_$TIMESTAMP"
mkdir -p "$DEPLOY_DIR"
# 复制代码
print_info "复制项目文件..."
cp -r sanguo/ "$DEPLOY_DIR/"
cp -r vnpy/ "$DEPLOY_DIR/" 2>/dev/null || true
cp pyproject.toml "$DEPLOY_DIR/"
cp README.md "$DEPLOY_DIR/"
# 构建 wheel 包
print_info "构建 wheel 包..."
pip install build
python -m build --wheel --outdir "$DEPLOY_DIR/"
print_success "部署包构建完成: $DEPLOY_DIR"
# 步骤 5: 部署到目标环境
print_info "步骤 5: 部署到目标环境..."
if [ "$DEPLOY_ENV" = "production" ]; then
print_info "生产环境部署..."
# 这里可以添加生产环境部署逻辑
# 例如: 上传到阿里云 OSS, SSH 到服务器部署等
print_warning "生产环境部署需要配置阿里云凭证"
elif [ "$DEPLOY_ENV" = "testing" ]; then
print_info "测试环境部署..."
print_success "测试环境部署完成"
else
print_info "本地开发环境部署..."
print_success "本地部署完成"
fi
# 步骤 6: 验证部署
print_info "步骤 6: 验证部署..."
# 验证模块导入
print_info "验证模块导入..."
python -c "import sanguo; import vnpy; print('✅ 模块导入成功')"
print_success "部署验证通过"
echo ""
echo "========================================================================"
echo "🎉 部署完成!"
echo "========================================================================"
echo "部署时间: $(date)"
echo "部署环境: $DEPLOY_ENV"
echo "部署包: $DEPLOY_DIR"
echo "========================================================================"
echo ""
@@ -0,0 +1,219 @@
# =============================================================================
# sanguo_vnpy 阿里云生产环境 Terraform 配置
# 版本: v1.0
# 作者: 姜维(后勤总督)
# =============================================================================
terraform {
required_providers {
alicloud = {
source = "aliyun/alicloud"
version = ">= 1.212.0"
}
}
}
provider "alicloud" {
access_key = var.alicloud_access_key
secret_key = var.alicloud_secret_key
region = var.alicloud_region
}
# =============================================================================
# 变量定义
# =============================================================================
variable "alicloud_access_key" {
description = "阿里云 Access Key"
type = string
sensitive = true
}
variable "alicloud_secret_key" {
description = "阿里云 Secret Key"
type = string
sensitive = true
}
variable "alicloud_region" {
description = "阿里云区域"
type = string
default = "cn-hangzhou"
}
variable "environment" {
description = "环境类型: production/testing/development"
type = string
default = "production"
}
variable "instance_type" {
description = "ECS 实例规格"
type = string
default = "ecs.c6.large"
}
variable "vpc_cidr" {
description = "VPC CIDR 块"
type = string
default = "10.0.0.0/16"
}
variable "vswitch_cidr" {
description = "虚拟交换机 CIDR 块"
type = string
default = "10.0.0.0/24"
}
# =============================================================================
# VPC 网络
# =============================================================================
resource "alicloud_vpc" "sanguo_vpc" {
vpc_name = "sanguo-vnpy-${var.environment}-vpc"
cidr_block = var.vpc_cidr
}
resource "alicloud_vswitch" "sanguo_vswitch" {
vswitch_name = "sanguo-vnpy-${var.environment}-vswitch"
vpc_id = alicloud_vpc.sanguo_vpc.id
cidr_block = var.vswitch_cidr
zone_id = "${var.alicloud_region}-a"
}
# =============================================================================
# 安全组
# =============================================================================
resource "alicloud_security_group" "sanguo_sg" {
name = "sanguo-vnpy-${var.environment}-sg"
description = "sanguo_vnpy ${var.environment} 安全组"
vpc_id = alicloud_vpc.sanguo_vpc.id
}
resource "alicloud_security_group_rule" "allow_ssh" {
type = "ingress"
ip_protocol = "tcp"
nic_type = "intranet"
policy = "accept"
port_range = "22/22"
priority = 1
security_group_id = alicloud_security_group.sanguo_sg.id
cidr_ip = "0.0.0.0/0"
}
resource "alicloud_security_group_rule" "allow_http" {
type = "ingress"
ip_protocol = "tcp"
nic_type = "intranet"
policy = "accept"
port_range = "80/80"
priority = 2
security_group_id = alicloud_security_group.sanguo_sg.id
cidr_ip = "0.0.0.0/0"
}
resource "alicloud_security_group_rule" "allow_vnpy" {
type = "ingress"
ip_protocol = "tcp"
nic_type = "intranet"
policy = "accept"
port_range = "8080/8080"
priority = 3
security_group_id = alicloud_security_group.sanguo_sg.id
cidr_ip = "0.0.0.0/0"
}
# =============================================================================
# ECS 实例
# =============================================================================
resource "alicloud_instance" "sanguo_ecs" {
instance_name = "sanguo-vnpy-${var.environment}-ecs"
availability_zone = "${var.alicloud_region}-a"
instance_type = var.instance_type
security_groups = [alicloud_security_group.sanguo_sg.id]
vswitch_id = alicloud_vswitch.sanguo_vswitch.id
internet_charge_type = "PayByTraffic"
internet_max_bandwidth_out = 100
system_disk_size = 40
system_disk_category = "cloud_efficiency"
image_id = "ubuntu_22_04_x64_20G_alibase_20240228.vhd"
password = var.ecs_password
instance_charge_type = "PostPaid"
}
variable "ecs_password" {
description = "ECS 实例密码"
type = string
sensitive = true
default = "Sanguo@2024!"
}
# =============================================================================
# OSS 对象存储
# =============================================================================
resource "alicloud_oss_bucket" "sanguo_oss" {
bucket = "sanguo-vnpy-${var.environment}-data"
acl = "private"
}
# =============================================================================
# RDS 数据库(可选)
# =============================================================================
resource "alicloud_db_instance" "sanguo_rds" {
count = var.enable_rds ? 1 : 0
engine = "MySQL"
engine_version = "8.0"
instance_type = "rds.mysql.s2.large"
instance_storage = 20
vswitch_id = alicloud_vswitch.sanguo_vswitch.id
security_ips = ["0.0.0.0/0"]
db_instance_name = "sanguo-vnpy-${var.environment}-rds"
}
variable "enable_rds" {
description = "是否启用 RDS 数据库"
type = bool
default = false
}
# =============================================================================
# 输出
# =============================================================================
output "vpc_id" {
description = "VPC ID"
value = alicloud_vpc.sanguo_vpc.id
}
output "ecs_public_ip" {
description = "ECS 公网 IP"
value = alicloud_instance.sanguo_ecs.public_ip
}
output "ecs_private_ip" {
description = "ECS 私网 IP"
value = alicloud_instance.sanguo_ecs.private_ip
}
output "oss_bucket_name" {
description = "OSS 存储桶名称"
value = alicloud_oss_bucket.sanguo_oss.bucket
}
output "ecs_ssh_command" {
description = "SSH 连接命令"
value = "ssh root@${alicloud_instance.sanguo_ecs.public_ip}"
}
output "vnpy_web_url" {
description = "vn.py Web 界面地址"
value = "http://${alicloud_instance.sanguo_ecs.public_ip}:8080"
}
@@ -0,0 +1,335 @@
# sanguo_vnpy 阿里云生产环境应急响应方案
**版本**: v1.0
**作者**: 姜维(后勤总督)
**日期**: 2026-03-21
---
## 🚨 应急响应原则
### 1. 快速响应原则
- **5分钟内**:发现问题并启动响应流程
- **15分钟内**:完成初步诊断和影响评估
- **30分钟内**:确定并执行恢复方案
### 2. 优先级原则
- **P0(严重)**:系统完全不可用,数据丢失风险
- **P1(高)**:核心功能异常,影响主要交易
- **P2(中)**:次要功能异常,不影响核心交易
- **P3(低)**:轻微问题,用户体验影响
### 3. 数据安全原则
- **先备份,后操作**:任何修复操作前先备份数据
- **日志优先**:优先保存和分析日志,避免二次故障
- **最小化变更**:使用最小必要的操作修复问题
---
## 🔍 问题诊断流程
### 1. 监控告警触发
#### 告警来源
1. **系统监控**Prometheus + Grafana
2. **应用监控**sanguo_vnpy 内部健康检查
3. **业务监控**:策略执行异常告警
4. **用户反馈**:用户上报的问题
#### 告警级别对应
| 告警类型 | 影响 | 响应级别 |
|---------|------|---------|
| 实例宕机 | 系统不可用 | P0 |
| CPU > 90% 5分钟 | 性能下降 | P1 |
| 内存 > 90% 5分钟 | 可能OOM | P1 |
| 磁盘 > 95% | 数据写入失败 | P0 |
| vn.py 进程消失 | 应用不可用 | P0 |
| 策略执行失败 | 业务影响 | P1 |
### 2. 快速诊断步骤
#### 步骤 1: 检查系统状态
```bash
# 1. 检查服务器是否在线
ping <server-ip>
# 2. SSH 登录(如果可能)
ssh root@<server-ip>
# 3. 检查系统资源
top
htop
df -h
free -m
# 4. 检查网络
ping 8.8.8.8
curl -I https://www.aliyun.com
```
#### 步骤 2: 检查服务状态
```bash
# 1. 检查 vn.py 进程
ps aux | grep -i vnpy
ps aux | grep -i python
# 2. 检查端口监听
netstat -tlnp | grep 8080
ss -tlnp | grep 8080
# 3. 检查监控服务
systemctl status prometheus
systemctl status node_exporter
systemctl status grafana-server
```
#### 步骤 3: 检查日志
```bash
# 1. 系统日志
tail -100 /var/log/syslog
tail -100 /var/log/messages
# 2. 应用日志
tail -100 /path/to/sanguo_vnpy/logs/app.log
tail -100 /path/to/sanguo_vnpy/logs/error.log
# 3. 云服务商日志
# 阿里云控制台查看云服务器监控和事件
```
---
## 🔧 常见问题应急处理
### 场景 1: 实例完全宕机(P0)
#### 现象
- Ping 无响应
- SSH 无法连接
- 监控显示实例离线
#### 应急处理
1. **立即备份**(如果还能访问)
```bash
# 尝试通过阿里云控制台创建快照
# 快照命名: sanguo-vnpy-$(date +%Y%m%d-%H%M)-emergency
```
2. **重启实例**
- 阿里云控制台 → 实例 → 重启
- 等待 2-5 分钟
3. **如果重启失败**
- 使用可用快照回滚
- 或从备份数据重建实例
4. **验证恢复**
```bash
# 检查服务是否恢复
ssh root@<server-ip>
ps aux | grep vnpy
curl http://localhost:8080/health
```
---
### 场景 2: vn.py 进程崩溃(P0
#### 现象
- 实例在线,但 vn.py 进程消失
- 端口 8080 无响应
- 应用监控告警
#### 应急处理
1. **保存崩溃现场**
```bash
# 保存系统状态
dmesg > /tmp/dmesg-$(date +%Y%m%d-%H%M).log
vmstat 1 5 > /tmp/vmstat-$(date +%Y%m%d-%H%M).log
# 保存应用日志
cp -r /path/to/sanguo_vnpy/logs /tmp/logs-backup-$(date +%Y%m%d-%H%M)
```
2. **检查崩溃原因**
```bash
# 检查系统日志中的 OOM
grep -i "out of memory" /var/log/syslog
grep -i "killed process" /var/log/syslog
# 检查应用错误日志
tail -200 /path/to/sanguo_vnpy/logs/error.log
```
3. **快速重启服务**
```bash
# 进入项目目录
cd /path/to/sanguo_vnpy
# 激活虚拟环境
source venv/bin/activate
# 启动 vn.py
python -m vnpy &
# 或者使用服务管理
systemctl start sanguo-vnpy
```
4. **验证恢复**
```bash
# 检查进程
ps aux | grep vnpy
# 检查端口
curl http://localhost:8080/health
# 检查监控
# 确认 Prometheus 数据恢复
```
---
### 场景 3: 磁盘空间满(P0)
#### 现象
- 磁盘使用率 > 95%
- 数据写入失败
- 应用无法保存数据
#### 应急处理
1. **立即清理临时文件**
```bash
# 清理系统临时文件
rm -rf /tmp/*
# 清理应用缓存
rm -rf /path/to/sanguo_vnpy/cache/*
# 清理旧日志(保留最近7天)
find /path/to/sanguo_vnpy/logs -name "*.log" -mtime +7 -delete
```
2. **检查大文件**
```bash
# 查找大于 100MB 的文件
find / -type f -size +100M -exec ls -lh {} \;
# 检查数据目录
du -sh /path/to/sanguo_vnpy/data
du -sh /path/to/sanguo_vnpy/results
```
3. **扩容磁盘(如果需要)**
- 阿里云控制台 → 云盘 → 扩容
- 或挂载新的数据盘
4. **验证恢复**
```bash
df -h
# 确认使用率下降到安全范围(< 80%)
```
---
### 场景 4: 数据库连接失败(P1)
#### 现象
- 应用报错无法连接数据库
- 策略无法获取数据
- 回测无法执行
#### 应急处理
1. **检查数据库状态**
```bash
# 如果使用 RDS
# 阿里云控制台检查 RDS 状态
# 如果使用本地 SQLite
ls -lh /path/to/sanguo_vnpy/data/*.db
# 检查文件权限和完整性
```
2. **网络连接测试**
```bash
# 测试数据库连接
telnet <db-host> <db-port>
nc -zv <db-host> <db-port>
# 检查安全组
# 确认应用服务器 IP 在数据库白名单中
```
3. **快速恢复方案**
```bash
# 方案 A: 切换到本地缓存数据
# 修改配置使用 akshare 直接获取
# 方案 B: 从备份恢复数据库
# 恢复最近的数据库备份
# 方案 C: 重启数据库服务
# 如果是自建数据库,重启服务
```
---
## 📋 应急响应检查清单
### 响应前检查
- [ ] 确认告警级别和影响范围
- [ ] 通知相关人员(诸葛亮/主公)
- [ ] 保存当前系统状态和日志
- [ ] 准备回滚方案
### 响应中检查
- [ ] 执行诊断步骤,确定根因
- [ ] 执行最小必要的修复操作
- [ ] 持续监控系统状态
- [ ] 记录所有操作和时间点
### 响应后检查
- [ ] 验证核心功能恢复正常
- [ ] 验证监控数据正常
- [ ] 验证业务数据完整性
- [ ] 总结故障原因和改进措施
---
## 📞 联络清单
### 紧急联络
- **总军师(诸葛亮)**:负责决策和协调
- **主公**:最终决策和资源协调
- **赵云**:数据库和数据相关问题
- **关羽**:回测引擎和风控问题
- **张飞**API 兼容层问题
- **司马懿**:安全和合规问题
### 阿里云支持
- **阿里云控制台**https://home.console.aliyun.com
- **阿里云工单**:紧急问题提交工单
- **阿里云电话**400-910-0100
---
## 🔄 事后复盘
### 复盘会议
- **时间**:故障恢复后 24 小时内
- **参与人**:所有相关人员
- **内容**
1. 故障回顾和时间线
2. 根因分析
3. 响应流程评估
4. 改进措施讨论
### 改进措施
- 技术改进:防止同类故障再次发生
- 流程改进:优化响应流程
- 监控改进:完善告警和监控
- 文档改进:更新应急方案
---
**本应急方案会持续更新,确保生产环境安全稳定运行!** 🚛
@@ -0,0 +1,211 @@
#!/usr/bin/env bash
# =============================================================================
# sanguo_vnpy 阿里云生产环境监控系统部署脚本
# 版本: v1.0
# 作者: 姜维(后勤总督)
# =============================================================================
set -e
echo "========================================================================"
echo "🚀 sanguo_vnpy 监控系统部署"
echo "========================================================================"
# 颜色定义
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m'
print_info() {
echo -e "${BLUE}[INFO]${NC} $1"
}
print_success() {
echo -e "${GREEN}[SUCCESS]${NC} $1"
}
print_warning() {
echo -e "${YELLOW}[WARNING]${NC} $1"
}
print_error() {
echo -e "${RED}[ERROR]${NC} $1"
}
# 步骤 1: 安装系统依赖
print_info "步骤 1: 安装系统依赖..."
if command -v apt-get &> /dev/null; then
sudo apt-get update
sudo apt-get install -y prometheus node_exporter grafana nginx
elif command -v yum &> /dev/null; then
sudo yum install -y epel-release
sudo yum install -y prometheus node_exporter grafana nginx
else
print_error "不支持的操作系统"
exit 1
fi
print_success "系统依赖安装完成"
# 步骤 2: 配置 Prometheus
print_info "步骤 2: 配置 Prometheus..."
sudo mkdir -p /etc/prometheus
sudo cat > /etc/prometheus/prometheus.yml << 'EOF'
global:
scrape_interval: 15s
evaluation_interval: 15s
scrape_configs:
- job_name: 'prometheus'
static_configs:
- targets: ['localhost:9090']
- job_name: 'node_exporter'
static_configs:
- targets: ['localhost:9100']
- job_name: 'sanguo_vnpy'
static_configs:
- targets: ['localhost:8080']
metrics_path: '/metrics'
alerting:
alertmanagers:
- static_configs:
- targets: ['localhost:9093']
rule_files:
- "/etc/prometheus/alerts.yml"
EOF
# 步骤 3: 配置告警规则
print_info "步骤 3: 配置告警规则..."
sudo cat > /etc/prometheus/alerts.yml << 'EOF'
groups:
- name: sanguo_vnpy_alerts
rules:
- alert: InstanceDown
expr: up == 0
for: 5m
labels:
severity: critical
annotations:
summary: "实例 {{ $labels.instance }} 已宕机"
description: "{{ $labels.job }} 实例 {{ $labels.instance }} 已宕机超过 5 分钟"
- alert: HighCPUUsage
expr: 100 - (avg by(instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
for: 5m
labels:
severity: warning
annotations:
summary: "实例 {{ $labels.instance }} CPU 使用率过高"
description: "{{ $labels.instance }} CPU 使用率超过 80%"
- alert: HighMemoryUsage
expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 80
for: 5m
labels:
severity: warning
annotations:
summary: "实例 {{ $labels.instance }} 内存使用率过高"
description: "{{ $labels.instance }} 内存使用率超过 80%"
- alert: DiskSpaceLow
expr: (1 - (node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"})) * 100 > 90
for: 5m
labels:
severity: critical
annotations:
summary: "实例 {{ $labels.instance }} 磁盘空间不足"
description: "{{ $labels.instance }} 磁盘使用率超过 90%"
- alert: VnpyDown
expr: up{job="sanguo_vnpy"} == 0
for: 2m
labels:
severity: critical
annotations:
summary: "sanguo_vnpy 服务已宕机"
description: "sanguo_vnpy 服务已宕机超过 2 分钟,请立即检查!"
EOF
# 步骤 4: 配置 Grafana
print_info "步骤 4: 配置 Grafana..."
sudo mkdir -p /etc/grafana/provisioning/datasources
sudo cat > /etc/grafana/provisioning/datasources/prometheus.yml << 'EOF'
apiVersion: 1
datasources:
- name: Prometheus
type: prometheus
access: proxy
url: http://localhost:9090
isDefault: true
editable: true
EOF
# 步骤 5: 启动服务
print_info "步骤 5: 启动监控服务..."
sudo systemctl enable prometheus
sudo systemctl enable node_exporter
sudo systemctl enable grafana-server
sudo systemctl start prometheus
sudo systemctl start node_exporter
sudo systemctl start grafana-server
# 步骤 6: 配置 Nginx 反向代理
print_info "步骤 6: 配置 Nginx 反向代理..."
sudo cat > /etc/nginx/sites-available/sanguo-monitoring << 'EOF'
server {
listen 80;
server_name _;
location /grafana/ {
proxy_pass http://localhost:3000/;
proxy_set_header Host $host;
proxy_set_header X-Real-IP $remote_addr;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
proxy_set_header X-Forwarded-Proto $scheme;
}
location /prometheus/ {
proxy_pass http://localhost:9090/;
proxy_set_header Host $host;
proxy_set_header X-Real-IP $remote_addr;
}
location /node-exporter/ {
proxy_pass http://localhost:9100/;
proxy_set_header Host $host;
}
}
EOF
sudo ln -sf /etc/nginx/sites-available/sanguo-monitoring /etc/nginx/sites-enabled/
sudo nginx -t
sudo systemctl reload nginx
# 步骤 7: 显示访问信息
print_success "监控系统部署完成!"
echo ""
echo "========================================================================"
echo "📊 监控系统访问信息"
echo "========================================================================"
echo "Grafana: http://$(hostname -I | awk '{print $1}'):3000"
echo "Prometheus: http://$(hostname -I | awk '{print $1}'):9090"
echo "Node Exporter: http://$(hostname -I | awk '{print $1}'):9100"
echo ""
echo "默认 Grafana 账号: admin / admin"
echo "========================================================================"
echo ""