add: 关羽完成已调研结果和实时风控系统上传
This commit is contained in:
+157
@@ -0,0 +1,157 @@
|
||||
#!/usr/bin/env bash
|
||||
# =============================================================================
|
||||
# sanguo_vnpy 自动化部署流水线脚本
|
||||
# 版本: v1.0
|
||||
# 作者: 姜维(后勤总督)
|
||||
# =============================================================================
|
||||
|
||||
set -e
|
||||
|
||||
echo "========================================================================"
|
||||
echo "🚀 sanguo_vnpy 自动化部署流水线"
|
||||
echo "========================================================================"
|
||||
|
||||
# 颜色定义
|
||||
RED='\033[0;31m'
|
||||
GREEN='\033[0;32m'
|
||||
YELLOW='\033[1;33m'
|
||||
BLUE='\033[0;34m'
|
||||
NC='\033[0m'
|
||||
|
||||
print_info() {
|
||||
echo -e "${BLUE}[INFO]${NC} $1"
|
||||
}
|
||||
|
||||
print_success() {
|
||||
echo -e "${GREEN}[SUCCESS]${NC} $1"
|
||||
}
|
||||
|
||||
print_warning() {
|
||||
echo -e "${YELLOW}[WARNING]${NC} $1"
|
||||
}
|
||||
|
||||
print_error() {
|
||||
echo -e "${RED}[ERROR]${NC} $1"
|
||||
}
|
||||
|
||||
# 获取当前时间
|
||||
get_timestamp() {
|
||||
date +"%Y%m%d_%H%M%S"
|
||||
}
|
||||
|
||||
# 配置变量
|
||||
PROJECT_DIR="/Users/chufeng/.openclaw/agents/main/workspace/projects/sanguo_vnpy"
|
||||
DEPLOY_ENV="${1:-production}"
|
||||
TIMESTAMP=$(get_timestamp)
|
||||
|
||||
echo ""
|
||||
print_info "部署环境: $DEPLOY_ENV"
|
||||
print_info "时间戳: $TIMESTAMP"
|
||||
echo ""
|
||||
|
||||
# 步骤 1: 代码构建
|
||||
print_info "步骤 1: 代码构建..."
|
||||
|
||||
cd "$PROJECT_DIR"
|
||||
|
||||
# 检查虚拟环境
|
||||
if [ ! -d "venv" ]; then
|
||||
print_warning "虚拟环境不存在,正在创建..."
|
||||
python3 -m venv venv
|
||||
fi
|
||||
|
||||
# 激活虚拟环境
|
||||
print_info "激活虚拟环境..."
|
||||
source venv/bin/activate
|
||||
|
||||
# 升级依赖
|
||||
print_info "升级项目依赖..."
|
||||
pip install --upgrade pip wheel
|
||||
pip install -e ".[alpha,dev]"
|
||||
|
||||
print_success "代码构建完成"
|
||||
|
||||
# 步骤 2: 代码检查
|
||||
print_info "步骤 2: 代码质量检查..."
|
||||
|
||||
# 运行代码检查
|
||||
if command -v ruff &> /dev/null; then
|
||||
print_info "运行 Ruff 代码检查..."
|
||||
ruff check sanguo/
|
||||
print_success "代码检查通过"
|
||||
else
|
||||
print_warning "Ruff 未安装,跳过代码检查"
|
||||
fi
|
||||
|
||||
# 步骤 3: 运行测试
|
||||
print_info "步骤 3: 运行测试..."
|
||||
|
||||
if [ -d "tests" ]; then
|
||||
if command -v pytest &> /dev/null; then
|
||||
print_info "运行测试..."
|
||||
pytest tests/ -v
|
||||
print_success "测试通过"
|
||||
else
|
||||
print_warning "pytest 未安装,跳过测试"
|
||||
fi
|
||||
else
|
||||
print_warning "测试目录不存在,跳过测试"
|
||||
fi
|
||||
|
||||
# 步骤 4: 构建部署包
|
||||
print_info "步骤 4: 构建部署包..."
|
||||
|
||||
# 创建部署目录
|
||||
DEPLOY_DIR="/tmp/sanguo_vnpy_deploy_$TIMESTAMP"
|
||||
mkdir -p "$DEPLOY_DIR"
|
||||
|
||||
# 复制代码
|
||||
print_info "复制项目文件..."
|
||||
cp -r sanguo/ "$DEPLOY_DIR/"
|
||||
cp -r vnpy/ "$DEPLOY_DIR/" 2>/dev/null || true
|
||||
cp pyproject.toml "$DEPLOY_DIR/"
|
||||
cp README.md "$DEPLOY_DIR/"
|
||||
|
||||
# 构建 wheel 包
|
||||
print_info "构建 wheel 包..."
|
||||
pip install build
|
||||
python -m build --wheel --outdir "$DEPLOY_DIR/"
|
||||
|
||||
print_success "部署包构建完成: $DEPLOY_DIR"
|
||||
|
||||
# 步骤 5: 部署到目标环境
|
||||
print_info "步骤 5: 部署到目标环境..."
|
||||
|
||||
if [ "$DEPLOY_ENV" = "production" ]; then
|
||||
print_info "生产环境部署..."
|
||||
# 这里可以添加生产环境部署逻辑
|
||||
# 例如: 上传到阿里云 OSS, SSH 到服务器部署等
|
||||
print_warning "生产环境部署需要配置阿里云凭证"
|
||||
|
||||
elif [ "$DEPLOY_ENV" = "testing" ]; then
|
||||
print_info "测试环境部署..."
|
||||
print_success "测试环境部署完成"
|
||||
|
||||
else
|
||||
print_info "本地开发环境部署..."
|
||||
print_success "本地部署完成"
|
||||
fi
|
||||
|
||||
# 步骤 6: 验证部署
|
||||
print_info "步骤 6: 验证部署..."
|
||||
|
||||
# 验证模块导入
|
||||
print_info "验证模块导入..."
|
||||
python -c "import sanguo; import vnpy; print('✅ 模块导入成功')"
|
||||
|
||||
print_success "部署验证通过"
|
||||
|
||||
echo ""
|
||||
echo "========================================================================"
|
||||
echo "🎉 部署完成!"
|
||||
echo "========================================================================"
|
||||
echo "部署时间: $(date)"
|
||||
echo "部署环境: $DEPLOY_ENV"
|
||||
echo "部署包: $DEPLOY_DIR"
|
||||
echo "========================================================================"
|
||||
echo ""
|
||||
@@ -0,0 +1,219 @@
|
||||
# =============================================================================
|
||||
# sanguo_vnpy 阿里云生产环境 Terraform 配置
|
||||
# 版本: v1.0
|
||||
# 作者: 姜维(后勤总督)
|
||||
# =============================================================================
|
||||
|
||||
terraform {
|
||||
required_providers {
|
||||
alicloud = {
|
||||
source = "aliyun/alicloud"
|
||||
version = ">= 1.212.0"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
provider "alicloud" {
|
||||
access_key = var.alicloud_access_key
|
||||
secret_key = var.alicloud_secret_key
|
||||
region = var.alicloud_region
|
||||
}
|
||||
|
||||
# =============================================================================
|
||||
# 变量定义
|
||||
# =============================================================================
|
||||
|
||||
variable "alicloud_access_key" {
|
||||
description = "阿里云 Access Key"
|
||||
type = string
|
||||
sensitive = true
|
||||
}
|
||||
|
||||
variable "alicloud_secret_key" {
|
||||
description = "阿里云 Secret Key"
|
||||
type = string
|
||||
sensitive = true
|
||||
}
|
||||
|
||||
variable "alicloud_region" {
|
||||
description = "阿里云区域"
|
||||
type = string
|
||||
default = "cn-hangzhou"
|
||||
}
|
||||
|
||||
variable "environment" {
|
||||
description = "环境类型: production/testing/development"
|
||||
type = string
|
||||
default = "production"
|
||||
}
|
||||
|
||||
variable "instance_type" {
|
||||
description = "ECS 实例规格"
|
||||
type = string
|
||||
default = "ecs.c6.large"
|
||||
}
|
||||
|
||||
variable "vpc_cidr" {
|
||||
description = "VPC CIDR 块"
|
||||
type = string
|
||||
default = "10.0.0.0/16"
|
||||
}
|
||||
|
||||
variable "vswitch_cidr" {
|
||||
description = "虚拟交换机 CIDR 块"
|
||||
type = string
|
||||
default = "10.0.0.0/24"
|
||||
}
|
||||
|
||||
# =============================================================================
|
||||
# VPC 网络
|
||||
# =============================================================================
|
||||
|
||||
resource "alicloud_vpc" "sanguo_vpc" {
|
||||
vpc_name = "sanguo-vnpy-${var.environment}-vpc"
|
||||
cidr_block = var.vpc_cidr
|
||||
}
|
||||
|
||||
resource "alicloud_vswitch" "sanguo_vswitch" {
|
||||
vswitch_name = "sanguo-vnpy-${var.environment}-vswitch"
|
||||
vpc_id = alicloud_vpc.sanguo_vpc.id
|
||||
cidr_block = var.vswitch_cidr
|
||||
zone_id = "${var.alicloud_region}-a"
|
||||
}
|
||||
|
||||
# =============================================================================
|
||||
# 安全组
|
||||
# =============================================================================
|
||||
|
||||
resource "alicloud_security_group" "sanguo_sg" {
|
||||
name = "sanguo-vnpy-${var.environment}-sg"
|
||||
description = "sanguo_vnpy ${var.environment} 安全组"
|
||||
vpc_id = alicloud_vpc.sanguo_vpc.id
|
||||
}
|
||||
|
||||
resource "alicloud_security_group_rule" "allow_ssh" {
|
||||
type = "ingress"
|
||||
ip_protocol = "tcp"
|
||||
nic_type = "intranet"
|
||||
policy = "accept"
|
||||
port_range = "22/22"
|
||||
priority = 1
|
||||
security_group_id = alicloud_security_group.sanguo_sg.id
|
||||
cidr_ip = "0.0.0.0/0"
|
||||
}
|
||||
|
||||
resource "alicloud_security_group_rule" "allow_http" {
|
||||
type = "ingress"
|
||||
ip_protocol = "tcp"
|
||||
nic_type = "intranet"
|
||||
policy = "accept"
|
||||
port_range = "80/80"
|
||||
priority = 2
|
||||
security_group_id = alicloud_security_group.sanguo_sg.id
|
||||
cidr_ip = "0.0.0.0/0"
|
||||
}
|
||||
|
||||
resource "alicloud_security_group_rule" "allow_vnpy" {
|
||||
type = "ingress"
|
||||
ip_protocol = "tcp"
|
||||
nic_type = "intranet"
|
||||
policy = "accept"
|
||||
port_range = "8080/8080"
|
||||
priority = 3
|
||||
security_group_id = alicloud_security_group.sanguo_sg.id
|
||||
cidr_ip = "0.0.0.0/0"
|
||||
}
|
||||
|
||||
# =============================================================================
|
||||
# ECS 实例
|
||||
# =============================================================================
|
||||
|
||||
resource "alicloud_instance" "sanguo_ecs" {
|
||||
instance_name = "sanguo-vnpy-${var.environment}-ecs"
|
||||
availability_zone = "${var.alicloud_region}-a"
|
||||
instance_type = var.instance_type
|
||||
security_groups = [alicloud_security_group.sanguo_sg.id]
|
||||
vswitch_id = alicloud_vswitch.sanguo_vswitch.id
|
||||
internet_charge_type = "PayByTraffic"
|
||||
internet_max_bandwidth_out = 100
|
||||
|
||||
system_disk_size = 40
|
||||
system_disk_category = "cloud_efficiency"
|
||||
|
||||
image_id = "ubuntu_22_04_x64_20G_alibase_20240228.vhd"
|
||||
|
||||
password = var.ecs_password
|
||||
instance_charge_type = "PostPaid"
|
||||
}
|
||||
|
||||
variable "ecs_password" {
|
||||
description = "ECS 实例密码"
|
||||
type = string
|
||||
sensitive = true
|
||||
default = "Sanguo@2024!"
|
||||
}
|
||||
|
||||
# =============================================================================
|
||||
# OSS 对象存储
|
||||
# =============================================================================
|
||||
|
||||
resource "alicloud_oss_bucket" "sanguo_oss" {
|
||||
bucket = "sanguo-vnpy-${var.environment}-data"
|
||||
acl = "private"
|
||||
}
|
||||
|
||||
# =============================================================================
|
||||
# RDS 数据库(可选)
|
||||
# =============================================================================
|
||||
|
||||
resource "alicloud_db_instance" "sanguo_rds" {
|
||||
count = var.enable_rds ? 1 : 0
|
||||
|
||||
engine = "MySQL"
|
||||
engine_version = "8.0"
|
||||
instance_type = "rds.mysql.s2.large"
|
||||
instance_storage = 20
|
||||
vswitch_id = alicloud_vswitch.sanguo_vswitch.id
|
||||
security_ips = ["0.0.0.0/0"]
|
||||
db_instance_name = "sanguo-vnpy-${var.environment}-rds"
|
||||
}
|
||||
|
||||
variable "enable_rds" {
|
||||
description = "是否启用 RDS 数据库"
|
||||
type = bool
|
||||
default = false
|
||||
}
|
||||
|
||||
# =============================================================================
|
||||
# 输出
|
||||
# =============================================================================
|
||||
|
||||
output "vpc_id" {
|
||||
description = "VPC ID"
|
||||
value = alicloud_vpc.sanguo_vpc.id
|
||||
}
|
||||
|
||||
output "ecs_public_ip" {
|
||||
description = "ECS 公网 IP"
|
||||
value = alicloud_instance.sanguo_ecs.public_ip
|
||||
}
|
||||
|
||||
output "ecs_private_ip" {
|
||||
description = "ECS 私网 IP"
|
||||
value = alicloud_instance.sanguo_ecs.private_ip
|
||||
}
|
||||
|
||||
output "oss_bucket_name" {
|
||||
description = "OSS 存储桶名称"
|
||||
value = alicloud_oss_bucket.sanguo_oss.bucket
|
||||
}
|
||||
|
||||
output "ecs_ssh_command" {
|
||||
description = "SSH 连接命令"
|
||||
value = "ssh root@${alicloud_instance.sanguo_ecs.public_ip}"
|
||||
}
|
||||
|
||||
output "vnpy_web_url" {
|
||||
description = "vn.py Web 界面地址"
|
||||
value = "http://${alicloud_instance.sanguo_ecs.public_ip}:8080"
|
||||
}
|
||||
@@ -0,0 +1,335 @@
|
||||
# sanguo_vnpy 阿里云生产环境应急响应方案
|
||||
|
||||
**版本**: v1.0
|
||||
**作者**: 姜维(后勤总督)
|
||||
**日期**: 2026-03-21
|
||||
|
||||
---
|
||||
|
||||
## 🚨 应急响应原则
|
||||
|
||||
### 1. 快速响应原则
|
||||
- **5分钟内**:发现问题并启动响应流程
|
||||
- **15分钟内**:完成初步诊断和影响评估
|
||||
- **30分钟内**:确定并执行恢复方案
|
||||
|
||||
### 2. 优先级原则
|
||||
- **P0(严重)**:系统完全不可用,数据丢失风险
|
||||
- **P1(高)**:核心功能异常,影响主要交易
|
||||
- **P2(中)**:次要功能异常,不影响核心交易
|
||||
- **P3(低)**:轻微问题,用户体验影响
|
||||
|
||||
### 3. 数据安全原则
|
||||
- **先备份,后操作**:任何修复操作前先备份数据
|
||||
- **日志优先**:优先保存和分析日志,避免二次故障
|
||||
- **最小化变更**:使用最小必要的操作修复问题
|
||||
|
||||
---
|
||||
|
||||
## 🔍 问题诊断流程
|
||||
|
||||
### 1. 监控告警触发
|
||||
|
||||
#### 告警来源
|
||||
1. **系统监控**:Prometheus + Grafana
|
||||
2. **应用监控**:sanguo_vnpy 内部健康检查
|
||||
3. **业务监控**:策略执行异常告警
|
||||
4. **用户反馈**:用户上报的问题
|
||||
|
||||
#### 告警级别对应
|
||||
| 告警类型 | 影响 | 响应级别 |
|
||||
|---------|------|---------|
|
||||
| 实例宕机 | 系统不可用 | P0 |
|
||||
| CPU > 90% 5分钟 | 性能下降 | P1 |
|
||||
| 内存 > 90% 5分钟 | 可能OOM | P1 |
|
||||
| 磁盘 > 95% | 数据写入失败 | P0 |
|
||||
| vn.py 进程消失 | 应用不可用 | P0 |
|
||||
| 策略执行失败 | 业务影响 | P1 |
|
||||
|
||||
### 2. 快速诊断步骤
|
||||
|
||||
#### 步骤 1: 检查系统状态
|
||||
```bash
|
||||
# 1. 检查服务器是否在线
|
||||
ping <server-ip>
|
||||
|
||||
# 2. SSH 登录(如果可能)
|
||||
ssh root@<server-ip>
|
||||
|
||||
# 3. 检查系统资源
|
||||
top
|
||||
htop
|
||||
df -h
|
||||
free -m
|
||||
|
||||
# 4. 检查网络
|
||||
ping 8.8.8.8
|
||||
curl -I https://www.aliyun.com
|
||||
```
|
||||
|
||||
#### 步骤 2: 检查服务状态
|
||||
```bash
|
||||
# 1. 检查 vn.py 进程
|
||||
ps aux | grep -i vnpy
|
||||
ps aux | grep -i python
|
||||
|
||||
# 2. 检查端口监听
|
||||
netstat -tlnp | grep 8080
|
||||
ss -tlnp | grep 8080
|
||||
|
||||
# 3. 检查监控服务
|
||||
systemctl status prometheus
|
||||
systemctl status node_exporter
|
||||
systemctl status grafana-server
|
||||
```
|
||||
|
||||
#### 步骤 3: 检查日志
|
||||
```bash
|
||||
# 1. 系统日志
|
||||
tail -100 /var/log/syslog
|
||||
tail -100 /var/log/messages
|
||||
|
||||
# 2. 应用日志
|
||||
tail -100 /path/to/sanguo_vnpy/logs/app.log
|
||||
tail -100 /path/to/sanguo_vnpy/logs/error.log
|
||||
|
||||
# 3. 云服务商日志
|
||||
# 阿里云控制台查看云服务器监控和事件
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 🔧 常见问题应急处理
|
||||
|
||||
### 场景 1: 实例完全宕机(P0)
|
||||
|
||||
#### 现象
|
||||
- Ping 无响应
|
||||
- SSH 无法连接
|
||||
- 监控显示实例离线
|
||||
|
||||
#### 应急处理
|
||||
1. **立即备份**(如果还能访问)
|
||||
```bash
|
||||
# 尝试通过阿里云控制台创建快照
|
||||
# 快照命名: sanguo-vnpy-$(date +%Y%m%d-%H%M)-emergency
|
||||
```
|
||||
|
||||
2. **重启实例**
|
||||
- 阿里云控制台 → 实例 → 重启
|
||||
- 等待 2-5 分钟
|
||||
|
||||
3. **如果重启失败**
|
||||
- 使用可用快照回滚
|
||||
- 或从备份数据重建实例
|
||||
|
||||
4. **验证恢复**
|
||||
```bash
|
||||
# 检查服务是否恢复
|
||||
ssh root@<server-ip>
|
||||
ps aux | grep vnpy
|
||||
curl http://localhost:8080/health
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### 场景 2: vn.py 进程崩溃(P0)
|
||||
|
||||
#### 现象
|
||||
- 实例在线,但 vn.py 进程消失
|
||||
- 端口 8080 无响应
|
||||
- 应用监控告警
|
||||
|
||||
#### 应急处理
|
||||
1. **保存崩溃现场**
|
||||
```bash
|
||||
# 保存系统状态
|
||||
dmesg > /tmp/dmesg-$(date +%Y%m%d-%H%M).log
|
||||
vmstat 1 5 > /tmp/vmstat-$(date +%Y%m%d-%H%M).log
|
||||
|
||||
# 保存应用日志
|
||||
cp -r /path/to/sanguo_vnpy/logs /tmp/logs-backup-$(date +%Y%m%d-%H%M)
|
||||
```
|
||||
|
||||
2. **检查崩溃原因**
|
||||
```bash
|
||||
# 检查系统日志中的 OOM
|
||||
grep -i "out of memory" /var/log/syslog
|
||||
grep -i "killed process" /var/log/syslog
|
||||
|
||||
# 检查应用错误日志
|
||||
tail -200 /path/to/sanguo_vnpy/logs/error.log
|
||||
```
|
||||
|
||||
3. **快速重启服务**
|
||||
```bash
|
||||
# 进入项目目录
|
||||
cd /path/to/sanguo_vnpy
|
||||
|
||||
# 激活虚拟环境
|
||||
source venv/bin/activate
|
||||
|
||||
# 启动 vn.py
|
||||
python -m vnpy &
|
||||
|
||||
# 或者使用服务管理
|
||||
systemctl start sanguo-vnpy
|
||||
```
|
||||
|
||||
4. **验证恢复**
|
||||
```bash
|
||||
# 检查进程
|
||||
ps aux | grep vnpy
|
||||
|
||||
# 检查端口
|
||||
curl http://localhost:8080/health
|
||||
|
||||
# 检查监控
|
||||
# 确认 Prometheus 数据恢复
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### 场景 3: 磁盘空间满(P0)
|
||||
|
||||
#### 现象
|
||||
- 磁盘使用率 > 95%
|
||||
- 数据写入失败
|
||||
- 应用无法保存数据
|
||||
|
||||
#### 应急处理
|
||||
1. **立即清理临时文件**
|
||||
```bash
|
||||
# 清理系统临时文件
|
||||
rm -rf /tmp/*
|
||||
|
||||
# 清理应用缓存
|
||||
rm -rf /path/to/sanguo_vnpy/cache/*
|
||||
|
||||
# 清理旧日志(保留最近7天)
|
||||
find /path/to/sanguo_vnpy/logs -name "*.log" -mtime +7 -delete
|
||||
```
|
||||
|
||||
2. **检查大文件**
|
||||
```bash
|
||||
# 查找大于 100MB 的文件
|
||||
find / -type f -size +100M -exec ls -lh {} \;
|
||||
|
||||
# 检查数据目录
|
||||
du -sh /path/to/sanguo_vnpy/data
|
||||
du -sh /path/to/sanguo_vnpy/results
|
||||
```
|
||||
|
||||
3. **扩容磁盘(如果需要)**
|
||||
- 阿里云控制台 → 云盘 → 扩容
|
||||
- 或挂载新的数据盘
|
||||
|
||||
4. **验证恢复**
|
||||
```bash
|
||||
df -h
|
||||
# 确认使用率下降到安全范围(< 80%)
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### 场景 4: 数据库连接失败(P1)
|
||||
|
||||
#### 现象
|
||||
- 应用报错无法连接数据库
|
||||
- 策略无法获取数据
|
||||
- 回测无法执行
|
||||
|
||||
#### 应急处理
|
||||
1. **检查数据库状态**
|
||||
```bash
|
||||
# 如果使用 RDS
|
||||
# 阿里云控制台检查 RDS 状态
|
||||
|
||||
# 如果使用本地 SQLite
|
||||
ls -lh /path/to/sanguo_vnpy/data/*.db
|
||||
# 检查文件权限和完整性
|
||||
```
|
||||
|
||||
2. **网络连接测试**
|
||||
```bash
|
||||
# 测试数据库连接
|
||||
telnet <db-host> <db-port>
|
||||
nc -zv <db-host> <db-port>
|
||||
|
||||
# 检查安全组
|
||||
# 确认应用服务器 IP 在数据库白名单中
|
||||
```
|
||||
|
||||
3. **快速恢复方案**
|
||||
```bash
|
||||
# 方案 A: 切换到本地缓存数据
|
||||
# 修改配置使用 akshare 直接获取
|
||||
|
||||
# 方案 B: 从备份恢复数据库
|
||||
# 恢复最近的数据库备份
|
||||
|
||||
# 方案 C: 重启数据库服务
|
||||
# 如果是自建数据库,重启服务
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 📋 应急响应检查清单
|
||||
|
||||
### 响应前检查
|
||||
- [ ] 确认告警级别和影响范围
|
||||
- [ ] 通知相关人员(诸葛亮/主公)
|
||||
- [ ] 保存当前系统状态和日志
|
||||
- [ ] 准备回滚方案
|
||||
|
||||
### 响应中检查
|
||||
- [ ] 执行诊断步骤,确定根因
|
||||
- [ ] 执行最小必要的修复操作
|
||||
- [ ] 持续监控系统状态
|
||||
- [ ] 记录所有操作和时间点
|
||||
|
||||
### 响应后检查
|
||||
- [ ] 验证核心功能恢复正常
|
||||
- [ ] 验证监控数据正常
|
||||
- [ ] 验证业务数据完整性
|
||||
- [ ] 总结故障原因和改进措施
|
||||
|
||||
---
|
||||
|
||||
## 📞 联络清单
|
||||
|
||||
### 紧急联络
|
||||
- **总军师(诸葛亮)**:负责决策和协调
|
||||
- **主公**:最终决策和资源协调
|
||||
- **赵云**:数据库和数据相关问题
|
||||
- **关羽**:回测引擎和风控问题
|
||||
- **张飞**:API 兼容层问题
|
||||
- **司马懿**:安全和合规问题
|
||||
|
||||
### 阿里云支持
|
||||
- **阿里云控制台**:https://home.console.aliyun.com
|
||||
- **阿里云工单**:紧急问题提交工单
|
||||
- **阿里云电话**:400-910-0100
|
||||
|
||||
---
|
||||
|
||||
## 🔄 事后复盘
|
||||
|
||||
### 复盘会议
|
||||
- **时间**:故障恢复后 24 小时内
|
||||
- **参与人**:所有相关人员
|
||||
- **内容**:
|
||||
1. 故障回顾和时间线
|
||||
2. 根因分析
|
||||
3. 响应流程评估
|
||||
4. 改进措施讨论
|
||||
|
||||
### 改进措施
|
||||
- 技术改进:防止同类故障再次发生
|
||||
- 流程改进:优化响应流程
|
||||
- 监控改进:完善告警和监控
|
||||
- 文档改进:更新应急方案
|
||||
|
||||
---
|
||||
|
||||
**本应急方案会持续更新,确保生产环境安全稳定运行!** 🚛
|
||||
+211
@@ -0,0 +1,211 @@
|
||||
#!/usr/bin/env bash
|
||||
# =============================================================================
|
||||
# sanguo_vnpy 阿里云生产环境监控系统部署脚本
|
||||
# 版本: v1.0
|
||||
# 作者: 姜维(后勤总督)
|
||||
# =============================================================================
|
||||
|
||||
set -e
|
||||
|
||||
echo "========================================================================"
|
||||
echo "🚀 sanguo_vnpy 监控系统部署"
|
||||
echo "========================================================================"
|
||||
|
||||
# 颜色定义
|
||||
RED='\033[0;31m'
|
||||
GREEN='\033[0;32m'
|
||||
YELLOW='\033[1;33m'
|
||||
BLUE='\033[0;34m'
|
||||
NC='\033[0m'
|
||||
|
||||
print_info() {
|
||||
echo -e "${BLUE}[INFO]${NC} $1"
|
||||
}
|
||||
|
||||
print_success() {
|
||||
echo -e "${GREEN}[SUCCESS]${NC} $1"
|
||||
}
|
||||
|
||||
print_warning() {
|
||||
echo -e "${YELLOW}[WARNING]${NC} $1"
|
||||
}
|
||||
|
||||
print_error() {
|
||||
echo -e "${RED}[ERROR]${NC} $1"
|
||||
}
|
||||
|
||||
# 步骤 1: 安装系统依赖
|
||||
print_info "步骤 1: 安装系统依赖..."
|
||||
|
||||
if command -v apt-get &> /dev/null; then
|
||||
sudo apt-get update
|
||||
sudo apt-get install -y prometheus node_exporter grafana nginx
|
||||
elif command -v yum &> /dev/null; then
|
||||
sudo yum install -y epel-release
|
||||
sudo yum install -y prometheus node_exporter grafana nginx
|
||||
else
|
||||
print_error "不支持的操作系统"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
print_success "系统依赖安装完成"
|
||||
|
||||
# 步骤 2: 配置 Prometheus
|
||||
print_info "步骤 2: 配置 Prometheus..."
|
||||
|
||||
sudo mkdir -p /etc/prometheus
|
||||
sudo cat > /etc/prometheus/prometheus.yml << 'EOF'
|
||||
global:
|
||||
scrape_interval: 15s
|
||||
evaluation_interval: 15s
|
||||
|
||||
scrape_configs:
|
||||
- job_name: 'prometheus'
|
||||
static_configs:
|
||||
- targets: ['localhost:9090']
|
||||
|
||||
- job_name: 'node_exporter'
|
||||
static_configs:
|
||||
- targets: ['localhost:9100']
|
||||
|
||||
- job_name: 'sanguo_vnpy'
|
||||
static_configs:
|
||||
- targets: ['localhost:8080']
|
||||
metrics_path: '/metrics'
|
||||
|
||||
alerting:
|
||||
alertmanagers:
|
||||
- static_configs:
|
||||
- targets: ['localhost:9093']
|
||||
|
||||
rule_files:
|
||||
- "/etc/prometheus/alerts.yml"
|
||||
EOF
|
||||
|
||||
# 步骤 3: 配置告警规则
|
||||
print_info "步骤 3: 配置告警规则..."
|
||||
|
||||
sudo cat > /etc/prometheus/alerts.yml << 'EOF'
|
||||
groups:
|
||||
- name: sanguo_vnpy_alerts
|
||||
rules:
|
||||
- alert: InstanceDown
|
||||
expr: up == 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "实例 {{ $labels.instance }} 已宕机"
|
||||
description: "{{ $labels.job }} 实例 {{ $labels.instance }} 已宕机超过 5 分钟"
|
||||
|
||||
- alert: HighCPUUsage
|
||||
expr: 100 - (avg by(instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "实例 {{ $labels.instance }} CPU 使用率过高"
|
||||
description: "{{ $labels.instance }} CPU 使用率超过 80%"
|
||||
|
||||
- alert: HighMemoryUsage
|
||||
expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 80
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "实例 {{ $labels.instance }} 内存使用率过高"
|
||||
description: "{{ $labels.instance }} 内存使用率超过 80%"
|
||||
|
||||
- alert: DiskSpaceLow
|
||||
expr: (1 - (node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"})) * 100 > 90
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "实例 {{ $labels.instance }} 磁盘空间不足"
|
||||
description: "{{ $labels.instance }} 磁盘使用率超过 90%"
|
||||
|
||||
- alert: VnpyDown
|
||||
expr: up{job="sanguo_vnpy"} == 0
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "sanguo_vnpy 服务已宕机"
|
||||
description: "sanguo_vnpy 服务已宕机超过 2 分钟,请立即检查!"
|
||||
EOF
|
||||
|
||||
# 步骤 4: 配置 Grafana
|
||||
print_info "步骤 4: 配置 Grafana..."
|
||||
|
||||
sudo mkdir -p /etc/grafana/provisioning/datasources
|
||||
sudo cat > /etc/grafana/provisioning/datasources/prometheus.yml << 'EOF'
|
||||
apiVersion: 1
|
||||
|
||||
datasources:
|
||||
- name: Prometheus
|
||||
type: prometheus
|
||||
access: proxy
|
||||
url: http://localhost:9090
|
||||
isDefault: true
|
||||
editable: true
|
||||
EOF
|
||||
|
||||
# 步骤 5: 启动服务
|
||||
print_info "步骤 5: 启动监控服务..."
|
||||
|
||||
sudo systemctl enable prometheus
|
||||
sudo systemctl enable node_exporter
|
||||
sudo systemctl enable grafana-server
|
||||
|
||||
sudo systemctl start prometheus
|
||||
sudo systemctl start node_exporter
|
||||
sudo systemctl start grafana-server
|
||||
|
||||
# 步骤 6: 配置 Nginx 反向代理
|
||||
print_info "步骤 6: 配置 Nginx 反向代理..."
|
||||
|
||||
sudo cat > /etc/nginx/sites-available/sanguo-monitoring << 'EOF'
|
||||
server {
|
||||
listen 80;
|
||||
server_name _;
|
||||
|
||||
location /grafana/ {
|
||||
proxy_pass http://localhost:3000/;
|
||||
proxy_set_header Host $host;
|
||||
proxy_set_header X-Real-IP $remote_addr;
|
||||
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
|
||||
proxy_set_header X-Forwarded-Proto $scheme;
|
||||
}
|
||||
|
||||
location /prometheus/ {
|
||||
proxy_pass http://localhost:9090/;
|
||||
proxy_set_header Host $host;
|
||||
proxy_set_header X-Real-IP $remote_addr;
|
||||
}
|
||||
|
||||
location /node-exporter/ {
|
||||
proxy_pass http://localhost:9100/;
|
||||
proxy_set_header Host $host;
|
||||
}
|
||||
}
|
||||
EOF
|
||||
|
||||
sudo ln -sf /etc/nginx/sites-available/sanguo-monitoring /etc/nginx/sites-enabled/
|
||||
sudo nginx -t
|
||||
sudo systemctl reload nginx
|
||||
|
||||
# 步骤 7: 显示访问信息
|
||||
print_success "监控系统部署完成!"
|
||||
|
||||
echo ""
|
||||
echo "========================================================================"
|
||||
echo "📊 监控系统访问信息"
|
||||
echo "========================================================================"
|
||||
echo "Grafana: http://$(hostname -I | awk '{print $1}'):3000"
|
||||
echo "Prometheus: http://$(hostname -I | awk '{print $1}'):9090"
|
||||
echo "Node Exporter: http://$(hostname -I | awk '{print $1}'):9100"
|
||||
echo ""
|
||||
echo "默认 Grafana 账号: admin / admin"
|
||||
echo "========================================================================"
|
||||
echo ""
|
||||
Reference in New Issue
Block a user