initial-import: 2026-04-11 21:18:55

2026-04-11 21:18:55 +08:00
commit 5e6b2d73eb
264 changed files with 117047 additions and 0 deletions
@@ -0,0 +1,501 @@
+#!/usr/bin/env python3
+"""
+数据格式转换工具 - 姜维
+功能：将赵云将军的本地数据格式转换为vn.py兼容格式
+"""
+
+import pandas as pd
+import os
+import glob
+import json
+import logging
+from datetime import datetime
+from pathlib import Path
+
+# 配置日志
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
+    handlers=[
+        logging.FileHandler('data_convert_tool.log'),
+        logging.StreamHandler()
+    ]
+)
+logger = logging.getLogger(__name__)
+
+
+class DataConverter:
+    """
+    数据格式转换器
+    赵云格式 → vn.py格式
+    """
+    
+    # 赵云数据字段映射到vn.py字段
+    FIELD_MAPPING = {
+        # 基本字段
+        'date': 'datetime',
+        'open': 'open_price',
+        'high': 'high_price',
+        'low': 'low_price',
+        'close': 'close_price',
+        'volume': 'volume',
+        'amount': 'turnover',  # 注意：vn.py中turnover是成交额
+        'turnover': 'turnover_rate',  # 换手率
+        
+        # 可选字段
+        'outstanding_share': 'outstanding_share',
+        'year': 'year',
+        
+        # 财务数据字段
+        'pe_ttm': 'pe_ttm',
+        'pb': 'pb',
+        'roe': 'roe',
+        'total_market_cap': 'total_market_cap',
+        'circulating_market_cap': 'circulating_market_cap',
+    }
+    
+    # 必需字段
+    REQUIRED_FIELDS = ['date', 'open', 'high', 'low', 'close', 'volume']
+    
+    def __init__(self, zhaoyun_data_dir: str, output_dir: str):
+        """
+        初始化转换器
+        
+        Args:
+            zhaoyun_data_dir: 赵云数据目录
+            output_dir: 输出目录
+        """
+        self.zhaoyun_dir = zhaoyun_data_dir
+        self.output_dir = output_dir
+        
+        # 创建输出目录
+        os.makedirs(output_dir, exist_ok=True)
+        
+        # 子目录结构
+        self.subdirs = {
+            'daily': 'daily',
+            'minute': 'minute',
+            'financial': 'financial',
+            'stock_info': 'stock_info',
+        }
+        
+        for subdir in self.subdirs.values():
+            os.makedirs(os.path.join(output_dir, subdir), exist_ok=True)
+    
+    def analyze_zhaoyun_structure(self) -> dict:
+        """
+        分析赵云数据目录结构
+        
+        Returns:
+            结构分析报告
+        """
+        report = {
+            'timestamp': datetime.now().isoformat(),
+            'zhaoyun_dir': self.zhaoyun_dir,
+            'exists': os.path.exists(self.zhaoyun_dir),
+            'subdirectories': {},
+            'file_counts': {},
+            'sample_files': {},
+            'data_quality': {},
+        }
+        
+        if not report['exists']:
+            logger.error(f"赵云数据目录不存在: {self.zhaoyun_dir}")
+            return report
+        
+        # 分析子目录
+        for subdir in ['raw/daily', 'raw/financial', 'raw/stock_info', 'raw/minute_kline']:
+            full_path = os.path.join(self.zhaoyun_dir, subdir)
+            if os.path.exists(full_path):
+                # 统计文件
+                parquet_files = list(glob.glob(os.path.join(full_path, '**/*.parquet'), recursive=True))
+                csv_files = list(glob.glob(os.path.join(full_path, '**/*.csv'), recursive=True))
+                
+                report['subdirectories'][subdir] = {
+                    'path': full_path,
+                    'parquet_count': len(parquet_files),
+                    'csv_count': len(csv_files),
+                    'total_files': len(parquet_files) + len(csv_files),
+                }
+                
+                # 取样分析
+                if parquet_files:
+                    sample_file = parquet_files[0]
+                    try:
+                        df = pd.read_parquet(sample_file)
+                        report['sample_files'][subdir] = {
+                            'file': sample_file,
+                            'rows': len(df),
+                            'columns': list(df.columns),
+                            'dtypes': str(df.dtypes.to_dict()),
+                            'date_range': {
+                                'min': str(df['date'].min()) if 'date' in df.columns else 'N/A',
+                                'max': str(df['date'].max()) if 'date' in df.columns else 'N/A',
+                            } if 'date' in df.columns else {},
+                        }
+                    except Exception as e:
+                        report['sample_files'][subdir] = {'error': str(e)}
+        
+        logger.info(f"赵云数据结构分析完成")
+        return report
+    
+    def convert_daily_data(self, year: int = None, symbols: list = None, limit: int = None):
+        """
+        转换日线数据
+        
+        Args:
+            year: 指定年份，None表示所有年份
+            symbols: 指定股票代码列表，None表示所有股票
+            limit: 限制转换数量（用于测试）
+        """
+        daily_dir = os.path.join(self.zhaoyun_dir, 'raw/daily')
+        if not os.path.exists(daily_dir):
+            logger.error(f"赵云日线数据目录不存在: {daily_dir}")
+            return
+        
+        # 确定年份范围
+        if year:
+            years = [str(year)]
+        else:
+            years = [d for d in os.listdir(daily_dir) if os.path.isdir(os.path.join(daily_dir, d))]
+            years.sort()
+        
+        logger.info(f"开始转换日线数据，年份: {years}")
+        
+        total_converted = 0
+        total_failed = 0
+        
+        for year_dir in years:
+            year_path = os.path.join(daily_dir, year_dir)
+            output_year_path = os.path.join(self.output_dir, 'daily', year_dir)
+            os.makedirs(output_year_path, exist_ok=True)
+            
+            # 查找所有parquet文件
+            parquet_files = glob.glob(os.path.join(year_path, '*.parquet'))
+            
+            if symbols:
+                # 过滤指定股票
+                filtered_files = []
+                for file in parquet_files:
+                    file_name = os.path.basename(file)
+                    # 从文件名提取股票代码
+                    if 'sh' in file_name:
+                        symbol = file_name.split('_')[0][2:] + '.SH'
+                    elif 'sz' in file_name:
+                        symbol = file_name.split('_')[0][2:] + '.SZ'
+                    elif 'bj' in file_name:
+                        symbol = file_name.split('_')[0][2:] + '.BJ'
+                    else:
+                        symbol = file_name.split('_')[0]
+                    
+                    if symbol in symbols or symbol.replace('.SH', '').replace('.SZ', '').replace('.BJ', '') in symbols:
+                        filtered_files.append(file)
+                parquet_files = filtered_files
+            
+            if limit:
+                parquet_files = parquet_files[:limit]
+            
+            logger.info(f"转换 {year_dir} 年数据，共 {len(parquet_files)} 个文件")
+            
+            for file_idx, file_path in enumerate(parquet_files, 1):
+                try:
+                    # 从文件名提取信息
+                    file_name = os.path.basename(file_path)
+                    
+                    # 解析股票代码和交易所
+                    if file_name.startswith('sh'):
+                        symbol = file_name[2:8]  # 提取6位数字代码
+                        exchange = 'SH'
+                    elif file_name.startswith('sz'):
+                        symbol = file_name[2:8]
+                        exchange = 'SZ'
+                    elif file_name.startswith('bj'):
+                        symbol = file_name[2:8]
+                        exchange = 'BJ'
+                    else:
+                        symbol = file_name.split('_')[0]
+                        exchange = 'SH'  # 默认
+                    
+                    # 读取数据
+                    df = pd.read_parquet(file_path)
+                    
+                    # 检查必需字段
+                    missing_fields = [field for field in self.REQUIRED_FIELDS if field not in df.columns]
+                    if missing_fields:
+                        logger.warning(f"文件 {file_name} 缺少必需字段: {missing_fields}")
+                        total_failed += 1
+                        continue
+                    
+                    # 创建vn.py格式DataFrame
+                    vnpy_df = pd.DataFrame()
+                    
+                    # 转换字段
+                    for zhaoyun_field, vnpy_field in self.FIELD_MAPPING.items():
+                        if zhaoyun_field in df.columns:
+                            vnpy_df[vnpy_field] = df[zhaoyun_field]
+                    
+                    # 特殊处理datetime字段
+                    if 'datetime' not in vnpy_df.columns and 'date' in df.columns:
+                        vnpy_df['datetime'] = pd.to_datetime(df['date']).dt.strftime('%Y-%m-%d %H:%M:%S')
+                    
+                    # 添加标识字段
+                    vnpy_df['symbol'] = symbol
+                    vnpy_df['exchange'] = exchange
+                    vnpy_df['interval'] = '1d'
+                    
+                    # 添加唯一ID（可选）
+                    vnpy_df['id'] = range(1, len(vnpy_df) + 1)
+                    
+                    # 输出文件名
+                    output_file = os.path.join(output_year_path, f"{exchange}{symbol}_daily_vnpy.parquet")
+                    
+                    # 保存为parquet
+                    vnpy_df.to_parquet(output_file, index=False)
+                    
+                    total_converted += 1
+                    
+                    if file_idx % 100 == 0 or file_idx == len(parquet_files):
+                        logger.info(f"进度: {year_dir}年 {file_idx}/{len(parquet_files)} 转换: {total_converted} 失败: {total_failed}")
+                        
+                except Exception as e:
+                    logger.error(f"转换文件失败 {file_path}: {e}")
+                    total_failed += 1
+        
+        logger.info(f"日线数据转换完成: 成功 {total_converted}, 失败 {total_failed}")
+        
+        # 保存转换报告
+        report = {
+            'conversion_date': datetime.now().isoformat(),
+            'zhaoyun_dir': daily_dir,
+            'output_dir': os.path.join(self.output_dir, 'daily'),
+            'years_converted': years,
+            'total_converted': total_converted,
+            'total_failed': total_failed,
+            'symbols_converted': symbols if symbols else 'ALL',
+        }
+        
+        report_file = os.path.join(self.output_dir, 'daily_conversion_report.json')
+        with open(report_file, 'w', encoding='utf-8') as f:
+            json.dump(report, f, ensure_ascii=False, indent=2)
+        
+        logger.info(f"转换报告已保存: {report_file}")
+    
+    def convert_stock_info(self):
+        """转换股票基础信息"""
+        stock_info_dir = os.path.join(self.zhaoyun_dir, 'raw/stock_info')
+        if not os.path.exists(stock_info_dir):
+            logger.warning(f"赵云股票信息目录不存在: {stock_info_dir}")
+            return
+        
+        # 查找股票信息文件
+        stock_files = glob.glob(os.path.join(stock_info_dir, '*.parquet')) + \
+                     glob.glob(os.path.join(stock_info_dir, '*.csv'))
+        
+        if not stock_files:
+            logger.warning(f"未找到股票信息文件")
+            return
+        
+        logger.info(f"开始转换股票信息，共 {len(stock_files)} 个文件")
+        
+        all_stock_info = []
+        
+        for file_path in stock_files:
+            try:
+                # 读取文件
+                if file_path.endswith('.parquet'):
+                    df = pd.read_parquet(file_path)
+                else:
+                    df = pd.read_csv(file_path)
+                
+                # 标准化字段名
+                column_mapping = {
+                    '代码': 'symbol',
+                    '名称': 'name',
+                    '行业': 'industry',
+                    '市场': 'market',
+                    '上市日期': 'list_date',
+                    '总市值': 'total_market_cap',
+                    '流通市值': 'circulating_market_cap',
+                    '市盈率': 'pe',
+                    '市净率': 'pb',
+                    'ROE': 'roe',
+                }
+                
+                df = df.rename(columns={k: v for k, v in column_mapping.items() if k in df.columns})
+                
+                # 添加exchange字段
+                if 'symbol' in df.columns:
+                    df['exchange'] = df['symbol'].apply(lambda x: 'SH' if str(x).startswith('6') else 'SZ')
+                
+                all_stock_info.append(df)
+                logger.info(f"转换股票信息文件: {os.path.basename(file_path)} ({len(df)} 条记录)")
+                
+            except Exception as e:
+                logger.error(f"转换股票信息失败 {file_path}: {e}")
+        
+        if all_stock_info:
+            # 合并所有数据
+            combined_df = pd.concat(all_stock_info, ignore_index=True)
+            
+            # 去重
+            if 'symbol' in combined_df.columns:
+                combined_df = combined_df.drop_duplicates(subset=['symbol'])
+            
+            # 保存
+            output_file = os.path.join(self.output_dir, 'stock_info', 'stock_basic_info_vnpy.parquet')
+            combined_df.to_parquet(output_file, index=False)
+            
+            logger.info(f"股票信息转换完成: {output_file} ({len(combined_df)} 只股票)")
+    
+    def create_config_file(self):
+        """创建vn.py配置文件"""
+        config = {
+            'data_source': 'zhaoyun_local_data',
+            'data_directory': os.path.abspath(self.output_dir),
+            'priority': 'local_first',
+            'fields_mapping': self.FIELD_MAPPING,
+            'created_at': datetime.now().isoformat(),
+            'description': '赵云本地数据 → vn.py格式转换配置',
+            'usage': {
+                'daily_data_path': '{data_directory}/daily/{year}/{exchange}{symbol}_daily_vnpy.parquet',
+                'stock_info_path': '{data_directory}/stock_info/stock_basic_info_vnpy.parquet',
+                'python_import': 'from vnpy_local_data_adapter import VnpyLocalDataAdapter',
+                'init_code': 'adapter = VnpyLocalDataAdapter(use_local_first=True)',
+            }
+        }
+        
+        config_file = os.path.join(self.output_dir, 'vnpy_data_config.json')
+        with open(config_file, 'w', encoding='utf-8') as f:
+            json.dump(config, f, ensure_ascii=False, indent=2)
+        
+        logger.info(f"vn.py配置文件已创建: {config_file}")
+        
+        # 创建使用说明
+        readme = f"""# vn.py本地数据使用说明
+
+## 数据来源
+- 原始数据：赵云将军下载的A股数据
+- 转换工具：姜维数据格式转换器
+- 输出格式：vn.py兼容的parquet格式
+
+## 目录结构
+```
+{self.output_dir}/
+├── daily/                    # 日线数据
+│   ├── 2010/                # 按年分区
+│   ├── 2011/
+│   └── ...
+├── stock_info/              # 股票基础信息
+│   └── stock_basic_info_vnpy.parquet
+├── vnpy_data_config.json    # 配置文件
+└── daily_conversion_report.json  # 转换报告
+```
+
+## 使用方法
+
+### 1. 在vn.py策略中使用
+```python
+from vnpy_local_data_adapter import VnpyLocalDataAdapter
+
+# 创建适配器（优先使用本地数据）
+adapter = VnpyLocalDataAdapter(use_local_first=True)
+
+# 获取数据
+data = adapter.get_daily_data("000001.SZ", "2024-01-01", "2024-03-01")
+```
+
+### 2. 直接读取数据
+```python
+import pandas as pd
+
+# 读取日线数据
+file_path = "{self.output_dir}/daily/2024/SH600000_daily_vnpy.parquet"
+df = pd.read_parquet(file_path)
+
+# 读取股票信息
+stock_info_path = "{self.output_dir}/stock_info/stock_basic_info_vnpy.parquet"
+stock_info = pd.read_parquet(stock_info_path)
+```
+
+### 3. 验证数据结构
+```python
+from vnpy_local_data_adapter import VnpyLocalDataAdapter
+
+adapter = VnpyLocalDataAdapter()
+result = adapter.verify_local_data_structure("000001.SZ")
+print(result)
+```
+
+## 数据更新
+1. 联系赵云将军更新原始数据
+2. 运行数据转换工具更新vn.py格式数据
+3. 验证数据完整性
+
+## 注意事项
+- 本地数据优先，缺失时自动回退到akshare
+- 数据文件按年分区，提高查询效率
+- 定期检查数据完整性
+
+**维护者**: 姜维（后勤总督）
+**数据源**: 赵云（数据工程将军）
+**最后更新**: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
+"""
+        
+        readme_file = os.path.join(self.output_dir, 'README.md')
+        with open(readme_file, 'w', encoding='utf-8') as f:
+            f.write(readme)
+        
+        logger.info(f"使用说明已创建: {readme_file}")
+
+
+def main():
+    """主函数"""
+    print("=" * 60)
+    print("赵云数据 → vn.py格式转换工具")
+    print("=" * 60)
+    
+    # 配置路径
+    ZHAOYUN_DATA_DIR = "/Users/chufeng/nas/stock/sanguo_vnpy/zhaoyun-data/data"
+    OUTPUT_DIR = "/Users/chufeng/.openclaw/workspace-jiangwei/vnpy_local_data"
+    
+    # 创建转换器
+    converter = DataConverter(ZHAOYUN_DATA_DIR, OUTPUT_DIR)
+    
+    # 1. 分析数据结构
+    print("\n1. 分析赵云数据结构...")
+    structure_report = converter.analyze_zhaoyun_structure()
+    
+    if not structure_report['exists']:
+        print(f"❌ 赵云数据目录不存在: {ZHAOYUN_DATA_DIR}")
+        return
+    
+    print(f"✅ 赵云数据目录有效")
+    for subdir, info in structure_report['subdirectories'].items():
+        print(f"   {subdir}: {info['total_files']} 个文件")
+    
+    # 2. 转换日线数据（测试模式，只转换2024年的前10个文件）
+    print("\n2. 转换日线数据（测试模式）...")
+    converter.convert_daily_data(year=2024, limit=10)
+    
+    # 3. 转换股票信息
+    print("\n3. 转换股票信息...")
+    converter.convert_stock_info()
+    
+    # 4. 创建配置文件
+    print("\n4. 创建配置文件...")
+    converter.create_config_file()
+    
+    print("\n" + "=" * 60)
+    print("转换完成！")
+    print(f"输出目录: {OUTPUT_DIR}")
+    print("=" * 60)
+    print("\n下一步操作:")
+    print("1. 将 vnpy_local_data_adapter.py 集成到vn.py策略中")
+    print("2. 配置数据路径: vnpy_data_config.json")
+    print("3. 测试数据加载: python test_vnpy_data.py")
+    print("4. 联系赵云将军更新数据")
+    print("=" * 60)
+
+
+if __name__ == "__main__":
+    main()