initial-import: 2026-04-11 21:18:55

This commit is contained in:
cfdaily
2026-04-11 21:18:55 +08:00
commit 5e6b2d73eb
264 changed files with 117047 additions and 0 deletions
+501
View File
@@ -0,0 +1,501 @@
#!/usr/bin/env python3
"""
数据格式转换工具 - 姜维
功能:将赵云将军的本地数据格式转换为vn.py兼容格式
"""
import pandas as pd
import os
import glob
import json
import logging
from datetime import datetime
from pathlib import Path
# 配置日志
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
handlers=[
logging.FileHandler('data_convert_tool.log'),
logging.StreamHandler()
]
)
logger = logging.getLogger(__name__)
class DataConverter:
"""
数据格式转换器
赵云格式 → vn.py格式
"""
# 赵云数据字段映射到vn.py字段
FIELD_MAPPING = {
# 基本字段
'date': 'datetime',
'open': 'open_price',
'high': 'high_price',
'low': 'low_price',
'close': 'close_price',
'volume': 'volume',
'amount': 'turnover', # 注意:vn.py中turnover是成交额
'turnover': 'turnover_rate', # 换手率
# 可选字段
'outstanding_share': 'outstanding_share',
'year': 'year',
# 财务数据字段
'pe_ttm': 'pe_ttm',
'pb': 'pb',
'roe': 'roe',
'total_market_cap': 'total_market_cap',
'circulating_market_cap': 'circulating_market_cap',
}
# 必需字段
REQUIRED_FIELDS = ['date', 'open', 'high', 'low', 'close', 'volume']
def __init__(self, zhaoyun_data_dir: str, output_dir: str):
"""
初始化转换器
Args:
zhaoyun_data_dir: 赵云数据目录
output_dir: 输出目录
"""
self.zhaoyun_dir = zhaoyun_data_dir
self.output_dir = output_dir
# 创建输出目录
os.makedirs(output_dir, exist_ok=True)
# 子目录结构
self.subdirs = {
'daily': 'daily',
'minute': 'minute',
'financial': 'financial',
'stock_info': 'stock_info',
}
for subdir in self.subdirs.values():
os.makedirs(os.path.join(output_dir, subdir), exist_ok=True)
def analyze_zhaoyun_structure(self) -> dict:
"""
分析赵云数据目录结构
Returns:
结构分析报告
"""
report = {
'timestamp': datetime.now().isoformat(),
'zhaoyun_dir': self.zhaoyun_dir,
'exists': os.path.exists(self.zhaoyun_dir),
'subdirectories': {},
'file_counts': {},
'sample_files': {},
'data_quality': {},
}
if not report['exists']:
logger.error(f"赵云数据目录不存在: {self.zhaoyun_dir}")
return report
# 分析子目录
for subdir in ['raw/daily', 'raw/financial', 'raw/stock_info', 'raw/minute_kline']:
full_path = os.path.join(self.zhaoyun_dir, subdir)
if os.path.exists(full_path):
# 统计文件
parquet_files = list(glob.glob(os.path.join(full_path, '**/*.parquet'), recursive=True))
csv_files = list(glob.glob(os.path.join(full_path, '**/*.csv'), recursive=True))
report['subdirectories'][subdir] = {
'path': full_path,
'parquet_count': len(parquet_files),
'csv_count': len(csv_files),
'total_files': len(parquet_files) + len(csv_files),
}
# 取样分析
if parquet_files:
sample_file = parquet_files[0]
try:
df = pd.read_parquet(sample_file)
report['sample_files'][subdir] = {
'file': sample_file,
'rows': len(df),
'columns': list(df.columns),
'dtypes': str(df.dtypes.to_dict()),
'date_range': {
'min': str(df['date'].min()) if 'date' in df.columns else 'N/A',
'max': str(df['date'].max()) if 'date' in df.columns else 'N/A',
} if 'date' in df.columns else {},
}
except Exception as e:
report['sample_files'][subdir] = {'error': str(e)}
logger.info(f"赵云数据结构分析完成")
return report
def convert_daily_data(self, year: int = None, symbols: list = None, limit: int = None):
"""
转换日线数据
Args:
year: 指定年份,None表示所有年份
symbols: 指定股票代码列表,None表示所有股票
limit: 限制转换数量(用于测试)
"""
daily_dir = os.path.join(self.zhaoyun_dir, 'raw/daily')
if not os.path.exists(daily_dir):
logger.error(f"赵云日线数据目录不存在: {daily_dir}")
return
# 确定年份范围
if year:
years = [str(year)]
else:
years = [d for d in os.listdir(daily_dir) if os.path.isdir(os.path.join(daily_dir, d))]
years.sort()
logger.info(f"开始转换日线数据,年份: {years}")
total_converted = 0
total_failed = 0
for year_dir in years:
year_path = os.path.join(daily_dir, year_dir)
output_year_path = os.path.join(self.output_dir, 'daily', year_dir)
os.makedirs(output_year_path, exist_ok=True)
# 查找所有parquet文件
parquet_files = glob.glob(os.path.join(year_path, '*.parquet'))
if symbols:
# 过滤指定股票
filtered_files = []
for file in parquet_files:
file_name = os.path.basename(file)
# 从文件名提取股票代码
if 'sh' in file_name:
symbol = file_name.split('_')[0][2:] + '.SH'
elif 'sz' in file_name:
symbol = file_name.split('_')[0][2:] + '.SZ'
elif 'bj' in file_name:
symbol = file_name.split('_')[0][2:] + '.BJ'
else:
symbol = file_name.split('_')[0]
if symbol in symbols or symbol.replace('.SH', '').replace('.SZ', '').replace('.BJ', '') in symbols:
filtered_files.append(file)
parquet_files = filtered_files
if limit:
parquet_files = parquet_files[:limit]
logger.info(f"转换 {year_dir} 年数据,共 {len(parquet_files)} 个文件")
for file_idx, file_path in enumerate(parquet_files, 1):
try:
# 从文件名提取信息
file_name = os.path.basename(file_path)
# 解析股票代码和交易所
if file_name.startswith('sh'):
symbol = file_name[2:8] # 提取6位数字代码
exchange = 'SH'
elif file_name.startswith('sz'):
symbol = file_name[2:8]
exchange = 'SZ'
elif file_name.startswith('bj'):
symbol = file_name[2:8]
exchange = 'BJ'
else:
symbol = file_name.split('_')[0]
exchange = 'SH' # 默认
# 读取数据
df = pd.read_parquet(file_path)
# 检查必需字段
missing_fields = [field for field in self.REQUIRED_FIELDS if field not in df.columns]
if missing_fields:
logger.warning(f"文件 {file_name} 缺少必需字段: {missing_fields}")
total_failed += 1
continue
# 创建vn.py格式DataFrame
vnpy_df = pd.DataFrame()
# 转换字段
for zhaoyun_field, vnpy_field in self.FIELD_MAPPING.items():
if zhaoyun_field in df.columns:
vnpy_df[vnpy_field] = df[zhaoyun_field]
# 特殊处理datetime字段
if 'datetime' not in vnpy_df.columns and 'date' in df.columns:
vnpy_df['datetime'] = pd.to_datetime(df['date']).dt.strftime('%Y-%m-%d %H:%M:%S')
# 添加标识字段
vnpy_df['symbol'] = symbol
vnpy_df['exchange'] = exchange
vnpy_df['interval'] = '1d'
# 添加唯一ID(可选)
vnpy_df['id'] = range(1, len(vnpy_df) + 1)
# 输出文件名
output_file = os.path.join(output_year_path, f"{exchange}{symbol}_daily_vnpy.parquet")
# 保存为parquet
vnpy_df.to_parquet(output_file, index=False)
total_converted += 1
if file_idx % 100 == 0 or file_idx == len(parquet_files):
logger.info(f"进度: {year_dir}{file_idx}/{len(parquet_files)} 转换: {total_converted} 失败: {total_failed}")
except Exception as e:
logger.error(f"转换文件失败 {file_path}: {e}")
total_failed += 1
logger.info(f"日线数据转换完成: 成功 {total_converted}, 失败 {total_failed}")
# 保存转换报告
report = {
'conversion_date': datetime.now().isoformat(),
'zhaoyun_dir': daily_dir,
'output_dir': os.path.join(self.output_dir, 'daily'),
'years_converted': years,
'total_converted': total_converted,
'total_failed': total_failed,
'symbols_converted': symbols if symbols else 'ALL',
}
report_file = os.path.join(self.output_dir, 'daily_conversion_report.json')
with open(report_file, 'w', encoding='utf-8') as f:
json.dump(report, f, ensure_ascii=False, indent=2)
logger.info(f"转换报告已保存: {report_file}")
def convert_stock_info(self):
"""转换股票基础信息"""
stock_info_dir = os.path.join(self.zhaoyun_dir, 'raw/stock_info')
if not os.path.exists(stock_info_dir):
logger.warning(f"赵云股票信息目录不存在: {stock_info_dir}")
return
# 查找股票信息文件
stock_files = glob.glob(os.path.join(stock_info_dir, '*.parquet')) + \
glob.glob(os.path.join(stock_info_dir, '*.csv'))
if not stock_files:
logger.warning(f"未找到股票信息文件")
return
logger.info(f"开始转换股票信息,共 {len(stock_files)} 个文件")
all_stock_info = []
for file_path in stock_files:
try:
# 读取文件
if file_path.endswith('.parquet'):
df = pd.read_parquet(file_path)
else:
df = pd.read_csv(file_path)
# 标准化字段名
column_mapping = {
'代码': 'symbol',
'名称': 'name',
'行业': 'industry',
'市场': 'market',
'上市日期': 'list_date',
'总市值': 'total_market_cap',
'流通市值': 'circulating_market_cap',
'市盈率': 'pe',
'市净率': 'pb',
'ROE': 'roe',
}
df = df.rename(columns={k: v for k, v in column_mapping.items() if k in df.columns})
# 添加exchange字段
if 'symbol' in df.columns:
df['exchange'] = df['symbol'].apply(lambda x: 'SH' if str(x).startswith('6') else 'SZ')
all_stock_info.append(df)
logger.info(f"转换股票信息文件: {os.path.basename(file_path)} ({len(df)} 条记录)")
except Exception as e:
logger.error(f"转换股票信息失败 {file_path}: {e}")
if all_stock_info:
# 合并所有数据
combined_df = pd.concat(all_stock_info, ignore_index=True)
# 去重
if 'symbol' in combined_df.columns:
combined_df = combined_df.drop_duplicates(subset=['symbol'])
# 保存
output_file = os.path.join(self.output_dir, 'stock_info', 'stock_basic_info_vnpy.parquet')
combined_df.to_parquet(output_file, index=False)
logger.info(f"股票信息转换完成: {output_file} ({len(combined_df)} 只股票)")
def create_config_file(self):
"""创建vn.py配置文件"""
config = {
'data_source': 'zhaoyun_local_data',
'data_directory': os.path.abspath(self.output_dir),
'priority': 'local_first',
'fields_mapping': self.FIELD_MAPPING,
'created_at': datetime.now().isoformat(),
'description': '赵云本地数据 → vn.py格式转换配置',
'usage': {
'daily_data_path': '{data_directory}/daily/{year}/{exchange}{symbol}_daily_vnpy.parquet',
'stock_info_path': '{data_directory}/stock_info/stock_basic_info_vnpy.parquet',
'python_import': 'from vnpy_local_data_adapter import VnpyLocalDataAdapter',
'init_code': 'adapter = VnpyLocalDataAdapter(use_local_first=True)',
}
}
config_file = os.path.join(self.output_dir, 'vnpy_data_config.json')
with open(config_file, 'w', encoding='utf-8') as f:
json.dump(config, f, ensure_ascii=False, indent=2)
logger.info(f"vn.py配置文件已创建: {config_file}")
# 创建使用说明
readme = f"""# vn.py本地数据使用说明
## 数据来源
- 原始数据:赵云将军下载的A股数据
- 转换工具:姜维数据格式转换器
- 输出格式:vn.py兼容的parquet格式
## 目录结构
```
{self.output_dir}/
├── daily/ # 日线数据
│ ├── 2010/ # 按年分区
│ ├── 2011/
│ └── ...
├── stock_info/ # 股票基础信息
│ └── stock_basic_info_vnpy.parquet
├── vnpy_data_config.json # 配置文件
└── daily_conversion_report.json # 转换报告
```
## 使用方法
### 1. 在vn.py策略中使用
```python
from vnpy_local_data_adapter import VnpyLocalDataAdapter
# 创建适配器(优先使用本地数据)
adapter = VnpyLocalDataAdapter(use_local_first=True)
# 获取数据
data = adapter.get_daily_data("000001.SZ", "2024-01-01", "2024-03-01")
```
### 2. 直接读取数据
```python
import pandas as pd
# 读取日线数据
file_path = "{self.output_dir}/daily/2024/SH600000_daily_vnpy.parquet"
df = pd.read_parquet(file_path)
# 读取股票信息
stock_info_path = "{self.output_dir}/stock_info/stock_basic_info_vnpy.parquet"
stock_info = pd.read_parquet(stock_info_path)
```
### 3. 验证数据结构
```python
from vnpy_local_data_adapter import VnpyLocalDataAdapter
adapter = VnpyLocalDataAdapter()
result = adapter.verify_local_data_structure("000001.SZ")
print(result)
```
## 数据更新
1. 联系赵云将军更新原始数据
2. 运行数据转换工具更新vn.py格式数据
3. 验证数据完整性
## 注意事项
- 本地数据优先,缺失时自动回退到akshare
- 数据文件按年分区,提高查询效率
- 定期检查数据完整性
**维护者**: 姜维(后勤总督)
**数据源**: 赵云(数据工程将军)
**最后更新**: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
"""
readme_file = os.path.join(self.output_dir, 'README.md')
with open(readme_file, 'w', encoding='utf-8') as f:
f.write(readme)
logger.info(f"使用说明已创建: {readme_file}")
def main():
"""主函数"""
print("=" * 60)
print("赵云数据 → vn.py格式转换工具")
print("=" * 60)
# 配置路径
ZHAOYUN_DATA_DIR = "/Users/chufeng/nas/stock/sanguo_vnpy/zhaoyun-data/data"
OUTPUT_DIR = "/Users/chufeng/.openclaw/workspace-jiangwei/vnpy_local_data"
# 创建转换器
converter = DataConverter(ZHAOYUN_DATA_DIR, OUTPUT_DIR)
# 1. 分析数据结构
print("\n1. 分析赵云数据结构...")
structure_report = converter.analyze_zhaoyun_structure()
if not structure_report['exists']:
print(f"❌ 赵云数据目录不存在: {ZHAOYUN_DATA_DIR}")
return
print(f"✅ 赵云数据目录有效")
for subdir, info in structure_report['subdirectories'].items():
print(f" {subdir}: {info['total_files']} 个文件")
# 2. 转换日线数据(测试模式,只转换2024年的前10个文件)
print("\n2. 转换日线数据(测试模式)...")
converter.convert_daily_data(year=2024, limit=10)
# 3. 转换股票信息
print("\n3. 转换股票信息...")
converter.convert_stock_info()
# 4. 创建配置文件
print("\n4. 创建配置文件...")
converter.create_config_file()
print("\n" + "=" * 60)
print("转换完成!")
print(f"输出目录: {OUTPUT_DIR}")
print("=" * 60)
print("\n下一步操作:")
print("1. 将 vnpy_local_data_adapter.py 集成到vn.py策略中")
print("2. 配置数据路径: vnpy_data_config.json")
print("3. 测试数据加载: python test_vnpy_data.py")
print("4. 联系赵云将军更新数据")
print("=" * 60)
if __name__ == "__main__":
main()