auto-sync: 2026-05-02 22:35:26

This commit is contained in:
cfdaily
2026-05-02 22:35:26 +08:00
parent cecb3e606d
commit d6709c205c
+84 -114
View File
@@ -2,22 +2,23 @@
""" """
15分钟线数据下载脚本 15分钟线数据下载脚本
数据源降级链: 数据源降级链
1. 腾讯 mkline API(直接返回15分钟线) 1. 新浪财经15分钟K线API(有真实amount800条/次)
2. 腾讯 minute/query + 聚合为15分钟(仅当天数据) 2. 腾讯 minute/query + 聚合为15分钟仅当天数据amount为估算)
功能: 功能
- 支持HS300 / 全市场下载 - 支持HS300 / 全市场下载
- 增量下载(追加新数据,不覆盖) - 增量下载追加新数据不覆盖已有)
- 断点续传(JSON进度文件) - 断点续传JSON进度文件
- 限频保护(0.3s间隔 + 重试) - 限频保护0.3s间隔 + 重试
- 与已有84只Parquet格式完全一致 - 与已有84只Parquet格式完全一致7列,end-of-bar时间戳)
- 数据校验(价格>0, OHLC一致性)
用法: 用法
python3 download_minute.py --scope hs300 python3 download_minute.py --scope hs300
python3 download_minute.py --scope all python3 download_minute.py --scope all
python3 download_minute.py --codes 000001 600519 python3 download_minute.py --codes 000001 600519
python3 download_minute.py --scope hs300 --resume # 断点续传 python3 download_minute.py --scope hs300 --resume
""" """
import argparse import argparse
@@ -33,7 +34,6 @@ from pathlib import Path
from typing import Optional, List, Tuple from typing import Optional, List, Tuple
import pandas as pd import pandas as pd
import numpy as np
logging.basicConfig( logging.basicConfig(
level=logging.INFO, level=logging.INFO,
@@ -44,90 +44,72 @@ logger = logging.getLogger(__name__)
# --- 配置 --- # --- 配置 ---
OUTPUT_DIR = Path("/Volumes/stock/minute_kline/15min") OUTPUT_DIR = Path("/Volumes/stock/minute_kline/15min")
PROGRESS_FILE = OUTPUT_DIR / "download_progress.json" PROGRESS_FILE = OUTPUT_DIR / "download_progress.json"
REQUEST_INTERVAL = 0.3 # 秒/请求 REQUEST_INTERVAL = 0.3
MAX_RETRIES = 3 # 单只重试次数 MAX_RETRIES = 3
CONSECUTIVE_FAIL_PAUSE = 60 # 连续失败暂停秒数 CONSECUTIVE_FAIL_PAUSE = 60
MAX_CONSECUTIVE_FAILS = 5 # 连续失败阈值 MAX_CONSECUTIVE_FAILS = 5
HS300_FILE = Path("/Volumes/stock/A股数据/stock_info/hs300_constituents_latest.csv") HS300_FILE = Path("/Users/chufeng/.openclaw/sanguo_projects/sanguo_quant_live/zhaoyun-data/data/raw/stock_info/hs300_constituents_latest.csv")
ALL_STOCKS_FILE = Path("/Volumes/stock/sanguo_vnpy/data/all_stocks.csv") ALL_STOCKS_FILE = Path("/Users/chufeng/.openclaw/sanguo_projects/sanguo_quant_live/zhaoyun-data/data/raw/stock_info/stock_basic_info_raw_20260326_113530.csv")
HEADERS = { HEADERS = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7)"}
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7)",
"Referer": "https://finance.qq.com",
}
# --- HTTP 工具 --- def _make_opener():
def fetch_url(url: str, timeout: int = 10) -> str: """创建无代理opener,避免akshare代理污染"""
req = urllib.request.Request(url, headers=HEADERS) return urllib.request.build_opener(urllib.request.ProxyHandler({}))
with urllib.request.urlopen(req, timeout=timeout) as resp:
return resp.read().decode("utf-8", errors="replace")
# --- 腾讯 mkline API --- # --- 新浪15分钟K线API(主源) ---
def try_mkline(symbol: str, count: int = 800) -> Optional[pd.DataFrame]: def try_sina_15min(symbol: str, datalen: int = 800) -> Optional[pd.DataFrame]:
""" """
腾讯mkline API,直接返回15分钟线 新浪财经15分钟K线API
symbol: sz000001 或 sh600519
Args: datalen: 返回条数(最大约800
symbol: 如 "sz000001" 返回: DataFrame(day, open, high, low, close, volume, amount) 或 None
count: 返回条数
""" """
url = f"http://web.ifzq.gtimg.cn/appstock/app/kline/mkline?param={symbol},m15,,{count}" url = (
f"https://quotes.sina.cn/cn/api/jsonp_v2.php/var%20=min15_{symbol}=/"
f"CN_MarketDataService.getKLineData?symbol={symbol}&scale=15&ma=no&datalen={datalen}"
)
try: try:
raw = fetch_url(url) opener = _make_opener()
data = json.loads(raw) req = urllib.request.Request(url, headers=HEADERS)
# 解析结构: data -> {symbol} -> data -> day/data with opener.open(req, timeout=15) as r:
stock_data = data.get("data", {}).get(symbol, {}).get("data", {}) raw = r.read().decode("utf-8", errors="replace")
m = re.search(r'\((\[.*\])\)', raw, re.DOTALL)
# mkline返回的是 { "day": [...], "m15": [...] } if not m:
klines = stock_data.get("m15", stock_data.get("day", []))
if not klines:
return None return None
data = json.loads(m.group(1))
rows = [] if not data:
for line in klines:
parts = line.split()
if len(parts) >= 6:
# 格式: "YYYYMMDDHHMM open high low close volume"
dt_str = parts[0]
rows.append({
"day": f"{dt_str[:4]}-{dt_str[4:6]}-{dt_str[6:8]} {dt_str[8:10]}:{dt_str[10:12]}:00",
"open": float(parts[1]),
"high": float(parts[2]),
"low": float(parts[3]),
"close": float(parts[4]),
"volume": str(int(float(parts[5]))),
"amount": str(round(float(parts[4]) * int(float(parts[5])), 2)), # close*volume估算
})
if not rows:
return None return None
return pd.DataFrame(rows) df = pd.DataFrame(data)
# 确保列顺序
cols = ["day", "open", "high", "low", "close", "volume", "amount"]
for c in cols:
if c not in df.columns:
return None
return df[cols]
except Exception as e: except Exception as e:
logger.debug("mkline failed for %s: %s", symbol, e) logger.debug("新浪15min失败 %s: %s", symbol, e)
return None return None
# --- 腾讯 minute/query API + 聚合 --- # --- 腾讯 minute/query + 聚合(备源,仅当天) ---
def try_minute_query_aggregate(symbol: str, date: str) -> Optional[pd.DataFrame]: def try_minute_query_aggregate(symbol: str, date: str) -> Optional[pd.DataFrame]:
""" """
腾讯minute/query API,返回1分钟线,聚合为15分钟线 腾讯minute/query API返回1分钟线聚合为15分钟线
symbol: sz000001
Args: date: 20260502
symbol: 如 "sz000001"
date: 如 "20260502"
""" """
url = f"http://web.ifzq.gtimg.cn/appstock/app/minute/query?code={symbol}" url = f"http://web.ifzq.gtimg.cn/appstock/app/minute/query?code={symbol}"
try: try:
raw = fetch_url(url) opener = _make_opener()
data = json.loads(raw) req = urllib.request.Request(url, headers=HEADERS)
with opener.open(req, timeout=10) as r:
data = json.loads(r.read())
minute_data = data.get("data", {}).get(symbol, {}).get("data", {}).get("data", []) minute_data = data.get("data", {}).get(symbol, {}).get("data", {}).get("data", [])
if not minute_data: if not minute_data:
return None return None
# 解析1分钟线: "HHMM price vol amount"
one_min = [] one_min = []
for line in minute_data: for line in minute_data:
parts = line.split() parts = line.split()
@@ -139,22 +121,18 @@ def try_minute_query_aggregate(symbol: str, date: str) -> Optional[pd.DataFrame]
"vol": float(parts[2]), "vol": float(parts[2]),
"amount": float(parts[3]), "amount": float(parts[3]),
}) })
if not one_min: if not one_min:
return None return None
return _aggregate_1m_to_15m(pd.DataFrame(one_min))
df = pd.DataFrame(one_min)
return _aggregate_1m_to_15m(df)
except Exception as e: except Exception as e:
logger.debug("minute_query failed for %s: %s", symbol, e) logger.debug("minute_query失败 %s: %s", symbol, e)
return None return None
def _aggregate_1m_to_15m(df: pd.DataFrame) -> pd.DataFrame: def _aggregate_1m_to_15m(df: pd.DataFrame) -> pd.DataFrame:
"""1分钟线聚合为15分钟线""" """1分钟线聚合为15分钟线end-of-bar时间戳)"""
df["time"] = pd.to_datetime(df["time"]) df["time"] = pd.to_datetime(df["time"])
# 15分钟分组:按时间段切分(9:30-9:45, 9:45-10:00, ...) # end-of-bar对齐:已有84只数据用K线结束时间(09:45, 10:00...
# end-of-bar对齐:已有84只数据用K线结束时间(09:45, 10:00...)
df["group"] = df["time"].dt.floor("15min") + pd.Timedelta(minutes=15) df["group"] = df["time"].dt.floor("15min") + pd.Timedelta(minutes=15)
agg = df.groupby("group").agg( agg = df.groupby("group").agg(
@@ -163,10 +141,10 @@ def _aggregate_1m_to_15m(df: pd.DataFrame) -> pd.DataFrame:
low=("price", "min"), low=("price", "min"),
close=("price", "last"), close=("price", "last"),
volume=("vol", "sum"), volume=("vol", "sum"),
amount=("amount", "last"), # 累计值取最后 amount=("amount", "last"),
).reset_index() ).reset_index()
result = pd.DataFrame({ return pd.DataFrame({
"day": agg["group"].dt.strftime("%Y-%m-%d %H:%M:%S"), "day": agg["group"].dt.strftime("%Y-%m-%d %H:%M:%S"),
"open": agg["open"], "open": agg["open"],
"high": agg["high"], "high": agg["high"],
@@ -175,11 +153,10 @@ def _aggregate_1m_to_15m(df: pd.DataFrame) -> pd.DataFrame:
"volume": agg["volume"].astype(str), "volume": agg["volume"].astype(str),
"amount": agg["amount"].astype(str), "amount": agg["amount"].astype(str),
}) })
return result
# --- 下载主流程 --- # --- 下载主流程 ---
def get_market_prefix(code: str) -> str: def get_market_prefix(code: str) -> Tuple[str, str]:
code = re.sub(r"[^0-9]", "", code).zfill(6) code = re.sub(r"[^0-9]", "", code).zfill(6)
if code.startswith(("60", "68", "51")): if code.startswith(("60", "68", "51")):
return "sh", code return "sh", code
@@ -187,16 +164,16 @@ def get_market_prefix(code: str) -> str:
def download_single(code: str) -> Tuple[Optional[pd.DataFrame], str]: def download_single(code: str) -> Tuple[Optional[pd.DataFrame], str]:
"""下载单只股票15分钟线,返回(df, source)""" """下载单只股票15分钟线返回(df, source)"""
prefix, clean = get_market_prefix(code) prefix, clean = get_market_prefix(code)
symbol = f"{prefix}{clean}" symbol = f"{prefix}{clean}"
# 主源:mkline # 主源:新浪15分钟线
df = try_mkline(symbol) df = try_sina_15min(symbol)
if df is not None and len(df) > 0: if df is not None and len(df) > 0:
return df, "mkline" return df, "sina_15min"
# 备源:minute/query + 聚合 # 备源minute/query + 聚合
today = datetime.now().strftime("%Y%m%d") today = datetime.now().strftime("%Y%m%d")
df = try_minute_query_aggregate(symbol, today) df = try_minute_query_aggregate(symbol, today)
if df is not None and len(df) > 0: if df is not None and len(df) > 0:
@@ -206,7 +183,7 @@ def download_single(code: str) -> Tuple[Optional[pd.DataFrame], str]:
def download_with_increment(code: str, output_dir: Path) -> Tuple[str, int]: def download_with_increment(code: str, output_dir: Path) -> Tuple[str, int]:
"""增量下载单只股票,返回(status, rows)""" """增量下载单只股票"""
prefix, clean = get_market_prefix(code) prefix, clean = get_market_prefix(code)
filename = f"{prefix}{clean}_15min.parquet" filename = f"{prefix}{clean}_15min.parquet"
parquet_path = output_dir / filename parquet_path = output_dir / filename
@@ -216,23 +193,30 @@ def download_with_increment(code: str, output_dir: Path) -> Tuple[str, int]:
return "failed", 0 return "failed", 0
# 数据校验 # 数据校验
for col in ['open', 'high', 'low', 'close']: for col in ["open", "high", "low", "close"]:
df_new[col] = pd.to_numeric(df_new[col], errors='coerce') df_new[col] = pd.to_numeric(df_new[col], errors="coerce")
df_new["volume"] = pd.to_numeric(df_new["volume"], errors="coerce").fillna(0)
df_new["amount"] = pd.to_numeric(df_new["amount"], errors="coerce").fillna(0)
# 价格>0 # 价格>0
bad_zero = (df_new[['close', 'open']] <= 0).any(axis=1) bad_zero = (df_new[["close", "open"]] <= 0).any(axis=1)
if bad_zero.any(): if bad_zero.any():
logger.warning("价格<=0 %s: %d", code, bad_zero.sum()) logger.warning("价格<=0 %s: %d", code, bad_zero.sum())
df_new = df_new[~bad_zero] df_new = df_new[~bad_zero]
# OHLC一致性 # OHLC一致性
bad_ohlc = (df_new['high'] < df_new[['open', 'close']].max(axis=1)) | (df_new['low'] > df_new[['open', 'close']].min(axis=1)) bad_ohlc = (df_new["high"] < df_new[["open", "close"]].max(axis=1)) | \
(df_new["low"] > df_new[["open", "close"]].min(axis=1))
if bad_ohlc.any(): if bad_ohlc.any():
logger.warning("OHLC异常 %s: %d", code, bad_ohlc.sum()) logger.warning("OHLC异常 %s: %d", code, bad_ohlc.sum())
df_new = df_new[~bad_ohlc] df_new = df_new[~bad_ohlc]
if df_new.empty: if df_new.empty:
return "failed", 0 return "failed", 0
# 转回object类型与已有数据兼容
df_new["volume"] = df_new["volume"].astype(str)
df_new["amount"] = df_new["amount"].astype(str)
if parquet_path.exists(): if parquet_path.exists():
# 增量:合并去重
existing = pd.read_parquet(parquet_path) existing = pd.read_parquet(parquet_path)
combined = pd.concat([existing, df_new], ignore_index=True) combined = pd.concat([existing, df_new], ignore_index=True)
combined = combined.drop_duplicates(subset=["day"], keep="last") combined = combined.drop_duplicates(subset=["day"], keep="last")
@@ -264,19 +248,16 @@ def save_progress(progress: dict):
def get_stock_list(scope: str) -> List[str]: def get_stock_list(scope: str) -> List[str]:
if scope == "hs300": if scope == "hs300":
df = pd.read_csv(HS300_FILE) df = pd.read_csv(HS300_FILE)
# 尝试多种列名
for col in ["成分券代码", "代码", "code"]: for col in ["成分券代码", "代码", "code"]:
if col in df.columns: if col in df.columns:
return [str(c).zfill(6) for c in df[col].tolist()] return [str(c).zfill(6) for c in df[col].tolist()]
raise ValueError(f"HS300文件中找不到代码列,现有列: {list(df.columns)}") raise ValueError(f"HS300文件中找不到代码列现有列: {list(df.columns)}")
if scope == "all": if scope == "all":
df = pd.read_csv(ALL_STOCKS_FILE) df = pd.read_csv(ALL_STOCKS_FILE)
for col in ["代码", "code", "股票代码"]: for col in ["代码", "code", "股票代码"]:
if col in df.columns: if col in df.columns:
return [str(c).zfill(6) for c in df[col].tolist()] return [str(c).zfill(6) for c in df[col].tolist()]
raise ValueError(f"全市场文件中找不到代码列,现有列: {list(df.columns)}") raise ValueError(f"全市场文件中找不到代码列现有列: {list(df.columns)}")
raise ValueError(f"Unknown scope: {scope}") raise ValueError(f"Unknown scope: {scope}")
@@ -292,7 +273,6 @@ def main():
output_dir = Path(args.output_dir) output_dir = Path(args.output_dir)
output_dir.mkdir(parents=True, exist_ok=True) output_dir.mkdir(parents=True, exist_ok=True)
# 获取股票列表
if args.codes: if args.codes:
codes = args.codes codes = args.codes
elif args.scope: elif args.scope:
@@ -300,10 +280,8 @@ def main():
else: else:
parser.error("必须指定 --scope 或 --codes") parser.error("必须指定 --scope 或 --codes")
# 断点续传
progress = load_progress() if args.resume else {"completed": [], "failed": []} progress = load_progress() if args.resume else {"completed": [], "failed": []}
skip_set = set(progress["completed"]) skip_set = set(progress["completed"])
todo = [c for c in codes if c not in skip_set] todo = [c for c in codes if c not in skip_set]
logger.info("股票总数: %d, 已完成: %d, 待下载: %d", len(codes), len(skip_set), len(todo)) logger.info("股票总数: %d, 已完成: %d, 待下载: %d", len(codes), len(skip_set), len(todo))
@@ -313,11 +291,9 @@ def main():
consecutive_fails = 0 consecutive_fails = 0
for i, code in enumerate(todo): for i, code in enumerate(todo):
# 限频
if i > 0: if i > 0:
time.sleep(REQUEST_INTERVAL) time.sleep(REQUEST_INTERVAL)
# 重试逻辑
status = "failed" status = "failed"
rows = 0 rows = 0
for attempt in range(MAX_RETRIES): for attempt in range(MAX_RETRIES):
@@ -339,24 +315,18 @@ def main():
consecutive_fails += 1 consecutive_fails += 1
progress["failed"].append(code) progress["failed"].append(code)
logger.warning("[%d/%d] %s: FAILED", i + 1, len(todo), code) logger.warning("[%d/%d] %s: FAILED", i + 1, len(todo), code)
# 连续失败保护
if consecutive_fails >= MAX_CONSECUTIVE_FAILS: if consecutive_fails >= MAX_CONSECUTIVE_FAILS:
logger.error("连续失败 %d,暂停 %d", consecutive_fails, CONSECUTIVE_FAIL_PAUSE) logger.error("连续失败 %d暂停 %d", consecutive_fails, CONSECUTIVE_FAIL_PAUSE)
time.sleep(CONSECUTIVE_FAIL_PAUSE) time.sleep(CONSECUTIVE_FAIL_PAUSE)
consecutive_fails =0 consecutive_fails = 0
# 定期保存进度
if (i + 1) % 50 == 0: if (i + 1) % 50 == 0:
save_progress(progress) save_progress(progress)
# 最终保存
save_progress(progress) save_progress(progress)
elapsed = time.time() - t_start elapsed = time.time() - t_start
logger.info("=" * 50) logger.info("=" * 50)
logger.info("下载完成: 成功 %d, 失败 %d, 耗时 %.1f", ok_count, fail_count, elapsed) logger.info("下载完成: 成功 %d, 失败 %d, 耗时 %.1f", ok_count, fail_count, elapsed)
logger.info("进度文件: %s", PROGRESS_FILE)
if __name__ == "__main__": if __name__ == "__main__":