Files
sanguo_quant_live/zhaoyun-data/scripts/github_crawler/vnpy_github_crawler.py
T
2026-03-28 00:14:34 +08:00

395 lines
13 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
"""
GitHub vnpy组织所有公开仓库爬取
爬取所有vnpy组织下的公开仓库内容
保存到知识库: /Users/chufeng/.openclaw/knowledge_base/vnpy-github/
"""
import os
import sys
import json
import time
import logging
import requests
from datetime import datetime
from typing import Dict, List, Optional
from pathlib import Path
# 配置日志
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
handlers=[
logging.StreamHandler(),
logging.FileHandler('/Users/chufeng/.openclaw/knowledge_base/vnpy-github/crawler.log', encoding='utf-8')
]
)
logger = logging.getLogger(__name__)
class VNPYGitHubCrawler:
"""vnpy GitHub爬取器"""
def __init__(
self,
org_name: str = "vnpy",
output_dir: str = "/Users/chufeng/.openclaw/knowledge_base/vnpy-github",
token: Optional[str] = None
):
"""初始化爬取器"""
self.org_name = org_name
self.output_dir = Path(output_dir)
self.token = token
# 创建输出目录
self.output_dir.mkdir(parents=True, exist_ok=True)
(self.output_dir / "repos").mkdir(exist_ok=True)
(self.output_dir / "readmes").mkdir(exist_ok=True)
(self.output_dir / "sources").mkdir(exist_ok=True)
# 请求头
self.headers = {}
if self.token:
self.headers['Authorization'] = f'token {self.token}'
self.headers['Accept'] = 'application/vnd.github.v3+json'
logger.info(f"初始化vnpy GitHub爬取器")
logger.info(f"组织: {self.org_name}")
logger.info(f"输出目录: {self.output_dir}")
def get_all_repos(self) -> List[Dict]:
"""获取vnpy用户下所有公开仓库"""
logger.info(f"开始获取 {self.org_name} 用户下的所有公开仓库...")
all_repos = []
page = 1
while True:
url = f"https://api.github.com/users/{self.org_name}/repos"
params = {
"page": page,
"per_page": 100,
"type": "public"
}
try:
response = requests.get(url, headers=self.headers, params=params)
if response.status_code != 200:
logger.error(f"获取仓库列表失败: HTTP {response.status_code}")
logger.error(f"响应: {response.text[:200]}")
break
repos = response.json()
if not repos:
break # 没有更多仓库了
logger.info(f"{page} 页获取到 {len(repos)} 个仓库")
for repo in repos:
all_repos.append({
"id": repo["id"],
"name": repo["name"],
"full_name": repo["full_name"],
"description": repo.get("description", ""),
"url": repo["html_url"],
"clone_url": repo["clone_url"],
"default_branch": repo["default_branch"],
"stars": repo["stargazers_count"],
"forks": repo["forks_count"],
"created_at": repo["created_at"],
"updated_at": repo["updated_at"],
"size": repo["size"]
})
page += 1
time.sleep(1) # 避免请求过快
except Exception as e:
logger.error(f"获取仓库列表异常: {e}")
break
logger.info(f"总共获取到 {len(all_repos)} 个公开仓库")
return all_repos
def save_repo_list(self, repos: List[Dict]) -> None:
"""保存仓库列表"""
output_file = self.output_dir / "vnpy_repos_list.json"
summary = {
"crawled_at": datetime.now().isoformat(),
"org_name": self.org_name,
"total_repos": len(repos),
"repositories": repos
}
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(summary, f, ensure_ascii=False, indent=2)
logger.info(f"仓库列表已保存: {output_file}")
# 打印仓库信息
print("\n" + "="*70)
print(f"📋 vnpy组织公开仓库列表 ({len(repos)} 个):")
print("="*70)
for i, repo in enumerate(repos, 1):
stars = repo.get('stars', 0)
description = repo.get('description', '')[:50]
print(f" {i:3d}. {repo['name']:<30}{stars:4d} {description}")
print("="*70 + "\n")
def get_readme_content(self, repo_full_name: str) -> Optional[str]:
"""获取README内容"""
logger.info(f"获取 {repo_full_name} 的README...")
# 获取README内容
url = f"https://api.github.com/repos/{repo_full_name}/readme"
try:
response = requests.get(url, headers=self.headers)
if response.status_code != 200:
logger.warning(f"获取README失败: HTTP {response.status_code}")
return None
data = response.json()
# README内容是base64编码
import base64
content = base64.b64decode(data['content']).decode('utf-8')
logger.info(f"成功获取README: {len(content)} 字符")
return content
except Exception as e:
logger.error(f"获取README异常: {e}")
return None
def clone_repo_source(self, clone_url: str, repo_name: str) -> bool:
"""克隆仓库源码"""
logger.info(f"克隆源码: {repo_name}")
target_dir = self.output_dir / "sources" / repo_name
if target_dir.exists():
logger.info(f"仓库已存在,跳过克隆: {target_dir}")
return True
try:
cmd = f"git clone {clone_url} {target_dir}"
result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
if result.returncode == 0:
logger.info(f"✅ 克隆成功: {repo_name}")
return True
else:
logger.error(f"❌ 克隆失败: {repo_name}")
logger.error(f"错误: {result.stderr}")
return False
except Exception as e:
logger.error(f"克隆异常: {e}")
return False
def save_readme(self, repo_name: str, content: str) -> None:
"""保存README内容"""
readme_file = self.output_dir / "readmes" / f"{repo_name}_README.md"
with open(readme_file, 'w', encoding='utf-8') as f:
f.write(content)
logger.info(f"README已保存: {readme_file}")
def save_repo_info(self, repo: Dict) -> None:
"""保存单个仓库信息"""
repo_info_file = self.output_dir / "repos" / f"{repo['name']}_info.json"
with open(repo_info_file, 'w', encoding='utf-8') as f:
json.dump(repo, f, ensure_ascii=False, indent=2)
def crawl_all_repos(self, clone_source: bool = True) -> Dict:
"""爬取所有仓库"""
logger.info("="*70)
logger.info("🚀 开始爬取vnpy所有公开仓库")
logger.info("="*70)
# 1. 获取所有仓库
repos = self.get_all_repos()
if not repos:
logger.error("❌ 未获取到任何仓库")
return {
"status": "failed",
"error": "未获取到任何仓库",
"total_repos": 0
}
# 2. 保存仓库列表
self.save_repo_list(repos)
# 3. 逐个爬取
crawl_results = {
"total": len(repos),
"success": 0,
"failed": 0,
"repos": []
}
for i, repo in enumerate(repos, 1):
repo_name = repo["name"]
full_name = repo["full_name"]
logger.info(f"\n{'='*60}")
logger.info(f"📦 [{i}/{len(repos)}] 处理: {repo_name}")
logger.info(f"{'='*60}")
# 保存仓库信息
self.save_repo_info(repo)
# 获取README
readme_content = self.get_readme_content(full_name)
if readme_content:
self.save_readme(repo_name, readme_content)
crawl_results["success"] += 1
else:
crawl_results["failed"] += 1
# 克隆源码
if clone_source:
clone_url = repo["clone_url"]
if self.clone_repo_source(clone_url, repo_name):
logger.info(f"{repo_name} 源码克隆完成")
else:
logger.warning(f"⚠️ {repo_name} 源码克隆失败")
# 限速
time.sleep(2)
# 保存爬取总结
crawl_summary = {
"crawled_at": datetime.now().isoformat(),
"organization": self.org_name,
"total_repos": crawl_results["total"],
"successfully_crawled": crawl_results["success"],
"failed_crawled": crawl_results["failed"],
"repositories": crawl_results["repos"]
}
summary_file = self.output_dir / "crawl_summary.json"
with open(summary_file, 'w', encoding='utf-8') as f:
json.dump(crawl_summary, f, ensure_ascii=False, indent=2)
logger.info("\n" + "="*70)
logger.info("✅ 爬取完成!")
logger.info(f" 总计仓库: {crawl_results['total']}")
logger.info(f" 成功爬取: {crawl_results['success']}")
logger.info(f" 爬取失败: {crawl_results['failed']}")
logger.info(f" 保存位置: {self.output_dir}")
logger.info("="*70)
return crawl_summary
def generate_index(self) -> None:
"""生成索引文件"""
logger.info("生成索引文件...")
# 读取仓库列表
repos_file = self.output_dir / "vnpy_repos_list.json"
with open(repos_file, 'r', encoding='utf-8') as f:
data = json.load(f)
repos = data["repositories"]
# 生成markdown索引
index_content = """# vnpy GitHub 官方仓库索引
**爬取时间**: {crawled_at}
**总计公开仓库**: {total_repos}
## 📦 仓库列表
| # | 仓库名称 | ⭐ Stars | 描述 | 链接 |
|---|----------|-------|------|------|
""".format(
crawled_at = datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
total_repos = len(repos)
)
for i, repo in enumerate(repos, 1):
name = repo["name"]
stars = repo.get("stars", 0)
description = repo.get("description", "") or ""
if len(description) > 50:
description = description[:47] + "..."
url = repo.get("url", "")
index_content += f"| {i} | **{name}** | {stars} | {description} | [🔗 link]({url}) |\n"
index_content += """
## 📁 目录结构
```
vnpy-github/
├── vnpy_repos_list.json # 完整仓库列表JSON
├── crawl_summary.json # 爬取总结
├── repos/ # 单个仓库信息JSON
├── readmes/ # README内容
└── sources/ # 完整源码(git clone
```
## 💡 使用说明
1. **查看仓库列表**: `vnpy_repos_list.json`
2. **查看README**: `readmes/{repo_name}_README.md`
3. **获取源码**: `sources/{repo_name}/`
**爬取完成,可供查阅分析!**
"""
index_file = self.output_dir / "README.md"
with open(index_file, 'w', encoding='utf-8') as f:
f.write(index_content)
logger.info(f"索引文件已生成: {index_file}")
def main():
"""主函数"""
import argparse
parser = argparse.ArgumentParser(description='vnpy GitHub组织仓库爬取')
parser.add_argument('--org', default='vnpy', help='GitHub组织名称')
parser.add_argument('--output', default='/Users/chufeng/.openclaw/knowledge_base/vnpy-github', help='输出目录')
parser.add_argument('--token', default=None, help='GitHub API token (可选,提高速率限制)')
parser.add_argument('--no-clone', action='store_true', help='不克隆源码')
args = parser.parse_args()
crawler = VNPYGitHubCrawler(
org_name=args.org,
output_dir=args.output,
token=args.token
)
# 开始爬取
result = crawler.crawl_all_repos(clone_source=not args.no_clone)
# 生成索引
crawler.generate_index()
return result
if __name__ == "__main__":
result = main()
if result.get("status") == "failed":
sys.exit(1)
else:
print(f"\n🎉 vnpy GitHub爬取完成!")
print(f" 总计: {result['total_repos']} 个仓库")
print(f" 成功: {result['successfully_crawled']}")
print(f" 失败: {result['failed_crawled']}")
print(f" 保存位置: /Users/chufeng/.openclaw/knowledge_base/vnpy-github/")
sys.exit(0)