395 lines
13 KiB
Python
395 lines
13 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
GitHub vnpy组织所有公开仓库爬取
|
||
爬取所有vnpy组织下的公开仓库内容
|
||
保存到知识库: /Users/chufeng/.openclaw/knowledge_base/vnpy-github/
|
||
"""
|
||
|
||
import os
|
||
import sys
|
||
import json
|
||
import time
|
||
import logging
|
||
import requests
|
||
from datetime import datetime
|
||
from typing import Dict, List, Optional
|
||
from pathlib import Path
|
||
|
||
# 配置日志
|
||
logging.basicConfig(
|
||
level=logging.INFO,
|
||
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
||
handlers=[
|
||
logging.StreamHandler(),
|
||
logging.FileHandler('/Users/chufeng/.openclaw/knowledge_base/vnpy-github/crawler.log', encoding='utf-8')
|
||
]
|
||
)
|
||
logger = logging.getLogger(__name__)
|
||
|
||
class VNPYGitHubCrawler:
|
||
"""vnpy GitHub爬取器"""
|
||
|
||
def __init__(
|
||
self,
|
||
org_name: str = "vnpy",
|
||
output_dir: str = "/Users/chufeng/.openclaw/knowledge_base/vnpy-github",
|
||
token: Optional[str] = None
|
||
):
|
||
"""初始化爬取器"""
|
||
self.org_name = org_name
|
||
self.output_dir = Path(output_dir)
|
||
self.token = token
|
||
|
||
# 创建输出目录
|
||
self.output_dir.mkdir(parents=True, exist_ok=True)
|
||
(self.output_dir / "repos").mkdir(exist_ok=True)
|
||
(self.output_dir / "readmes").mkdir(exist_ok=True)
|
||
(self.output_dir / "sources").mkdir(exist_ok=True)
|
||
|
||
# 请求头
|
||
self.headers = {}
|
||
if self.token:
|
||
self.headers['Authorization'] = f'token {self.token}'
|
||
|
||
self.headers['Accept'] = 'application/vnd.github.v3+json'
|
||
|
||
logger.info(f"初始化vnpy GitHub爬取器")
|
||
logger.info(f"组织: {self.org_name}")
|
||
logger.info(f"输出目录: {self.output_dir}")
|
||
|
||
def get_all_repos(self) -> List[Dict]:
|
||
"""获取vnpy用户下所有公开仓库"""
|
||
logger.info(f"开始获取 {self.org_name} 用户下的所有公开仓库...")
|
||
|
||
all_repos = []
|
||
page = 1
|
||
|
||
while True:
|
||
url = f"https://api.github.com/users/{self.org_name}/repos"
|
||
params = {
|
||
"page": page,
|
||
"per_page": 100,
|
||
"type": "public"
|
||
}
|
||
|
||
try:
|
||
response = requests.get(url, headers=self.headers, params=params)
|
||
|
||
if response.status_code != 200:
|
||
logger.error(f"获取仓库列表失败: HTTP {response.status_code}")
|
||
logger.error(f"响应: {response.text[:200]}")
|
||
break
|
||
|
||
repos = response.json()
|
||
|
||
if not repos:
|
||
break # 没有更多仓库了
|
||
|
||
logger.info(f"第 {page} 页获取到 {len(repos)} 个仓库")
|
||
|
||
for repo in repos:
|
||
all_repos.append({
|
||
"id": repo["id"],
|
||
"name": repo["name"],
|
||
"full_name": repo["full_name"],
|
||
"description": repo.get("description", ""),
|
||
"url": repo["html_url"],
|
||
"clone_url": repo["clone_url"],
|
||
"default_branch": repo["default_branch"],
|
||
"stars": repo["stargazers_count"],
|
||
"forks": repo["forks_count"],
|
||
"created_at": repo["created_at"],
|
||
"updated_at": repo["updated_at"],
|
||
"size": repo["size"]
|
||
})
|
||
|
||
page += 1
|
||
time.sleep(1) # 避免请求过快
|
||
|
||
except Exception as e:
|
||
logger.error(f"获取仓库列表异常: {e}")
|
||
break
|
||
|
||
logger.info(f"总共获取到 {len(all_repos)} 个公开仓库")
|
||
return all_repos
|
||
|
||
def save_repo_list(self, repos: List[Dict]) -> None:
|
||
"""保存仓库列表"""
|
||
output_file = self.output_dir / "vnpy_repos_list.json"
|
||
|
||
summary = {
|
||
"crawled_at": datetime.now().isoformat(),
|
||
"org_name": self.org_name,
|
||
"total_repos": len(repos),
|
||
"repositories": repos
|
||
}
|
||
|
||
with open(output_file, 'w', encoding='utf-8') as f:
|
||
json.dump(summary, f, ensure_ascii=False, indent=2)
|
||
|
||
logger.info(f"仓库列表已保存: {output_file}")
|
||
|
||
# 打印仓库信息
|
||
print("\n" + "="*70)
|
||
print(f"📋 vnpy组织公开仓库列表 ({len(repos)} 个):")
|
||
print("="*70)
|
||
for i, repo in enumerate(repos, 1):
|
||
stars = repo.get('stars', 0)
|
||
description = repo.get('description', '')[:50]
|
||
print(f" {i:3d}. {repo['name']:<30} ⭐{stars:4d} {description}")
|
||
print("="*70 + "\n")
|
||
|
||
def get_readme_content(self, repo_full_name: str) -> Optional[str]:
|
||
"""获取README内容"""
|
||
logger.info(f"获取 {repo_full_name} 的README...")
|
||
|
||
# 获取README内容
|
||
url = f"https://api.github.com/repos/{repo_full_name}/readme"
|
||
|
||
try:
|
||
response = requests.get(url, headers=self.headers)
|
||
|
||
if response.status_code != 200:
|
||
logger.warning(f"获取README失败: HTTP {response.status_code}")
|
||
return None
|
||
|
||
data = response.json()
|
||
# README内容是base64编码
|
||
import base64
|
||
content = base64.b64decode(data['content']).decode('utf-8')
|
||
|
||
logger.info(f"成功获取README: {len(content)} 字符")
|
||
return content
|
||
|
||
except Exception as e:
|
||
logger.error(f"获取README异常: {e}")
|
||
return None
|
||
|
||
def clone_repo_source(self, clone_url: str, repo_name: str) -> bool:
|
||
"""克隆仓库源码"""
|
||
logger.info(f"克隆源码: {repo_name}")
|
||
|
||
target_dir = self.output_dir / "sources" / repo_name
|
||
|
||
if target_dir.exists():
|
||
logger.info(f"仓库已存在,跳过克隆: {target_dir}")
|
||
return True
|
||
|
||
try:
|
||
cmd = f"git clone {clone_url} {target_dir}"
|
||
result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
|
||
|
||
if result.returncode == 0:
|
||
logger.info(f"✅ 克隆成功: {repo_name}")
|
||
return True
|
||
else:
|
||
logger.error(f"❌ 克隆失败: {repo_name}")
|
||
logger.error(f"错误: {result.stderr}")
|
||
return False
|
||
|
||
except Exception as e:
|
||
logger.error(f"克隆异常: {e}")
|
||
return False
|
||
|
||
def save_readme(self, repo_name: str, content: str) -> None:
|
||
"""保存README内容"""
|
||
readme_file = self.output_dir / "readmes" / f"{repo_name}_README.md"
|
||
|
||
with open(readme_file, 'w', encoding='utf-8') as f:
|
||
f.write(content)
|
||
|
||
logger.info(f"README已保存: {readme_file}")
|
||
|
||
def save_repo_info(self, repo: Dict) -> None:
|
||
"""保存单个仓库信息"""
|
||
repo_info_file = self.output_dir / "repos" / f"{repo['name']}_info.json"
|
||
|
||
with open(repo_info_file, 'w', encoding='utf-8') as f:
|
||
json.dump(repo, f, ensure_ascii=False, indent=2)
|
||
|
||
def crawl_all_repos(self, clone_source: bool = True) -> Dict:
|
||
"""爬取所有仓库"""
|
||
logger.info("="*70)
|
||
logger.info("🚀 开始爬取vnpy所有公开仓库")
|
||
logger.info("="*70)
|
||
|
||
# 1. 获取所有仓库
|
||
repos = self.get_all_repos()
|
||
|
||
if not repos:
|
||
logger.error("❌ 未获取到任何仓库")
|
||
return {
|
||
"status": "failed",
|
||
"error": "未获取到任何仓库",
|
||
"total_repos": 0
|
||
}
|
||
|
||
# 2. 保存仓库列表
|
||
self.save_repo_list(repos)
|
||
|
||
# 3. 逐个爬取
|
||
crawl_results = {
|
||
"total": len(repos),
|
||
"success": 0,
|
||
"failed": 0,
|
||
"repos": []
|
||
}
|
||
|
||
for i, repo in enumerate(repos, 1):
|
||
repo_name = repo["name"]
|
||
full_name = repo["full_name"]
|
||
|
||
logger.info(f"\n{'='*60}")
|
||
logger.info(f"📦 [{i}/{len(repos)}] 处理: {repo_name}")
|
||
logger.info(f"{'='*60}")
|
||
|
||
# 保存仓库信息
|
||
self.save_repo_info(repo)
|
||
|
||
# 获取README
|
||
readme_content = self.get_readme_content(full_name)
|
||
if readme_content:
|
||
self.save_readme(repo_name, readme_content)
|
||
crawl_results["success"] += 1
|
||
else:
|
||
crawl_results["failed"] += 1
|
||
|
||
# 克隆源码
|
||
if clone_source:
|
||
clone_url = repo["clone_url"]
|
||
if self.clone_repo_source(clone_url, repo_name):
|
||
logger.info(f"✅ {repo_name} 源码克隆完成")
|
||
else:
|
||
logger.warning(f"⚠️ {repo_name} 源码克隆失败")
|
||
|
||
# 限速
|
||
time.sleep(2)
|
||
|
||
# 保存爬取总结
|
||
crawl_summary = {
|
||
"crawled_at": datetime.now().isoformat(),
|
||
"organization": self.org_name,
|
||
"total_repos": crawl_results["total"],
|
||
"successfully_crawled": crawl_results["success"],
|
||
"failed_crawled": crawl_results["failed"],
|
||
"repositories": crawl_results["repos"]
|
||
}
|
||
|
||
summary_file = self.output_dir / "crawl_summary.json"
|
||
with open(summary_file, 'w', encoding='utf-8') as f:
|
||
json.dump(crawl_summary, f, ensure_ascii=False, indent=2)
|
||
|
||
logger.info("\n" + "="*70)
|
||
logger.info("✅ 爬取完成!")
|
||
logger.info(f" 总计仓库: {crawl_results['total']}")
|
||
logger.info(f" 成功爬取: {crawl_results['success']}")
|
||
logger.info(f" 爬取失败: {crawl_results['failed']}")
|
||
logger.info(f" 保存位置: {self.output_dir}")
|
||
logger.info("="*70)
|
||
|
||
return crawl_summary
|
||
|
||
def generate_index(self) -> None:
|
||
"""生成索引文件"""
|
||
logger.info("生成索引文件...")
|
||
|
||
# 读取仓库列表
|
||
repos_file = self.output_dir / "vnpy_repos_list.json"
|
||
with open(repos_file, 'r', encoding='utf-8') as f:
|
||
data = json.load(f)
|
||
repos = data["repositories"]
|
||
|
||
# 生成markdown索引
|
||
index_content = """# vnpy GitHub 官方仓库索引
|
||
|
||
**爬取时间**: {crawled_at}
|
||
**总计公开仓库**: {total_repos}
|
||
|
||
## 📦 仓库列表
|
||
|
||
| # | 仓库名称 | ⭐ Stars | 描述 | 链接 |
|
||
|---|----------|-------|------|------|
|
||
""".format(
|
||
crawled_at = datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
|
||
total_repos = len(repos)
|
||
)
|
||
|
||
for i, repo in enumerate(repos, 1):
|
||
name = repo["name"]
|
||
stars = repo.get("stars", 0)
|
||
description = repo.get("description", "") or ""
|
||
if len(description) > 50:
|
||
description = description[:47] + "..."
|
||
url = repo.get("url", "")
|
||
|
||
index_content += f"| {i} | **{name}** | {stars} | {description} | [🔗 link]({url}) |\n"
|
||
|
||
index_content += """
|
||
|
||
## 📁 目录结构
|
||
|
||
```
|
||
vnpy-github/
|
||
├── vnpy_repos_list.json # 完整仓库列表JSON
|
||
├── crawl_summary.json # 爬取总结
|
||
├── repos/ # 单个仓库信息JSON
|
||
├── readmes/ # README内容
|
||
└── sources/ # 完整源码(git clone)
|
||
```
|
||
|
||
## 💡 使用说明
|
||
|
||
1. **查看仓库列表**: `vnpy_repos_list.json`
|
||
2. **查看README**: `readmes/{repo_name}_README.md`
|
||
3. **获取源码**: `sources/{repo_name}/`
|
||
|
||
**爬取完成,可供查阅分析!**
|
||
"""
|
||
|
||
index_file = self.output_dir / "README.md"
|
||
with open(index_file, 'w', encoding='utf-8') as f:
|
||
f.write(index_content)
|
||
|
||
logger.info(f"索引文件已生成: {index_file}")
|
||
|
||
|
||
def main():
|
||
"""主函数"""
|
||
import argparse
|
||
|
||
parser = argparse.ArgumentParser(description='vnpy GitHub组织仓库爬取')
|
||
parser.add_argument('--org', default='vnpy', help='GitHub组织名称')
|
||
parser.add_argument('--output', default='/Users/chufeng/.openclaw/knowledge_base/vnpy-github', help='输出目录')
|
||
parser.add_argument('--token', default=None, help='GitHub API token (可选,提高速率限制)')
|
||
parser.add_argument('--no-clone', action='store_true', help='不克隆源码')
|
||
|
||
args = parser.parse_args()
|
||
|
||
crawler = VNPYGitHubCrawler(
|
||
org_name=args.org,
|
||
output_dir=args.output,
|
||
token=args.token
|
||
)
|
||
|
||
# 开始爬取
|
||
result = crawler.crawl_all_repos(clone_source=not args.no_clone)
|
||
|
||
# 生成索引
|
||
crawler.generate_index()
|
||
|
||
return result
|
||
|
||
|
||
if __name__ == "__main__":
|
||
result = main()
|
||
|
||
if result.get("status") == "failed":
|
||
sys.exit(1)
|
||
else:
|
||
print(f"\n🎉 vnpy GitHub爬取完成!")
|
||
print(f" 总计: {result['total_repos']} 个仓库")
|
||
print(f" 成功: {result['successfully_crawled']} 个")
|
||
print(f" 失败: {result['failed_crawled']} 个")
|
||
print(f" 保存位置: /Users/chufeng/.openclaw/knowledge_base/vnpy-github/")
|
||
sys.exit(0)
|