#!/usr/bin/env python3 """ GitHub vnpy组织所有公开仓库爬取 爬取所有vnpy组织下的公开仓库内容 保存到知识库: /Users/chufeng/.openclaw/knowledge_base/vnpy-github/ """ import os import sys import json import time import logging import requests from datetime import datetime from typing import Dict, List, Optional from pathlib import Path # 配置日志 logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', handlers=[ logging.StreamHandler(), logging.FileHandler('/Users/chufeng/.openclaw/knowledge_base/vnpy-github/crawler.log', encoding='utf-8') ] ) logger = logging.getLogger(__name__) class VNPYGitHubCrawler: """vnpy GitHub爬取器""" def __init__( self, org_name: str = "vnpy", output_dir: str = "/Users/chufeng/.openclaw/knowledge_base/vnpy-github", token: Optional[str] = None ): """初始化爬取器""" self.org_name = org_name self.output_dir = Path(output_dir) self.token = token # 创建输出目录 self.output_dir.mkdir(parents=True, exist_ok=True) (self.output_dir / "repos").mkdir(exist_ok=True) (self.output_dir / "readmes").mkdir(exist_ok=True) (self.output_dir / "sources").mkdir(exist_ok=True) # 请求头 self.headers = {} if self.token: self.headers['Authorization'] = f'token {self.token}' self.headers['Accept'] = 'application/vnd.github.v3+json' logger.info(f"初始化vnpy GitHub爬取器") logger.info(f"组织: {self.org_name}") logger.info(f"输出目录: {self.output_dir}") def get_all_repos(self) -> List[Dict]: """获取vnpy用户下所有公开仓库""" logger.info(f"开始获取 {self.org_name} 用户下的所有公开仓库...") all_repos = [] page = 1 while True: url = f"https://api.github.com/users/{self.org_name}/repos" params = { "page": page, "per_page": 100, "type": "public" } try: response = requests.get(url, headers=self.headers, params=params) if response.status_code != 200: logger.error(f"获取仓库列表失败: HTTP {response.status_code}") logger.error(f"响应: {response.text[:200]}") break repos = response.json() if not repos: break # 没有更多仓库了 logger.info(f"第 {page} 页获取到 {len(repos)} 个仓库") for repo in repos: all_repos.append({ "id": repo["id"], "name": repo["name"], "full_name": repo["full_name"], "description": repo.get("description", ""), "url": repo["html_url"], "clone_url": repo["clone_url"], "default_branch": repo["default_branch"], "stars": repo["stargazers_count"], "forks": repo["forks_count"], "created_at": repo["created_at"], "updated_at": repo["updated_at"], "size": repo["size"] }) page += 1 time.sleep(1) # 避免请求过快 except Exception as e: logger.error(f"获取仓库列表异常: {e}") break logger.info(f"总共获取到 {len(all_repos)} 个公开仓库") return all_repos def save_repo_list(self, repos: List[Dict]) -> None: """保存仓库列表""" output_file = self.output_dir / "vnpy_repos_list.json" summary = { "crawled_at": datetime.now().isoformat(), "org_name": self.org_name, "total_repos": len(repos), "repositories": repos } with open(output_file, 'w', encoding='utf-8') as f: json.dump(summary, f, ensure_ascii=False, indent=2) logger.info(f"仓库列表已保存: {output_file}") # 打印仓库信息 print("\n" + "="*70) print(f"📋 vnpy组织公开仓库列表 ({len(repos)} 个):") print("="*70) for i, repo in enumerate(repos, 1): stars = repo.get('stars', 0) description = repo.get('description', '')[:50] print(f" {i:3d}. {repo['name']:<30} ⭐{stars:4d} {description}") print("="*70 + "\n") def get_readme_content(self, repo_full_name: str) -> Optional[str]: """获取README内容""" logger.info(f"获取 {repo_full_name} 的README...") # 获取README内容 url = f"https://api.github.com/repos/{repo_full_name}/readme" try: response = requests.get(url, headers=self.headers) if response.status_code != 200: logger.warning(f"获取README失败: HTTP {response.status_code}") return None data = response.json() # README内容是base64编码 import base64 content = base64.b64decode(data['content']).decode('utf-8') logger.info(f"成功获取README: {len(content)} 字符") return content except Exception as e: logger.error(f"获取README异常: {e}") return None def clone_repo_source(self, clone_url: str, repo_name: str) -> bool: """克隆仓库源码""" logger.info(f"克隆源码: {repo_name}") target_dir = self.output_dir / "sources" / repo_name if target_dir.exists(): logger.info(f"仓库已存在,跳过克隆: {target_dir}") return True try: cmd = f"git clone {clone_url} {target_dir}" result = subprocess.run(cmd, shell=True, capture_output=True, text=True) if result.returncode == 0: logger.info(f"✅ 克隆成功: {repo_name}") return True else: logger.error(f"❌ 克隆失败: {repo_name}") logger.error(f"错误: {result.stderr}") return False except Exception as e: logger.error(f"克隆异常: {e}") return False def save_readme(self, repo_name: str, content: str) -> None: """保存README内容""" readme_file = self.output_dir / "readmes" / f"{repo_name}_README.md" with open(readme_file, 'w', encoding='utf-8') as f: f.write(content) logger.info(f"README已保存: {readme_file}") def save_repo_info(self, repo: Dict) -> None: """保存单个仓库信息""" repo_info_file = self.output_dir / "repos" / f"{repo['name']}_info.json" with open(repo_info_file, 'w', encoding='utf-8') as f: json.dump(repo, f, ensure_ascii=False, indent=2) def crawl_all_repos(self, clone_source: bool = True) -> Dict: """爬取所有仓库""" logger.info("="*70) logger.info("🚀 开始爬取vnpy所有公开仓库") logger.info("="*70) # 1. 获取所有仓库 repos = self.get_all_repos() if not repos: logger.error("❌ 未获取到任何仓库") return { "status": "failed", "error": "未获取到任何仓库", "total_repos": 0 } # 2. 保存仓库列表 self.save_repo_list(repos) # 3. 逐个爬取 crawl_results = { "total": len(repos), "success": 0, "failed": 0, "repos": [] } for i, repo in enumerate(repos, 1): repo_name = repo["name"] full_name = repo["full_name"] logger.info(f"\n{'='*60}") logger.info(f"📦 [{i}/{len(repos)}] 处理: {repo_name}") logger.info(f"{'='*60}") # 保存仓库信息 self.save_repo_info(repo) # 获取README readme_content = self.get_readme_content(full_name) if readme_content: self.save_readme(repo_name, readme_content) crawl_results["success"] += 1 else: crawl_results["failed"] += 1 # 克隆源码 if clone_source: clone_url = repo["clone_url"] if self.clone_repo_source(clone_url, repo_name): logger.info(f"✅ {repo_name} 源码克隆完成") else: logger.warning(f"⚠️ {repo_name} 源码克隆失败") # 限速 time.sleep(2) # 保存爬取总结 crawl_summary = { "crawled_at": datetime.now().isoformat(), "organization": self.org_name, "total_repos": crawl_results["total"], "successfully_crawled": crawl_results["success"], "failed_crawled": crawl_results["failed"], "repositories": crawl_results["repos"] } summary_file = self.output_dir / "crawl_summary.json" with open(summary_file, 'w', encoding='utf-8') as f: json.dump(crawl_summary, f, ensure_ascii=False, indent=2) logger.info("\n" + "="*70) logger.info("✅ 爬取完成!") logger.info(f" 总计仓库: {crawl_results['total']}") logger.info(f" 成功爬取: {crawl_results['success']}") logger.info(f" 爬取失败: {crawl_results['failed']}") logger.info(f" 保存位置: {self.output_dir}") logger.info("="*70) return crawl_summary def generate_index(self) -> None: """生成索引文件""" logger.info("生成索引文件...") # 读取仓库列表 repos_file = self.output_dir / "vnpy_repos_list.json" with open(repos_file, 'r', encoding='utf-8') as f: data = json.load(f) repos = data["repositories"] # 生成markdown索引 index_content = """# vnpy GitHub 官方仓库索引 **爬取时间**: {crawled_at} **总计公开仓库**: {total_repos} ## 📦 仓库列表 | # | 仓库名称 | ⭐ Stars | 描述 | 链接 | |---|----------|-------|------|------| """.format( crawled_at = datetime.now().strftime("%Y-%m-%d %H:%M:%S"), total_repos = len(repos) ) for i, repo in enumerate(repos, 1): name = repo["name"] stars = repo.get("stars", 0) description = repo.get("description", "") or "" if len(description) > 50: description = description[:47] + "..." url = repo.get("url", "") index_content += f"| {i} | **{name}** | {stars} | {description} | [🔗 link]({url}) |\n" index_content += """ ## 📁 目录结构 ``` vnpy-github/ ├── vnpy_repos_list.json # 完整仓库列表JSON ├── crawl_summary.json # 爬取总结 ├── repos/ # 单个仓库信息JSON ├── readmes/ # README内容 └── sources/ # 完整源码(git clone) ``` ## 💡 使用说明 1. **查看仓库列表**: `vnpy_repos_list.json` 2. **查看README**: `readmes/{repo_name}_README.md` 3. **获取源码**: `sources/{repo_name}/` **爬取完成,可供查阅分析!** """ index_file = self.output_dir / "README.md" with open(index_file, 'w', encoding='utf-8') as f: f.write(index_content) logger.info(f"索引文件已生成: {index_file}") def main(): """主函数""" import argparse parser = argparse.ArgumentParser(description='vnpy GitHub组织仓库爬取') parser.add_argument('--org', default='vnpy', help='GitHub组织名称') parser.add_argument('--output', default='/Users/chufeng/.openclaw/knowledge_base/vnpy-github', help='输出目录') parser.add_argument('--token', default=None, help='GitHub API token (可选,提高速率限制)') parser.add_argument('--no-clone', action='store_true', help='不克隆源码') args = parser.parse_args() crawler = VNPYGitHubCrawler( org_name=args.org, output_dir=args.output, token=args.token ) # 开始爬取 result = crawler.crawl_all_repos(clone_source=not args.no_clone) # 生成索引 crawler.generate_index() return result if __name__ == "__main__": result = main() if result.get("status") == "failed": sys.exit(1) else: print(f"\n🎉 vnpy GitHub爬取完成!") print(f" 总计: {result['total_repos']} 个仓库") print(f" 成功: {result['successfully_crawled']} 个") print(f" 失败: {result['failed_crawled']} 个") print(f" 保存位置: /Users/chufeng/.openclaw/knowledge_base/vnpy-github/") sys.exit(0)