chore: update project structure for new workspace layout

2026-03-25 23:07:52 +08:00
parent e18d0ed3e6
commit fd21c8e1a1
21 changed files with 1641 additions and 402 deletions
@@ -0,0 +1,219 @@
+#!/usr/bin/env python3
+# 批量数据下载器 - 赵云数据工程工具
+# 用于批量下载聚宽文章、金融数据等
+
+import requests
+import time
+import json
+import os
+from typing import List, Dict, Optional
+import logging
+from datetime import datetime
+
+class BatchDownloader:
+    """批量数据下载器"""
+    
+    def __init__(self, output_dir: str = "./data/raw"):
+        self.output_dir = output_dir
+        self.session = requests.Session()
+        self.session.headers.update({
+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
+        })
+        
+        # 创建输出目录
+        os.makedirs(output_dir, exist_ok=True)
+        
+        # 配置日志
+        logging.basicConfig(
+            level=logging.INFO,
+            format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+        )
+        self.logger = logging.getLogger(__name__)
+    
+    def download_jq_articles(self, article_links: List[str], delay: float = 1.0) -> Dict:
+        """下载聚宽文章
+        
+        Args:
+            article_links: 文章链接列表
+            delay: 请求延迟（秒）
+            
+        Returns:
+            Dict: 下载结果统计
+        """
+        results = {
+            'total': len(article_links),
+            'success': 0,
+            'failed': 0,
+            'articles': []
+        }
+        
+        for i, link in enumerate(article_links, 1):
+            try:
+                self.logger.info(f"下载文章 {i}/{len(article_links)}: {link}")
+                
+                # 模拟请求
+                response = self.session.get(link, timeout=10)
+                response.raise_for_status()
+                
+                # 解析文章内容
+                article_data = self._parse_jq_article(response.text)
+                
+                # 保存文章
+                article_id = f"article_{i:03d}"
+                save_path = os.path.join(self.output_dir, f"{article_id}.json")
+                with open(save_path, 'w', encoding='utf-8') as f:
+                    json.dump(article_data, f, ensure_ascii=False, indent=2)
+                
+                results['success'] += 1
+                results['articles'].append({
+                    'id': article_id,
+                    'url': link,
+                    'save_path': save_path,
+                    'timestamp': datetime.now().isoformat()
+                })
+                
+                self.logger.info(f"文章 {article_id} 下载成功")
+                
+            except Exception as e:
+                self.logger.error(f"下载失败 {link}: {e}")
+                results['failed'] += 1
+            
+            # 请求延迟
+            if i < len(article_links):
+                time.sleep(delay)
+        
+        return results
+    
+    def _parse_jq_article(self, html_content: str) -> Dict:
+        """解析聚宽文章内容
+        
+        Args:
+            html_content: HTML内容
+            
+        Returns:
+            Dict: 解析后的文章数据
+        """
+        # 这里简化处理，实际需要HTML解析
+        return {
+            'title': f"聚宽文章 - {datetime.now().strftime('%Y%m%d_%H%M%S')}",
+            'content': "文章内容解析逻辑待实现",
+            'metadata': {
+                'source': 'joinquant',
+                'crawl_time': datetime.now().isoformat(),
+                'status': 'raw'
+            }
+        }
+    
+    def download_financial_data(self, symbols: List[str], start_date: str, end_date: str) -> Dict:
+        """下载金融数据
+        
+        Args:
+            symbols: 股票代码列表
+            start_date: 开始日期
+            end_date: 结束日期
+            
+        Returns:
+            Dict: 下载结果
+        """
+        results = {}
+        
+        for symbol in symbols:
+            try:
+                self.logger.info(f"下载金融数据: {symbol}")
+                
+                # 这里可以集成akshare、tushare等数据源
+                # 示例数据
+                data = {
+                    'symbol': symbol,
+                    'start_date': start_date,
+                    'end_date': end_date,
+                    'data': []  # 实际数据
+                }
+                
+                # 保存数据
+                save_path = os.path.join(self.output_dir, f"financial_{symbol}_{start_date}_{end_date}.json")
+                with open(save_path, 'w', encoding='utf-8') as f:
+                    json.dump(data, f, ensure_ascii=False, indent=2)
+                
+                results[symbol] = {
+                    'status': 'success',
+                    'save_path': save_path
+                }
+                
+            except Exception as e:
+                self.logger.error(f"下载金融数据失败 {symbol}: {e}")
+                results[symbol] = {
+                    'status': 'failed',
+                    'error': str(e)
+                }
+        
+        return results
+    
+    def resume_download(self, log_file: str) -> Dict:
+        """断点续传
+        
+        Args:
+            log_file: 下载日志文件
+            
+        Returns:
+            Dict: 续传结果
+        """
+        self.logger.info(f"尝试断点续传: {log_file}")
+        
+        try:
+            with open(log_file, 'r', encoding='utf-8') as f:
+                log_data = json.load(f)
+            
+            # 找出失败的下载项
+            failed_items = [item for item in log_data.get('items', []) 
+                           if item.get('status') == 'failed']
+            
+            if not failed_items:
+                self.logger.info("没有失败的下载项")
+                return {'status': 'completed', 'failed': 0}
+            
+            self.logger.info(f"发现 {len(failed_items)} 个失败的下载项，尝试重新下载")
+            
+            # 重新下载失败的项
+            success_count = 0
+            for item in failed_items:
+                try:
+                    # 重新下载逻辑
+                    # ...
+                    success_count += 1
+                except Exception as e:
+                    self.logger.error(f"重新下载失败: {e}")
+            
+            return {
+                'status': f'resumed {success_count}/{len(failed_items)}',
+                'total_failed': len(failed_items),
+                'resumed': success_count,
+                'still_failed': len(failed_items) - success_count
+            }
+            
+        except Exception as e:
+            self.logger.error(f"断点续传失败: {e}")
+            return {'status': 'failed', 'error': str(e)}
+
+def main():
+    """示例使用"""
+    downloader = BatchDownloader()
+    
+    # 示例：下载聚宽文章
+    article_links = [
+        "https://www.joinquant.com/view/community/detail/12345",
+        "https://www.joinquant.com/view/community/detail/67890"
+    ]
+    
+    results = downloader.download_jq_articles(article_links)
+    print(f"下载结果: {json.dumps(results, ensure_ascii=False, indent=2)}")
+    
+    # 示例：下载金融数据
+    stock_symbols = ['000001', '000002']
+    financial_results = downloader.download_financial_data(
+        stock_symbols, '2024-01-01', '2024-03-01'
+    )
+    print(f"金融数据下载结果: {json.dumps(financial_results, ensure_ascii=False, indent=2)}")
+
+if __name__ == "__main__":
+    main()