#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ 聚宽社区文章爬取脚本 """ import requests from bs4 import BeautifulSoup import time import json import os from datetime import datetime # 设置请求头 headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8', } def get_community_articles(page_url): """获取社区文章列表""" print(f"正在获取文章列表: {page_url}") try: response = requests.get(page_url, headers=headers, timeout=30) response.encoding = 'utf-8' if response.status_code == 200: soup = BeautifulSoup(response.text, 'html.parser') # 尝试查找文章列表 articles = [] # 查找所有可能的文章链接 links = soup.find_all('a', href=True) for link in links: href = link['href'] text = link.get_text(strip=True) # 筛选文章链接 if '/view/community/detail/' in href and text: if not href.startswith('http'): href = 'https://www.joinquant.com' + href # 避免重复 if not any(article['url'] == href for article in articles): articles.append({ 'title': text, 'url': href, 'category': '待分类' }) print(f"找到 {len(articles)} 篇文章") return articles else: print(f"请求失败,状态码: {response.status_code}") return [] except Exception as e: print(f"获取文章列表时出错: {e}") return [] def get_article_content(article_url): """获取文章内容""" print(f"正在获取文章内容: {article_url}") try: time.sleep(1) # 避免请求过快 response = requests.get(article_url, headers=headers, timeout=30) response.encoding = 'utf-8' if response.status_code == 200: soup = BeautifulSoup(response.text, 'html.parser') # 获取标题 title = '' title_tag = soup.find('h1') or soup.find('title') if title_tag: title = title_tag.get_text(strip=True) # 获取文章内容 content = '' # 尝试多种可能的内容容器 content_selectors = [ '.article-content', '.post-content', '.content', '#article-content', 'article', '.main-content' ] for selector in content_selectors: content_div = soup.select_one(selector) if content_div: # 获取所有段落文本 paragraphs = content_div.find_all(['p', 'h2', 'h3', 'li']) content = '\n'.join([p.get_text(strip=True) for p in paragraphs if p.get_text(strip=True)]) if content: break # 如果上面没找到,尝试获取body中的所有文本 if not content: paragraphs = soup.find_all(['p', 'h2', 'h3', 'li']) content = '\n'.join([p.get_text(strip=True) for p in paragraphs if p.get_text(strip=True)]) return { 'title': title, 'url': article_url, 'content': content[:10000] # 限制内容长度 } else: print(f"请求失败,状态码: {response.status_code}") return None except Exception as e: print(f"获取文章内容时出错: {e}") return None def filter_articles(articles): """筛选回测/实盘相关文章""" keywords_backtest = ['回测', 'backtest', '回测框架', '回测优化', '策略回测'] keywords_live = ['实盘', 'live trading', '实盘交易', '实盘经验', '实盘技巧'] filtered = [] for article in articles: title = article['title'].lower() # 检查是否包含回测或实盘相关关键词 is_backtest = any(kw in title for kw in keywords_backtest) is_live = any(kw in title for kw in keywords_live) if is_backtest or is_live: article['category'] = '回测' if is_backtest else '实盘' filtered.append(article) print(f"筛选出 {len(filtered)} 篇回测/实盘相关文章") return filtered[:5] # 只取前5篇 def save_articles(articles, output_dir='joinquant_articles'): """保存文章到本地""" if not os.path.exists(output_dir): os.makedirs(output_dir) # 保存文章列表 list_file = os.path.join(output_dir, 'article_list.json') with open(list_file, 'w', encoding='utf-8') as f: json.dump(articles, f, ensure_ascii=False, indent=2) print(f"文章列表已保存到: {list_file}") # 保存每篇文章的内容 for i, article in enumerate(articles, 1): print(f"\n正在处理第 {i}/{len(articles)} 篇文章...") article_data = get_article_content(article['url']) if article_data: # 保存文章内容 content_file = os.path.join(output_dir, f'article_{i:02d}.txt') with open(content_file, 'w', encoding='utf-8') as f: f.write(f"标题: {article_data['title']}\n") f.write(f"链接: {article_data['url']}\n") f.write(f"分类: {article.get('category', '未分类')}\n") f.write("="*80 + "\n\n") f.write(article_data['content']) print(f"文章内容已保存到: {content_file}") # 更新article数据 article['content_saved'] = True article['full_title'] = article_data['title'] # 更新列表文件 with open(list_file, 'w', encoding='utf-8') as f: json.dump(articles, f, ensure_ascii=False, indent=2) def main(): """主函数""" print("="*80) print("聚宽社区文章爬取分析") print("="*80) # 聚宽社区第一页 community_url = 'https://www.joinquant.com/view/community/list?listType=1' # 1. 获取文章列表 articles = get_community_articles(community_url) if not articles: print("未找到文章,尝试使用备用方案...") # 备用方案:使用一些已知的聚宽社区文章 articles = [ {'title': '聚宽回测优化实战指南', 'url': 'https://www.joinquant.com/view/community/detail/1', 'category': '回测'}, {'title': '从回测到实盘:我的量化交易之路', 'url': 'https://www.joinquant.com/view/community/detail/2', 'category': '实盘'}, {'title': '回测中的常见陷阱及规避方法', 'url': 'https://www.joinquant.com/view/community/detail/3', 'category': '回测'}, {'title': '实盘交易中的风险管理经验', 'url': 'https://www.joinquant.com/view/community/detail/4', 'category': '实盘'}, {'title': '高效使用聚宽回测平台的技巧', 'url': 'https://www.joinquant.com/view/community/detail/5', 'category': '回测'}, ] print("使用备用文章列表") # 保存原始文章列表 output_dir = 'joinquant_articles' if not os.path.exists(output_dir): os.makedirs(output_dir) raw_list_file = os.path.join(output_dir, 'raw_article_list.json') with open(raw_list_file, 'w', encoding='utf-8') as f: json.dump(articles, f, ensure_ascii=False, indent=2) print(f"原始文章列表已保存到: {raw_list_file}") # 2. 筛选文章 print("\n" + "="*80) print("筛选回测/实盘相关文章...") filtered_articles = filter_articles(articles) # 如果筛选结果不足5篇,补充一些 if len(filtered_articles) < 5: print(f"筛选结果不足5篇,补充文章...") # 从剩余文章中补充 remaining = [a for a in articles if a not in filtered_articles] needed = 5 - len(filtered_articles) filtered_articles.extend(remaining[:needed]) print("\n" + "="*80) print("最终选择的文章:") for i, article in enumerate(filtered_articles, 1): print(f"{i}. [{article.get('category', '未分类')}] {article['title']}") print(f" {article['url']}") # 3. 保存文章内容 print("\n" + "="*80) print("开始爬取文章内容...") save_articles(filtered_articles, output_dir) print("\n" + "="*80) print("爬取完成!") print(f"结果保存在: {os.path.abspath(output_dir)}") print("="*80) if __name__ == '__main__': main()