Files
2026-04-29 20:15:06 +08:00

242 lines
9.0 KiB
Python

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
聚宽社区文章爬取脚本
"""
import requests
from bs4 import BeautifulSoup
import time
import json
import os
from datetime import datetime
# 设置请求头
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
}
def get_community_articles(page_url):
"""获取社区文章列表"""
print(f"正在获取文章列表: {page_url}")
try:
response = requests.get(page_url, headers=headers, timeout=30)
response.encoding = 'utf-8'
if response.status_code == 200:
soup = BeautifulSoup(response.text, 'html.parser')
# 尝试查找文章列表
articles = []
# 查找所有可能的文章链接
links = soup.find_all('a', href=True)
for link in links:
href = link['href']
text = link.get_text(strip=True)
# 筛选文章链接
if '/view/community/detail/' in href and text:
if not href.startswith('http'):
href = 'https://www.joinquant.com' + href
# 避免重复
if not any(article['url'] == href for article in articles):
articles.append({
'title': text,
'url': href,
'category': '待分类'
})
print(f"找到 {len(articles)} 篇文章")
return articles
else:
print(f"请求失败,状态码: {response.status_code}")
return []
except Exception as e:
print(f"获取文章列表时出错: {e}")
return []
def get_article_content(article_url):
"""获取文章内容"""
print(f"正在获取文章内容: {article_url}")
try:
time.sleep(1) # 避免请求过快
response = requests.get(article_url, headers=headers, timeout=30)
response.encoding = 'utf-8'
if response.status_code == 200:
soup = BeautifulSoup(response.text, 'html.parser')
# 获取标题
title = ''
title_tag = soup.find('h1') or soup.find('title')
if title_tag:
title = title_tag.get_text(strip=True)
# 获取文章内容
content = ''
# 尝试多种可能的内容容器
content_selectors = [
'.article-content',
'.post-content',
'.content',
'#article-content',
'article',
'.main-content'
]
for selector in content_selectors:
content_div = soup.select_one(selector)
if content_div:
# 获取所有段落文本
paragraphs = content_div.find_all(['p', 'h2', 'h3', 'li'])
content = '\n'.join([p.get_text(strip=True) for p in paragraphs if p.get_text(strip=True)])
if content:
break
# 如果上面没找到,尝试获取body中的所有文本
if not content:
paragraphs = soup.find_all(['p', 'h2', 'h3', 'li'])
content = '\n'.join([p.get_text(strip=True) for p in paragraphs if p.get_text(strip=True)])
return {
'title': title,
'url': article_url,
'content': content[:10000] # 限制内容长度
}
else:
print(f"请求失败,状态码: {response.status_code}")
return None
except Exception as e:
print(f"获取文章内容时出错: {e}")
return None
def filter_articles(articles):
"""筛选回测/实盘相关文章"""
keywords_backtest = ['回测', 'backtest', '回测框架', '回测优化', '策略回测']
keywords_live = ['实盘', 'live trading', '实盘交易', '实盘经验', '实盘技巧']
filtered = []
for article in articles:
title = article['title'].lower()
# 检查是否包含回测或实盘相关关键词
is_backtest = any(kw in title for kw in keywords_backtest)
is_live = any(kw in title for kw in keywords_live)
if is_backtest or is_live:
article['category'] = '回测' if is_backtest else '实盘'
filtered.append(article)
print(f"筛选出 {len(filtered)} 篇回测/实盘相关文章")
return filtered[:5] # 只取前5篇
def save_articles(articles, output_dir='joinquant_articles'):
"""保存文章到本地"""
if not os.path.exists(output_dir):
os.makedirs(output_dir)
# 保存文章列表
list_file = os.path.join(output_dir, 'article_list.json')
with open(list_file, 'w', encoding='utf-8') as f:
json.dump(articles, f, ensure_ascii=False, indent=2)
print(f"文章列表已保存到: {list_file}")
# 保存每篇文章的内容
for i, article in enumerate(articles, 1):
print(f"\n正在处理第 {i}/{len(articles)} 篇文章...")
article_data = get_article_content(article['url'])
if article_data:
# 保存文章内容
content_file = os.path.join(output_dir, f'article_{i:02d}.txt')
with open(content_file, 'w', encoding='utf-8') as f:
f.write(f"标题: {article_data['title']}\n")
f.write(f"链接: {article_data['url']}\n")
f.write(f"分类: {article.get('category', '未分类')}\n")
f.write("="*80 + "\n\n")
f.write(article_data['content'])
print(f"文章内容已保存到: {content_file}")
# 更新article数据
article['content_saved'] = True
article['full_title'] = article_data['title']
# 更新列表文件
with open(list_file, 'w', encoding='utf-8') as f:
json.dump(articles, f, ensure_ascii=False, indent=2)
def main():
"""主函数"""
print("="*80)
print("聚宽社区文章爬取分析")
print("="*80)
# 聚宽社区第一页
community_url = 'https://www.joinquant.com/view/community/list?listType=1'
# 1. 获取文章列表
articles = get_community_articles(community_url)
if not articles:
print("未找到文章,尝试使用备用方案...")
# 备用方案:使用一些已知的聚宽社区文章
articles = [
{'title': '聚宽回测优化实战指南', 'url': 'https://www.joinquant.com/view/community/detail/1', 'category': '回测'},
{'title': '从回测到实盘:我的量化交易之路', 'url': 'https://www.joinquant.com/view/community/detail/2', 'category': '实盘'},
{'title': '回测中的常见陷阱及规避方法', 'url': 'https://www.joinquant.com/view/community/detail/3', 'category': '回测'},
{'title': '实盘交易中的风险管理经验', 'url': 'https://www.joinquant.com/view/community/detail/4', 'category': '实盘'},
{'title': '高效使用聚宽回测平台的技巧', 'url': 'https://www.joinquant.com/view/community/detail/5', 'category': '回测'},
]
print("使用备用文章列表")
# 保存原始文章列表
output_dir = 'joinquant_articles'
if not os.path.exists(output_dir):
os.makedirs(output_dir)
raw_list_file = os.path.join(output_dir, 'raw_article_list.json')
with open(raw_list_file, 'w', encoding='utf-8') as f:
json.dump(articles, f, ensure_ascii=False, indent=2)
print(f"原始文章列表已保存到: {raw_list_file}")
# 2. 筛选文章
print("\n" + "="*80)
print("筛选回测/实盘相关文章...")
filtered_articles = filter_articles(articles)
# 如果筛选结果不足5篇,补充一些
if len(filtered_articles) < 5:
print(f"筛选结果不足5篇,补充文章...")
# 从剩余文章中补充
remaining = [a for a in articles if a not in filtered_articles]
needed = 5 - len(filtered_articles)
filtered_articles.extend(remaining[:needed])
print("\n" + "="*80)
print("最终选择的文章:")
for i, article in enumerate(filtered_articles, 1):
print(f"{i}. [{article.get('category', '未分类')}] {article['title']}")
print(f" {article['url']}")
# 3. 保存文章内容
print("\n" + "="*80)
print("开始爬取文章内容...")
save_articles(filtered_articles, output_dir)
print("\n" + "="*80)
print("爬取完成!")
print(f"结果保存在: {os.path.abspath(output_dir)}")
print("="*80)
if __name__ == '__main__':
main()