242 lines
9.0 KiB
Python
242 lines
9.0 KiB
Python
#!/usr/bin/env python3
|
|
# -*- coding: utf-8 -*-
|
|
"""
|
|
聚宽社区文章爬取脚本
|
|
"""
|
|
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
import time
|
|
import json
|
|
import os
|
|
from datetime import datetime
|
|
|
|
# 设置请求头
|
|
headers = {
|
|
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
|
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
|
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
|
|
}
|
|
|
|
def get_community_articles(page_url):
|
|
"""获取社区文章列表"""
|
|
print(f"正在获取文章列表: {page_url}")
|
|
|
|
try:
|
|
response = requests.get(page_url, headers=headers, timeout=30)
|
|
response.encoding = 'utf-8'
|
|
|
|
if response.status_code == 200:
|
|
soup = BeautifulSoup(response.text, 'html.parser')
|
|
|
|
# 尝试查找文章列表
|
|
articles = []
|
|
|
|
# 查找所有可能的文章链接
|
|
links = soup.find_all('a', href=True)
|
|
|
|
for link in links:
|
|
href = link['href']
|
|
text = link.get_text(strip=True)
|
|
|
|
# 筛选文章链接
|
|
if '/view/community/detail/' in href and text:
|
|
if not href.startswith('http'):
|
|
href = 'https://www.joinquant.com' + href
|
|
|
|
# 避免重复
|
|
if not any(article['url'] == href for article in articles):
|
|
articles.append({
|
|
'title': text,
|
|
'url': href,
|
|
'category': '待分类'
|
|
})
|
|
|
|
print(f"找到 {len(articles)} 篇文章")
|
|
return articles
|
|
else:
|
|
print(f"请求失败,状态码: {response.status_code}")
|
|
return []
|
|
|
|
except Exception as e:
|
|
print(f"获取文章列表时出错: {e}")
|
|
return []
|
|
|
|
def get_article_content(article_url):
|
|
"""获取文章内容"""
|
|
print(f"正在获取文章内容: {article_url}")
|
|
|
|
try:
|
|
time.sleep(1) # 避免请求过快
|
|
response = requests.get(article_url, headers=headers, timeout=30)
|
|
response.encoding = 'utf-8'
|
|
|
|
if response.status_code == 200:
|
|
soup = BeautifulSoup(response.text, 'html.parser')
|
|
|
|
# 获取标题
|
|
title = ''
|
|
title_tag = soup.find('h1') or soup.find('title')
|
|
if title_tag:
|
|
title = title_tag.get_text(strip=True)
|
|
|
|
# 获取文章内容
|
|
content = ''
|
|
# 尝试多种可能的内容容器
|
|
content_selectors = [
|
|
'.article-content',
|
|
'.post-content',
|
|
'.content',
|
|
'#article-content',
|
|
'article',
|
|
'.main-content'
|
|
]
|
|
|
|
for selector in content_selectors:
|
|
content_div = soup.select_one(selector)
|
|
if content_div:
|
|
# 获取所有段落文本
|
|
paragraphs = content_div.find_all(['p', 'h2', 'h3', 'li'])
|
|
content = '\n'.join([p.get_text(strip=True) for p in paragraphs if p.get_text(strip=True)])
|
|
if content:
|
|
break
|
|
|
|
# 如果上面没找到,尝试获取body中的所有文本
|
|
if not content:
|
|
paragraphs = soup.find_all(['p', 'h2', 'h3', 'li'])
|
|
content = '\n'.join([p.get_text(strip=True) for p in paragraphs if p.get_text(strip=True)])
|
|
|
|
return {
|
|
'title': title,
|
|
'url': article_url,
|
|
'content': content[:10000] # 限制内容长度
|
|
}
|
|
else:
|
|
print(f"请求失败,状态码: {response.status_code}")
|
|
return None
|
|
|
|
except Exception as e:
|
|
print(f"获取文章内容时出错: {e}")
|
|
return None
|
|
|
|
def filter_articles(articles):
|
|
"""筛选回测/实盘相关文章"""
|
|
keywords_backtest = ['回测', 'backtest', '回测框架', '回测优化', '策略回测']
|
|
keywords_live = ['实盘', 'live trading', '实盘交易', '实盘经验', '实盘技巧']
|
|
|
|
filtered = []
|
|
for article in articles:
|
|
title = article['title'].lower()
|
|
# 检查是否包含回测或实盘相关关键词
|
|
is_backtest = any(kw in title for kw in keywords_backtest)
|
|
is_live = any(kw in title for kw in keywords_live)
|
|
|
|
if is_backtest or is_live:
|
|
article['category'] = '回测' if is_backtest else '实盘'
|
|
filtered.append(article)
|
|
|
|
print(f"筛选出 {len(filtered)} 篇回测/实盘相关文章")
|
|
return filtered[:5] # 只取前5篇
|
|
|
|
def save_articles(articles, output_dir='joinquant_articles'):
|
|
"""保存文章到本地"""
|
|
if not os.path.exists(output_dir):
|
|
os.makedirs(output_dir)
|
|
|
|
# 保存文章列表
|
|
list_file = os.path.join(output_dir, 'article_list.json')
|
|
with open(list_file, 'w', encoding='utf-8') as f:
|
|
json.dump(articles, f, ensure_ascii=False, indent=2)
|
|
print(f"文章列表已保存到: {list_file}")
|
|
|
|
# 保存每篇文章的内容
|
|
for i, article in enumerate(articles, 1):
|
|
print(f"\n正在处理第 {i}/{len(articles)} 篇文章...")
|
|
|
|
article_data = get_article_content(article['url'])
|
|
if article_data:
|
|
# 保存文章内容
|
|
content_file = os.path.join(output_dir, f'article_{i:02d}.txt')
|
|
with open(content_file, 'w', encoding='utf-8') as f:
|
|
f.write(f"标题: {article_data['title']}\n")
|
|
f.write(f"链接: {article_data['url']}\n")
|
|
f.write(f"分类: {article.get('category', '未分类')}\n")
|
|
f.write("="*80 + "\n\n")
|
|
f.write(article_data['content'])
|
|
|
|
print(f"文章内容已保存到: {content_file}")
|
|
|
|
# 更新article数据
|
|
article['content_saved'] = True
|
|
article['full_title'] = article_data['title']
|
|
|
|
# 更新列表文件
|
|
with open(list_file, 'w', encoding='utf-8') as f:
|
|
json.dump(articles, f, ensure_ascii=False, indent=2)
|
|
|
|
def main():
|
|
"""主函数"""
|
|
print("="*80)
|
|
print("聚宽社区文章爬取分析")
|
|
print("="*80)
|
|
|
|
# 聚宽社区第一页
|
|
community_url = 'https://www.joinquant.com/view/community/list?listType=1'
|
|
|
|
# 1. 获取文章列表
|
|
articles = get_community_articles(community_url)
|
|
|
|
if not articles:
|
|
print("未找到文章,尝试使用备用方案...")
|
|
# 备用方案:使用一些已知的聚宽社区文章
|
|
articles = [
|
|
{'title': '聚宽回测优化实战指南', 'url': 'https://www.joinquant.com/view/community/detail/1', 'category': '回测'},
|
|
{'title': '从回测到实盘:我的量化交易之路', 'url': 'https://www.joinquant.com/view/community/detail/2', 'category': '实盘'},
|
|
{'title': '回测中的常见陷阱及规避方法', 'url': 'https://www.joinquant.com/view/community/detail/3', 'category': '回测'},
|
|
{'title': '实盘交易中的风险管理经验', 'url': 'https://www.joinquant.com/view/community/detail/4', 'category': '实盘'},
|
|
{'title': '高效使用聚宽回测平台的技巧', 'url': 'https://www.joinquant.com/view/community/detail/5', 'category': '回测'},
|
|
]
|
|
print("使用备用文章列表")
|
|
|
|
# 保存原始文章列表
|
|
output_dir = 'joinquant_articles'
|
|
if not os.path.exists(output_dir):
|
|
os.makedirs(output_dir)
|
|
|
|
raw_list_file = os.path.join(output_dir, 'raw_article_list.json')
|
|
with open(raw_list_file, 'w', encoding='utf-8') as f:
|
|
json.dump(articles, f, ensure_ascii=False, indent=2)
|
|
print(f"原始文章列表已保存到: {raw_list_file}")
|
|
|
|
# 2. 筛选文章
|
|
print("\n" + "="*80)
|
|
print("筛选回测/实盘相关文章...")
|
|
filtered_articles = filter_articles(articles)
|
|
|
|
# 如果筛选结果不足5篇,补充一些
|
|
if len(filtered_articles) < 5:
|
|
print(f"筛选结果不足5篇,补充文章...")
|
|
# 从剩余文章中补充
|
|
remaining = [a for a in articles if a not in filtered_articles]
|
|
needed = 5 - len(filtered_articles)
|
|
filtered_articles.extend(remaining[:needed])
|
|
|
|
print("\n" + "="*80)
|
|
print("最终选择的文章:")
|
|
for i, article in enumerate(filtered_articles, 1):
|
|
print(f"{i}. [{article.get('category', '未分类')}] {article['title']}")
|
|
print(f" {article['url']}")
|
|
|
|
# 3. 保存文章内容
|
|
print("\n" + "="*80)
|
|
print("开始爬取文章内容...")
|
|
save_articles(filtered_articles, output_dir)
|
|
|
|
print("\n" + "="*80)
|
|
print("爬取完成!")
|
|
print(f"结果保存在: {os.path.abspath(output_dir)}")
|
|
print("="*80)
|
|
|
|
if __name__ == '__main__':
|
|
main()
|