一些小修改
This commit is contained in:
122
rss/nature_filter_rss.py
Normal file
122
rss/nature_filter_rss.py
Normal file
@@ -0,0 +1,122 @@
|
||||
import Bfeedparser
|
||||
import requests
|
||||
from feedgen.feed import FeedGenerator
|
||||
from datetime import datetime, timezone
|
||||
import time
|
||||
|
||||
# --- 1. 配置区 ---
|
||||
|
||||
# 你的关键词列表,不区分大小写
|
||||
KEYWORDS = ['solid-state battery', 'lithium metal', 'anode-free', 'electrolyte']
|
||||
|
||||
# 你想监控的 Nature 系列期刊的 RSS 源
|
||||
SOURCE_FEEDS = {
|
||||
'Nature': 'https://www.nature.com/nature/rss/current',
|
||||
'Nat Commun': 'https://www.nature.com/ncomms/rss/current',
|
||||
'Nat Energy': 'https://www.nature.com/nenergy/rss/current',
|
||||
'Nat Mater': 'https://www.nature.com/nmat/rss/current',
|
||||
'Nat Nanotechnol': 'https://www.nature.com/nnano/rss/current',
|
||||
'Nat Sustain': 'https://www.nature.com/natsustain/rss/current',
|
||||
'Nat Chem': 'https://www.nature.com/nchem/rss/current',
|
||||
'Nat Synth': 'https://www.nature.com/natsynth/rss/current',
|
||||
'Nat Catal': 'https://www.nature.com/natcatal/rss/current',
|
||||
'Nat Rev Mater': 'https://www.nature.com/natrevmat/rss/current',
|
||||
'Nat Rev Chem': 'https://www.nature.com/natrevchem/rss/current',
|
||||
'Nat Rev Earth Environ': 'https://www.nature.com/natrevearthenviron/rss/current',
|
||||
}
|
||||
|
||||
# 输出的 RSS 文件路径,确保 ttrss 能通过 web 服务器访问到它
|
||||
OUTPUT_FILE = '/var/www/html/rss/nature_filtered_feed.xml'
|
||||
|
||||
|
||||
# --- 2. 脚本核心逻辑 ---
|
||||
N
|
||||
def fetch_and_filter():
|
||||
"""获取所有源,过滤文章,返回一个匹配文章的列表"""
|
||||
|
||||
print(f"Starting feed fetch at {datetime.now()}")
|
||||
|
||||
matched_articles = []
|
||||
# 使用集合来存储已添加文章的链接,防止重复
|
||||
seen_links = set()
|
||||
|
||||
headers = {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
|
||||
}
|
||||
|
||||
for name, url in SOURCE_FEEDS.items():
|
||||
print(f" -> Fetching from {name}...")
|
||||
try:
|
||||
# 使用 requests 获取内容,可以更好地处理网络问题和伪装 User-Agent
|
||||
response = requests.get(url, headers=headers, timeout=15)
|
||||
response.raise_for_status() # 确保请求成功
|
||||
|
||||
# 使用 feedparser 解析获取到的内容
|
||||
feed = feedparser.parse(response.content)
|
||||
|
||||
for entry in feed.entries:
|
||||
# 检查文章链接是否已处理过
|
||||
if entry.link in seen_links:
|
||||
continue
|
||||
|
||||
# 将标题和摘要拼接在一起,方便搜索
|
||||
content_to_check = (entry.title + ' ' + entry.get('summary', '')).lower()
|
||||
|
||||
# 检查是否有任何一个关键词出现在内容中
|
||||
if any(keyword.lower() in content_to_check for keyword in KEYWORDS):
|
||||
print(f" [MATCH FOUND] in {name}: {entry.title}")
|
||||
|
||||
# 为了在 RSS 阅读器中更好地展示,我们在标题前加上来源期刊
|
||||
entry.title = f"[{name}] {entry.title}"
|
||||
matched_articles.append(entry)
|
||||
seen_links.add(entry.link)
|
||||
|
||||
# 友好请求,避免过于频繁
|
||||
time.sleep(1)
|
||||
|
||||
except requests.RequestException as e:
|
||||
print(f" [ERROR] Could not fetch {name}: {e}")
|
||||
except Exception as e:
|
||||
print(f" [ERROR] An unexpected error occurred for {name}: {e}")
|
||||
|
||||
print(f"\nFound {len(matched_articles)} matching articles in total.")
|
||||
return matched_articles
|
||||
|
||||
|
||||
def generate_filtered_feed(articles):
|
||||
"""根据过滤后的文章列表生成新的 RSS 文件"""
|
||||
|
||||
fg = FeedGenerator()
|
||||
fg.title('My Filtered Nature Research Feed')
|
||||
fg.link(href='https://www.nature.com', rel='alternate')
|
||||
fg.description(f"Custom RSS feed for Nature journals, filtered by keywords: {', '.join(KEYWORDS)}")
|
||||
|
||||
# 按发布日期对文章进行排序(从新到旧)
|
||||
articles.sort(key=lambda x: x.get('published_parsed') or x.get('updated_parsed'), reverse=True)
|
||||
|
||||
for entry in articles:
|
||||
fe = fg.add_entry()
|
||||
fe.id(entry.link) # 使用文章链接作为唯一ID
|
||||
fe.title(entry.title)
|
||||
fe.link(href=entry.link)
|
||||
# feedparser 已经帮我们解析好了摘要
|
||||
fe.description(entry.get('summary', 'No summary available.'))
|
||||
|
||||
# 处理发布日期
|
||||
pub_date = entry.get('published_parsed')
|
||||
if pub_date:
|
||||
# 转换为带时区的 datetime 对象
|
||||
fe.published(datetime.fromtimestamp(time.mktime(pub_date)).replace(tzinfo=timezone.utc))
|
||||
|
||||
# 写入文件
|
||||
fg.rss_file(OUTPUT_FILE, pretty=True)
|
||||
print(f"Successfully generated new RSS feed at {OUTPUT_FILE}")
|
||||
|
||||
|
||||
# --- 3. 主程序入口 ---
|
||||
if __name__ == "__main__":
|
||||
filtered_articles = fetch_and_filter()
|
||||
if filtered_articles:
|
||||
generate_filtered_feed(filtered_articles)
|
||||
else:
|
||||
print("No new matching articles found. RSS file not updated.")
|
||||
Reference in New Issue
Block a user