一些小修改

This commit is contained in:
2025-11-19 12:23:17 +08:00
parent 95d719cc1e
commit 80ae03c8c1
25 changed files with 2291 additions and 17 deletions

122
rss/nature_filter_rss.py Normal file
View File

@@ -0,0 +1,122 @@
import Bfeedparser
import requests
from feedgen.feed import FeedGenerator
from datetime import datetime, timezone
import time
# --- 1. 配置区 ---
# 你的关键词列表,不区分大小写
KEYWORDS = ['solid-state battery', 'lithium metal', 'anode-free', 'electrolyte']
# 你想监控的 Nature 系列期刊的 RSS 源
SOURCE_FEEDS = {
'Nature': 'https://www.nature.com/nature/rss/current',
'Nat Commun': 'https://www.nature.com/ncomms/rss/current',
'Nat Energy': 'https://www.nature.com/nenergy/rss/current',
'Nat Mater': 'https://www.nature.com/nmat/rss/current',
'Nat Nanotechnol': 'https://www.nature.com/nnano/rss/current',
'Nat Sustain': 'https://www.nature.com/natsustain/rss/current',
'Nat Chem': 'https://www.nature.com/nchem/rss/current',
'Nat Synth': 'https://www.nature.com/natsynth/rss/current',
'Nat Catal': 'https://www.nature.com/natcatal/rss/current',
'Nat Rev Mater': 'https://www.nature.com/natrevmat/rss/current',
'Nat Rev Chem': 'https://www.nature.com/natrevchem/rss/current',
'Nat Rev Earth Environ': 'https://www.nature.com/natrevearthenviron/rss/current',
}
# 输出的 RSS 文件路径,确保 ttrss 能通过 web 服务器访问到它
OUTPUT_FILE = '/var/www/html/rss/nature_filtered_feed.xml'
# --- 2. 脚本核心逻辑 ---
N
def fetch_and_filter():
"""获取所有源,过滤文章,返回一个匹配文章的列表"""
print(f"Starting feed fetch at {datetime.now()}")
matched_articles = []
# 使用集合来存储已添加文章的链接,防止重复
seen_links = set()
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
for name, url in SOURCE_FEEDS.items():
print(f" -> Fetching from {name}...")
try:
# 使用 requests 获取内容,可以更好地处理网络问题和伪装 User-Agent
response = requests.get(url, headers=headers, timeout=15)
response.raise_for_status() # 确保请求成功
# 使用 feedparser 解析获取到的内容
feed = feedparser.parse(response.content)
for entry in feed.entries:
# 检查文章链接是否已处理过
if entry.link in seen_links:
continue
# 将标题和摘要拼接在一起,方便搜索
content_to_check = (entry.title + ' ' + entry.get('summary', '')).lower()
# 检查是否有任何一个关键词出现在内容中
if any(keyword.lower() in content_to_check for keyword in KEYWORDS):
print(f" [MATCH FOUND] in {name}: {entry.title}")
# 为了在 RSS 阅读器中更好地展示,我们在标题前加上来源期刊
entry.title = f"[{name}] {entry.title}"
matched_articles.append(entry)
seen_links.add(entry.link)
# 友好请求,避免过于频繁
time.sleep(1)
except requests.RequestException as e:
print(f" [ERROR] Could not fetch {name}: {e}")
except Exception as e:
print(f" [ERROR] An unexpected error occurred for {name}: {e}")
print(f"\nFound {len(matched_articles)} matching articles in total.")
return matched_articles
def generate_filtered_feed(articles):
"""根据过滤后的文章列表生成新的 RSS 文件"""
fg = FeedGenerator()
fg.title('My Filtered Nature Research Feed')
fg.link(href='https://www.nature.com', rel='alternate')
fg.description(f"Custom RSS feed for Nature journals, filtered by keywords: {', '.join(KEYWORDS)}")
# 按发布日期对文章进行排序(从新到旧)
articles.sort(key=lambda x: x.get('published_parsed') or x.get('updated_parsed'), reverse=True)
for entry in articles:
fe = fg.add_entry()
fe.id(entry.link) # 使用文章链接作为唯一ID
fe.title(entry.title)
fe.link(href=entry.link)
# feedparser 已经帮我们解析好了摘要
fe.description(entry.get('summary', 'No summary available.'))
# 处理发布日期
pub_date = entry.get('published_parsed')
if pub_date:
# 转换为带时区的 datetime 对象
fe.published(datetime.fromtimestamp(time.mktime(pub_date)).replace(tzinfo=timezone.utc))
# 写入文件
fg.rss_file(OUTPUT_FILE, pretty=True)
print(f"Successfully generated new RSS feed at {OUTPUT_FILE}")
# --- 3. 主程序入口 ---
if __name__ == "__main__":
filtered_articles = fetch_and_filter()
if filtered_articles:
generate_filtered_feed(filtered_articles)
else:
print("No new matching articles found. RSS file not updated.")