Files
solidstate-tools/mcp/SearchPaperByEmbedding/crawl.py
koko c0b2ec5983 sofvBV_mcp重构v2
Embedding copy
2025-10-22 23:59:23 +08:00

67 lines
2.1 KiB
Python

import requests
import json
import time
def fetch_submissions(venue_id, offset=0, limit=1000):
url = "https://api2.openreview.net/notes"
params = {
"content.venueid": venue_id,
"details": "replyCount,invitation",
"limit": limit,
"offset": offset,
"sort": "number:desc"
}
headers = {"User-Agent": "Mozilla/5.0"}
response = requests.get(url, params=params, headers=headers)
response.raise_for_status()
return response.json()
def crawl_papers(venue_id, output_file):
all_papers = []
offset = 0
limit = 1000
print(f"Fetching papers from {venue_id}...")
while True:
data = fetch_submissions(venue_id, offset, limit)
notes = data.get("notes", [])
if not notes:
break
for note in notes:
paper = {
"id": note.get("id"),
"number": note.get("number"),
"title": note.get("content", {}).get("title", {}).get("value", ""),
"authors": note.get("content", {}).get("authors", {}).get("value", []),
"abstract": note.get("content", {}).get("abstract", {}).get("value", ""),
"keywords": note.get("content", {}).get("keywords", {}).get("value", []),
"primary_area": note.get("content", {}).get("primary_area", {}).get("value", ""),
"forum_url": f"https://openreview.net/forum?id={note.get('id')}"
}
all_papers.append(paper)
print(f"Fetched {len(notes)} papers (total: {len(all_papers)})")
if len(notes) < limit:
break
offset += limit
time.sleep(0.5)
with open(output_file, "w", encoding="utf-8") as f:
json.dump(all_papers, f, ensure_ascii=False, indent=2)
print(f"\nTotal: {len(all_papers)} papers")
print(f"Saved to {output_file}")
return all_papers
if __name__ == "__main__":
crawl_papers(
venue_id="ICLR.cc/2026/Conference/Submission",
output_file="iclr2026_papers.json"
)