67 lines
2.1 KiB
Python
67 lines
2.1 KiB
Python
import requests
|
|
import json
|
|
import time
|
|
|
|
def fetch_submissions(venue_id, offset=0, limit=1000):
|
|
url = "https://api2.openreview.net/notes"
|
|
params = {
|
|
"content.venueid": venue_id,
|
|
"details": "replyCount,invitation",
|
|
"limit": limit,
|
|
"offset": offset,
|
|
"sort": "number:desc"
|
|
}
|
|
headers = {"User-Agent": "Mozilla/5.0"}
|
|
response = requests.get(url, params=params, headers=headers)
|
|
response.raise_for_status()
|
|
return response.json()
|
|
|
|
def crawl_papers(venue_id, output_file):
|
|
all_papers = []
|
|
offset = 0
|
|
limit = 1000
|
|
|
|
print(f"Fetching papers from {venue_id}...")
|
|
|
|
while True:
|
|
data = fetch_submissions(venue_id, offset, limit)
|
|
notes = data.get("notes", [])
|
|
|
|
if not notes:
|
|
break
|
|
|
|
for note in notes:
|
|
paper = {
|
|
"id": note.get("id"),
|
|
"number": note.get("number"),
|
|
"title": note.get("content", {}).get("title", {}).get("value", ""),
|
|
"authors": note.get("content", {}).get("authors", {}).get("value", []),
|
|
"abstract": note.get("content", {}).get("abstract", {}).get("value", ""),
|
|
"keywords": note.get("content", {}).get("keywords", {}).get("value", []),
|
|
"primary_area": note.get("content", {}).get("primary_area", {}).get("value", ""),
|
|
"forum_url": f"https://openreview.net/forum?id={note.get('id')}"
|
|
}
|
|
all_papers.append(paper)
|
|
|
|
print(f"Fetched {len(notes)} papers (total: {len(all_papers)})")
|
|
|
|
if len(notes) < limit:
|
|
break
|
|
|
|
offset += limit
|
|
time.sleep(0.5)
|
|
|
|
with open(output_file, "w", encoding="utf-8") as f:
|
|
json.dump(all_papers, f, ensure_ascii=False, indent=2)
|
|
|
|
print(f"\nTotal: {len(all_papers)} papers")
|
|
print(f"Saved to {output_file}")
|
|
return all_papers
|
|
|
|
if __name__ == "__main__":
|
|
crawl_papers(
|
|
venue_id="ICLR.cc/2026/Conference/Submission",
|
|
output_file="iclr2026_papers.json"
|
|
)
|
|
|