sofvBV_mcp重构v2

Embedding copy
This commit is contained in:
2025-10-22 23:59:23 +08:00
parent b9ba79d7a8
commit c0b2ec5983
8 changed files with 916 additions and 50 deletions

View File

@@ -0,0 +1,21 @@
MIT License
Copyright (c) 2025 gyj155
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

View File

@@ -0,0 +1,96 @@
# Paper Semantic Search
Find similar papers using semantic search. Supports both local models (free) and OpenAI API (better quality).
## Features
- Request for papers from OpenReview (e.g., ICLR2026 submissions)
- Semantic search with example papers or text queries
- Support embedding caching
- Embed model support: Open-source (e.g., all-MiniLM-L6-v2) or OpenAI
## Quick Start
```bash
pip install -r requirements.txt
```
### 1. Prepare Papers
```python
from crawl import crawl_papers
crawl_papers(
venue_id="ICLR.cc/2026/Conference/Submission",
output_file="iclr2026_papers.json"
)
```
### 2. Search Papers
```python
from search import PaperSearcher
# Local model (free)
searcher = PaperSearcher('iclr2026_papers.json', model_type='local')
# OpenAI model (better, requires API key)
# export OPENAI_API_KEY='your-key'
# searcher = PaperSearcher('iclr2026_papers.json', model_type='openai')
searcher.compute_embeddings()
# Search with example papers that you are interested in
examples = [
{
"title": "Your paper title",
"abstract": "Your paper abstract..."
}
]
results = searcher.search(examples=examples, top_k=100)
# Or search with text query
results = searcher.search(query="interesting topics", top_k=100)
searcher.display(results, n=10)
searcher.save(results, 'results.json')
```
## How It Works
1. Paper titles and abstracts are converted to embeddings
2. Embeddings are cached automatically
3. Your query is embedded using the same model
4. Cosine similarity finds the most similar papers
5. Results are ranked by similarity score
## Cache
Embeddings are cached as `cache_<filename>_<hash>_<model>.npy`. Delete to recompute.
## Example Output
```
================================================================================
Top 100 Results (showing 10)
================================================================================
1. [0.8456] Paper a
#12345 | foundation or frontier models, including LLMs
https://openreview.net/forum?id=xxx
2. [0.8234] Paper b
#12346 | applications to robotics, autonomy, planning
https://openreview.net/forum?id=yyy
```
## Tips
- Use 1-5 example papers for best results, or a paragraph of description of your interested topic
- Local model is good enough for most cases
- OpenAI model for critical search (~$1 for 18k queries)
If it's useful, please consider giving a star~

View File

@@ -0,0 +1,66 @@
import requests
import json
import time
def fetch_submissions(venue_id, offset=0, limit=1000):
url = "https://api2.openreview.net/notes"
params = {
"content.venueid": venue_id,
"details": "replyCount,invitation",
"limit": limit,
"offset": offset,
"sort": "number:desc"
}
headers = {"User-Agent": "Mozilla/5.0"}
response = requests.get(url, params=params, headers=headers)
response.raise_for_status()
return response.json()
def crawl_papers(venue_id, output_file):
all_papers = []
offset = 0
limit = 1000
print(f"Fetching papers from {venue_id}...")
while True:
data = fetch_submissions(venue_id, offset, limit)
notes = data.get("notes", [])
if not notes:
break
for note in notes:
paper = {
"id": note.get("id"),
"number": note.get("number"),
"title": note.get("content", {}).get("title", {}).get("value", ""),
"authors": note.get("content", {}).get("authors", {}).get("value", []),
"abstract": note.get("content", {}).get("abstract", {}).get("value", ""),
"keywords": note.get("content", {}).get("keywords", {}).get("value", []),
"primary_area": note.get("content", {}).get("primary_area", {}).get("value", ""),
"forum_url": f"https://openreview.net/forum?id={note.get('id')}"
}
all_papers.append(paper)
print(f"Fetched {len(notes)} papers (total: {len(all_papers)})")
if len(notes) < limit:
break
offset += limit
time.sleep(0.5)
with open(output_file, "w", encoding="utf-8") as f:
json.dump(all_papers, f, ensure_ascii=False, indent=2)
print(f"\nTotal: {len(all_papers)} papers")
print(f"Saved to {output_file}")
return all_papers
if __name__ == "__main__":
crawl_papers(
venue_id="ICLR.cc/2026/Conference/Submission",
output_file="iclr2026_papers.json"
)

View File

@@ -0,0 +1,22 @@
from search import PaperSearcher
# Use local model (free)
searcher = PaperSearcher('iclr2026_papers.json', model_type='local')
# Or use OpenAI (better quality)
# searcher = PaperSearcher('iclr2026_papers.json', model_type='openai')
searcher.compute_embeddings()
examples = [
{
"title": "Improving Developer Emotion Classification via LLM-Based Augmentation",
"abstract": "Detecting developer emotion in the informative data stream of technical commit messages..."
},
]
results = searcher.search(examples=examples, top_k=100)
searcher.display(results, n=10)
searcher.save(results, 'results.json')

View File

@@ -0,0 +1,6 @@
requests
numpy
scikit-learn
sentence-transformers
openai

View File

@@ -0,0 +1,156 @@
import json
import numpy as np
import os
import hashlib
from pathlib import Path
from sklearn.metrics.pairwise import cosine_similarity
class PaperSearcher:
def __init__(self, papers_file, model_type="openai", api_key=None, base_url=None):
with open(papers_file, 'r', encoding='utf-8') as f:
self.papers = json.load(f)
self.model_type = model_type
self.cache_file = self._get_cache_file(papers_file, model_type)
self.embeddings = None
if model_type == "openai":
from openai import OpenAI
self.client = OpenAI(
api_key=api_key or os.getenv('OPENAI_API_KEY'),
base_url=base_url
)
self.model_name = "text-embedding-3-large"
else:
from sentence_transformers import SentenceTransformer
self.model = SentenceTransformer('all-MiniLM-L6-v2')
self.model_name = "all-MiniLM-L6-v2"
self._load_cache()
def _get_cache_file(self, papers_file, model_type):
base_name = Path(papers_file).stem
file_hash = hashlib.md5(papers_file.encode()).hexdigest()[:8]
cache_name = f"cache_{base_name}_{file_hash}_{model_type}.npy"
return str(Path(papers_file).parent / cache_name)
def _load_cache(self):
if os.path.exists(self.cache_file):
try:
self.embeddings = np.load(self.cache_file)
if len(self.embeddings) == len(self.papers):
print(f"Loaded cache: {self.embeddings.shape}")
return True
self.embeddings = None
except:
self.embeddings = None
return False
def _save_cache(self):
np.save(self.cache_file, self.embeddings)
print(f"Saved cache: {self.cache_file}")
def _create_text(self, paper):
parts = []
if paper.get('title'):
parts.append(f"Title: {paper['title']}")
if paper.get('abstract'):
parts.append(f"Abstract: {paper['abstract']}")
if paper.get('keywords'):
kw = ', '.join(paper['keywords']) if isinstance(paper['keywords'], list) else paper['keywords']
parts.append(f"Keywords: {kw}")
return ' '.join(parts)
def _embed_openai(self, texts):
if isinstance(texts, str):
texts = [texts]
embeddings = []
batch_size = 100
for i in range(0, len(texts), batch_size):
batch = texts[i:i + batch_size]
response = self.client.embeddings.create(input=batch, model=self.model_name)
embeddings.extend([item.embedding for item in response.data])
return np.array(embeddings)
def _embed_local(self, texts):
if isinstance(texts, str):
texts = [texts]
return self.model.encode(texts, show_progress_bar=len(texts) > 100)
def compute_embeddings(self, force=False):
if self.embeddings is not None and not force:
print("Using cached embeddings")
return self.embeddings
print(f"Computing embeddings ({self.model_name})...")
texts = [self._create_text(p) for p in self.papers]
if self.model_type == "openai":
self.embeddings = self._embed_openai(texts)
else:
self.embeddings = self._embed_local(texts)
print(f"Computed: {self.embeddings.shape}")
self._save_cache()
return self.embeddings
def search(self, examples=None, query=None, top_k=100):
if self.embeddings is None:
self.compute_embeddings()
if examples:
texts = []
for ex in examples:
text = f"Title: {ex['title']}"
if ex.get('abstract'):
text += f" Abstract: {ex['abstract']}"
texts.append(text)
if self.model_type == "openai":
embs = self._embed_openai(texts)
else:
embs = self._embed_local(texts)
query_emb = np.mean(embs, axis=0).reshape(1, -1)
elif query:
if self.model_type == "openai":
query_emb = self._embed_openai(query).reshape(1, -1)
else:
query_emb = self._embed_local(query).reshape(1, -1)
else:
raise ValueError("Provide either examples or query")
similarities = cosine_similarity(query_emb, self.embeddings)[0]
top_indices = np.argsort(similarities)[::-1][:top_k]
return [{
'paper': self.papers[idx],
'similarity': float(similarities[idx])
} for idx in top_indices]
def display(self, results, n=10):
print(f"\n{'='*80}")
print(f"Top {len(results)} Results (showing {min(n, len(results))})")
print(f"{'='*80}\n")
for i, result in enumerate(results[:n], 1):
paper = result['paper']
sim = result['similarity']
print(f"{i}. [{sim:.4f}] {paper['title']}")
print(f" #{paper.get('number', 'N/A')} | {paper.get('primary_area', 'N/A')}")
print(f" {paper['forum_url']}\n")
def save(self, results, output_file):
with open(output_file, 'w', encoding='utf-8') as f:
json.dump({
'model': self.model_name,
'total': len(results),
'results': results
}, f, ensure_ascii=False, indent=2)
print(f"Saved to {output_file}")