sofvBV_mcp重构v2
Embedding copy
This commit is contained in:
21
mcp/SearchPaperByEmbedding/LICENSE
Normal file
21
mcp/SearchPaperByEmbedding/LICENSE
Normal file
@@ -0,0 +1,21 @@
|
||||
MIT License
|
||||
|
||||
Copyright (c) 2025 gyj155
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all
|
||||
copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
||||
96
mcp/SearchPaperByEmbedding/README.md
Normal file
96
mcp/SearchPaperByEmbedding/README.md
Normal file
@@ -0,0 +1,96 @@
|
||||
# Paper Semantic Search
|
||||
|
||||
Find similar papers using semantic search. Supports both local models (free) and OpenAI API (better quality).
|
||||
|
||||
## Features
|
||||
|
||||
- Request for papers from OpenReview (e.g., ICLR2026 submissions)
|
||||
- Semantic search with example papers or text queries
|
||||
- Support embedding caching
|
||||
- Embed model support: Open-source (e.g., all-MiniLM-L6-v2) or OpenAI
|
||||
|
||||
## Quick Start
|
||||
|
||||
```bash
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
|
||||
### 1. Prepare Papers
|
||||
|
||||
```python
|
||||
from crawl import crawl_papers
|
||||
|
||||
crawl_papers(
|
||||
venue_id="ICLR.cc/2026/Conference/Submission",
|
||||
output_file="iclr2026_papers.json"
|
||||
)
|
||||
```
|
||||
|
||||
### 2. Search Papers
|
||||
|
||||
```python
|
||||
from search import PaperSearcher
|
||||
|
||||
# Local model (free)
|
||||
searcher = PaperSearcher('iclr2026_papers.json', model_type='local')
|
||||
|
||||
# OpenAI model (better, requires API key)
|
||||
# export OPENAI_API_KEY='your-key'
|
||||
# searcher = PaperSearcher('iclr2026_papers.json', model_type='openai')
|
||||
|
||||
searcher.compute_embeddings()
|
||||
|
||||
# Search with example papers that you are interested in
|
||||
examples = [
|
||||
{
|
||||
"title": "Your paper title",
|
||||
"abstract": "Your paper abstract..."
|
||||
}
|
||||
]
|
||||
|
||||
results = searcher.search(examples=examples, top_k=100)
|
||||
|
||||
# Or search with text query
|
||||
results = searcher.search(query="interesting topics", top_k=100)
|
||||
|
||||
searcher.display(results, n=10)
|
||||
searcher.save(results, 'results.json')
|
||||
```
|
||||
|
||||
|
||||
|
||||
## How It Works
|
||||
|
||||
1. Paper titles and abstracts are converted to embeddings
|
||||
2. Embeddings are cached automatically
|
||||
3. Your query is embedded using the same model
|
||||
4. Cosine similarity finds the most similar papers
|
||||
5. Results are ranked by similarity score
|
||||
|
||||
## Cache
|
||||
|
||||
Embeddings are cached as `cache_<filename>_<hash>_<model>.npy`. Delete to recompute.
|
||||
|
||||
## Example Output
|
||||
|
||||
```
|
||||
================================================================================
|
||||
Top 100 Results (showing 10)
|
||||
================================================================================
|
||||
|
||||
1. [0.8456] Paper a
|
||||
#12345 | foundation or frontier models, including LLMs
|
||||
https://openreview.net/forum?id=xxx
|
||||
|
||||
2. [0.8234] Paper b
|
||||
#12346 | applications to robotics, autonomy, planning
|
||||
https://openreview.net/forum?id=yyy
|
||||
```
|
||||
|
||||
## Tips
|
||||
|
||||
- Use 1-5 example papers for best results, or a paragraph of description of your interested topic
|
||||
- Local model is good enough for most cases
|
||||
- OpenAI model for critical search (~$1 for 18k queries)
|
||||
|
||||
If it's useful, please consider giving a star~
|
||||
66
mcp/SearchPaperByEmbedding/crawl.py
Normal file
66
mcp/SearchPaperByEmbedding/crawl.py
Normal file
@@ -0,0 +1,66 @@
|
||||
import requests
|
||||
import json
|
||||
import time
|
||||
|
||||
def fetch_submissions(venue_id, offset=0, limit=1000):
|
||||
url = "https://api2.openreview.net/notes"
|
||||
params = {
|
||||
"content.venueid": venue_id,
|
||||
"details": "replyCount,invitation",
|
||||
"limit": limit,
|
||||
"offset": offset,
|
||||
"sort": "number:desc"
|
||||
}
|
||||
headers = {"User-Agent": "Mozilla/5.0"}
|
||||
response = requests.get(url, params=params, headers=headers)
|
||||
response.raise_for_status()
|
||||
return response.json()
|
||||
|
||||
def crawl_papers(venue_id, output_file):
|
||||
all_papers = []
|
||||
offset = 0
|
||||
limit = 1000
|
||||
|
||||
print(f"Fetching papers from {venue_id}...")
|
||||
|
||||
while True:
|
||||
data = fetch_submissions(venue_id, offset, limit)
|
||||
notes = data.get("notes", [])
|
||||
|
||||
if not notes:
|
||||
break
|
||||
|
||||
for note in notes:
|
||||
paper = {
|
||||
"id": note.get("id"),
|
||||
"number": note.get("number"),
|
||||
"title": note.get("content", {}).get("title", {}).get("value", ""),
|
||||
"authors": note.get("content", {}).get("authors", {}).get("value", []),
|
||||
"abstract": note.get("content", {}).get("abstract", {}).get("value", ""),
|
||||
"keywords": note.get("content", {}).get("keywords", {}).get("value", []),
|
||||
"primary_area": note.get("content", {}).get("primary_area", {}).get("value", ""),
|
||||
"forum_url": f"https://openreview.net/forum?id={note.get('id')}"
|
||||
}
|
||||
all_papers.append(paper)
|
||||
|
||||
print(f"Fetched {len(notes)} papers (total: {len(all_papers)})")
|
||||
|
||||
if len(notes) < limit:
|
||||
break
|
||||
|
||||
offset += limit
|
||||
time.sleep(0.5)
|
||||
|
||||
with open(output_file, "w", encoding="utf-8") as f:
|
||||
json.dump(all_papers, f, ensure_ascii=False, indent=2)
|
||||
|
||||
print(f"\nTotal: {len(all_papers)} papers")
|
||||
print(f"Saved to {output_file}")
|
||||
return all_papers
|
||||
|
||||
if __name__ == "__main__":
|
||||
crawl_papers(
|
||||
venue_id="ICLR.cc/2026/Conference/Submission",
|
||||
output_file="iclr2026_papers.json"
|
||||
)
|
||||
|
||||
22
mcp/SearchPaperByEmbedding/demo.py
Normal file
22
mcp/SearchPaperByEmbedding/demo.py
Normal file
@@ -0,0 +1,22 @@
|
||||
from search import PaperSearcher
|
||||
|
||||
# Use local model (free)
|
||||
searcher = PaperSearcher('iclr2026_papers.json', model_type='local')
|
||||
|
||||
# Or use OpenAI (better quality)
|
||||
# searcher = PaperSearcher('iclr2026_papers.json', model_type='openai')
|
||||
|
||||
searcher.compute_embeddings()
|
||||
|
||||
examples = [
|
||||
{
|
||||
"title": "Improving Developer Emotion Classification via LLM-Based Augmentation",
|
||||
"abstract": "Detecting developer emotion in the informative data stream of technical commit messages..."
|
||||
},
|
||||
]
|
||||
|
||||
results = searcher.search(examples=examples, top_k=100)
|
||||
|
||||
searcher.display(results, n=10)
|
||||
searcher.save(results, 'results.json')
|
||||
|
||||
6
mcp/SearchPaperByEmbedding/requirements.txt
Normal file
6
mcp/SearchPaperByEmbedding/requirements.txt
Normal file
@@ -0,0 +1,6 @@
|
||||
requests
|
||||
numpy
|
||||
scikit-learn
|
||||
sentence-transformers
|
||||
openai
|
||||
|
||||
156
mcp/SearchPaperByEmbedding/search.py
Normal file
156
mcp/SearchPaperByEmbedding/search.py
Normal file
@@ -0,0 +1,156 @@
|
||||
import json
|
||||
import numpy as np
|
||||
import os
|
||||
import hashlib
|
||||
from pathlib import Path
|
||||
from sklearn.metrics.pairwise import cosine_similarity
|
||||
|
||||
class PaperSearcher:
|
||||
def __init__(self, papers_file, model_type="openai", api_key=None, base_url=None):
|
||||
with open(papers_file, 'r', encoding='utf-8') as f:
|
||||
self.papers = json.load(f)
|
||||
|
||||
self.model_type = model_type
|
||||
self.cache_file = self._get_cache_file(papers_file, model_type)
|
||||
self.embeddings = None
|
||||
|
||||
if model_type == "openai":
|
||||
from openai import OpenAI
|
||||
self.client = OpenAI(
|
||||
api_key=api_key or os.getenv('OPENAI_API_KEY'),
|
||||
base_url=base_url
|
||||
)
|
||||
self.model_name = "text-embedding-3-large"
|
||||
else:
|
||||
from sentence_transformers import SentenceTransformer
|
||||
self.model = SentenceTransformer('all-MiniLM-L6-v2')
|
||||
self.model_name = "all-MiniLM-L6-v2"
|
||||
|
||||
self._load_cache()
|
||||
|
||||
def _get_cache_file(self, papers_file, model_type):
|
||||
base_name = Path(papers_file).stem
|
||||
file_hash = hashlib.md5(papers_file.encode()).hexdigest()[:8]
|
||||
cache_name = f"cache_{base_name}_{file_hash}_{model_type}.npy"
|
||||
return str(Path(papers_file).parent / cache_name)
|
||||
|
||||
def _load_cache(self):
|
||||
if os.path.exists(self.cache_file):
|
||||
try:
|
||||
self.embeddings = np.load(self.cache_file)
|
||||
if len(self.embeddings) == len(self.papers):
|
||||
print(f"Loaded cache: {self.embeddings.shape}")
|
||||
return True
|
||||
self.embeddings = None
|
||||
except:
|
||||
self.embeddings = None
|
||||
return False
|
||||
|
||||
def _save_cache(self):
|
||||
np.save(self.cache_file, self.embeddings)
|
||||
print(f"Saved cache: {self.cache_file}")
|
||||
|
||||
def _create_text(self, paper):
|
||||
parts = []
|
||||
if paper.get('title'):
|
||||
parts.append(f"Title: {paper['title']}")
|
||||
if paper.get('abstract'):
|
||||
parts.append(f"Abstract: {paper['abstract']}")
|
||||
if paper.get('keywords'):
|
||||
kw = ', '.join(paper['keywords']) if isinstance(paper['keywords'], list) else paper['keywords']
|
||||
parts.append(f"Keywords: {kw}")
|
||||
return ' '.join(parts)
|
||||
|
||||
def _embed_openai(self, texts):
|
||||
if isinstance(texts, str):
|
||||
texts = [texts]
|
||||
|
||||
embeddings = []
|
||||
batch_size = 100
|
||||
|
||||
for i in range(0, len(texts), batch_size):
|
||||
batch = texts[i:i + batch_size]
|
||||
response = self.client.embeddings.create(input=batch, model=self.model_name)
|
||||
embeddings.extend([item.embedding for item in response.data])
|
||||
|
||||
return np.array(embeddings)
|
||||
|
||||
def _embed_local(self, texts):
|
||||
if isinstance(texts, str):
|
||||
texts = [texts]
|
||||
return self.model.encode(texts, show_progress_bar=len(texts) > 100)
|
||||
|
||||
def compute_embeddings(self, force=False):
|
||||
if self.embeddings is not None and not force:
|
||||
print("Using cached embeddings")
|
||||
return self.embeddings
|
||||
|
||||
print(f"Computing embeddings ({self.model_name})...")
|
||||
texts = [self._create_text(p) for p in self.papers]
|
||||
|
||||
if self.model_type == "openai":
|
||||
self.embeddings = self._embed_openai(texts)
|
||||
else:
|
||||
self.embeddings = self._embed_local(texts)
|
||||
|
||||
print(f"Computed: {self.embeddings.shape}")
|
||||
self._save_cache()
|
||||
return self.embeddings
|
||||
|
||||
def search(self, examples=None, query=None, top_k=100):
|
||||
if self.embeddings is None:
|
||||
self.compute_embeddings()
|
||||
|
||||
if examples:
|
||||
texts = []
|
||||
for ex in examples:
|
||||
text = f"Title: {ex['title']}"
|
||||
if ex.get('abstract'):
|
||||
text += f" Abstract: {ex['abstract']}"
|
||||
texts.append(text)
|
||||
|
||||
if self.model_type == "openai":
|
||||
embs = self._embed_openai(texts)
|
||||
else:
|
||||
embs = self._embed_local(texts)
|
||||
|
||||
query_emb = np.mean(embs, axis=0).reshape(1, -1)
|
||||
|
||||
elif query:
|
||||
if self.model_type == "openai":
|
||||
query_emb = self._embed_openai(query).reshape(1, -1)
|
||||
else:
|
||||
query_emb = self._embed_local(query).reshape(1, -1)
|
||||
else:
|
||||
raise ValueError("Provide either examples or query")
|
||||
|
||||
similarities = cosine_similarity(query_emb, self.embeddings)[0]
|
||||
top_indices = np.argsort(similarities)[::-1][:top_k]
|
||||
|
||||
return [{
|
||||
'paper': self.papers[idx],
|
||||
'similarity': float(similarities[idx])
|
||||
} for idx in top_indices]
|
||||
|
||||
def display(self, results, n=10):
|
||||
print(f"\n{'='*80}")
|
||||
print(f"Top {len(results)} Results (showing {min(n, len(results))})")
|
||||
print(f"{'='*80}\n")
|
||||
|
||||
for i, result in enumerate(results[:n], 1):
|
||||
paper = result['paper']
|
||||
sim = result['similarity']
|
||||
|
||||
print(f"{i}. [{sim:.4f}] {paper['title']}")
|
||||
print(f" #{paper.get('number', 'N/A')} | {paper.get('primary_area', 'N/A')}")
|
||||
print(f" {paper['forum_url']}\n")
|
||||
|
||||
def save(self, results, output_file):
|
||||
with open(output_file, 'w', encoding='utf-8') as f:
|
||||
json.dump({
|
||||
'model': self.model_name,
|
||||
'total': len(results),
|
||||
'results': results
|
||||
}, f, ensure_ascii=False, indent=2)
|
||||
print(f"Saved to {output_file}")
|
||||
|
||||
Reference in New Issue
Block a user