sofvBV_mcp重构v2

Embedding copy
2025-10-22 23:59:23 +08:00
parent b9ba79d7a8
commit c0b2ec5983
8 changed files with 916 additions and 50 deletions
--- a/mcp/SearchPaperByEmbedding/LICENSE
+++ b/mcp/SearchPaperByEmbedding/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2025 gyj155
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
--- a/mcp/SearchPaperByEmbedding/README.md
+++ b/mcp/SearchPaperByEmbedding/README.md
@@ -0,0 +1,96 @@
+# Paper Semantic Search
+
+Find similar papers using semantic search. Supports both local models (free) and OpenAI API (better quality).
+
+## Features
+
+- Request for papers from OpenReview (e.g., ICLR2026 submissions)
+- Semantic search with example papers or text queries
+- Support embedding caching 
+- Embed model support: Open-source (e.g., all-MiniLM-L6-v2) or OpenAI
+
+## Quick Start
+
+```bash
+pip install -r requirements.txt
+```
+
+### 1. Prepare Papers
+
+```python
+from crawl import crawl_papers
+
+crawl_papers(
+    venue_id="ICLR.cc/2026/Conference/Submission",
+    output_file="iclr2026_papers.json"
+)
+```
+
+### 2. Search Papers
+
+```python
+from search import PaperSearcher
+
+# Local model (free)
+searcher = PaperSearcher('iclr2026_papers.json', model_type='local')
+
+# OpenAI model (better, requires API key)
+# export OPENAI_API_KEY='your-key'
+# searcher = PaperSearcher('iclr2026_papers.json', model_type='openai')
+
+searcher.compute_embeddings()
+
+# Search with example papers that you are interested in
+examples = [
+    {
+        "title": "Your paper title",
+        "abstract": "Your paper abstract..."
+    }
+]
+
+results = searcher.search(examples=examples, top_k=100)
+
+# Or search with text query
+results = searcher.search(query="interesting topics", top_k=100)
+
+searcher.display(results, n=10)
+searcher.save(results, 'results.json')
+```
+
+
+
+## How It Works
+
+1. Paper titles and abstracts are converted to embeddings
+2. Embeddings are cached automatically
+3. Your query is embedded using the same model
+4. Cosine similarity finds the most similar papers
+5. Results are ranked by similarity score
+
+## Cache
+
+Embeddings are cached as `cache_<filename>_<hash>_<model>.npy`. Delete to recompute.
+
+## Example Output
+
+```
+================================================================================
+Top 100 Results (showing 10)
+================================================================================
+
+1. [0.8456] Paper a
+   #12345 | foundation or frontier models, including LLMs
+   https://openreview.net/forum?id=xxx
+
+2. [0.8234] Paper b
+   #12346 | applications to robotics, autonomy, planning
+   https://openreview.net/forum?id=yyy
+```
+
+## Tips
+
+- Use 1-5 example papers for best results, or a paragraph of description of your interested topic
+- Local model is good enough for most cases
+- OpenAI model for critical search (~$1 for 18k queries)
+
+If it's useful, please consider giving a star~
--- a/mcp/SearchPaperByEmbedding/crawl.py
+++ b/mcp/SearchPaperByEmbedding/crawl.py
@@ -0,0 +1,66 @@
+import requests
+import json
+import time
+
+def fetch_submissions(venue_id, offset=0, limit=1000):
+    url = "https://api2.openreview.net/notes"
+    params = {
+        "content.venueid": venue_id,
+        "details": "replyCount,invitation",
+        "limit": limit,
+        "offset": offset,
+        "sort": "number:desc"
+    }
+    headers = {"User-Agent": "Mozilla/5.0"}
+    response = requests.get(url, params=params, headers=headers)
+    response.raise_for_status()
+    return response.json()
+
+def crawl_papers(venue_id, output_file):
+    all_papers = []
+    offset = 0
+    limit = 1000
+    
+    print(f"Fetching papers from {venue_id}...")
+    
+    while True:
+        data = fetch_submissions(venue_id, offset, limit)
+        notes = data.get("notes", [])
+        
+        if not notes:
+            break
+            
+        for note in notes:
+            paper = {
+                "id": note.get("id"),
+                "number": note.get("number"),
+                "title": note.get("content", {}).get("title", {}).get("value", ""),
+                "authors": note.get("content", {}).get("authors", {}).get("value", []),
+                "abstract": note.get("content", {}).get("abstract", {}).get("value", ""),
+                "keywords": note.get("content", {}).get("keywords", {}).get("value", []),
+                "primary_area": note.get("content", {}).get("primary_area", {}).get("value", ""),
+                "forum_url": f"https://openreview.net/forum?id={note.get('id')}"
+            }
+            all_papers.append(paper)
+        
+        print(f"Fetched {len(notes)} papers (total: {len(all_papers)})")
+        
+        if len(notes) < limit:
+            break
+            
+        offset += limit
+        time.sleep(0.5)
+    
+    with open(output_file, "w", encoding="utf-8") as f:
+        json.dump(all_papers, f, ensure_ascii=False, indent=2)
+    
+    print(f"\nTotal: {len(all_papers)} papers")
+    print(f"Saved to {output_file}")
+    return all_papers
+
+if __name__ == "__main__":
+    crawl_papers(
+        venue_id="ICLR.cc/2026/Conference/Submission",
+        output_file="iclr2026_papers.json"
+    )
+
--- a/mcp/SearchPaperByEmbedding/demo.py
+++ b/mcp/SearchPaperByEmbedding/demo.py
@@ -0,0 +1,22 @@
+from search import PaperSearcher
+
+# Use local model (free)
+searcher = PaperSearcher('iclr2026_papers.json', model_type='local')
+
+# Or use OpenAI (better quality)
+# searcher = PaperSearcher('iclr2026_papers.json', model_type='openai')
+
+searcher.compute_embeddings()
+
+examples = [
+    {
+        "title": "Improving Developer Emotion Classification via LLM-Based Augmentation",
+        "abstract": "Detecting developer emotion in the informative data stream of technical commit messages..."
+    },
+]
+
+results = searcher.search(examples=examples, top_k=100)
+
+searcher.display(results, n=10)
+searcher.save(results, 'results.json')
+
--- a/mcp/SearchPaperByEmbedding/requirements.txt
+++ b/mcp/SearchPaperByEmbedding/requirements.txt
@@ -0,0 +1,6 @@
+requests
+numpy
+scikit-learn
+sentence-transformers
+openai
+
--- a/mcp/SearchPaperByEmbedding/search.py
+++ b/mcp/SearchPaperByEmbedding/search.py
@@ -0,0 +1,156 @@
+import json
+import numpy as np
+import os
+import hashlib
+from pathlib import Path
+from sklearn.metrics.pairwise import cosine_similarity
+
+class PaperSearcher:
+    def __init__(self, papers_file, model_type="openai", api_key=None, base_url=None):
+        with open(papers_file, 'r', encoding='utf-8') as f:
+            self.papers = json.load(f)
+        
+        self.model_type = model_type
+        self.cache_file = self._get_cache_file(papers_file, model_type)
+        self.embeddings = None
+        
+        if model_type == "openai":
+            from openai import OpenAI
+            self.client = OpenAI(
+                api_key=api_key or os.getenv('OPENAI_API_KEY'),
+                base_url=base_url
+            )
+            self.model_name = "text-embedding-3-large"
+        else:
+            from sentence_transformers import SentenceTransformer
+            self.model = SentenceTransformer('all-MiniLM-L6-v2')
+            self.model_name = "all-MiniLM-L6-v2"
+        
+        self._load_cache()
+    
+    def _get_cache_file(self, papers_file, model_type):
+        base_name = Path(papers_file).stem
+        file_hash = hashlib.md5(papers_file.encode()).hexdigest()[:8]
+        cache_name = f"cache_{base_name}_{file_hash}_{model_type}.npy"
+        return str(Path(papers_file).parent / cache_name)
+    
+    def _load_cache(self):
+        if os.path.exists(self.cache_file):
+            try:
+                self.embeddings = np.load(self.cache_file)
+                if len(self.embeddings) == len(self.papers):
+                    print(f"Loaded cache: {self.embeddings.shape}")
+                    return True
+                self.embeddings = None
+            except:
+                self.embeddings = None
+        return False
+    
+    def _save_cache(self):
+        np.save(self.cache_file, self.embeddings)
+        print(f"Saved cache: {self.cache_file}")
+    
+    def _create_text(self, paper):
+        parts = []
+        if paper.get('title'):
+            parts.append(f"Title: {paper['title']}")
+        if paper.get('abstract'):
+            parts.append(f"Abstract: {paper['abstract']}")
+        if paper.get('keywords'):
+            kw = ', '.join(paper['keywords']) if isinstance(paper['keywords'], list) else paper['keywords']
+            parts.append(f"Keywords: {kw}")
+        return ' '.join(parts)
+    
+    def _embed_openai(self, texts):
+        if isinstance(texts, str):
+            texts = [texts]
+        
+        embeddings = []
+        batch_size = 100
+        
+        for i in range(0, len(texts), batch_size):
+            batch = texts[i:i + batch_size]
+            response = self.client.embeddings.create(input=batch, model=self.model_name)
+            embeddings.extend([item.embedding for item in response.data])
+        
+        return np.array(embeddings)
+    
+    def _embed_local(self, texts):
+        if isinstance(texts, str):
+            texts = [texts]
+        return self.model.encode(texts, show_progress_bar=len(texts) > 100)
+    
+    def compute_embeddings(self, force=False):
+        if self.embeddings is not None and not force:
+            print("Using cached embeddings")
+            return self.embeddings
+        
+        print(f"Computing embeddings ({self.model_name})...")
+        texts = [self._create_text(p) for p in self.papers]
+        
+        if self.model_type == "openai":
+            self.embeddings = self._embed_openai(texts)
+        else:
+            self.embeddings = self._embed_local(texts)
+        
+        print(f"Computed: {self.embeddings.shape}")
+        self._save_cache()
+        return self.embeddings
+    
+    def search(self, examples=None, query=None, top_k=100):
+        if self.embeddings is None:
+            self.compute_embeddings()
+        
+        if examples:
+            texts = []
+            for ex in examples:
+                text = f"Title: {ex['title']}"
+                if ex.get('abstract'):
+                    text += f" Abstract: {ex['abstract']}"
+                texts.append(text)
+            
+            if self.model_type == "openai":
+                embs = self._embed_openai(texts)
+            else:
+                embs = self._embed_local(texts)
+            
+            query_emb = np.mean(embs, axis=0).reshape(1, -1)
+        
+        elif query:
+            if self.model_type == "openai":
+                query_emb = self._embed_openai(query).reshape(1, -1)
+            else:
+                query_emb = self._embed_local(query).reshape(1, -1)
+        else:
+            raise ValueError("Provide either examples or query")
+        
+        similarities = cosine_similarity(query_emb, self.embeddings)[0]
+        top_indices = np.argsort(similarities)[::-1][:top_k]
+        
+        return [{
+            'paper': self.papers[idx],
+            'similarity': float(similarities[idx])
+        } for idx in top_indices]
+    
+    def display(self, results, n=10):
+        print(f"\n{'='*80}")
+        print(f"Top {len(results)} Results (showing {min(n, len(results))})")
+        print(f"{'='*80}\n")
+        
+        for i, result in enumerate(results[:n], 1):
+            paper = result['paper']
+            sim = result['similarity']
+            
+            print(f"{i}. [{sim:.4f}] {paper['title']}")
+            print(f"   #{paper.get('number', 'N/A')} | {paper.get('primary_area', 'N/A')}")
+            print(f"   {paper['forum_url']}\n")
+    
+    def save(self, results, output_file):
+        with open(output_file, 'w', encoding='utf-8') as f:
+            json.dump({
+                'model': self.model_name,
+                'total': len(results),
+                'results': results
+            }, f, ensure_ascii=False, indent=2)
+        print(f"Saved to {output_file}")
+