新增了zeo++的部分

This commit is contained in:
2025-10-23 17:08:57 +08:00
parent c0b2ec5983
commit 1f8667ae51
4 changed files with 370 additions and 2 deletions

View File

@@ -10,8 +10,8 @@ searcher.compute_embeddings()
examples = [
{
"title": "Improving Developer Emotion Classification via LLM-Based Augmentation",
"abstract": "Detecting developer emotion in the informative data stream of technical commit messages..."
"title": "Solid-State battery",
"abstract": "Solid-State battery"
},
]

View File

@@ -11,10 +11,15 @@ from starlette.routing import Mount
from system_tools import create_system_mcp
from materialproject_mcp import create_materials_mcp
from softBV_remake import create_softbv_mcp
from paper_search_mcp import create_paper_search_mcp
from topological_analysis_models import create_topological_analysis_mcp
# 创建 MCP 实例
system_mcp = create_system_mcp()
materials_mcp = create_materials_mcp()
softbv_mcp = create_softbv_mcp()
paper_search_mcp = create_paper_search_mcp()
topological_analysis_mcp = create_topological_analysis_mcp()
# 在 Starlette 的 lifespan 中启动 MCP 的 session manager
@contextlib.asynccontextmanager
async def lifespan(app: Starlette):
@@ -22,6 +27,8 @@ async def lifespan(app: Starlette):
await stack.enter_async_context(system_mcp.session_manager.run())
await stack.enter_async_context(materials_mcp.session_manager.run())
await stack.enter_async_context(softbv_mcp.session_manager.run())
await stack.enter_async_context(paper_search_mcp.session_manager.run())
await stack.enter_async_context(topological_analysis_mcp.session_manager.run())
yield # 服务器运行期间
# 退出时自动清理
@@ -32,6 +39,8 @@ app = Starlette(
Mount("/system", app=system_mcp.streamable_http_app()),
Mount("/materials", app=materials_mcp.streamable_http_app()),
Mount("/softBV", app=softbv_mcp.streamable_http_app()),
Mount("/papersearch",app=paper_search_mcp.streamable_http_app()),
Mount("/topologicalAnalysis",app=topological_analysis_mcp.streamable_http_app()),
],
)
@@ -41,4 +50,6 @@ app = Starlette(
# http://localhost:8000/system
# http://localhost:8000/materials
# http://localhost:8000/softBV
# http://localhost:8000/papersearch
# http://localhost:8000/topologicalAnalysis
# 如果需要浏览器客户端访问CORS 暴露 Mcp-Session-Id请参考 README 中的 CORS 配置示例 [1]

172
mcp/paper_search_mcp.py Normal file
View File

@@ -0,0 +1,172 @@
# paper_search_mcp.py (重构版)
import os
import asyncio
from dataclasses import dataclass
from typing import Any
from collections.abc import AsyncIterator
from contextlib import asynccontextmanager
from pydantic import BaseModel, Field
# 假设您的 search.py 提供了 PaperSearcher 类
from SearchPaperByEmbedding.search import PaperSearcher
from mcp.server.fastmcp import FastMCP, Context
from mcp.server.session import ServerSession
# ========= 1. 配置与常量 (修正版) =========
# 获取当前脚本文件所在的目录的绝对路径
_CURRENT_SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
# 基于项目根目录构造资源文件的绝对路径
DATA_DIR = os.path.join(_CURRENT_SCRIPT_DIR,"SearchPaperByEmbedding")
PAPERS_JSON_PATH = os.path.join(DATA_DIR, "iclr2026_papers.json")
# 打印路径用于调试,确保它是正确的
print(f"DEBUG: Trying to load papers from: {PAPERS_JSON_PATH}")
MODEL_TYPE = os.getenv("EMBEDDING_MODEL_TYPE", "local")
# ========= 2. 数据模型 =========
class Paper(BaseModel):
"""单篇论文的详细信息"""
title: str
authors: list[str]
abstract: str
pdf_url: str | None = None
similarity_score: float = Field(description="与查询的相似度分数,越高越相关")
class SearchQuery(BaseModel):
"""论文搜索的查询结构"""
title: str
abstract: str | None = Field(None, description="可选的摘要,以提供更丰富的上下文")
# ========= 3. Lifespan (资源加载与管理) =========
@dataclass
class PaperSearchContext:
"""在服务器生命周期内共享的、已初始化的 PaperSearcher 实例"""
searcher: PaperSearcher
@asynccontextmanager
async def paper_search_lifespan(_server: FastMCP) -> AsyncIterator[PaperSearchContext]:
"""
FastMCP 生命周期:在服务器启动时初始化 PaperSearcher 并计算 Embeddings。
这确保了最耗时的部分只在启动时执行一次。
"""
print(f"正在使用 '{MODEL_TYPE}' 模型初始化论文搜索引擎...")
# 使用 to_thread 运行同步的初始化代码
def _initialize_searcher():
# 1. 初始化
searcher = PaperSearcher(
PAPERS_JSON_PATH,
model_type=MODEL_TYPE
)
# 2. 计算/加载 Embeddings
print("正在计算或加载论文 Embeddings...")
searcher.compute_embeddings()
print("Embeddings 加载完成。")
return searcher
searcher = await asyncio.to_thread(_initialize_searcher)
print("论文搜索引擎已准备就绪!")
try:
# 使用 yield 将创建好的共享资源上下文传递给 MCP
yield PaperSearchContext(searcher=searcher)
finally:
print("论文搜索引擎服务关闭。")
# ========= 4. MCP 服务器实例 =========
mcp = FastMCP(
name="Paper Search",
instructions="通过 embedding 相似度检索 ICLR 2026 论文的工具。",
lifespan=paper_search_lifespan,
streamable_http_path="/",
)
# ========= 5. 工具实现 =========
@mcp.tool()
async def search_papers(
queries: list[SearchQuery],
top_k: int = 3,
ctx: Context[ServerSession, PaperSearchContext] | None = None,
) -> list[Paper]:
"""
根据给定的一个或多个查询(包含标题和可选的摘要),使用语义相似度搜索论文。
返回最相关的 top_k 个结果。
"""
if ctx is None:
raise ValueError("Context is required for this operation.")
if not queries:
return []
app_ctx = ctx.request_context.lifespan_context
searcher = app_ctx.searcher
# 预处理查询,确保 title 和 abstract 至少有一个存在
examples_to_search = []
for q in queries:
query_dict = q.model_dump(exclude_none=True)
if "title" not in query_dict and "abstract" in query_dict:
query_dict["title"] = query_dict["abstract"]
elif "abstract" not in query_dict and "title" in query_dict:
query_dict["abstract"] = query_dict["title"]
examples_to_search.append(query_dict)
if not examples_to_search:
return []
query_display = queries[0].title or queries[0].abstract or "Empty Query"
await ctx.info(f"正在以查询 '{query_display}' 等内容搜索排名前 {top_k} 的论文...")
# 调用底层的 search 方法
results = await asyncio.to_thread(
searcher.search,
examples=examples_to_search,
top_k=top_k
)
# --- 关键修正:按照 search.py 返回的正确结构进行解析 ---
papers = []
for res in results:
paper_data = res.get('paper', {}) # 获取嵌套的论文信息字典
similarity = res.get('similarity', 0.0) # 获取相似度分数
papers.append(
Paper(
title=paper_data.get("title", "N/A"),
authors=paper_data.get("authors", []),
abstract=paper_data.get("abstract", ""),
# 注意:原始数据中 pdf url 的键是 'forum_url'
pdf_url=paper_data.get("forum_url"),
similarity_score=similarity # 使用正确的相似度分数
)
)
await ctx.debug(f"找到 {len(papers)} 篇相关论文。")
return papers
# ========= 6. 工厂函数 (用于 Starlette 集成) =========
def create_paper_search_mcp() -> FastMCP:
"""供 Starlette 挂载的工厂函数。"""
return mcp
if __name__ == "__main__":
# 如果直接运行此文件,则启动一个独立的 MCP 服务器
mcp.run(transport="streamable-http")

View File

@@ -0,0 +1,185 @@
# topological_analysis_tools.py
import posixpath
import re
from typing import Any, Literal
from contextlib import asynccontextmanager
from collections.abc import AsyncIterator
import asyncssh
from pydantic import BaseModel, Field
# 从 MCP SDK 导入核心组件 [1]
from mcp.server.fastmcp import FastMCP, Context
from mcp.server.session import ServerSession
# 从 lifespan.py 导入共享的配置和数据类 [3]
# 注意: 我们不再需要导入 system_lifespan
from lifespan import (
SharedAppContext,
REMOTE_HOST,
REMOTE_USER,
PRIVATE_KEY_PATH,
INITIAL_WORKING_DIRECTORY,
)
# ==============================================================================
# 1. 辅助函数 (为实现独立性而复制)
# ==============================================================================
def shell_quote(arg: str) -> str:
"""安全地将字符串转义为单个 shell 参数 [5]。"""
return "'" + arg.replace("'", "'\"'\"'") + "'"
def _safe_join(sandbox_root: str, relative_path: str) -> str:
"""安全地将用户提供的相对路径连接到沙箱根目录 [5]。"""
rel = (relative_path or ".").strip()
if rel.startswith("/"):
rel = rel.lstrip("/")
combined = posixpath.normpath(posixpath.join(sandbox_root, rel))
root_norm = sandbox_root.rstrip("/")
if combined != root_norm and not combined.startswith(root_norm + "/"):
raise ValueError("路径越界: 仅允许访问沙箱目录内部")
if ".." in combined.split("/"):
raise ValueError("非法路径: 不允许使用 '..' 跨目录")
return combined
# ==============================================================================
# 2. 独立的生命周期管理器
# ==============================================================================
@asynccontextmanager
async def analysis_lifespan(_server: FastMCP) -> AsyncIterator[SharedAppContext]:
"""
为拓扑分析工具独立管理 SSH 连接的生命周期。
"""
conn: asyncssh.SSHClientConnection | None = None
print("分析工具: 正在建立独立的 SSH 连接...")
try:
conn = await asyncssh.connect(
REMOTE_HOST,
username=REMOTE_USER,
client_keys=[PRIVATE_KEY_PATH],
)
print(f"分析工具: 独立的 SSH 连接到 {REMOTE_HOST} 成功!")
yield SharedAppContext(ssh_connection=conn, sandbox_path=INITIAL_WORKING_DIRECTORY)
finally:
if conn:
conn.close()
await conn.wait_closed()
print("分析工具: 独立的 SSH 连接已关闭。")
# ==============================================================================
# 3. 数据模型与解析函数 (与之前相同)
# ==============================================================================
# (为简洁起见,此处省略了 Pydantic 模型和 _parse_analysis_output 函数的定义,
# 请将上一版本中的这部分代码粘贴到这里)
VALID_CONFIGS = Literal["O.yaml", "S.yaml", "Cl.yaml", "Br.yaml"]
class SubmitTopologicalAnalysisParams(BaseModel):
cif_file_path: str = Field(description="远程服务器上 CIF 文件的相对路径。")
config_files: list[VALID_CONFIGS] = Field(description="选择一个或多个用于分析的 YAML 配置文件。")
output_file_path: str = Field(description="用于保存计算结果的输出文件相对路径。")
class AnalysisResult(BaseModel):
percolation_diameter_a: float | None = Field(None, description="渗透直径 (Percolation diameter), 单位 Å。")
connectivity_distance: float | None = Field(None, description="连通距离 (the minium of d), 单位 Å。")
max_node_length_a: float | None = Field(None, description="最大节点长度 (Maximum node length), 单位 Å。")
min_cation_distance_a: float | None = Field(None, description="到阳离子的最小距离, 单位 Å。")
total_time_s: float | None = Field(None, description="总计算耗时, 单位秒。")
long_node_count: int | None = Field(None, description="长节点数量 (Long node number)。")
short_node_count: int | None = Field(None, description="短节点数量 (Short node number)。")
warnings: list[str] = Field(default_factory=list, description="解析过程中遇到的警告或缺失的关键值信息。")
def _parse_analysis_output(log_content: str) -> AnalysisResult:
def find_float(pattern: str) -> float | None:
match = re.search(pattern, log_content)
return float(match.group(1)) if match else None
def find_int(pattern: str) -> int | None:
match = re.search(pattern, log_content)
return int(match.group(1)) if match else None
data = { "percolation_diameter_a": find_float(r"Percolation diameter \(A\):\s*([\d\.]+)"), "max_node_length_a": find_float(r"Maximum node length detected:\s*([\d\.]+)\s*A"), "min_cation_distance_a": find_float(r"minimum distance to cations is\s*([\d\.]+)\s*A"), "total_time_s": find_float(r"Total used time:\s*([\d\.]+)"), "long_node_count": find_int(r"Long node number:\s*(\d+)"), "short_node_count": find_int(r"Short node number:\s*(\d+)"), }
conn_dist_match = re.search(r"the minium of d\s*([\d\.]+)", log_content, re.MULTILINE)
data["connectivity_distance"] = float(conn_dist_match.group(1)) if conn_dist_match else None
warnings = [k + " 未找到" for k, v in data.items() if v is None and k not in ["min_cation_distance_a", "long_node_count", "short_node_count"]]
return AnalysisResult(**data, warnings=warnings)
# ==============================================================================
# 4. MCP 服务器工厂函数
# ==============================================================================
def create_topological_analysis_mcp() -> FastMCP:
"""
供 Starlette 挂载的工厂函数。
这个函数现在创建并使用自己独立的 lifespan 管理器。
"""
analysis_mcp = FastMCP(
name="Topological Analysis Tools",
instructions="用于在远程服务器上提交和分析拓扑计算任务的工具集。",
streamable_http_path="/",
lifespan=analysis_lifespan, # 关键: 使用本文件内定义的独立 lifespan
)
@analysis_mcp.tool()
async def submit_topological_analysis(
params: SubmitTopologicalAnalysisParams,
ctx: Context[ServerSession, SharedAppContext],
) -> dict[str, Any]:
"""【步骤1/2】异步提交拓扑分析任务。"""
# ... (工具的实现逻辑与之前完全相同)
try:
app_ctx = ctx.request_context.lifespan_context
conn = app_ctx.ssh_connection
sandbox_root = app_ctx.sandbox_path
home_dir = f"/cluster/home/{REMOTE_USER}"
tool_dir = f"{home_dir}/tool/Topological_analyse"
cif_abs_path = _safe_join(sandbox_root, params.cif_file_path)
output_abs_path = _safe_join(sandbox_root, params.output_file_path)
config_args = " ".join([shell_quote(f"{tool_dir}/{cfg}") for cfg in params.config_files])
config_flags = f"-i {config_args}"
command = f"""
nohup sh -c '
source {shell_quote(f"{home_dir}/.bashrc")} && \\
conda activate {shell_quote(f"{home_dir}/anaconda3/envs/zeo")} && \\
python {shell_quote(f"{tool_dir}/analyze_voronoi_nodes.py")} \\
{shell_quote(cif_abs_path)} \\
{config_flags}
' > {shell_quote(output_abs_path)} 2>&1 &
""".strip()
await ctx.info("提交后台拓扑分析任务...")
await conn.run(command, check=True)
return {"success": True, "message": "拓扑分析任务已成功提交。", "output_file": params.output_file_path}
except Exception as e:
msg = f"提交后台任务失败: {e}"
await ctx.error(msg)
return {"success": False, "error": msg}
@analysis_mcp.tool()
async def analyze_topological_results(
output_file_path: str,
ctx: Context[ServerSession, SharedAppContext],
) -> AnalysisResult:
"""【步骤2/2】读取并分析拓扑分析任务的输出文件。"""
# ... (工具的实现逻辑与之前完全相同)
try:
await ctx.info(f"开始分析结果文件: {output_file_path}")
app_ctx = ctx.request_context.lifespan_context
conn = app_ctx.ssh_connection
sandbox_root = app_ctx.sandbox_path
target_path = _safe_join(sandbox_root, output_file_path)
async with conn.start_sftp_client() as sftp:
async with sftp.open(target_path, "r") as f:
log_content = await f.read()
if not log_content:
raise ValueError("结果文件为空。")
analysis_data = _parse_analysis_output(log_content)
return analysis_data
except FileNotFoundError:
msg = f"分析失败: 结果文件 '{output_file_path}' 不存在。"
await ctx.error(msg)
return AnalysisResult(warnings=[msg])
except Exception as e:
msg = f"分析结果时发生错误: {e}"
await ctx.error(msg)
return AnalysisResult(warnings=[msg])
return analysis_mcp