nep框架搭建

2025-12-08 17:14:27 +08:00
parent 7a6ca92ad4
commit 0b6537a810
21 changed files with 219 additions and 1 deletions
--- a/.idea/vcs.xml
+++ b/.idea/vcs.xml
@@ -1,4 +1,6 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <project version="4">
-  <component name="VcsDirectoryMappings" defaultProject="true" />
+  <component name="VcsDirectoryMappings">
+    <mapping directory="$PROJECT_DIR$" vcs="Git" />
+  </component>
 </project>
--- a/README.md
+++ b/README.md
--- a/config/param.yaml
+++ b/config/param.yaml
@@ -0,0 +1,70 @@
+# config/param.yaml
+
+# --- 1. 环境与路径配置 ---
+env:
+  # 可执行文件绝对路径
+  vasp_std: "mpirun -np 1 /cluster/home/koko125/vasp/bin_gpu/vasp_std"
+  gpumd: "/cluster/home/koko125/tool/GPUMD/src/gpumd"
+  nep: "/cluster/home/koko125/tool/GPUMD/src/nep"
+
+  # GPUMDKit 脚本库根目录
+  gpumdkit_root: "/cluster/home/koko125/tool/GPUMDkit"
+
+  # 【修改点】HPC 作业提交配置 (用于填充 submit.slurm 模板)
+  # 这些变量会被自动替换到 .sh 脚本头部
+#  slurm_config:
+#    partition: "v100"      # 队列分区名
+#    account: "def-user"    # 账户名 (如果有)
+#    gpu_per_node: 1        # 每节点 GPU 数
+#    ntasks_per_node: 32    # 每节点 CPU 核数
+#    time_limit: "24:00:00" # 墙钟时间限制
+
+# --- 2. 流程控制 ---
+# 阶段代号定义 (对应 modules 下的 Python 文件)
+stages_def:
+  p: "preheat"   # 00.md/preheat
+  m: "md"        # 00.md/md
+  s: "select"    # 01.select
+  d: "scf"       # 02.scf
+  t: "train"     # 03.train
+  pr: "predict"  # 04.predict (新增：用于性质预测)
+  o: "output"    # 05.output  (始终默认执行：整理报告)
+
+# 自定义流程调度
+# 注意：'o' (output) 不需要显式写在这里，代码逻辑会强制每轮最后执行它
+schedule:
+  # 第1轮: 跑完训练，不做预测，看一眼结果
+  1: ["p", "m", "s", "d", "t"]
+
+  # 第2轮: 跑完训练，加入预测步骤 (计算电导/扩散等)
+  2: ["p", "m", "s", "d", "t", "pr"]
+
+# 默认流程 (如果没有定义轮次)
+default_workflow: ["p", "m", "s", "d", "t", "pr"]
+
+# --- 3. 容错与通知 ---
+control:
+  max_retries: 3           # 任务失败自动重启次数
+  check_interval: 60       # 状态检查间隔 (秒)
+
+notification:
+  enable_log: true
+  log_file: "./logs/sys_runtime.log"
+
+  enable_hook: true
+  hook_script: "python ./hooks/send_alert.py"
+  alert_events: ["fail", "finish"]
+
+# --- 4. 模块参数 ---
+params:
+  preheat:
+    temp: 300
+    steps: 10000
+  select:
+    target_min: 60
+    target_max: 120
+    init_threshold: 0.01
+  predict:
+    # 预测阶段需要的参数，比如计算电导率的温度范围
+    temperatures: [300, 400, 500]
+    script_path: "scripts/calc_conductivity.py" # 具体的计算脚本
--- a/config/system.yaml
+++ b/config/system.yaml
@@ -0,0 +1,16 @@
+# config/system.yaml
+project_name: "LiYCl_Transport_v1"
+
+# 物理体系定义
+system:
+  elements: ["Li", "Y", "Cl"]
+
+  # 初始结构 (VASP格式)
+  initial_structure: "./initial_data/LiYCl.vasp"
+
+  # 初始势函数 (第一轮 preheat 使用)
+  # 如果是第一轮，使用此通用势；后续轮次自动使用上一轮训练结果
+  initial_potential: "./initial_data/nep89.txt"
+
+  # 晶格常数或扩胞设置 (可选，视具体模块逻辑而定)
+  supercell: [1, 1, 1]
--- a/hooks/send_alert.py
+++ b/hooks/send_alert.py
--- a/main.py
+++ b/main.py
@@ -0,0 +1,33 @@
+import sys
+import time
+import traceback
+from nep_auto.driver import NEPDriver
+from nep_auto.utils.logger import setup_logger
+
+
+def main():
+    # 1. 初始化全局日志
+    logger = setup_logger("logs/sys_runtime.log")
+    logger.info("========================================")
+    logger.info("🚀 NEP Automation Framework Starting...")
+    logger.info("========================================")
+
+    try:
+        # 2. 初始化驱动器 (加载配置，恢复状态)
+        driver = NEPDriver()
+
+        # 3. 启动主循环
+        driver.run()
+
+    except KeyboardInterrupt:
+        logger.warning("⚠️ 用户手动中断程序 (KeyboardInterrupt)")
+        sys.exit(0)
+    except Exception as e:
+        logger.error(f"❌ 程序发生严重崩溃: {str(e)}")
+        logger.error(traceback.format_exc())
+        # 这里可以加入发送崩溃通知的逻辑
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()
--- a/nep_auto/init.py
+++ b/nep_auto/init.py
--- a/nep_auto/driver.py
+++ b/nep_auto/driver.py
@@ -0,0 +1,37 @@
+import yaml
+import time
+import logging
+from pathlib import Path
+from nep_auto.status_manager import StatusManager
+
+
+class NEPDriver:
+    def __init__(self):
+        self.logger = logging.getLogger("NEP_Auto")
+        self.root = Path(".")
+
+        # 1. 加载配置
+        self.config_sys = self._load_yaml("config/system.yaml")
+        self.config_param = self._load_yaml("config/param.yaml")
+        self.logger.info(f"项目名称: {self.config_sys.get('project_name')}")
+
+        # 2. 初始化状态管理器
+        self.status = StatusManager(self.root / "workspace")
+
+    def _load_yaml(self, path):
+        if not Path(path).exists():
+            raise FileNotFoundError(f"配置文件缺失: {path}")
+        with open(path, 'r') as f:
+            return yaml.safe_load(f)
+
+    def run(self):
+        """主循环"""
+        self.logger.info("✅ 驱动器初始化完成，准备进入主循环...")
+
+        # 获取当前轮次
+        current_iter = self.status.get_current_iter()
+        self.logger.info(f"当前进度: iter_{current_iter:03d}")
+
+        # 暂时只打印一次就退出，用于测试环境
+        self.logger.info("测试阶段：环境检查通过。等待模块代码实现...")
+        # while True: ... (后续我们将在这里实现调度逻辑)
--- a/nep_auto/modules/init.py
+++ b/nep_auto/modules/init.py
--- a/nep_auto/modules/base_module.py
+++ b/nep_auto/modules/base_module.py
--- a/nep_auto/modules/m0_preheat.py
+++ b/nep_auto/modules/m0_preheat.py
--- a/nep_auto/modules/m1_md.py
+++ b/nep_auto/modules/m1_md.py
--- a/nep_auto/modules/m2_select.py
+++ b/nep_auto/modules/m2_select.py
--- a/nep_auto/modules/m3_scf.py
+++ b/nep_auto/modules/m3_scf.py
--- a/nep_auto/modules/m4_train.py
+++ b/nep_auto/modules/m4_train.py
--- a/nep_auto/modules/m5_predict.py
+++ b/nep_auto/modules/m5_predict.py
--- a/nep_auto/status_manager.py
+++ b/nep_auto/status_manager.py
@@ -0,0 +1,27 @@
+import json
+import os
+from pathlib import Path
+
+
+class StatusManager:
+    def __init__(self, workspace_path):
+        self.workspace = Path(workspace_path)
+        self.status_file = self.workspace / "status.json"
+
+        if not self.workspace.exists():
+            self.workspace.mkdir(parents=True)
+
+        # 如果没有状态文件，创建一个初始的
+        if not self.status_file.exists():
+            self._save_status({"current_iter": 1, "stages": {}})
+
+    def _save_status(self, data):
+        with open(self.status_file, 'w') as f:
+            json.dump(data, f, indent=4)
+
+    def get_current_iter(self):
+        if self.status_file.exists():
+            with open(self.status_file, 'r') as f:
+                data = json.load(f)
+                return data.get("current_iter", 1)
+        return 1
--- a/nep_auto/utils/init.py
+++ b/nep_auto/utils/init.py
--- a/nep_auto/utils/logger.py
+++ b/nep_auto/utils/logger.py
@@ -0,0 +1,33 @@
+import logging
+import os
+import sys
+
+
+def setup_logger(log_file="logs/runtime.log"):
+    # 确保日志目录存在
+    os.makedirs(os.path.dirname(log_file), exist_ok=True)
+
+    logger = logging.getLogger("NEP_Auto")
+    logger.setLevel(logging.INFO)
+
+    # 避免重复添加 handler
+    if logger.handlers:
+        return logger
+
+    # 格式
+    formatter = logging.Formatter(
+        '[%(asctime)s] [%(levelname)s] %(message)s',
+        datefmt='%Y-%m-%d %H:%M:%S'
+    )
+
+    # 文件输出
+    fh = logging.FileHandler(log_file, mode='a', encoding='utf-8')
+    fh.setFormatter(formatter)
+    logger.addHandler(fh)
+
+    # 屏幕输出
+    ch = logging.StreamHandler(sys.stdout)
+    ch.setFormatter(formatter)
+    logger.addHandler(ch)
+
+    return logger
--- a/nep_auto/utils/notifier.py
+++ b/nep_auto/utils/notifier.py
--- a/nep_auto/utils/runner.py
+++ b/nep_auto/utils/runner.py