nep框架搭建

2025-12-08 22:05:06 +08:00
parent 5057d18e98
commit cba2afb403
9 changed files with 498 additions and 5 deletions
--- a/.idea/NEP-auto.iml
+++ b/.idea/NEP-auto.iml
@@ -2,7 +2,7 @@
 <module type="PYTHON_MODULE" version="4">
  <component name="NewModuleRootManager">
    <content url="file://$MODULE_DIR$" />
-    <orderEntry type="inheritedJdk" />
+    <orderEntry type="jdk" jdkName="Python 3.12" jdkType="Python SDK" />
    <orderEntry type="sourceFolder" forTests="false" />
  </component>
 </module>
--- a/config/machine.yaml
+++ b/config/machine.yaml
@@ -25,7 +25,12 @@ systems:
        command: "nep"
        env_setup: ""
        gpu_id: 0
-
+      gpumdkit:
        # 假设是 GPU 版本，可能不需要 mpirun 或者只需要少量核
        command: "gpumdkit.sh"
        env_setup: ""
        # 即使是 local 模式，有时也需要指定并行度
        n_procs: 1
      # 3. VASP (GPU 版) 配置
      vasp:
        # 假设是 GPU 版本，可能不需要 mpirun 或者只需要少量核
--- a/config/param.yaml
+++ b/config/param.yaml
@@ -32,9 +32,7 @@ notification:
 # --- 3. 各模块具体的物理/算法参数 ---
 params:
  preheat:
-    temp: 300
+    template_file: "run_ramp.in"
    steps: 10000
    # 这里不需要指定 gpumd 路径，只需要指定物理量
  select:
    target_min: 60
--- a/nep_auto/modules/m0_preheat.py
+++ b/nep_auto/modules/m0_preheat.py
@@ -0,0 +1,113 @@
 import shutil
 import logging
 from pathlib import Path
 from .base_module import BaseModule
 class PreheatModule(BaseModule):
    def __init__(self, driver, iter_id):
        super().__init__(driver, iter_id)
        self.template_subdir = "00_md"
    def get_work_dir(self):
        return self.iter_dir / "00.md" / "preheat"
    def initialize(self):
        super().initialize()  # 创建目录
        work_dir = self.get_work_dir()
        # 1. 准备 run.in (从配置读取模板名)
        template_name = self.config_param['params']['preheat'].get('template_file', 'run.in')
        self.copy_template(template_name, "run.in")
        # 2. 准备 nep.in (GPUMD 运行必需，虽然内容可能很简单)
        self.copy_template("nep.in")
        # 3. 准备 nep.txt (势函数)
        self._prepare_potential()
        # 4. 准备 model.xyz (结构)
        self._prepare_structure()
    def _prepare_potential(self):
        """准备势函数文件 nep.txt"""
        dst = self.get_work_dir() / "nep.txt"
        if self.iter_id == 1:
            # 第一轮：使用 system.yaml 里定义的初始势
            init_pot = Path(self.config_sys['system']['initial_potential'])
            if not init_pot.exists():
                raise FileNotFoundError(f"Initial potential not found: {init_pot}")
            shutil.copy(init_pot, dst)
            self.logger.info(f"   -> Copied initial potential: {init_pot.name}")
        else:
            # 后续轮次：使用上一轮训练结果
            prev_iter = f"iter_{self.iter_id - 1:03d}"
            prev_train_dir = self.root / prev_iter / "03.train"
            src = prev_train_dir / "nep.txt"
            if not src.exists():
                raise FileNotFoundError(f"Previous potential not found: {src}")
            shutil.copy(src, dst)
            self.logger.info(f"   -> Copied potential from {prev_iter}")
    def _prepare_structure(self):
        """准备 model.xyz"""
        work_dir = self.get_work_dir()
        # 目前逻辑：Preheat 总是从初始结构开始（或者你可以改为从上一轮的 dump 中取）
        # 这里演示从 VASP 文件转换
        vasp_path = Path(self.config_sys['system']['initial_structure'])
        if not vasp_path.exists():
            raise FileNotFoundError(f"Structure file not found: {vasp_path}")
        # 复制到工作目录
        local_vasp = work_dir / vasp_path.name
        shutil.copy(vasp_path, local_vasp)
        # 调用 gpumdkit.sh -addlabel 进行转换
        # 命令格式: gpumdkit.sh -addlabel file.vasp Li Y Cl
        elements = " ".join(self.config_sys['system']['elements'])
        self.logger.info("   -> Converting VASP to model.xyz...")
        # 使用 runner 调用 gpumdkit (必须在 machine.yaml 里定义了 'gpumdkit')
        # 注意：gpumdkit.sh 可能不输出 model.xyz 而是输出 file.xyz，需要确认
        # 假设输出为 model.xyz
        cmd_args = f"-addlabel {local_vasp.name} {elements}"
        self.runner.run("gpumdkit", cwd=work_dir, extra_args=cmd_args)
        # 检查是否生成成功
        if not (work_dir / "model.xyz").exists():
            # 有时候 gpumdkit 生成的文件名可能是 LiYCl.xyz，需要重命名为 model.xyz
            # 这里做一个容错检查
            expected_name = local_vasp.stem + ".xyz"  # e.g., LiYCl.xyz
            if (work_dir / expected_name).exists():
                shutil.move(work_dir / expected_name, work_dir / "model.xyz")
            else:
                raise RuntimeError("Failed to generate model.xyz from gpumdkit")
    def run(self):
        """执行 GPUMD"""
        work_dir = self.get_work_dir()
        # 检查是否已经跑完 (简单的锁文件机制)
        if (work_dir / "thermo.out").exists():
            self.logger.info(f"   -> Pre-check: thermo.out exists, skipping preheat.")
            # 这里可以加更复杂的检查，比如步数是否足够
            return
        self.logger.info(f"🔥 Running Preheat in {self.iter_name}")
        self.initialize()
        # 调用 GPUMD
        # GPUMD 没有参数，直接运行
        self.runner.run("gpumd", cwd=work_dir)
        self.logger.info("   -> Preheat finished.")
    def check_done(self):
        # 简单检查 thermo.out 是否存在且非空
        f = self.get_work_dir() / "thermo.out"
        return f.exists() and f.stat().st_size > 0
--- a/nep_auto/modules/m1_md.py
+++ b/nep_auto/modules/m1_md.py
@@ -0,0 +1,116 @@
 import shutil
 import glob
 from pathlib import Path
 from nep_auto.modules.base_module import BaseModule
 class MDModule(BaseModule):
    def __init__(self, driver, iter_id):
        super().__init__(driver, iter_id)
        self.template_subdir = "00_md"
        # 预热目录 (输入源)
        self.preheat_dir = self.iter_dir / "00.md" / "preheat"
        # MD 目录 (工作区)
        self.work_dir = self.iter_dir / "00.md" / "md"
    def get_work_dir(self):
        return self.work_dir
    def run(self):
        self.logger.info(f"🌪️ [MD] Starting Sampling Phase Iter {self.iter_id}...")
        self.initialize()
        # ----------------------------------------
        # 1. 从预热轨迹中采样 (dump.xyz -> sampled_structures.xyz)
        # ----------------------------------------
        preheat_dump = self.preheat_dir / "dump.xyz"
        if not preheat_dump.exists():
            raise FileNotFoundError(f"Preheat dump not found: {preheat_dump}")
        # 调用 sample_structures.py
        # 假设参数: input_file method number
        kit_root = self.driver.config_param['env']['gpumdkit_root']
        script = f"{kit_root}/Scripts/sample_structures/sample_structures.py"
        # 复制 dump 到当前目录以便处理
        local_dump = self.work_dir / "preheat_dump.xyz"
        shutil.copy(preheat_dump, local_dump)
        self.logger.info("   -> Sampling structures from preheat trajectory...")
        # 按照你的描述: sample_structures.py dump.xyz uniform 4
        # 这里 "4" 可以放到 param.yaml 里配置，暂时写死或读取默认
        self.runner.run(
            "python_script",  # 这里可以用 local runner 直接跑 python
            cwd=self.work_dir,
            extra_args=f"{script} preheat_dump.xyz uniform 4"
        )
        # 产物通常叫 sampled_structures.xyz，我们需要把它作为后续 MD 的起始结构
        # 但注意：GPUMD MD 通常读取 model.xyz 或者 restart。
        # 如果你的 run.in 里写的是 load_xyz sampled_structures.xyz，那就没问题。
        # 如果不是，通常做法是把 sampled_structures.xyz 切分成多个文件夹。
        # --- 修正逻辑：根据你的描述 "生成 sample_1-4 文件夹" ---
        # 我们遍历 template/00_md/md_run_*.in
        tpl_path = Path("template") / self.template_subdir
        run_templates = sorted(list(tpl_path.glob("md_run_*.in")))
        if not run_templates:
            self.logger.warning(f"⚠️ No 'md_run_*.in' found in {tpl_path}, looking for 'run.in'...")
            run_templates = list(tpl_path.glob("run.in"))
        sub_tasks = []
        nep_source = self.preheat_dir / "nep.txt"  # 沿用预热阶段的势函数
        for idx, tpl in enumerate(run_templates, start=1):
            task_name = f"sample_{idx}"
            task_dir = self.work_dir / task_name
            task_dir.mkdir(exist_ok=True)
            sub_tasks.append(task_dir)
            # 1. 复制 run.in
            shutil.copy(tpl, task_dir / "run.in")
            # 2. 复制 nep.txt
            shutil.copy(nep_source, task_dir / "nep.txt")
            # 3. 复制结构 (假设所有 sample 都从预热的最后一帧或 sampled_structures 开始)
            # 这里简化处理：复制 model.xyz (初始结构) 或者 使用 preheat 的最后状态
            # 根据你的流程，通常需要把 sampled_structures.xyz 里的某一帧放进去
            # 或者 GPUMD 支持直接读取 exyz。
            # 这里我们假设 run.in 里配置好了读取方式，我们只负责给文件。
            if (self.preheat_dir / "model.xyz").exists():
                shutil.copy(self.preheat_dir / "model.xyz", task_dir / "model.xyz")
        # ----------------------------------------
        # 2. 执行所有 Sample 任务
        # ----------------------------------------
        self.logger.info(f"   -> Submitting {len(sub_tasks)} MD tasks...")
        for task_dir in sub_tasks:
            self.logger.info(f"      -> Running {task_dir.name}...")
            self.runner.run("gpumd", cwd=task_dir)
        # ----------------------------------------
        # 3. 合并结果
        # ----------------------------------------
        self.logger.info("   -> Merging dump files...")
        # cat sample_*/dump.xyz >> dump.xyz
        # 使用 python 实现 cat 以跨平台安全
        target_dump = self.work_dir / "dump.xyz"
        with open(target_dump, 'wb') as outfile:
            for task_dir in sub_tasks:
                src = task_dir / "dump.xyz"
                if src.exists():
                    with open(src, 'rb') as infile:
                        shutil.copyfileobj(infile, outfile)
                else:
                    self.logger.warning(f"⚠️ {task_dir.name} generated no dump.xyz")
        self.check_done()
    def check_done(self):
        if (self.work_dir / "dump.xyz").exists():
            self.logger.info("✅ MD Sampling finished.")
            return True
        raise RuntimeError("MD failed: dump.xyz not created.")
--- a/nep_auto/modules/m2_select.py
+++ b/nep_auto/modules/m2_select.py
@@ -0,0 +1,105 @@
 import shutil
 import re
 from pathlib import Path
 from nep_auto.modules.base_module import BaseModule
 class SelectModule(BaseModule):
    def __init__(self, driver, iter_id):
        super().__init__(driver, iter_id)
        self.work_dir = self.iter_dir / "01.select"
        self.md_dir = self.iter_dir / "00.md" / "md"
    def get_work_dir(self):
        return self.work_dir
    def get_frame_count(self, xyz_file):
        """读取 xyz 文件帧数 (简单通过 grep 'Lattice' 计数，或用 ASE)"""
        if not xyz_file.exists():
            return 0
        # 简单方法：读取文件统计 Lattice 出现的次数 (ExtXYZ 格式)
        try:
            with open(xyz_file, 'r') as f:
                content = f.read()
                return content.count("Lattice=")
        except:
            return 0
    def run(self):
        self.logger.info(f"🔍 [Select] Starting Active Learning Selection Iter {self.iter_id}...")
        self.initialize()
        # 准备数据
        src_dump = self.md_dir / "dump.xyz"
        train_xyz_prev = self.root / "00.data" / "train.xyz"  # 或者是上一轮的 train
        # 如果是 iter > 1，train.xyz 应该是累积的。这里简化，先假设有一个参考的 train.xyz
        # 必须文件：dump.xyz, train.xyz, nep.txt
        shutil.copy(src_dump, self.work_dir / "dump.xyz")
        # 这里的 train.xyz 是给 neptrain_select_structs.py 用作参考的
        if self.iter_id == 1:
            # 第一轮可以用 data 里的初始文件，或者做一个空的
            pass
        else:
            # 复制上一轮的 train.xyz
            pass
        # 复制 nep.txt
        shutil.copy(self.md_dir / "nep.txt", self.work_dir / "nep.txt")
        # 读取参数
        cfg = self.config_param['params']['select']
        target_min = cfg.get('target_min', 60)
        target_max = cfg.get('target_max', 120)
        threshold = cfg.get('init_threshold', 0.01)
        kit_root = self.driver.config_param['env']['gpumdkit_root']
        script = f"{kit_root}/Scripts/sample_structures/neptrain_select_structs.py"
        # 循环筛选
        max_attempts = 10
        attempt = 0
        while attempt < max_attempts:
            self.logger.info(f"   -> Attempt {attempt + 1}: Threshold = {threshold}")
            # 构造命令: python script dump.xyz train.xyz nep.txt [options]
            # 注意：如果你的脚本不支持命令行传参阈值，需要修改脚本或用 sed 修改
            # 假设脚本已经被修改支持 --distance {threshold}，或者我们用一种 hack 方式
            # 既然原流程是交互式的，这里强烈建议你修改 neptrain_select_structs.py
            # 让它支持命令行参数：parser.add_argument('--distance', ...)
            cmd_args = f"{script} dump.xyz train.xyz nep.txt --distance {threshold} --auto_confirm"
            try:
                self.runner.run("python_script", cwd=self.work_dir, extra_args=cmd_args)
            except Exception as e:
                self.logger.warning(f"Select script warning: {e}")
            # 检查结果
            selected_file = self.work_dir / "selected.xyz"
            count = self.get_frame_count(selected_file)
            self.logger.info(f"      -> Selected {count} structures.")
            if target_min <= count <= target_max:
                self.logger.info("✅ Selection criteria met!")
                break
            elif count < target_min:
                self.logger.info("      -> Too few, lowering threshold (-0.01)...")
                threshold = threshold - 0.01
            else:
                self.logger.info("      -> Too many, raising threshold (+0.01)...")
                threshold = threshold + 0.01
            attempt += 1
        if attempt >= max_attempts:
            self.logger.warning("⚠️ Max attempts reached in selection. Proceeding with current best.")
        self.check_done()
    def check_done(self):
        if (self.work_dir / "selected.xyz").exists():
            return True
        raise RuntimeError("Selection failed: selected.xyz not found")
--- a/nep_auto/modules/m3_scf.py
+++ b/nep_auto/modules/m3_scf.py
@@ -0,0 +1,91 @@
 import shutil
 from pathlib import Path
 from ase.io import read, write
 from nep_auto.modules.base_module import BaseModule
 class SCFModule(BaseModule):
    def __init__(self, driver, iter_id):
        super().__init__(driver, iter_id)
        self.template_subdir = "02_scf"
        self.work_dir = self.iter_dir / "02.scf"
        self.select_dir = self.iter_dir / "01.select"
    def get_work_dir(self):
        return self.work_dir
    def run(self):
        self.logger.info(f"⚛️ [SCF] Starting DFT Calculation Iter {self.iter_id}...")
        self.initialize()
        # 1. 读取 selected.xyz
        selected_xyz = self.select_dir / "selected.xyz"
        if not selected_xyz.exists():
            raise FileNotFoundError("selected.xyz missing")
        self.logger.info("   -> Reading structures using ASE...")
        atoms_list = read(selected_xyz, index=':')
        self.logger.info(f"   -> Found {len(atoms_list)} structures.")
        # 2. 准备任务文件夹
        task_dirs = []
        for i, atoms in enumerate(atoms_list):
            task_name = f"task.{i:03d}"
            task_dir = self.work_dir / task_name
            task_dir.mkdir(exist_ok=True)
            task_dirs.append(task_dir)
            # 写 POSCAR
            write(task_dir / "POSCAR", atoms, format='vasp')
            # 复制模版 INCAR, KPOINTS, POTCAR
            self.copy_template("INCAR", target_name=None)  # 复制到 self.work_dir
            shutil.copy(self.work_dir / "INCAR", task_dir / "INCAR")  # 再分发
            self.copy_template("KPOINTS", target_name=None)
            shutil.copy(self.work_dir / "KPOINTS", task_dir / "KPOINTS")
            self.copy_template("POTCAR", target_name=None)
            shutil.copy(self.work_dir / "POTCAR", task_dir / "POTCAR")
        # 3. 提交任务
        # 这里区分 local 模式和 slurm 模式
        # 既然你目前是 interactive gpu，我们假设是串行或者简单的并行
        self.logger.info("   -> Running VASP jobs...")
        success_count = 0
        for task_dir in task_dirs:
            self.logger.info(f"      -> Running {task_dir.name}...")
            try:
                # 调用 machine.yaml 里定义的 vasp
                # 注意：如果 task 很多，这里最好写成多进程并发
                self.runner.run("vasp", cwd=task_dir)
                # 简单检查
                if (task_dir / "OUTCAR").exists():
                    success_count += 1
            except Exception as e:
                self.logger.error(f"Task {task_dir.name} failed: {e}")
        self.logger.info(f"   -> Finished. Success: {success_count}/{len(task_dirs)}")
        # 4. 收集数据 (OUTCAR -> NEP-dataset.xyz)
        self.logger.info("   -> Collecting data...")
        valid_atoms = []
        for task_dir in task_dirs:
            try:
                # 读取 OUTCAR
                atoms = read(task_dir / "OUTCAR", format='vasp-outcar')
                valid_atoms.append(atoms)
            except:
                pass
        if valid_atoms:
            write(self.work_dir / "NEP-dataset.xyz", valid_atoms, format='extxyz')
        else:
            raise RuntimeError("No valid OUTCARs found!")
        self.check_done()
    def check_done(self):
        if (self.work_dir / "NEP-dataset.xyz").exists():
            return True
        raise RuntimeError("SCF failed: NEP-dataset.xyz not generated")
--- a/nep_auto/modules/m4_train.py
+++ b/nep_auto/modules/m4_train.py
@@ -0,0 +1,57 @@
 import shutil
 from nep_auto.modules.base_module import BaseModule
 class TrainModule(BaseModule):
    def __init__(self, driver, iter_id):
        super().__init__(driver, iter_id)
        self.template_subdir = "03_train"
        self.work_dir = self.iter_dir / "03.train"
    def get_work_dir(self):
        return self.work_dir
    def run(self):
        self.logger.info(f"🧠 [Train] Starting Training Iter {self.iter_id}...")
        self.initialize()
        # 1. 准备 train.xyz
        # 逻辑：当前 train.xyz = 上一轮 train.xyz + 本轮 scf/NEP-dataset.xyz
        current_train_xyz = self.work_dir / "train.xyz"
        # 打开输出文件
        with open(current_train_xyz, 'wb') as outfile:
            # A. 写入上一轮数据 (或初始数据)
            if self.iter_id == 1:
                # 第一轮，看是否有初始训练集，如果没有则只用本轮的 SCF 数据
                # 这里假设 iter_000 是个虚拟的，或者直接去 00.data 里找
                init_data = self.root / "00.data" / "train.xyz"  # 预留位置
                pass
            else:
                prev_train = self.root / f"iter_{self.iter_id - 1:03d}" / "03.train" / "train.xyz"
                if prev_train.exists():
                    with open(prev_train, 'rb') as infile:
                        shutil.copyfileobj(infile, outfile)
            # B. 写入本轮新数据
            new_data = self.iter_dir / "02.scf" / "NEP-dataset.xyz"
            if new_data.exists():
                with open(new_data, 'rb') as infile:
                    shutil.copyfileobj(infile, outfile)
            else:
                raise FileNotFoundError("New training data (NEP-dataset.xyz) missing!")
        # 2. 准备 nep.in
        self.copy_template("nep.in")
        # 3. 运行训练
        self.logger.info("   -> Running NEP training...")
        self.runner.run("nep", cwd=self.work_dir)
        self.check_done()
    def check_done(self):
        if (self.work_dir / "nep.txt").exists():
            self.logger.info("✅ Training finished.")
            return True
        raise RuntimeError("Training failed: nep.txt not generated")
--- a/template/00_md/run_rump.in
+++ b/template/00_md/run_rump.in
@@ -0,0 +1,8 @@
 potential ./nep.txt
 velocity 100
 ensemble npt_mttk temp 100 400 aniso 0 0
 run 100000
 ensemble npt_mttk temp 400 1200 aniso 0 0
 dump_thermo 10
 dump_exyz 10000
 run 100000