nep框架搭建

2025-12-08 22:34:02 +08:00
parent cba2afb403
commit 19a6924a41
3 changed files with 246 additions and 115 deletions
--- a/nep_auto/modules/m2_select.py
+++ b/nep_auto/modules/m2_select.py
@@ -1,5 +1,5 @@
 import shutil
-import re
+import subprocess
 from pathlib import Path
 from nep_auto.modules.base_module import BaseModule

@@ -14,14 +14,16 @@ class SelectModule(BaseModule):
        return self.work_dir

    def get_frame_count(self, xyz_file):
-        """读取 xyz 文件帧数 (简单通过 grep 'Lattice' 计数，或用 ASE)"""
+        """读取 xyz 文件帧数 (通过 grep 'Lattice' 计数)"""
        if not xyz_file.exists():
            return 0
-        # 简单方法：读取文件统计 Lattice 出现的次数 (ExtXYZ 格式)
        try:
-            with open(xyz_file, 'r') as f:
-                content = f.read()
-                return content.count("Lattice=")
+            # 使用 grep -c 更快，避免 python 读取大文件内存溢出
+            result = subprocess.run(
+                f"grep -c 'Lattice' {xyz_file}",
+                shell=True, stdout=subprocess.PIPE, text=True
+            )
+            return int(result.stdout.strip())
        except:
            return 0

@@ -29,73 +31,114 @@ class SelectModule(BaseModule):
        self.logger.info(f"🔍 [Select] Starting Active Learning Selection Iter {self.iter_id}...")
        self.initialize()

-        # 准备数据
+        # ----------------------------------------
+        # 1. 准备必要文件
+        # ----------------------------------------
+        # A. 待筛选数据 (从 MD 结果拿)
        src_dump = self.md_dir / "dump.xyz"
-        train_xyz_prev = self.root / "00.data" / "train.xyz"  # 或者是上一轮的 train
-        # 如果是 iter > 1，train.xyz 应该是累积的。这里简化，先假设有一个参考的 train.xyz
-
-        # 必须文件：dump.xyz, train.xyz, nep.txt
+        if not src_dump.exists():
+            raise FileNotFoundError(f"MD dump missing: {src_dump}")
        shutil.copy(src_dump, self.work_dir / "dump.xyz")

-        # 这里的 train.xyz 是给 neptrain_select_structs.py 用作参考的
-        if self.iter_id == 1:
-            # 第一轮可以用 data 里的初始文件，或者做一个空的
-            pass
-        else:
-            # 复制上一轮的 train.xyz
-            pass
-
-        # 复制 nep.txt
+        # B. 势函数 (从 MD 结果拿)
        shutil.copy(self.md_dir / "nep.txt", self.work_dir / "nep.txt")

-        # 读取参数
+        # C. 历史训练集 (用于对比)
+        # 逻辑：如果是第一轮，我们需要一个初始的 train.xyz (即使是空的或者是 model.xyz)
+        # gpumdkit 需要这个文件存在
+        target_train_xyz = self.work_dir / "train.xyz"
+
+        if self.iter_id == 1:
+            # 尝试从 data 目录拿初始训练集，如果没有，可以用 model.xyz 充数
+            init_train = self.root / "00.data" / "train.xyz"
+            if init_train.exists():
+                shutil.copy(init_train, target_train_xyz)
+            else:
+                # 如果实在没有，把初始结构当做 train.xyz，避免脚本报错
+                self.logger.warning("No initial train.xyz found, using model.xyz as placeholder.")
+                shutil.copy(self.md_dir / "model.xyz", target_train_xyz)
+        else:
+            # 使用上一轮累积的训练集
+            prev_train = self.root / f"iter_{self.iter_id - 1:03d}" / "03.train" / "train.xyz"
+            if prev_train.exists():
+                shutil.copy(prev_train, target_train_xyz)
+            else:
+                raise FileNotFoundError(f"Previous train.xyz missing: {prev_train}")
+
+        # ----------------------------------------
+        # 2. 循环筛选 (调整阈值)
+        # ----------------------------------------
        cfg = self.config_param['params']['select']
        target_min = cfg.get('target_min', 60)
        target_max = cfg.get('target_max', 120)
        threshold = cfg.get('init_threshold', 0.01)

-        kit_root = self.driver.config_param['env']['gpumdkit_root']
-        script = f"{kit_root}/Scripts/sample_structures/neptrain_select_structs.py"
-
-        # 循环筛选
        max_attempts = 10
        attempt = 0

+        # gpumdkit 命令 (假设 machine.yaml 里配好了 tool 叫 'gpumdkit')
+        # 如果是 local 模式，runner.run 实际上是执行 command。
+        # 但这里我们需要特殊的 input pipe，runner 的通用接口可能不够用。
+        # 既然我们明确是 local 环境且用 pipe，直接用 subprocess 最稳。
+        gpumdkit_cmd = self.machine_config['tools']['gpumdkit']['command']  # e.g. "gpumdkit.sh"
+
        while attempt < max_attempts:
-            self.logger.info(f"   -> Attempt {attempt + 1}: Threshold = {threshold}")
+            self.logger.info(f"   -> Attempt {attempt + 1}: Threshold = {threshold:.5f}")

-            # 构造命令: python script dump.xyz train.xyz nep.txt [options]
-            # 注意：如果你的脚本不支持命令行传参阈值，需要修改脚本或用 sed 修改
-            # 假设脚本已经被修改支持 --distance {threshold}，或者我们用一种 hack 方式
-            # 既然原流程是交互式的，这里强烈建议你修改 neptrain_select_structs.py
-            # 让它支持命令行参数：parser.add_argument('--distance', ...)
+            # 构造输入流字符串
+            # 对应你的流程: 203 -> file names -> 1 (distance mode) -> threshold
+            input_str = f"203\ndump.xyz train.xyz nep.txt\n1\n{threshold}\n"

-            cmd_args = f"{script} dump.xyz train.xyz nep.txt --distance {threshold} --auto_confirm"
+            # 构造完整命令： echo -e "..." | gpumdkit.sh
+            # 注意：python 的 input 参数直接传给 stdin，不需要用 echo |

            try:
-                self.runner.run("python_script", cwd=self.work_dir, extra_args=cmd_args)
-            except Exception as e:
-                self.logger.warning(f"Select script warning: {e}")
+                self.logger.debug(f"      Input string: {repr(input_str)}")

-            # 检查结果
+                process = subprocess.run(
+                    gpumdkit_cmd,
+                    input=input_str,
+                    cwd=self.work_dir,
+                    shell=True,
+                    executable="/bin/bash",
+                    stdout=subprocess.PIPE,
+                    stderr=subprocess.PIPE,
+                    text=True
+                )
+
+                # 记录输出以便 debug
+                # self.logger.debug(process.stdout)
+
+                if process.returncode != 0:
+                    self.logger.error(f"gpumdkit execution failed: {process.stderr}")
+                    raise RuntimeError("gpumdkit failed")
+
+            except Exception as e:
+                self.logger.error(f"Execution error: {e}")
+                raise
+
+            # 检查 selected.xyz
            selected_file = self.work_dir / "selected.xyz"
            count = self.get_frame_count(selected_file)
            self.logger.info(f"      -> Selected {count} structures.")

            if target_min <= count <= target_max:
-                self.logger.info("✅ Selection criteria met!")
+                self.logger.info(f"✅ Selection success! ({count} frames)")
                break
            elif count < target_min:
-                self.logger.info("      -> Too few, lowering threshold (-0.01)...")
-                threshold = threshold - 0.01
+                self.logger.info("      -> Too few, lowering threshold (x0.8)...")
+                threshold *= 0.8
            else:
-                self.logger.info("      -> Too many, raising threshold (+0.01)...")
-                threshold = threshold + 0.01
+                self.logger.info("      -> Too many, raising threshold (x1.2)...")
+                threshold *= 1.2
+
+            # 稍微清理一下生成的中间文件，防止下次干扰？
+            # selected.xyz 会被下次覆盖，所以不删也行。

            attempt += 1

        if attempt >= max_attempts:
-            self.logger.warning("⚠️ Max attempts reached in selection. Proceeding with current best.")
+            self.logger.warning("⚠️ Max attempts reached. Proceeding with current best.")

        self.check_done()