nep框架重构

2025-12-09 01:15:38 +08:00
parent 19a6924a41
commit 91bdb0dab1
30 changed files with 7930 additions and 1001 deletions
--- a/src/init.py
+++ b/src/init.py
--- a/src/interface.py
+++ b/src/interface.py
--- a/src/machine.py
+++ b/src/machine.py
@@ -0,0 +1,129 @@
+# src/machine.py
+import os
+import subprocess
+import time
+import logging
+import shutil
+
+
+class MachineManager:
+    def __init__(self, machine_config_path):
+        from src.utils import load_yaml
+        self.config = load_yaml(machine_config_path)
+        self.root_dir = self.config.get('root_dir', os.getcwd())
+        self.script_dir = os.path.join(self.root_dir, self.config.get('script_dir', 'config/scripts'))
+
+        logging.info(f"MachineManager initialized. Script dir: {self.script_dir}")
+
+    def execute(self, executor_name, work_dir):
+        """
+        统一执行入口
+        :param executor_name: machine.yaml 中定义的 key (如 gpumd, vasp_cpu)
+        :param work_dir: 任务执行的工作目录
+        """
+        if executor_name not in self.config['executors']:
+            logging.error(f"Executor '{executor_name}' not defined in machine.yaml")
+            return False
+
+        exec_conf = self.config['executors'][executor_name]
+        exec_type = exec_conf.get('type', 'local')
+
+        # 确保工作目录存在
+        os.makedirs(work_dir, exist_ok=True)
+
+        logging.info(f"--- Task: {executor_name} | Type: {exec_type} ---")
+        logging.info(f"Working Dir: {work_dir}")
+
+        if exec_type == 'local':
+            return self._run_local(exec_conf, work_dir)
+        elif exec_type == 'slurm':
+            return self._submit_slurm(exec_conf, work_dir, executor_name)
+        else:
+            logging.error(f"Unknown execution type: {exec_type}")
+            return False
+
+    def _run_local(self, conf, work_dir):
+        """本地直接执行"""
+        # 1. 优先看有没有 script 脚本文件
+        if 'script' in conf:
+            script_name = conf['script']
+            src_script = os.path.join(self.script_dir, script_name)
+
+            if not os.path.exists(src_script):
+                logging.error(f"Script not found: {src_script}")
+                return False
+
+            # 运行脚本: bash /path/to/script.sh
+            cmd = f"bash {src_script}"
+
+        # 2. 如果没有脚本，看有没有 cmd 直接命令
+        elif 'cmd' in conf:
+            cmd = conf['cmd']
+        else:
+            logging.error("No 'script' or 'cmd' defined for local executor.")
+            return False
+
+        try:
+            # 切换到工作目录执行
+            logging.info(f"Executing Local Command: {cmd}")
+            subprocess.check_call(cmd, shell=True, cwd=work_dir)
+            logging.info("Local execution success.")
+            return True
+        except subprocess.CalledProcessError as e:
+            logging.error(f"Execution failed with error code {e.returncode}")
+            return False
+
+    def _submit_slurm(self, conf, work_dir, job_name):
+        """生成 Slurm 脚本并提交 (模拟)"""
+        script_name = conf.get('script')
+        src_script = os.path.join(self.script_dir, script_name)
+
+        if not os.path.exists(src_script):
+            logging.error(f"Script not found: {src_script}")
+            return False
+
+        # 1. 读取用户自定义脚本内容
+        with open(src_script, 'r') as f:
+            user_script_content = f.read()
+
+        # 2. 生成提交脚本 (.sub)
+        sub_file = os.path.join(work_dir, "submit.sub")
+
+        with open(sub_file, 'w') as f:
+            f.write("#!/bin/bash\n")
+            f.write(f"#SBATCH --job-name={job_name}\n")
+            # 根据 yaml 自动填入 SBATCH 参数
+            if 'partition' in conf: f.write(f"#SBATCH --partition={conf['partition']}\n")
+            if 'nodes' in conf: f.write(f"#SBATCH --nodes={conf['nodes']}\n")
+            if 'ntasks' in conf: f.write(f"#SBATCH --ntasks={conf['ntasks']}\n")
+            if 'time' in conf: f.write(f"#SBATCH --time={conf['time']}\n")
+            if 'gpus' in conf: f.write(f"#SBATCH --gres=gpu:{conf['gpus']}\n")
+
+            f.write("\n")
+            f.write("cd $SLURM_SUBMIT_DIR\n")
+            f.write("\n")
+            f.write("# --- User Script Content ---\n")
+            f.write(user_script_content)
+
+        logging.info(f"Generated submission script: {sub_file}")
+
+        # 3. 提交任务
+        # 注意：这里我们做个判断，如果是在非 Slurm 环境测试，就不真正提交，只生成文件
+        # 如果你想真正提交，把下面的 True 改为 False
+        TEST_MODE = True
+
+        if TEST_MODE:
+            logging.info("[TEST_MODE] Simulated 'sbatch submit.sub'. Check the .sub file.")
+            return True
+        else:
+            try:
+                # 提交并获取 Job ID
+                res = subprocess.check_output(f"sbatch {sub_file}", shell=True, cwd=work_dir)
+                job_id = res.decode().strip().split()[-1]  # 通常输出是 Submitted batch job 123456
+                logging.info(f"Job submitted. ID: {job_id}")
+
+                # TODO: 这里需要加入 wait_for_job(job_id) 的逻辑，我们下一阶段实现
+                return True
+            except subprocess.CalledProcessError as e:
+                logging.error(f"Submission failed: {e}")
+                return False
--- a/src/stages.py
+++ b/src/stages.py
--- a/src/steps.py
+++ b/src/steps.py
@@ -0,0 +1,177 @@
+# src/steps.py
+import os
+import shutil
+import time
+import logging
+import subprocess
+
+
+class BaseStep:
+    def __init__(self, name, work_dir, machine_manager, config):
+        self.name = name
+        self.work_dir = work_dir
+        self.machine = machine_manager
+        self.config = config
+        os.makedirs(self.work_dir, exist_ok=True)
+        self.logger = logging.getLogger()
+
+    def copy_file(self, src, dst_name=None):
+        """辅助函数：安全复制文件"""
+        if not os.path.exists(src):
+            self.logger.error(f"[{self.name}] Source file missing: {src}")
+            return False
+
+        dst_name = dst_name if dst_name else os.path.basename(src)
+        dst_path = os.path.join(self.work_dir, dst_name)
+        shutil.copy(src, dst_path)
+        return dst_path
+
+
+class MDStep(BaseStep):
+    """
+    对应 00.md: 负责预热/采样
+    """
+
+    def run(self, prev_nep_path, template_path):
+        self.logger.info(f"=== Running Step: {self.name} (MD) ===")
+
+        # 1. 准备 nep.txt (来自上一轮或初始数据)
+        if not prev_nep_path:
+            self.logger.error("No nep.txt provided for MD.")
+            return False
+        self.copy_file(prev_nep_path, "nep.txt")
+
+        # 2. 准备 model.xyz (如果是第一轮，这里假设外部已经放好了，或者由init步生成)
+        # 为了简化，我们假设上一级流程已经把 model.xyz 准备在 work_dir 或者由上一轮传递
+        # 这里我们假设 model.xyz 必须存在于 work_dir (可以通过 init 步骤拷入)
+        if not os.path.exists(os.path.join(self.work_dir, "model.xyz")):
+            self.logger.warning(f"[{self.name}] model.xyz not found in {self.work_dir}. Make sure Init step ran.")
+
+        # 3. 准备 run.in (从 template 复制)
+        run_in_src = os.path.join(template_path, "run.in")
+        self.copy_file(run_in_src, "run.in")
+
+        # 4. 调用 Machine 执行 GPUMD
+        # 注意：这里我们调用 machine.yaml 里定义的 'gpumd' 执行器
+        success = self.machine.execute("gpumd", self.work_dir)
+
+        if success and os.path.exists(os.path.join(self.work_dir, "dump.xyz")):
+            self.logger.info(f"[{self.name}] MD finished. dump.xyz generated.")
+            return True
+        else:
+            self.logger.error(f"[{self.name}] MD failed or dump.xyz missing.")
+            return False
+
+
+class SelectStep(BaseStep):
+    """
+    对应 01.select: 智能筛选
+    """
+
+    def run(self, dump_path, train_path, nep_path, method="distance", params=[0.01, 60, 120]):
+        self.logger.info(f"=== Running Step: {self.name} (Smart Selection) ===")
+
+        # 准备文件
+        self.copy_file(dump_path, "dump.xyz")
+        self.copy_file(train_path, "train.xyz")
+        self.copy_file(nep_path, "nep.txt")
+
+        target_min, target_max = params[1], params[2]
+        threshold = params[0]
+        step_size = 0.001  # 每次调整的步长
+
+        # 你的流程里是用 gpumdkit.sh 做筛选 (option 203)
+        # 这里的命令构造需要非常小心，模拟你的 echo输入
+        # 假设 gpumdkit.sh 在 PATH 中，或者通过 machine config 获取路径
+        # 由于我们现在是 local 调试，假设你依然依赖 gpumdkit.sh
+        # 但既然我们写 Python，建议未来把筛选逻辑（计算距离）直接写成 Python 代码。
+        # 这里暂时模拟调用逻辑：
+
+        for i in range(10):  # 最多尝试10次
+            self.logger.info(f"Selection attempt {i + 1}: Threshold={threshold:.4f}")
+
+            # 构造输入字符串: 203 -> file names -> 1 (distance) -> threshold
+            # 注意：这里假设 gpumdkit.sh 能接受这种输入
+            # 为了调试方便，这里我们暂时只打日志，不真的调 gpumdkit (因为它需要真实的数据文件)
+            # 在真实运行中，这里应该调用:
+            # input_str = f"203\ndump.xyz train.xyz nep.txt\n1\n{threshold}\n"
+            # subprocess.run("gpumdkit.sh", input=input_str, cwd=self.work_dir...)
+
+            # --- 模拟代码 Start ---
+            # 假设生成了一个假的 selected.xyz
+            with open(os.path.join(self.work_dir, "selected.xyz"), 'w') as f:
+                # 模拟根据阈值，阈值越小选的越多
+                mock_count = int(100 / (threshold * 100))
+                f.write(f"Mock selected {mock_count} frames")
+
+            selected_count = mock_count
+            self.logger.info(f"Found {selected_count} structures (Mock).")
+            # --- 模拟代码 End ---
+
+            if target_min <= selected_count <= target_max:
+                self.logger.info(f"Selection Success! Final count: {selected_count}")
+                return True
+            elif selected_count < target_min:
+                self.logger.info("Too few. Decreasing threshold.")
+                threshold -= step_size
+                if threshold < 0: threshold = 0.001
+            else:
+                self.logger.info("Too many. Increasing threshold.")
+                threshold += step_size
+
+        self.logger.warning("Selection failed to converge. Using last result.")
+        return True  # 暂时允许继续
+
+
+class SCFStep(BaseStep):
+    """
+    对应 02.scf: VASP 计算
+    """
+
+    def run(self, template_path, potcar_path):
+        self.logger.info(f"=== Running Step: {self.name} (SCF/VASP) ===")
+
+        # 1. 复制 POTCAR
+        self.copy_file(potcar_path, "POTCAR")
+
+        # 2. 复制 INCAR
+        incar_src = os.path.join(template_path, "INCAR")
+        if not self.copy_file(incar_src, "INCAR"):
+            return False  # INCAR 必须有
+
+        # 3. 复制 KPOINTS (可选)
+        kpoints_src = os.path.join(template_path, "KPOINTS")
+        if os.path.exists(kpoints_src):
+            self.copy_file(kpoints_src, "KPOINTS")
+
+        # 4. 执行 VASP
+        # 注意：这里通常需要把 selected.xyz 拆分成多个文件夹
+        # 在 Local 简单测试中，我们假设 selected.xyz 已经被拆分成了 POSCAR
+        # 或者我们只跑一个单点能测试。
+        # 既然是框架开发，这里我们调用 machine.yaml 里的 'vasp_cpu'
+
+        success = self.machine.execute("vasp_cpu", self.work_dir)
+        return success
+
+
+class TrainStep(BaseStep):
+    """
+    对应 03.train: NEP 训练
+    """
+
+    def run(self, template_path, new_train_data_path):
+        self.logger.info(f"=== Running Step: {self.name} (Train) ===")
+
+        # 1. 准备 nep.in
+        self.copy_file(os.path.join(template_path, "nep.in"), "nep.in")
+
+        # 2. 准备 train.xyz (这里假设我们把所有数据 cat 到了这里)
+        if new_train_data_path and os.path.exists(new_train_data_path):
+            self.copy_file(new_train_data_path, "train.xyz")
+        else:
+            # 如果没有新数据，只是测试，创建一个空的
+            with open(os.path.join(self.work_dir, "train.xyz"), 'w') as f:
+                f.write("Mock training data")
+
+        # 3. 运行 NEP
+        return self.machine.execute("nep_local", self.work_dir)
--- a/src/utils.py
+++ b/src/utils.py
@@ -0,0 +1,47 @@
+# src/utils.py
+import yaml
+import logging
+import os
+import sys
+
+
+def load_yaml(path):
+    """加载 YAML 配置文件"""
+    if not os.path.exists(path):
+        logging.error(f"Config file not found: {path}")
+        sys.exit(1)
+    with open(path, 'r') as f:
+        return yaml.safe_load(f)
+
+
+def setup_logger(work_dir, log_file="autonep.log"):
+    """配置日志：同时输出到文件和控制台"""
+    logger = logging.getLogger()
+    logger.setLevel(logging.INFO)
+
+    # 清楚之前的 handler 防止重复
+    if logger.hasHandlers():
+        logger.handlers.clear()
+
+    # 文件 Handler
+    file_handler = logging.FileHandler(os.path.join(work_dir, log_file))
+    file_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))
+    logger.addHandler(file_handler)
+
+    # 控制台 Handler
+    console_handler = logging.StreamHandler()
+    console_handler.setFormatter(logging.Formatter('%(message)s'))  # 控制台只看消息，简洁点
+    logger.addHandler(console_handler)
+
+    return logger
+
+
+class Notifier:
+    """(预留) 通知模块"""
+
+    def __init__(self, url=None):
+        self.url = url
+
+    def send(self, title, msg, priority=5):
+        # 暂时只打印日志，不实际发送
+        logging.info(f"[[Notification]] {title}: {msg}")
--- a/src/workflow.py
+++ b/src/workflow.py
@@ -0,0 +1,131 @@
+# src/workflow.py
+import os
+import shutil
+import logging
+from src.utils import load_yaml
+from src.machine import MachineManager
+from src.steps import MDStep, SelectStep, SCFStep, TrainStep
+
+
+class Workflow:
+    def __init__(self, root_dir):
+        self.root_dir = root_dir
+
+        # 1. 加载配置
+        self.param = load_yaml(os.path.join(root_dir, "config/param.yaml"))
+
+        # 2. 初始化机器管理器
+        self.machine = MachineManager(os.path.join(root_dir, "config/machine.yaml"))
+
+        # 3. 初始化路径变量
+        self.workspace = os.path.join(root_dir, "workspace")
+        self.data_dir = os.path.join(root_dir, "data")
+        self.template_dir = os.path.join(root_dir, "template")
+        self.logger = logging.getLogger()
+
+        # 状态追踪变量
+        self.current_nep_pot = os.path.join(self.data_dir, self.param['files']['initial_pot'])
+        # 假设第一轮之前的 train set 也是空的或者由用户提供，这里先指向一个基础文件
+        self.current_train_set = os.path.join(self.workspace, "accumulated_train.xyz")
+
+    def run(self):
+        self.logger.info(f"Workflow Started: {self.param['project']}")
+
+        # 遍历每一轮迭代
+        for iteration in self.param['iterations']:
+            iter_id = iteration['id']
+            iter_name = f"iter_{iter_id:02d}"
+            iter_path = os.path.join(self.workspace, iter_name)
+
+            self.logger.info(f"\n >>> Starting Iteration: {iter_id} <<<")
+            os.makedirs(iter_path, exist_ok=True)
+
+            # --- 执行该轮定义的各个 Step ---
+            for step_conf in iteration['steps']:
+                step_name = step_conf['name']
+
+                # ==========================
+                # Step: 00.md
+                # ==========================
+                if step_name == "00.md":
+                    step_dir = os.path.join(iter_path, "00.md")
+
+                    # 只有第一轮且有init需求时，才进行 POSCAR -> model.xyz 转化
+                    # 这里为了 Local 测试，我们简单处理：直接把 POSCAR 拷过去当 model.xyz (仅作演示)
+                    # 实际上你应该调用 gpumdkit 转化
+                    if iter_id == 0:
+                        os.makedirs(step_dir, exist_ok=True)
+                        shutil.copy(os.path.join(self.data_dir, self.param['files']['poscar']),
+                                    os.path.join(step_dir, "model.xyz"))
+
+                    # 遍历子任务 (preheat, production...)
+                    for sub in step_conf.get('sub_tasks', []):
+                        template_sub_name = sub['template_sub']
+                        sub_work_dir = os.path.join(step_dir, template_sub_name)
+                        template_path = os.path.join(self.template_dir, "00.md", template_sub_name)
+
+                        # 实例化并运行
+                        md_task = MDStep(f"MD-{template_sub_name}", sub_work_dir, self.machine, self.config)
+
+                        # 关键：要把上一级准备好的 model.xyz 拷进来
+                        if iter_id == 0:
+                            shutil.copy(os.path.join(step_dir, "model.xyz"), os.path.join(sub_work_dir, "model.xyz"))
+                        # 如果是后续轮次，应该用上一轮选好的结构，这里暂略，先跑通第一轮
+
+                        md_task.run(self.current_nep_pot, template_path)
+
+                        # 记录最后生成的 dump.xyz 路径，供下一步使用
+                        self.last_dump_path = os.path.join(sub_work_dir, "dump.xyz")
+
+                # ==========================
+                # Step: 01.select
+                # ==========================
+                elif step_name == "01.select":
+                    step_dir = os.path.join(iter_path, "01.select")
+                    select_task = SelectStep("Select", step_dir, self.machine, self.config)
+
+                    # 使用上一步产生的 dump 和 当前的训练集/势函数
+                    select_task.run(
+                        dump_path=getattr(self, 'last_dump_path', None),
+                        train_path=self.current_train_set,
+                        nep_path=self.current_nep_pot,
+                        method=step_conf.get('method'),
+                        params=step_conf.get('params')
+                    )
+
+                # ==========================
+                # Step: 02.scf
+                # ==========================
+                elif step_name == "02.scf":
+                    step_dir = os.path.join(iter_path, "02.scf")
+                    scf_task = SCFStep("SCF", step_dir, self.machine, self.config)
+
+                    template_path = os.path.join(self.template_dir, "02.scf")
+                    potcar_path = os.path.join(self.data_dir, self.param['files']['potcar'])
+
+                    scf_task.run(template_path, potcar_path)
+
+                    # 假装产生了一些新数据
+                    self.new_data_chunk = os.path.join(step_dir, "scf_results.xyz")
+
+                # ==========================
+                # Step: 03.train
+                # ==========================
+                elif step_name == "03.train":
+                    step_dir = os.path.join(iter_path, "03.train")
+                    train_task = TrainStep("Train", step_dir, self.machine, self.config)
+
+                    template_path = os.path.join(self.template_dir, "03.train")
+
+                    # 实际逻辑应该是把 self.new_data_chunk 合并到 total_train.xyz
+                    # 这里直接传入
+                    train_task.run(template_path, getattr(self, 'new_data_chunk', None))
+
+                    # 更新当前势函数路径，供下一轮使用
+                    self.current_nep_pot = os.path.join(step_dir, "nep.txt")
+
+        self.logger.info("Workflow Finished Successfully.")
+
+    @property
+    def config(self):
+        return self.param  # 简单透传