nep框架重构 02.scf

2025-12-09 20:24:52 +08:00
parent ceff569583
commit f19d8ac4f0
2 changed files with 133 additions and 2 deletions
--- a/config/machine.yaml
+++ b/config/machine.yaml
@@ -22,6 +22,6 @@ executors:
    cmd: "gpumd" # 对应 config/scripts/gpumd.sh
  # 3. Slurm 提交测试 (VASP CPU)
-  vasp_cpu:
+  vasp_gpu:
    type: "local"
    cmd: "mpirun -np 1 vasp_std"
--- a/src/workflow.py
+++ b/src/workflow.py
@@ -310,4 +310,135 @@ class Workflow:
                                return
                    else:
-                        self.logger.info("Skipping Select (Already Done).")
+                        self.logger.info("Skipping Select (Already Done).")
                # ==========================
                # Step: 02.scf (VASP Calculation)
                # ==========================
                elif step_name == "02.scf":
                    step_dir = os.path.join(iter_path, "02.scf")
                    task_id_scf = f"{iter_name}.02.scf"
                    if not self.tracker.is_done(task_id_scf):
                        self.logger.info("=== Step: 02.scf (VASP) ===")
                        os.makedirs(step_dir, exist_ok=True)
                        # 1. 准备 selected.xyz
                        # 尝试从同轮次的 01.select 获取
                        select_step_dir = os.path.join(iter_path, "01.select")
                        src_selected = os.path.join(select_step_dir, "selected.xyz")
                        if not os.path.exists(src_selected):
                            self.logger.error(f"selected.xyz not found in {select_step_dir}")
                            return
                        dst_selected = os.path.join(step_dir, "selected.xyz")
                        if os.path.exists(dst_selected): os.remove(dst_selected)
                        os.symlink(os.path.abspath(src_selected), dst_selected)
                        # 2. 运行 301 拆分结构
                        # 命令: echo -e "301\niter" | gpumdkit.sh
                        # 这会生成 iterX_1, iterX_2... 和 fp 文件夹
                        kit_path = self.machine.config['paths'].get('gpumdkit', 'gpumdkit.sh')
                        input_str_301 = "301\niter"  # 这里 "iter" 是文件夹前缀名，gpumdkit 会自动加数字
                        self.logger.info("Splitting structures (301)...")
                        if not run_cmd_with_log(kit_path, step_dir, "scf_setup.log", input_str=input_str_301):
                            self.logger.error("301 command failed.")
                            return
                        # 3. 准备 VASP 输入文件到 'fp' 文件夹
                        # gpumdkit 生成的 fp 文件夹通常存放公共文件，子文件夹会软链过去
                        fp_dir = os.path.join(step_dir, "fp")
                        if not os.path.exists(fp_dir):
                            self.logger.error("'fp' directory was not created by 301.")
                            return
                        self.logger.info("Distributing VASP inputs to 'fp' folder...")
                        # A. POTCAR (来自 Data)
                        potcar_src = os.path.join(self.data_dir, self.param['files']['potcar'])
                        if os.path.exists(potcar_src):
                            shutil.copy(potcar_src, os.path.join(fp_dir, "POTCAR"))
                        else:
                            self.logger.error(f"POTCAR missing: {potcar_src}")
                            return
                        # B. INCAR (来自 Template)
                        # Template 路径: template/02.scf/INCAR
                        incar_src = os.path.join(self.template_dir, "02.scf", "INCAR")
                        if os.path.exists(incar_src):
                            shutil.copy(incar_src, os.path.join(fp_dir, "INCAR"))
                        else:
                            self.logger.error(f"INCAR missing in template: {incar_src}")
                            return
                        # C. KPOINTS (来自 Template, 可选)
                        kpoints_src = os.path.join(self.template_dir, "02.scf", "KPOINTS")
                        if os.path.exists(kpoints_src):
                            shutil.copy(kpoints_src, os.path.join(fp_dir, "KPOINTS"))
                        else:
                            self.logger.info("KPOINTS not found in template, assuming KSPACING in INCAR.")
                        # 4. 生成并提交计算任务
                        # 这里我们不理会 gpumdkit 生成的 presub.sh，而是根据 machine.yaml 生成自己的
                        executor_name = step_conf.get('executor', 'vasp_gpu')  # 默认用 cpu
                        # 获取执行命令 (例如 "mpirun -np 32 vasp_std")
                        # 这里的逻辑需要调用 machine 模块的一个新功能：批量生成提交脚本
                        # 但为了简化，我们在 Local 模式下生成一个遍历脚本
                        self.logger.info(f"Generating batch submission script for {executor_name}...")
                        # 读取 machine 配置里的命令
                        exec_conf = self.machine.config['executors'].get(executor_name, {})
                        vasp_cmd = exec_conf.get('cmd', 'mpirun -np 1 vasp_std')  # 默认值
                        # 生成 run_vasp.sh
                        run_script_path = os.path.join(step_dir, "run_vasp.sh")
                        with open(run_script_path, 'w') as f:
                            f.write("#!/bin/bash\n")
                            # 遍历 iter* 目录
                            f.write(f"for dir in iter*_*; do\n")
                            f.write(f"  if [ -d \"$dir\" ]; then\n")
                            f.write(f"    echo \"Running VASP in $dir ...\"\n")
                            f.write(f"    cd $dir\n")
                            # 写入具体的 VASP 执行命令
                            f.write(f"    {vasp_cmd} > vasp.log 2>&1\n")  # 重定向日志
                            f.write(f"    cd ..\n")
                            f.write(f"  fi\n")
                            f.write(f"done\n")
                        os.chmod(run_script_path, 0o755)
                        # 执行 VASP 计算
                        # 注意：如果是在 Slurm 上，这里应该提交 run_vasp.sh，并使用 Job ID 等待
                        # 目前 Local 模式直接运行
                        self.logger.info(">>> Executing VASP batch calculations (this may take time)...")
                        if not run_cmd_with_log("./run_vasp.sh", step_dir, "scf_exec.log"):
                            self.logger.error("VASP batch execution failed.")
                            return
                        # 5. 结果收集 (out2xyz)
                        self.logger.info("Collecting results (out2xyz)...")
                        cmd_collect = f"{kit_path} -out2xyz ."
                        if run_cmd_with_log(cmd_collect, step_dir, "scf_collect.log"):
                            # 检查结果
                            res_dir = os.path.join(step_dir, "NEPdataset-multiple_frames")
                            res_file = os.path.join(res_dir, "NEP-dataset.xyz")
                            if os.path.exists(res_file):
                                self.logger.info(f"VASP data collected: {res_file}")
                                # 保存这个路径供 Train 使用
                                self.new_data_chunk = res_file
                                self.tracker.mark_done(task_id_scf)
                            else:
                                self.logger.error("NEP-dataset.xyz not found after collection.")
                        else:
                            self.logger.error("out2xyz failed.")
                    else:
                        self.logger.info("Skipping SCF (Already Done).")
                        # 即使跳过，也要尝试恢复 self.new_data_chunk 变量，防止 Train 找不到数据
                        # 这里简单推断一下
                        res_file = os.path.join(step_dir, "NEPdataset-multiple_frames", "NEP-dataset.xyz")
                        if os.path.exists(res_file):
                            self.new_data_chunk = res_file