nep框架重构 02.scf

2025-12-09 20:24:52 +08:00
parent ceff569583
commit f19d8ac4f0
2 changed files with 133 additions and 2 deletions
--- a/config/machine.yaml
+++ b/config/machine.yaml
@@ -22,6 +22,6 @@ executors:
    cmd: "gpumd" # 对应 config/scripts/gpumd.sh

  # 3. Slurm 提交测试 (VASP CPU)
-  vasp_cpu:
+  vasp_gpu:
    type: "local"
    cmd: "mpirun -np 1 vasp_std"
--- a/src/workflow.py
+++ b/src/workflow.py
@@ -310,4 +310,135 @@ class Workflow:
                                return

                    else:
-                        self.logger.info("Skipping Select (Already Done).")
+                        self.logger.info("Skipping Select (Already Done).")
+                # ==========================
+                # Step: 02.scf (VASP Calculation)
+                # ==========================
+                elif step_name == "02.scf":
+                    step_dir = os.path.join(iter_path, "02.scf")
+                    task_id_scf = f"{iter_name}.02.scf"
+
+                    if not self.tracker.is_done(task_id_scf):
+                        self.logger.info("=== Step: 02.scf (VASP) ===")
+                        os.makedirs(step_dir, exist_ok=True)
+
+                        # 1. 准备 selected.xyz
+                        # 尝试从同轮次的 01.select 获取
+                        select_step_dir = os.path.join(iter_path, "01.select")
+                        src_selected = os.path.join(select_step_dir, "selected.xyz")
+                        if not os.path.exists(src_selected):
+                            self.logger.error(f"selected.xyz not found in {select_step_dir}")
+                            return
+
+                        dst_selected = os.path.join(step_dir, "selected.xyz")
+                        if os.path.exists(dst_selected): os.remove(dst_selected)
+                        os.symlink(os.path.abspath(src_selected), dst_selected)
+
+                        # 2. 运行 301 拆分结构
+                        # 命令: echo -e "301\niter" | gpumdkit.sh
+                        # 这会生成 iterX_1, iterX_2... 和 fp 文件夹
+                        kit_path = self.machine.config['paths'].get('gpumdkit', 'gpumdkit.sh')
+                        input_str_301 = "301\niter"  # 这里 "iter" 是文件夹前缀名，gpumdkit 会自动加数字
+
+                        self.logger.info("Splitting structures (301)...")
+                        if not run_cmd_with_log(kit_path, step_dir, "scf_setup.log", input_str=input_str_301):
+                            self.logger.error("301 command failed.")
+                            return
+
+                        # 3. 准备 VASP 输入文件到 'fp' 文件夹
+                        # gpumdkit 生成的 fp 文件夹通常存放公共文件，子文件夹会软链过去
+                        fp_dir = os.path.join(step_dir, "fp")
+                        if not os.path.exists(fp_dir):
+                            self.logger.error("'fp' directory was not created by 301.")
+                            return
+
+                        self.logger.info("Distributing VASP inputs to 'fp' folder...")
+
+                        # A. POTCAR (来自 Data)
+                        potcar_src = os.path.join(self.data_dir, self.param['files']['potcar'])
+                        if os.path.exists(potcar_src):
+                            shutil.copy(potcar_src, os.path.join(fp_dir, "POTCAR"))
+                        else:
+                            self.logger.error(f"POTCAR missing: {potcar_src}")
+                            return
+
+                        # B. INCAR (来自 Template)
+                        # Template 路径: template/02.scf/INCAR
+                        incar_src = os.path.join(self.template_dir, "02.scf", "INCAR")
+                        if os.path.exists(incar_src):
+                            shutil.copy(incar_src, os.path.join(fp_dir, "INCAR"))
+                        else:
+                            self.logger.error(f"INCAR missing in template: {incar_src}")
+                            return
+
+                        # C. KPOINTS (来自 Template, 可选)
+                        kpoints_src = os.path.join(self.template_dir, "02.scf", "KPOINTS")
+                        if os.path.exists(kpoints_src):
+                            shutil.copy(kpoints_src, os.path.join(fp_dir, "KPOINTS"))
+                        else:
+                            self.logger.info("KPOINTS not found in template, assuming KSPACING in INCAR.")
+
+                        # 4. 生成并提交计算任务
+                        # 这里我们不理会 gpumdkit 生成的 presub.sh，而是根据 machine.yaml 生成自己的
+                        executor_name = step_conf.get('executor', 'vasp_gpu')  # 默认用 cpu
+
+                        # 获取执行命令 (例如 "mpirun -np 32 vasp_std")
+                        # 这里的逻辑需要调用 machine 模块的一个新功能：批量生成提交脚本
+                        # 但为了简化，我们在 Local 模式下生成一个遍历脚本
+
+                        self.logger.info(f"Generating batch submission script for {executor_name}...")
+
+                        # 读取 machine 配置里的命令
+                        exec_conf = self.machine.config['executors'].get(executor_name, {})
+                        vasp_cmd = exec_conf.get('cmd', 'mpirun -np 1 vasp_std')  # 默认值
+
+                        # 生成 run_vasp.sh
+                        run_script_path = os.path.join(step_dir, "run_vasp.sh")
+                        with open(run_script_path, 'w') as f:
+                            f.write("#!/bin/bash\n")
+                            # 遍历 iter* 目录
+                            f.write(f"for dir in iter*_*; do\n")
+                            f.write(f"  if [ -d \"$dir\" ]; then\n")
+                            f.write(f"    echo \"Running VASP in $dir ...\"\n")
+                            f.write(f"    cd $dir\n")
+                            # 写入具体的 VASP 执行命令
+                            f.write(f"    {vasp_cmd} > vasp.log 2>&1\n")  # 重定向日志
+                            f.write(f"    cd ..\n")
+                            f.write(f"  fi\n")
+                            f.write(f"done\n")
+
+                        os.chmod(run_script_path, 0o755)
+
+                        # 执行 VASP 计算
+                        # 注意：如果是在 Slurm 上，这里应该提交 run_vasp.sh，并使用 Job ID 等待
+                        # 目前 Local 模式直接运行
+                        self.logger.info(">>> Executing VASP batch calculations (this may take time)...")
+                        if not run_cmd_with_log("./run_vasp.sh", step_dir, "scf_exec.log"):
+                            self.logger.error("VASP batch execution failed.")
+                            return
+
+                        # 5. 结果收集 (out2xyz)
+                        self.logger.info("Collecting results (out2xyz)...")
+                        cmd_collect = f"{kit_path} -out2xyz ."
+                        if run_cmd_with_log(cmd_collect, step_dir, "scf_collect.log"):
+                            # 检查结果
+                            res_dir = os.path.join(step_dir, "NEPdataset-multiple_frames")
+                            res_file = os.path.join(res_dir, "NEP-dataset.xyz")
+
+                            if os.path.exists(res_file):
+                                self.logger.info(f"VASP data collected: {res_file}")
+                                # 保存这个路径供 Train 使用
+                                self.new_data_chunk = res_file
+                                self.tracker.mark_done(task_id_scf)
+                            else:
+                                self.logger.error("NEP-dataset.xyz not found after collection.")
+                        else:
+                            self.logger.error("out2xyz failed.")
+
+                    else:
+                        self.logger.info("Skipping SCF (Already Done).")
+                        # 即使跳过，也要尝试恢复 self.new_data_chunk 变量，防止 Train 找不到数据
+                        # 这里简单推断一下
+                        res_file = os.path.join(step_dir, "NEPdataset-multiple_frames", "NEP-dataset.xyz")
+                        if os.path.exists(res_file):
+                            self.new_data_chunk = res_file