Files
NEP-auto/src/workflow.py
2025-12-10 10:30:54 +08:00

564 lines
32 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# src/workflow.py
import os
import shutil
import logging
import subprocess
from src.utils import load_yaml, run_cmd_with_log
from src.machine import MachineManager
from src.state import StateTracker # 新增
from src.steps import MDStep, SelectStep, SCFStep, TrainStep
class Workflow:
def __init__(self, root_dir):
self.root_dir = root_dir
self.param = load_yaml(os.path.join(root_dir, "config/param.yaml"))
self.machine = MachineManager(os.path.join(root_dir, "config/machine.yaml"))
self.workspace = os.path.join(root_dir, "workspace")
self.data_dir = os.path.join(root_dir, "data")
self.template_dir = os.path.join(root_dir, "template")
self.logger = logging.getLogger()
# 初始化状态追踪
os.makedirs(self.workspace, exist_ok=True)
self.tracker = StateTracker(self.workspace)
# 初始变量
self.current_nep_pot = os.path.join(self.data_dir, self.param['files']['initial_pot'])
self.current_train_set = os.path.join(self.workspace, "accumulated_train.xyz")
def run(self):
self.logger.info(f"Workflow Started: {self.param['project']}")
for iteration in self.param['iterations']:
iter_id = iteration['id']
iter_name = f"iter_{iter_id:02d}"
iter_path = os.path.join(self.workspace, iter_name)
self.logger.info(f"\n >>> Processing Iteration: {iter_id} <<<")
os.makedirs(iter_path, exist_ok=True)
for step_conf in iteration['steps']:
step_name = step_conf['name']
# ==========================
# Step: 00.md
# ==========================
if step_name == "00.md":
step_dir = os.path.join(iter_path, "00.md")
# 1. 初始化 model.xyz (仅做一次)
task_id_init = f"{iter_name}.00.md.init"
if iter_id == 0:
if not self.tracker.is_done(task_id_init):
os.makedirs(step_dir, exist_ok=True)
poscar_name = self.param['files']['poscar']
poscar_src = os.path.join(self.data_dir, poscar_name)
if os.path.exists(poscar_src):
shutil.copy(poscar_src, os.path.join(step_dir, poscar_name))
atom_labels = self.param['files'].get('label', '')
kit_path = self.machine.config['paths'].get('gpumdkit', 'gpumdkit.sh')
cmd = f"{kit_path} -addlabel {poscar_name} {atom_labels}"
self.logger.info(f"Initializing model.xyz...")
if run_cmd_with_log(cmd, step_dir, "init.log"):
self.tracker.mark_done(task_id_init)
else:
self.logger.error("Initialization failed. Check iter_00/00.md/init.log")
return
else:
self.logger.error("POSCAR missing.")
return
else:
self.logger.info("Skipping Init (Already Done).")
else:
# --- [新增逻辑] 后续轮次:从上一轮复制 model.xyz ---
# 1. 【核心修复】必须先确保目标文件夹存在!
if not os.path.exists(step_dir):
os.makedirs(step_dir, exist_ok=True)
# 2. 检查当前目录是否已有 model.xyz
if not os.path.exists(os.path.join(step_dir, "model.xyz")):
prev_iter_name = f"iter_{iter_id - 1:02d}"
prev_model_src = os.path.join(self.workspace, prev_iter_name, "00.md", "model.xyz")
if os.path.exists(prev_model_src):
self.logger.info(f"Copying model.xyz from {prev_iter_name}...")
shutil.copy(prev_model_src, os.path.join(step_dir, "model.xyz"))
else:
self.logger.error(f"Previous model.xyz not found: {prev_model_src}")
return
# 确保 gpumdkit 路径可用
kit_path = self.machine.config['paths'].get('gpumdkit', 'gpumdkit.sh')
# === Sub-task 1: Preheat ===
task_id_preheat = f"{iter_name}.00.md.preheat"
preheat_dir = os.path.join(step_dir, "preheat")
if not self.tracker.is_done(task_id_preheat):
self.logger.info(">>> Starting Preheat...")
os.makedirs(preheat_dir, exist_ok=True)
# 准备文件
shutil.copy(os.path.join(step_dir, "model.xyz"), os.path.join(preheat_dir, "model.xyz"))
shutil.copy(self.current_nep_pot, os.path.join(preheat_dir, "nep.txt"))
shutil.copy(os.path.join(self.template_dir, "00.md", "preheat", "run.in"),
os.path.join(preheat_dir, "run.in"))
# A. 运行 GPUMD
# 假设 gpumd 命令直接运行,无输入
if not run_cmd_with_log("gpumd", preheat_dir, "step_exec.log"):
self.logger.error("Preheat GPUMD failed.")
return
# B. 运行 采样 (201)
# [修正] 严格按照要求: "201\ndump.xyz uniform 4" (中间无额外换行)
input_str_201 = "201\ndump.xyz uniform 4"
self.logger.info(">>> Running Sampling (201)...")
if run_cmd_with_log(kit_path, preheat_dir, "step_exec.log", input_str=input_str_201):
if os.path.exists(os.path.join(preheat_dir, "sampled_structures.xyz")):
self.tracker.mark_done(task_id_preheat)
else:
self.logger.error("sampled_structures.xyz not generated.")
return
else:
self.logger.error("Sampling command failed.")
return
else:
self.logger.info("Skipping Preheat (Already Done).")
# === Sub-task 2: Production ===
task_id_prod = f"{iter_name}.00.md.production"
prod_dir = os.path.join(step_dir, "production")
if not self.tracker.is_done(task_id_prod):
self.logger.info(">>> Starting Production...")
os.makedirs(prod_dir, exist_ok=True)
# 1. 准备基础文件到 production 根目录
src_sample = os.path.abspath(os.path.join(preheat_dir, "sampled_structures.xyz"))
dst_sample = os.path.join(prod_dir, "sampled_structures.xyz")
if os.path.exists(dst_sample): os.remove(dst_sample)
os.symlink(src_sample, dst_sample)
shutil.copy(self.current_nep_pot, os.path.join(prod_dir, "nep.txt"))
shutil.copy(os.path.join(self.template_dir, "00.md", "production", "run.in"),
os.path.join(prod_dir, "run.in"))
# 2. 运行 302 (生成 md 文件夹, sample_* 文件夹, presub.sh)
# 302 通常会读取当前目录的 run.in 并在 md/ 下生成拆分后的 run_x.in
input_str_302 = "302"
if not run_cmd_with_log(kit_path, prod_dir, "step_exec.log", input_str=input_str_302):
self.logger.error("302 command failed.")
return
if not os.path.exists(os.path.join(prod_dir, "presub.sh")):
self.logger.error("presub.sh not found.")
return
# ---------------------------------------------------------
# [新增] 3. 补全文件:将 nep.txt 和 run.in 复制到 md 文件夹
# ---------------------------------------------------------
md_subdir = os.path.join(prod_dir, "md")
if os.path.exists(md_subdir):
self.logger.info("Copying nep.txt and run.in to 'md' folder...")
shutil.copy(os.path.join(prod_dir, "nep.txt"), os.path.join(md_subdir, "nep.txt"))
# 复制 run.in虽然 302 可能已经生成了 run_1.in 等,但为了保险或用户习惯,我们也拷进去
shutil.copy(os.path.join(prod_dir, "run.in"), os.path.join(md_subdir, "run.in"))
shutil.copy(os.path.join(prod_dir, "run.in"), os.path.join(md_subdir, "run_1.in"))
shutil.copy(os.path.join(prod_dir, "run.in"), os.path.join(md_subdir, "run_2.in"))
shutil.copy(os.path.join(prod_dir, "run.in"), os.path.join(md_subdir, "run_3.in"))
shutil.copy(os.path.join(prod_dir, "run.in"), os.path.join(md_subdir, "run_4.in"))
else:
self.logger.error("'md' folder was not created by 302 command.")
return
# 4. 运行 presub.sh
os.chmod(os.path.join(prod_dir, "presub.sh"), 0o755)
self.logger.info(">>> Executing presub.sh...")
if not run_cmd_with_log("./presub.sh", prod_dir, "step_exec.log"):
self.logger.error("presub.sh execution failed.")
return
# 5. 合并 dump
self.logger.info("Merging dump files...")
run_cmd_with_log("cat sample_*/dump.xyz > dump.xyz", prod_dir, "step_exec.log")
self.last_dump_path = os.path.join(prod_dir, "dump.xyz")
self.tracker.mark_done(task_id_prod)
else:
self.logger.info("Skipping Production (Already Done).")
self.last_dump_path = os.path.join(prod_dir, "dump.xyz")
# ==========================
# Step: 01.select
# ==========================
elif step_name == "01.select":
step_dir = os.path.join(iter_path, "01.select")
task_id_select = f"{iter_name}.01.select"
if not self.tracker.is_done(task_id_select):
self.logger.info(f"=== Step: 01.select ({step_conf.get('method', 'distance')}) ===")
os.makedirs(step_dir, exist_ok=True)
# 1. 准备 dump.xyz (软链接)
# 优先使用上一这步记录的路径,如果没有则尝试推断
dump_src = getattr(self, 'last_dump_path', None)
if not dump_src:
# 尝试推断iter_XX/00.md/production/dump.xyz
dump_src = os.path.join(iter_path, "00.md", "production", "dump.xyz")
if os.path.exists(dump_src):
dst_dump = os.path.join(step_dir, "dump.xyz")
if os.path.exists(dst_dump): os.remove(dst_dump)
# 使用绝对路径建立软链
os.symlink(os.path.abspath(dump_src), dst_dump)
else:
self.logger.error(f"Source dump.xyz not found: {dump_src}")
return
# 2. 准备 nep.txt
shutil.copy(self.current_nep_pot, os.path.join(step_dir, "nep.txt"))
# 3. 准备 train.xyz
# 逻辑:第一轮用 model.xyz之后用 self.current_train_set
if iter_id == 0:
# 第一轮:找同轮次 00.md 下的 model.xyz
model_xyz_src = os.path.join(iter_path, "00.md", "model.xyz")
if os.path.exists(model_xyz_src):
shutil.copy(model_xyz_src, os.path.join(step_dir, "train.xyz"))
else:
self.logger.error("model.xyz not found for initial train.xyz")
return
else:
# 后续轮次
if os.path.exists(self.current_train_set):
shutil.copy(self.current_train_set, os.path.join(step_dir, "train.xyz"))
else:
self.logger.error(f"Previous train set missing: {self.current_train_set}")
return
# 4. 执行筛选逻辑
method = step_conf.get('method', 'distance')
params = step_conf.get('params', [0.01, 60, 120]) # [threshold, min, max]
kit_path = self.machine.config['paths'].get('gpumdkit', 'gpumdkit.sh')
# === 分支 A: 按距离筛选 (Loop) ===
if method == "distance":
threshold = params[0]
target_min, target_max = params[1], params[2]
step_size = 0.001
max_attempts = 15
success = False
self.logger.info(f"Targeting {target_min}-{target_max} structures. Initial thr={threshold}")
for attempt in range(max_attempts):
self.logger.info(f"--- Attempt {attempt + 1}: Threshold {threshold:.4f} ---")
# 构造输入: 203 -> filenames -> 1 (distance) -> threshold
# 格式: "203\ndump.xyz train.xyz nep.txt\n1\n0.01"
input_str = f"203\ndump.xyz train.xyz nep.txt\n1\n{threshold}"
# 运行 gpumdkit (输出重定向到 append 模式的日志)
if not run_cmd_with_log(kit_path, step_dir, "select_exec.log", input_str=input_str):
self.logger.error("gpumdkit execution failed.")
break # 致命错误直接退出
# 检查 selected.xyz 数量
if not os.path.exists(os.path.join(step_dir, "selected.xyz")):
self.logger.warning("selected.xyz not found. Retrying...")
continue
try:
# grep -c "Lat" selected.xyz
count_out = subprocess.check_output('grep -c "Lat" selected.xyz', shell=True,
cwd=step_dir)
count = int(count_out.decode().strip())
except:
count = 0
self.logger.info(f"Selected count: {count}")
# 判断逻辑
if target_min <= count <= target_max:
self.logger.info(f"Success! {count} structures selected.")
success = True
break
elif count < target_min:
# 选少了 -> 阈值太严 -> 降低阈值? (通常距离阈值越低,选的越多?)
# 假设Threshold 代表“最小允许距离”。距离 > Thr 才选。
# 那么 Thr 越小,条件越松,选的越多。
self.logger.info("Too few. Decreasing threshold (Loosening).")
threshold -= step_size
if threshold < 0: threshold = 0.0001
else: # count > target_max
# 选多了 -> 提高阈值
self.logger.info("Too many. Increasing threshold (Tightening).")
threshold += step_size
if success:
self.tracker.mark_done(task_id_select)
else:
self.logger.warning(
"Failed to reach target count within max attempts. Using last result.")
# 这里可以选择是否标记完成,或者报错暂停。目前标记完成以便继续。
self.tracker.mark_done(task_id_select)
# === 分支 B: 按个数/随机筛选 (One-shot) ===
elif method == "random":
min_n, max_n = params[1], params[2]
# 构造输入: 203 -> filenames -> 2 (number) -> min max
input_str = f"203\ndump.xyz train.xyz nep.txt\n2\n{min_n} {max_n}"
self.logger.info(f"Random selection: {min_n}-{max_n}")
if run_cmd_with_log(kit_path, step_dir, "select_exec.log", input_str=input_str):
self.tracker.mark_done(task_id_select)
else:
self.logger.error("Random selection failed.")
return
else:
self.logger.info("Skipping Select (Already Done).")
# ==========================
# Step: 02.scf (VASP Calculation)
# ==========================
elif step_name == "02.scf":
step_dir = os.path.join(iter_path, "02.scf")
task_id_scf = f"{iter_name}.02.scf"
if not self.tracker.is_done(task_id_scf):
self.logger.info("=== Step: 02.scf (VASP) ===")
os.makedirs(step_dir, exist_ok=True)
# 1. 准备 selected.xyz
# 尝试从同轮次的 01.select 获取
select_step_dir = os.path.join(iter_path, "01.select")
src_selected = os.path.join(select_step_dir, "selected.xyz")
if not os.path.exists(src_selected):
self.logger.error(f"selected.xyz not found in {select_step_dir}")
return
dst_selected = os.path.join(step_dir, "selected.xyz")
if os.path.exists(dst_selected): os.remove(dst_selected)
os.symlink(os.path.abspath(src_selected), dst_selected)
# 2. 运行 301 拆分结构
# 命令: echo -e "301\niter" | gpumdkit.sh
# 这会生成 iterX_1, iterX_2... 和 fp 文件夹
kit_path = self.machine.config['paths'].get('gpumdkit', 'gpumdkit.sh')
input_str_301 = "301\niter" # 这里 "iter" 是文件夹前缀名gpumdkit 会自动加数字
self.logger.info("Splitting structures (301)...")
if not run_cmd_with_log(kit_path, step_dir, "scf_setup.log", input_str=input_str_301):
self.logger.error("301 command failed.")
return
# 3. 准备 VASP 输入文件到 'fp' 文件夹
# gpumdkit 生成的 fp 文件夹通常存放公共文件,子文件夹会软链过去
fp_dir = os.path.join(step_dir, "fp")
if not os.path.exists(fp_dir):
self.logger.error("'fp' directory was not created by 301.")
return
self.logger.info("Distributing VASP inputs to 'fp' folder...")
# A. POTCAR (来自 Data)
potcar_src = os.path.join(self.data_dir, self.param['files']['potcar'])
if os.path.exists(potcar_src):
shutil.copy(potcar_src, os.path.join(fp_dir, "POTCAR"))
else:
self.logger.error(f"POTCAR missing: {potcar_src}")
return
# B. INCAR (来自 Template)
# Template 路径: template/02.scf/INCAR
incar_src = os.path.join(self.template_dir, "02.scf", "INCAR")
if os.path.exists(incar_src):
shutil.copy(incar_src, os.path.join(fp_dir, "INCAR"))
else:
self.logger.error(f"INCAR missing in template: {incar_src}")
return
# C. KPOINTS (来自 Template, 可选)
kpoints_src = os.path.join(self.template_dir, "02.scf", "KPOINTS")
if os.path.exists(kpoints_src):
shutil.copy(kpoints_src, os.path.join(fp_dir, "KPOINTS"))
else:
self.logger.info("KPOINTS not found in template, assuming KSPACING in INCAR.")
# 4. 生成并提交计算任务
# 这里我们不理会 gpumdkit 生成的 presub.sh而是根据 machine.yaml 生成自己的
executor_name = step_conf.get('executor', 'vasp_gpu') # 默认用 cpu
# 获取执行命令 (例如 "mpirun -np 32 vasp_std")
# 这里的逻辑需要调用 machine 模块的一个新功能:批量生成提交脚本
# 但为了简化,我们在 Local 模式下生成一个遍历脚本
self.logger.info(f"Generating batch submission script for {executor_name}...")
# 读取 machine 配置里的命令
exec_conf = self.machine.config['executors'].get(executor_name, {})
vasp_cmd = exec_conf.get('cmd', 'mpirun -np 1 vasp_std') # 默认值
# 生成 run_vasp.sh
run_script_path = os.path.join(step_dir, "run_vasp.sh")
with open(run_script_path, 'w') as f:
f.write("#!/bin/bash\n")
# 遍历 iter* 目录
f.write(f"for dir in iter*_*; do\n")
f.write(f" if [ -d \"$dir\" ]; then\n")
f.write(f" echo \"Running VASP in $dir ...\"\n")
f.write(f" cd $dir\n")
# 写入具体的 VASP 执行命令
f.write(f" {vasp_cmd} > vasp.log 2>&1\n") # 重定向日志
f.write(f" cd ..\n")
f.write(f" fi\n")
f.write(f"done\n")
os.chmod(run_script_path, 0o755)
# 执行 VASP 计算
# 注意:如果是在 Slurm 上,这里应该提交 run_vasp.sh并使用 Job ID 等待
# 目前 Local 模式直接运行
self.logger.info(">>> Executing VASP batch calculations (this may take time)...")
if not run_cmd_with_log("./run_vasp.sh", step_dir, "scf_exec.log"):
self.logger.error("VASP batch execution failed.")
return
# 5. 结果收集 (out2xyz)
self.logger.info("Collecting results (out2xyz)...")
cmd_collect = f"{kit_path} -out2xyz ."
if run_cmd_with_log(cmd_collect, step_dir, "scf_collect.log"):
# 检查结果
res_dir = os.path.join(step_dir, "NEPdataset-multiple_frames")
res_file = os.path.join(res_dir, "NEP-dataset.xyz")
if os.path.exists(res_file):
self.logger.info(f"VASP data collected: {res_file}")
# 保存这个路径供 Train 使用
self.new_data_chunk = res_file
self.tracker.mark_done(task_id_scf)
else:
self.logger.error("NEP-dataset.xyz not found after collection.")
else:
self.logger.error("out2xyz failed.")
else:
self.logger.info("Skipping SCF (Already Done).")
# 即使跳过,也要尝试恢复 self.new_data_chunk 变量,防止 Train 找不到数据
# 这里简单推断一下
res_file = os.path.join(step_dir, "NEPdataset-multiple_frames", "NEP-dataset.xyz")
if os.path.exists(res_file):
self.new_data_chunk = res_file
# ==========================
# Step: 03.train (Training)
# ==========================
elif step_name == "03.train":
step_dir = os.path.join(iter_path, "03.train")
task_id_train = f"{iter_name}.03.train"
if not self.tracker.is_done(task_id_train):
self.logger.info("=== Step: 03.train (NEP) ===")
os.makedirs(step_dir, exist_ok=True)
# 1. 准备 train.xyz (合并数据)
# 逻辑Current Total Train = Previous Total Train + New Data Chunk
self.logger.info("Merging training data...")
train_xyz_path = os.path.join(step_dir, "train.xyz")
# 打开目标文件准备写入
with open(train_xyz_path, 'w') as outfile:
# A. 写入旧数据 (如果存在)
if os.path.exists(self.current_train_set):
self.logger.info(f"Appending previous data: {self.current_train_set}")
with open(self.current_train_set, 'r') as infile:
shutil.copyfileobj(infile, outfile)
# B. 写入新数据 (来自本轮 SCF)
# 尝试获取变量,如果变量丢失则尝试从路径推断
new_data = getattr(self, 'new_data_chunk', None)
if not new_data:
# 推断路径: iter_XX/02.scf/NEPdataset-multiple_frames/NEP-dataset.xyz
new_data = os.path.join(iter_path, "02.scf", "NEPdataset-multiple_frames",
"NEP-dataset.xyz")
if new_data and os.path.exists(new_data):
self.logger.info(f"Appending new data: {new_data}")
with open(new_data, 'r') as infile:
shutil.copyfileobj(infile, outfile)
else:
if iter_id == 0 and not os.path.exists(self.current_train_set):
self.logger.error("No training data available (neither previous nor new).")
return
else:
self.logger.warning("No new data found from SCF step. Training on old data only.")
# 更新全局变量指向最新的 train.xyz供下一轮使用
self.current_train_set = train_xyz_path
# 2. 准备 nep.in
template_nep_in = os.path.join(self.template_dir, "03.train", "nep.in")
if os.path.exists(template_nep_in):
shutil.copy(template_nep_in, os.path.join(step_dir, "nep.in"))
else:
self.logger.error(f"nep.in template missing: {template_nep_in}")
return
# 3. 执行训练
executor_name = step_conf.get('executor', 'nep_local')
# 获取nep命令比如 "nep" 或者 "/path/to/nep"
# 注意nep 命令通常不需要参数,它会自动读取 nep.in
self.logger.info(f"Starting NEP training using {executor_name}...")
# 这里的 log 文件叫 train_exec.log
if not run_cmd_with_log("nep", step_dir,
"train_exec.log"): # 假设 cmd 是 nep如果 machine.yaml 里有特殊定义请调整
self.logger.error("NEP training failed.")
return
# 检查是否生成了 nep.txt
if os.path.exists(os.path.join(step_dir, "nep.txt")):
self.logger.info("Training finished. nep.txt generated.")
# 更新全局势函数路径,供下一轮 MD 使用
self.current_nep_pot = os.path.join(step_dir, "nep.txt")
else:
self.logger.error("nep.txt not found after training.")
return
# 4. 后处理:绘图与归档
self.logger.info("Generating plots (gpumdkit.sh -plt train)...")
kit_path = self.machine.config['paths'].get('gpumdkit', 'gpumdkit.sh')
cmd_plt = f"{kit_path} -plt train"
run_cmd_with_log(cmd_plt, step_dir, "plot.log")
# [修改] 创建 output 目录在当前 iter 内部
output_dir = os.path.join(iter_path, "05.output")
os.makedirs(output_dir, exist_ok=True)
# 移动 png 图片
for file in os.listdir(step_dir):
if file.endswith(".png"):
src_png = os.path.join(step_dir, file)
dst_png = os.path.join(output_dir, file)
shutil.copy(src_png, dst_png)
self.logger.info(f"Archived plot: {file}")
self.tracker.mark_done(task_id_train)
else:
self.logger.info("Skipping Train (Already Done).")
# 恢复变量状态
self.current_nep_pot = os.path.join(step_dir, "nep.txt")
self.current_train_set = os.path.join(step_dir, "train.xyz")