nep框架重构
This commit is contained in:
0
src/__init__.py
Normal file
0
src/__init__.py
Normal file
0
src/interface.py
Normal file
0
src/interface.py
Normal file
129
src/machine.py
Normal file
129
src/machine.py
Normal file
@@ -0,0 +1,129 @@
|
||||
# src/machine.py
|
||||
import os
|
||||
import subprocess
|
||||
import time
|
||||
import logging
|
||||
import shutil
|
||||
|
||||
|
||||
class MachineManager:
|
||||
def __init__(self, machine_config_path):
|
||||
from src.utils import load_yaml
|
||||
self.config = load_yaml(machine_config_path)
|
||||
self.root_dir = self.config.get('root_dir', os.getcwd())
|
||||
self.script_dir = os.path.join(self.root_dir, self.config.get('script_dir', 'config/scripts'))
|
||||
|
||||
logging.info(f"MachineManager initialized. Script dir: {self.script_dir}")
|
||||
|
||||
def execute(self, executor_name, work_dir):
|
||||
"""
|
||||
统一执行入口
|
||||
:param executor_name: machine.yaml 中定义的 key (如 gpumd, vasp_cpu)
|
||||
:param work_dir: 任务执行的工作目录
|
||||
"""
|
||||
if executor_name not in self.config['executors']:
|
||||
logging.error(f"Executor '{executor_name}' not defined in machine.yaml")
|
||||
return False
|
||||
|
||||
exec_conf = self.config['executors'][executor_name]
|
||||
exec_type = exec_conf.get('type', 'local')
|
||||
|
||||
# 确保工作目录存在
|
||||
os.makedirs(work_dir, exist_ok=True)
|
||||
|
||||
logging.info(f"--- Task: {executor_name} | Type: {exec_type} ---")
|
||||
logging.info(f"Working Dir: {work_dir}")
|
||||
|
||||
if exec_type == 'local':
|
||||
return self._run_local(exec_conf, work_dir)
|
||||
elif exec_type == 'slurm':
|
||||
return self._submit_slurm(exec_conf, work_dir, executor_name)
|
||||
else:
|
||||
logging.error(f"Unknown execution type: {exec_type}")
|
||||
return False
|
||||
|
||||
def _run_local(self, conf, work_dir):
|
||||
"""本地直接执行"""
|
||||
# 1. 优先看有没有 script 脚本文件
|
||||
if 'script' in conf:
|
||||
script_name = conf['script']
|
||||
src_script = os.path.join(self.script_dir, script_name)
|
||||
|
||||
if not os.path.exists(src_script):
|
||||
logging.error(f"Script not found: {src_script}")
|
||||
return False
|
||||
|
||||
# 运行脚本: bash /path/to/script.sh
|
||||
cmd = f"bash {src_script}"
|
||||
|
||||
# 2. 如果没有脚本,看有没有 cmd 直接命令
|
||||
elif 'cmd' in conf:
|
||||
cmd = conf['cmd']
|
||||
else:
|
||||
logging.error("No 'script' or 'cmd' defined for local executor.")
|
||||
return False
|
||||
|
||||
try:
|
||||
# 切换到工作目录执行
|
||||
logging.info(f"Executing Local Command: {cmd}")
|
||||
subprocess.check_call(cmd, shell=True, cwd=work_dir)
|
||||
logging.info("Local execution success.")
|
||||
return True
|
||||
except subprocess.CalledProcessError as e:
|
||||
logging.error(f"Execution failed with error code {e.returncode}")
|
||||
return False
|
||||
|
||||
def _submit_slurm(self, conf, work_dir, job_name):
|
||||
"""生成 Slurm 脚本并提交 (模拟)"""
|
||||
script_name = conf.get('script')
|
||||
src_script = os.path.join(self.script_dir, script_name)
|
||||
|
||||
if not os.path.exists(src_script):
|
||||
logging.error(f"Script not found: {src_script}")
|
||||
return False
|
||||
|
||||
# 1. 读取用户自定义脚本内容
|
||||
with open(src_script, 'r') as f:
|
||||
user_script_content = f.read()
|
||||
|
||||
# 2. 生成提交脚本 (.sub)
|
||||
sub_file = os.path.join(work_dir, "submit.sub")
|
||||
|
||||
with open(sub_file, 'w') as f:
|
||||
f.write("#!/bin/bash\n")
|
||||
f.write(f"#SBATCH --job-name={job_name}\n")
|
||||
# 根据 yaml 自动填入 SBATCH 参数
|
||||
if 'partition' in conf: f.write(f"#SBATCH --partition={conf['partition']}\n")
|
||||
if 'nodes' in conf: f.write(f"#SBATCH --nodes={conf['nodes']}\n")
|
||||
if 'ntasks' in conf: f.write(f"#SBATCH --ntasks={conf['ntasks']}\n")
|
||||
if 'time' in conf: f.write(f"#SBATCH --time={conf['time']}\n")
|
||||
if 'gpus' in conf: f.write(f"#SBATCH --gres=gpu:{conf['gpus']}\n")
|
||||
|
||||
f.write("\n")
|
||||
f.write("cd $SLURM_SUBMIT_DIR\n")
|
||||
f.write("\n")
|
||||
f.write("# --- User Script Content ---\n")
|
||||
f.write(user_script_content)
|
||||
|
||||
logging.info(f"Generated submission script: {sub_file}")
|
||||
|
||||
# 3. 提交任务
|
||||
# 注意:这里我们做个判断,如果是在非 Slurm 环境测试,就不真正提交,只生成文件
|
||||
# 如果你想真正提交,把下面的 True 改为 False
|
||||
TEST_MODE = True
|
||||
|
||||
if TEST_MODE:
|
||||
logging.info("[TEST_MODE] Simulated 'sbatch submit.sub'. Check the .sub file.")
|
||||
return True
|
||||
else:
|
||||
try:
|
||||
# 提交并获取 Job ID
|
||||
res = subprocess.check_output(f"sbatch {sub_file}", shell=True, cwd=work_dir)
|
||||
job_id = res.decode().strip().split()[-1] # 通常输出是 Submitted batch job 123456
|
||||
logging.info(f"Job submitted. ID: {job_id}")
|
||||
|
||||
# TODO: 这里需要加入 wait_for_job(job_id) 的逻辑,我们下一阶段实现
|
||||
return True
|
||||
except subprocess.CalledProcessError as e:
|
||||
logging.error(f"Submission failed: {e}")
|
||||
return False
|
||||
0
src/stages.py
Normal file
0
src/stages.py
Normal file
177
src/steps.py
Normal file
177
src/steps.py
Normal file
@@ -0,0 +1,177 @@
|
||||
# src/steps.py
|
||||
import os
|
||||
import shutil
|
||||
import time
|
||||
import logging
|
||||
import subprocess
|
||||
|
||||
|
||||
class BaseStep:
|
||||
def __init__(self, name, work_dir, machine_manager, config):
|
||||
self.name = name
|
||||
self.work_dir = work_dir
|
||||
self.machine = machine_manager
|
||||
self.config = config
|
||||
os.makedirs(self.work_dir, exist_ok=True)
|
||||
self.logger = logging.getLogger()
|
||||
|
||||
def copy_file(self, src, dst_name=None):
|
||||
"""辅助函数:安全复制文件"""
|
||||
if not os.path.exists(src):
|
||||
self.logger.error(f"[{self.name}] Source file missing: {src}")
|
||||
return False
|
||||
|
||||
dst_name = dst_name if dst_name else os.path.basename(src)
|
||||
dst_path = os.path.join(self.work_dir, dst_name)
|
||||
shutil.copy(src, dst_path)
|
||||
return dst_path
|
||||
|
||||
|
||||
class MDStep(BaseStep):
|
||||
"""
|
||||
对应 00.md: 负责预热/采样
|
||||
"""
|
||||
|
||||
def run(self, prev_nep_path, template_path):
|
||||
self.logger.info(f"=== Running Step: {self.name} (MD) ===")
|
||||
|
||||
# 1. 准备 nep.txt (来自上一轮或初始数据)
|
||||
if not prev_nep_path:
|
||||
self.logger.error("No nep.txt provided for MD.")
|
||||
return False
|
||||
self.copy_file(prev_nep_path, "nep.txt")
|
||||
|
||||
# 2. 准备 model.xyz (如果是第一轮,这里假设外部已经放好了,或者由init步生成)
|
||||
# 为了简化,我们假设上一级流程已经把 model.xyz 准备在 work_dir 或者由上一轮传递
|
||||
# 这里我们假设 model.xyz 必须存在于 work_dir (可以通过 init 步骤拷入)
|
||||
if not os.path.exists(os.path.join(self.work_dir, "model.xyz")):
|
||||
self.logger.warning(f"[{self.name}] model.xyz not found in {self.work_dir}. Make sure Init step ran.")
|
||||
|
||||
# 3. 准备 run.in (从 template 复制)
|
||||
run_in_src = os.path.join(template_path, "run.in")
|
||||
self.copy_file(run_in_src, "run.in")
|
||||
|
||||
# 4. 调用 Machine 执行 GPUMD
|
||||
# 注意:这里我们调用 machine.yaml 里定义的 'gpumd' 执行器
|
||||
success = self.machine.execute("gpumd", self.work_dir)
|
||||
|
||||
if success and os.path.exists(os.path.join(self.work_dir, "dump.xyz")):
|
||||
self.logger.info(f"[{self.name}] MD finished. dump.xyz generated.")
|
||||
return True
|
||||
else:
|
||||
self.logger.error(f"[{self.name}] MD failed or dump.xyz missing.")
|
||||
return False
|
||||
|
||||
|
||||
class SelectStep(BaseStep):
|
||||
"""
|
||||
对应 01.select: 智能筛选
|
||||
"""
|
||||
|
||||
def run(self, dump_path, train_path, nep_path, method="distance", params=[0.01, 60, 120]):
|
||||
self.logger.info(f"=== Running Step: {self.name} (Smart Selection) ===")
|
||||
|
||||
# 准备文件
|
||||
self.copy_file(dump_path, "dump.xyz")
|
||||
self.copy_file(train_path, "train.xyz")
|
||||
self.copy_file(nep_path, "nep.txt")
|
||||
|
||||
target_min, target_max = params[1], params[2]
|
||||
threshold = params[0]
|
||||
step_size = 0.001 # 每次调整的步长
|
||||
|
||||
# 你的流程里是用 gpumdkit.sh 做筛选 (option 203)
|
||||
# 这里的命令构造需要非常小心,模拟你的 echo输入
|
||||
# 假设 gpumdkit.sh 在 PATH 中,或者通过 machine config 获取路径
|
||||
# 由于我们现在是 local 调试,假设你依然依赖 gpumdkit.sh
|
||||
# 但既然我们写 Python,建议未来把筛选逻辑(计算距离)直接写成 Python 代码。
|
||||
# 这里暂时模拟调用逻辑:
|
||||
|
||||
for i in range(10): # 最多尝试10次
|
||||
self.logger.info(f"Selection attempt {i + 1}: Threshold={threshold:.4f}")
|
||||
|
||||
# 构造输入字符串: 203 -> file names -> 1 (distance) -> threshold
|
||||
# 注意:这里假设 gpumdkit.sh 能接受这种输入
|
||||
# 为了调试方便,这里我们暂时只打日志,不真的调 gpumdkit (因为它需要真实的数据文件)
|
||||
# 在真实运行中,这里应该调用:
|
||||
# input_str = f"203\ndump.xyz train.xyz nep.txt\n1\n{threshold}\n"
|
||||
# subprocess.run("gpumdkit.sh", input=input_str, cwd=self.work_dir...)
|
||||
|
||||
# --- 模拟代码 Start ---
|
||||
# 假设生成了一个假的 selected.xyz
|
||||
with open(os.path.join(self.work_dir, "selected.xyz"), 'w') as f:
|
||||
# 模拟根据阈值,阈值越小选的越多
|
||||
mock_count = int(100 / (threshold * 100))
|
||||
f.write(f"Mock selected {mock_count} frames")
|
||||
|
||||
selected_count = mock_count
|
||||
self.logger.info(f"Found {selected_count} structures (Mock).")
|
||||
# --- 模拟代码 End ---
|
||||
|
||||
if target_min <= selected_count <= target_max:
|
||||
self.logger.info(f"Selection Success! Final count: {selected_count}")
|
||||
return True
|
||||
elif selected_count < target_min:
|
||||
self.logger.info("Too few. Decreasing threshold.")
|
||||
threshold -= step_size
|
||||
if threshold < 0: threshold = 0.001
|
||||
else:
|
||||
self.logger.info("Too many. Increasing threshold.")
|
||||
threshold += step_size
|
||||
|
||||
self.logger.warning("Selection failed to converge. Using last result.")
|
||||
return True # 暂时允许继续
|
||||
|
||||
|
||||
class SCFStep(BaseStep):
|
||||
"""
|
||||
对应 02.scf: VASP 计算
|
||||
"""
|
||||
|
||||
def run(self, template_path, potcar_path):
|
||||
self.logger.info(f"=== Running Step: {self.name} (SCF/VASP) ===")
|
||||
|
||||
# 1. 复制 POTCAR
|
||||
self.copy_file(potcar_path, "POTCAR")
|
||||
|
||||
# 2. 复制 INCAR
|
||||
incar_src = os.path.join(template_path, "INCAR")
|
||||
if not self.copy_file(incar_src, "INCAR"):
|
||||
return False # INCAR 必须有
|
||||
|
||||
# 3. 复制 KPOINTS (可选)
|
||||
kpoints_src = os.path.join(template_path, "KPOINTS")
|
||||
if os.path.exists(kpoints_src):
|
||||
self.copy_file(kpoints_src, "KPOINTS")
|
||||
|
||||
# 4. 执行 VASP
|
||||
# 注意:这里通常需要把 selected.xyz 拆分成多个文件夹
|
||||
# 在 Local 简单测试中,我们假设 selected.xyz 已经被拆分成了 POSCAR
|
||||
# 或者我们只跑一个单点能测试。
|
||||
# 既然是框架开发,这里我们调用 machine.yaml 里的 'vasp_cpu'
|
||||
|
||||
success = self.machine.execute("vasp_cpu", self.work_dir)
|
||||
return success
|
||||
|
||||
|
||||
class TrainStep(BaseStep):
|
||||
"""
|
||||
对应 03.train: NEP 训练
|
||||
"""
|
||||
|
||||
def run(self, template_path, new_train_data_path):
|
||||
self.logger.info(f"=== Running Step: {self.name} (Train) ===")
|
||||
|
||||
# 1. 准备 nep.in
|
||||
self.copy_file(os.path.join(template_path, "nep.in"), "nep.in")
|
||||
|
||||
# 2. 准备 train.xyz (这里假设我们把所有数据 cat 到了这里)
|
||||
if new_train_data_path and os.path.exists(new_train_data_path):
|
||||
self.copy_file(new_train_data_path, "train.xyz")
|
||||
else:
|
||||
# 如果没有新数据,只是测试,创建一个空的
|
||||
with open(os.path.join(self.work_dir, "train.xyz"), 'w') as f:
|
||||
f.write("Mock training data")
|
||||
|
||||
# 3. 运行 NEP
|
||||
return self.machine.execute("nep_local", self.work_dir)
|
||||
47
src/utils.py
Normal file
47
src/utils.py
Normal file
@@ -0,0 +1,47 @@
|
||||
# src/utils.py
|
||||
import yaml
|
||||
import logging
|
||||
import os
|
||||
import sys
|
||||
|
||||
|
||||
def load_yaml(path):
|
||||
"""加载 YAML 配置文件"""
|
||||
if not os.path.exists(path):
|
||||
logging.error(f"Config file not found: {path}")
|
||||
sys.exit(1)
|
||||
with open(path, 'r') as f:
|
||||
return yaml.safe_load(f)
|
||||
|
||||
|
||||
def setup_logger(work_dir, log_file="autonep.log"):
|
||||
"""配置日志:同时输出到文件和控制台"""
|
||||
logger = logging.getLogger()
|
||||
logger.setLevel(logging.INFO)
|
||||
|
||||
# 清楚之前的 handler 防止重复
|
||||
if logger.hasHandlers():
|
||||
logger.handlers.clear()
|
||||
|
||||
# 文件 Handler
|
||||
file_handler = logging.FileHandler(os.path.join(work_dir, log_file))
|
||||
file_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))
|
||||
logger.addHandler(file_handler)
|
||||
|
||||
# 控制台 Handler
|
||||
console_handler = logging.StreamHandler()
|
||||
console_handler.setFormatter(logging.Formatter('%(message)s')) # 控制台只看消息,简洁点
|
||||
logger.addHandler(console_handler)
|
||||
|
||||
return logger
|
||||
|
||||
|
||||
class Notifier:
|
||||
"""(预留) 通知模块"""
|
||||
|
||||
def __init__(self, url=None):
|
||||
self.url = url
|
||||
|
||||
def send(self, title, msg, priority=5):
|
||||
# 暂时只打印日志,不实际发送
|
||||
logging.info(f"[[Notification]] {title}: {msg}")
|
||||
131
src/workflow.py
Normal file
131
src/workflow.py
Normal file
@@ -0,0 +1,131 @@
|
||||
# src/workflow.py
|
||||
import os
|
||||
import shutil
|
||||
import logging
|
||||
from src.utils import load_yaml
|
||||
from src.machine import MachineManager
|
||||
from src.steps import MDStep, SelectStep, SCFStep, TrainStep
|
||||
|
||||
|
||||
class Workflow:
|
||||
def __init__(self, root_dir):
|
||||
self.root_dir = root_dir
|
||||
|
||||
# 1. 加载配置
|
||||
self.param = load_yaml(os.path.join(root_dir, "config/param.yaml"))
|
||||
|
||||
# 2. 初始化机器管理器
|
||||
self.machine = MachineManager(os.path.join(root_dir, "config/machine.yaml"))
|
||||
|
||||
# 3. 初始化路径变量
|
||||
self.workspace = os.path.join(root_dir, "workspace")
|
||||
self.data_dir = os.path.join(root_dir, "data")
|
||||
self.template_dir = os.path.join(root_dir, "template")
|
||||
self.logger = logging.getLogger()
|
||||
|
||||
# 状态追踪变量
|
||||
self.current_nep_pot = os.path.join(self.data_dir, self.param['files']['initial_pot'])
|
||||
# 假设第一轮之前的 train set 也是空的或者由用户提供,这里先指向一个基础文件
|
||||
self.current_train_set = os.path.join(self.workspace, "accumulated_train.xyz")
|
||||
|
||||
def run(self):
|
||||
self.logger.info(f"Workflow Started: {self.param['project']}")
|
||||
|
||||
# 遍历每一轮迭代
|
||||
for iteration in self.param['iterations']:
|
||||
iter_id = iteration['id']
|
||||
iter_name = f"iter_{iter_id:02d}"
|
||||
iter_path = os.path.join(self.workspace, iter_name)
|
||||
|
||||
self.logger.info(f"\n >>> Starting Iteration: {iter_id} <<<")
|
||||
os.makedirs(iter_path, exist_ok=True)
|
||||
|
||||
# --- 执行该轮定义的各个 Step ---
|
||||
for step_conf in iteration['steps']:
|
||||
step_name = step_conf['name']
|
||||
|
||||
# ==========================
|
||||
# Step: 00.md
|
||||
# ==========================
|
||||
if step_name == "00.md":
|
||||
step_dir = os.path.join(iter_path, "00.md")
|
||||
|
||||
# 只有第一轮且有init需求时,才进行 POSCAR -> model.xyz 转化
|
||||
# 这里为了 Local 测试,我们简单处理:直接把 POSCAR 拷过去当 model.xyz (仅作演示)
|
||||
# 实际上你应该调用 gpumdkit 转化
|
||||
if iter_id == 0:
|
||||
os.makedirs(step_dir, exist_ok=True)
|
||||
shutil.copy(os.path.join(self.data_dir, self.param['files']['poscar']),
|
||||
os.path.join(step_dir, "model.xyz"))
|
||||
|
||||
# 遍历子任务 (preheat, production...)
|
||||
for sub in step_conf.get('sub_tasks', []):
|
||||
template_sub_name = sub['template_sub']
|
||||
sub_work_dir = os.path.join(step_dir, template_sub_name)
|
||||
template_path = os.path.join(self.template_dir, "00.md", template_sub_name)
|
||||
|
||||
# 实例化并运行
|
||||
md_task = MDStep(f"MD-{template_sub_name}", sub_work_dir, self.machine, self.config)
|
||||
|
||||
# 关键:要把上一级准备好的 model.xyz 拷进来
|
||||
if iter_id == 0:
|
||||
shutil.copy(os.path.join(step_dir, "model.xyz"), os.path.join(sub_work_dir, "model.xyz"))
|
||||
# 如果是后续轮次,应该用上一轮选好的结构,这里暂略,先跑通第一轮
|
||||
|
||||
md_task.run(self.current_nep_pot, template_path)
|
||||
|
||||
# 记录最后生成的 dump.xyz 路径,供下一步使用
|
||||
self.last_dump_path = os.path.join(sub_work_dir, "dump.xyz")
|
||||
|
||||
# ==========================
|
||||
# Step: 01.select
|
||||
# ==========================
|
||||
elif step_name == "01.select":
|
||||
step_dir = os.path.join(iter_path, "01.select")
|
||||
select_task = SelectStep("Select", step_dir, self.machine, self.config)
|
||||
|
||||
# 使用上一步产生的 dump 和 当前的训练集/势函数
|
||||
select_task.run(
|
||||
dump_path=getattr(self, 'last_dump_path', None),
|
||||
train_path=self.current_train_set,
|
||||
nep_path=self.current_nep_pot,
|
||||
method=step_conf.get('method'),
|
||||
params=step_conf.get('params')
|
||||
)
|
||||
|
||||
# ==========================
|
||||
# Step: 02.scf
|
||||
# ==========================
|
||||
elif step_name == "02.scf":
|
||||
step_dir = os.path.join(iter_path, "02.scf")
|
||||
scf_task = SCFStep("SCF", step_dir, self.machine, self.config)
|
||||
|
||||
template_path = os.path.join(self.template_dir, "02.scf")
|
||||
potcar_path = os.path.join(self.data_dir, self.param['files']['potcar'])
|
||||
|
||||
scf_task.run(template_path, potcar_path)
|
||||
|
||||
# 假装产生了一些新数据
|
||||
self.new_data_chunk = os.path.join(step_dir, "scf_results.xyz")
|
||||
|
||||
# ==========================
|
||||
# Step: 03.train
|
||||
# ==========================
|
||||
elif step_name == "03.train":
|
||||
step_dir = os.path.join(iter_path, "03.train")
|
||||
train_task = TrainStep("Train", step_dir, self.machine, self.config)
|
||||
|
||||
template_path = os.path.join(self.template_dir, "03.train")
|
||||
|
||||
# 实际逻辑应该是把 self.new_data_chunk 合并到 total_train.xyz
|
||||
# 这里直接传入
|
||||
train_task.run(template_path, getattr(self, 'new_data_chunk', None))
|
||||
|
||||
# 更新当前势函数路径,供下一轮使用
|
||||
self.current_nep_pot = os.path.join(step_dir, "nep.txt")
|
||||
|
||||
self.logger.info("Workflow Finished Successfully.")
|
||||
|
||||
@property
|
||||
def config(self):
|
||||
return self.param # 简单透传
|
||||
Reference in New Issue
Block a user