Compare commits
31 Commits
main
...
reconstruc
| Author | SHA1 | Date | |
|---|---|---|---|
| def07faf6d | |||
| b6a1af5264 | |||
| f95f4011d4 | |||
| 405917e747 | |||
| 908bb67d6e | |||
| c1de59a7f2 | |||
| 83cfe0c1b7 | |||
| 12d232cb80 | |||
| 2e35ad3f8f | |||
| 518264eb60 | |||
| 6ab7aecbe4 | |||
| 9df74cc5b7 | |||
| d67bce1109 | |||
| 323790ee66 | |||
| f19d8ac4f0 | |||
| ceff569583 | |||
| 9675b40033 | |||
| 9e1ac525f1 | |||
| 081540c274 | |||
| 0a9dc8d8f5 | |||
| 7b0db3c399 | |||
| 42a6ba1ef6 | |||
| 13d9ce4385 | |||
| 99d7742c21 | |||
| 70068d4942 | |||
| 283fee001b | |||
| 2ed55d77be | |||
| 4aacef331e | |||
| e11918b6cf | |||
| 90e3d95b16 | |||
| 91bdb0dab1 |
@@ -1,58 +1,27 @@
|
||||
# config/machine.yaml
|
||||
machine_name: "Local_Test_Env"
|
||||
root_dir: "." # <--- 请修改这里为你的实际路径
|
||||
|
||||
# 当前使用的计算系统配置名
|
||||
current_system: "interactive_gpu"
|
||||
paths:
|
||||
# 必须填写入完整的绝对路径,确保 Python 无论在哪里执行都能找到它
|
||||
gpumdkit: "/cluster/home/koko125/tool/GPUMDkit/gpumdkit.sh"
|
||||
|
||||
systems:
|
||||
# --- 配置 1: 交互式 GPU 环境 (当前使用) ---
|
||||
# 场景: 你已经用 srun/tmux 申请到了资源,直接运行命令即可
|
||||
interactive_gpu:
|
||||
type: "local" # local 表示直接运行 subprocess,不提交 sbatch
|
||||
# 如果你的 nep 可执行文件也有特定路径,也可以放在这里
|
||||
nep_exe: "/cluster/home/koko125/tool/GPUMD/src/nep"
|
||||
# 脚本库位置
|
||||
script_dir: "config/scripts"
|
||||
|
||||
# 路径配置
|
||||
gpumdkit_root: "/cluster/home/koko125/tool/GPUMDkit"
|
||||
executors:
|
||||
# 1. 简单的本地命令 (如 NEP 训练)
|
||||
nep_local:
|
||||
type: "local"
|
||||
cmd: "nep"
|
||||
|
||||
tools:
|
||||
# 1. GPUMD 配置
|
||||
gpumd:
|
||||
command: "gpumd"
|
||||
# 运行前需要 source 的环境脚本
|
||||
env_setup: ""
|
||||
gpu_id: 0
|
||||
# 2. 复杂的本地脚本 (如 GPUMD)
|
||||
gpumd:
|
||||
type: "local"
|
||||
cmd: "gpumd" # 对应 config/scripts/gpumd.sh
|
||||
|
||||
# 2. NEP 配置 (同上)
|
||||
nep:
|
||||
command: "nep"
|
||||
env_setup: ""
|
||||
gpu_id: 0
|
||||
gpumdkit:
|
||||
# 假设是 GPU 版本,可能不需要 mpirun 或者只需要少量核
|
||||
command: "gpumdkit.sh"
|
||||
env_setup: ""
|
||||
# 即使是 local 模式,有时也需要指定并行度
|
||||
n_procs: 1
|
||||
# 3. VASP (GPU 版) 配置
|
||||
vasp:
|
||||
# 假设是 GPU 版本,可能不需要 mpirun 或者只需要少量核
|
||||
command: "mpirun -np 1 vasp_std"
|
||||
env_setup: ""
|
||||
# 即使是 local 模式,有时也需要指定并行度
|
||||
n_procs: 1
|
||||
|
||||
# --- 配置 2: VASP CPU 集群模式 (预留,未来使用) ---
|
||||
# 场景: 需要生成 submit.slurm 并 sbatch 提交
|
||||
slurm_cpu_cluster:
|
||||
type: "slurm"
|
||||
|
||||
gpumdkit_root: "/cluster/home/koko125/tool/GPUMDkit"
|
||||
|
||||
tools:
|
||||
vasp:
|
||||
command: "mpirun -np 4 vasp_std"
|
||||
env_setup: "module load vasp/6.3-cpu"
|
||||
|
||||
# Slurm 头部参数
|
||||
slurm_header:
|
||||
partition: "cpu_long"
|
||||
ntasks_per_node: 64
|
||||
time: "24:00:00"
|
||||
# 3. Slurm 提交测试 (VASP CPU)
|
||||
vasp_gpu:
|
||||
type: "local"
|
||||
cmd: "mpirun -np 1 vasp_std"
|
||||
@@ -1,44 +1,100 @@
|
||||
# config/param.yaml
|
||||
# param.yaml
|
||||
|
||||
# --- 1. 流程控制 ---
|
||||
stages_def:
|
||||
p: "preheat"
|
||||
m: "md"
|
||||
s: "select"
|
||||
d: "scf"
|
||||
t: "train"
|
||||
pr: "predict"
|
||||
o: "output"
|
||||
project: "LYC_model1"
|
||||
|
||||
# 默认流程
|
||||
default_workflow: ["p", "m", "s", "d", "t", "pr"]
|
||||
# 1. 初始文件定义 (对应 data/ 目录)
|
||||
files:
|
||||
poscar: "model1.vasp"
|
||||
potcar: "POTCAR"
|
||||
initial_pot: "nep89.txt" # 第一轮 MD 用的势函数
|
||||
label: "Li Y Cl"
|
||||
|
||||
# 自定义调度
|
||||
schedule:
|
||||
1: ["p", "m", "s", "d", "t", "o"]
|
||||
# 2. 迭代流程控制
|
||||
iterations:
|
||||
# --- 第一轮 ---
|
||||
- id: 0
|
||||
steps:
|
||||
# Step 1: MD (预热 + 采样)
|
||||
# 逻辑:会把 nep.txt (来自 initial_pot) 和 model.xyz 准备好
|
||||
- name: "00.md"
|
||||
sub_tasks:
|
||||
# 你提到可能有预热,也可能有加工,这里支持串行执行
|
||||
- template_sub: "preheat" # 使用 template/00.md/preheat/run.in
|
||||
- template_sub: "production" # 使用 template/00.md/production/run.in
|
||||
executor: "gpumd" # 对应 machine.yaml
|
||||
|
||||
# --- 2. 容错与通知 ---
|
||||
control:
|
||||
max_retries: 3
|
||||
check_interval: 60
|
||||
# Step 2: 筛选
|
||||
- name: "01.select"
|
||||
method: "random"
|
||||
params: [90, 120]
|
||||
|
||||
notification:
|
||||
enable_log: true
|
||||
log_file: "./logs/sys_runtime.log"
|
||||
enable_hook: true
|
||||
hook_script: "python ./hooks/send_alert.py"
|
||||
alert_events: ["fail", "finish"]
|
||||
# Step 3: SCF (VASP)
|
||||
# 逻辑:cp template/02.scf/INCAR; check KPOINTS; cp data/POTCAR
|
||||
- name: "02.scf"
|
||||
executor: "vasp_std" # 对应 machine.yaml (可能调用 vasp_std.sh)
|
||||
|
||||
# --- 3. 各模块具体的物理/算法参数 ---
|
||||
params:
|
||||
preheat:
|
||||
template_file: "run_ramp.in"
|
||||
# Step 4: 训练
|
||||
# 逻辑:cp template/03.train/nep.in
|
||||
- name: "03.train"
|
||||
executor: "nep_local"
|
||||
- id: 1
|
||||
steps:
|
||||
# Step 1: MD (预热 + 采样)
|
||||
# 逻辑:会把 nep.txt (来自 initial_pot) 和 model.xyz 准备好
|
||||
- name: "00.md"
|
||||
sub_tasks:
|
||||
# 你提到可能有预热,也可能有加工,这里支持串行执行
|
||||
- template_sub: "preheat" # 使用 template/00.md/preheat/run.in
|
||||
- template_sub: "production" # 使用 template/00.md/production/run.in
|
||||
executor: "gpumd" # 对应 machine.yaml
|
||||
|
||||
select:
|
||||
target_min: 60
|
||||
target_max: 120
|
||||
init_threshold: 0.01
|
||||
# Step 2: 筛选
|
||||
- name: "01.select"
|
||||
method: "distance"
|
||||
params: [90, 120]
|
||||
|
||||
scf:
|
||||
# 比如指定用 machine.yaml 里的哪个 tool 配置
|
||||
tool_key: "vasp"
|
||||
# Step 3: SCF (VASP)
|
||||
# 逻辑:cp template/02.scf/INCAR; check KPOINTS; cp data/POTCAR
|
||||
- name: "02.scf"
|
||||
executor: "vasp_std" # 对应 machine.yaml (可能调用 vasp_std.sh)
|
||||
|
||||
# Step 4: 训练
|
||||
# 逻辑:cp template/03.train/nep.in
|
||||
- name: "03.train"
|
||||
executor: "nep_local"
|
||||
# --- 第二轮 ---
|
||||
- id: 2
|
||||
steps:
|
||||
- name: "00.md"
|
||||
sub_tasks:
|
||||
- template_sub: "preheat"
|
||||
- template_sub: "production" # 第二轮可能只需要 sampling
|
||||
# 注意:这一轮的 nep.txt 会自动指向 iter_00/03.train/nep.txt
|
||||
|
||||
- name: "01.select"
|
||||
method: "distance"
|
||||
params: [0.01, 60, 90]
|
||||
|
||||
- name: "02.scf"
|
||||
executor: "vasp_std"
|
||||
|
||||
- name: "03.train"
|
||||
executor: "nep_local"
|
||||
- name: "04.predict"
|
||||
# [新增] 自定义模型文件 (位于 data/ 目录下),不填则使用当前训练结果
|
||||
# custom_nep: "nep_final_best.txt"
|
||||
|
||||
# [新增] 自定义预测结构 (位于 data/ 目录下),不填则使用 00.md 的结果
|
||||
# 注意:这里填写 .vasp 文件,程序会自动转化为 model.xyz
|
||||
custom_poscar: "model1_supercell.vasp"
|
||||
|
||||
conditions:
|
||||
- {T: 375, time: "15ns"}
|
||||
- { T: 400, time: "5ns" }
|
||||
- { T: 425, time: "2ns" }
|
||||
- { T: 450, time: "1ns" }
|
||||
- { T: 500, time: "1ns" }
|
||||
- { T: 600, time: "1ns" }
|
||||
- { T: 700, time: "1ns" }
|
||||
- { T: 800, time: "1ns" }
|
||||
- { T: 900, time: "1ns" }
|
||||
@@ -1,16 +0,0 @@
|
||||
# config/system.yaml
|
||||
project_name: "LiYCl_Transport_v1"
|
||||
|
||||
# 物理体系定义
|
||||
system:
|
||||
elements: ["Li", "Y", "Cl"]
|
||||
|
||||
# 初始结构 (VASP格式)
|
||||
initial_structure: "./initial_data/LiYCl.vasp"
|
||||
|
||||
# 初始势函数 (第一轮 preheat 使用)
|
||||
# 如果是第一轮,使用此通用势;后续轮次自动使用上一轮训练结果
|
||||
initial_potential: "./initial_data/nep89.txt"
|
||||
|
||||
# 晶格常数或扩胞设置 (可选,视具体模块逻辑而定)
|
||||
supercell: [1, 1, 1]
|
||||
58
data/POSCAR
Normal file
58
data/POSCAR
Normal file
@@ -0,0 +1,58 @@
|
||||
Li20 Ge2 P4 S24
|
||||
1.0
|
||||
8.5899509999999992 0.0000000000000000 0.0000000000000000
|
||||
-0.0386059999999990 8.8794570000000004 0.0000000000000000
|
||||
-0.1455200000000040 -0.4498699999999980 12.9663439999999994
|
||||
Li Ge P S
|
||||
20 2 4 24
|
||||
direct
|
||||
0.9887309999999990 0.5170910000000000 0.9517369999999991 Li+
|
||||
0.9644940000000000 0.5171220000000000 0.4263230000000000 Li+
|
||||
0.5105600000000000 0.9648409999999999 0.5489520000000000 Li+
|
||||
0.4963619999999990 0.9963219999999990 0.0506500000000000 Li+
|
||||
0.2598129999999990 0.2893130000000000 0.1575810000000000 Li+
|
||||
0.7437819999999991 0.7471070000000000 0.2819570000000000 Li+
|
||||
0.2159610000000000 0.8178780000000000 0.7535529999999990 Li+
|
||||
0.2296369999999990 0.7569920000000000 0.2519280000000000 Li+
|
||||
0.7540580000000000 0.2201050000000000 0.3515260000000000 Li+
|
||||
0.4620470000000000 0.5160800000000001 0.7507739999999991 Li+
|
||||
0.2296480000000000 0.2281340000000000 0.3660700000000000 Li+
|
||||
0.7342460000000000 0.2563749999999990 0.8572679999999990 Li+
|
||||
0.2450540000000000 0.7611289999999999 0.0057880000000000 Li+
|
||||
0.7414010000000000 0.2628560000000000 0.1314030000000000 Li+
|
||||
0.7641690000000000 0.7282580000000000 0.6894540000000000 Li+
|
||||
0.2214960000000000 0.2588350000000000 0.9194279999999990 Li+
|
||||
0.7481260000000000 0.7611089999999990 0.0084029999999990 Li+
|
||||
0.2441960000000000 0.7196549999999990 0.4849360000000000 Li+
|
||||
0.7834100000000001 0.2145880000000000 0.6254649999999990 Li+
|
||||
0.2411910000000000 0.2118050000000000 0.6496820000000000 Li+
|
||||
0.9968669999999999 0.4879730000000000 0.1831110000000000 Ge4+
|
||||
0.5088400000000001 0.9907500000000001 0.8156200000000000 Ge4+
|
||||
0.0083590000000000 0.4831760000000000 0.6844359999999990 P4+
|
||||
0.9926389999999990 0.9650139999999990 0.5018440000000000 P5+
|
||||
0.4924000000000000 0.5156300000000000 0.9930840000000000 P5+
|
||||
0.4878150000000000 0.0091810000000000 0.3105009999999990 P5+
|
||||
0.5174609999999999 0.2009880000000000 0.7214800000000000 S-
|
||||
0.9886940000000000 0.1764359999999990 0.4438430000000000 S2-
|
||||
0.0046020000000000 0.7939309999999991 0.3896550000000000 S2-
|
||||
0.2956610000000000 0.5308579999999991 0.9012580000000000 S2-
|
||||
0.6842570000000000 0.5303240000000000 0.8996850000000000 S2-
|
||||
0.4955310000000000 0.6948430000000000 0.0971559999999990 S2-
|
||||
0.4869330000000000 0.3101589999999990 0.0593990000000000 S2-
|
||||
0.1957789999999990 0.9476460000000000 0.5883450000000000 S2-
|
||||
0.8016430000000000 0.9458259999999999 0.5899210000000000 S2-
|
||||
0.0068860000000000 0.2956850000000000 0.0654530000000000 S2-
|
||||
0.0007660000000000 0.7092330000000000 0.1038340000000000 S2-
|
||||
0.2123620000000000 0.4863000000000000 0.6010390000000000 S2-
|
||||
0.8150440000000000 0.4910100000000001 0.5895600000000001 S2-
|
||||
0.4848800000000000 0.8005220000000000 0.3751230000000000 S2-
|
||||
0.4957740000000000 0.1787590000000000 0.4224449999999990 S2-
|
||||
0.2848890000000000 0.9890740000000000 0.9054629999999990 S2-
|
||||
0.7137549999999990 0.9990030000000000 0.9215460000000000 S2-
|
||||
0.9930950000000000 0.6631359999999999 0.7876190000000000 S2-
|
||||
0.0030850000000000 0.2815040000000000 0.7604740000000000 S2-
|
||||
0.7792519999999999 0.4661789999999990 0.2732200000000000 S2-
|
||||
0.2011889999999990 0.4785850000000000 0.2915830000000000 S2-
|
||||
0.4924739999999990 0.7926540000000000 0.7030090000000000 S2-
|
||||
0.6858759999999990 0.0159360000000000 0.2214040000000000 S2-
|
||||
0.2922079999999990 0.0311870000000000 0.2192110000000000 S2-
|
||||
7301
data/POTCAR
Normal file
7301
data/POTCAR
Normal file
File diff suppressed because it is too large
Load Diff
45
main.py
45
main.py
@@ -1,32 +1,33 @@
|
||||
# main.py
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
import traceback
|
||||
from nep_auto.driver import NEPDriver
|
||||
from nep_auto.utils.logger import setup_logger
|
||||
from src.utils import setup_logger
|
||||
from src.workflow import Workflow
|
||||
|
||||
|
||||
def main():
|
||||
# 1. 初始化全局日志
|
||||
logger = setup_logger("logs/sys_runtime.log")
|
||||
logger.info("========================================")
|
||||
logger.info("🚀 NEP Automation Framework Starting...")
|
||||
logger.info("========================================")
|
||||
root_dir = os.getcwd()
|
||||
|
||||
# 1. 初始化日志
|
||||
# 既然 workspace 还没创建,先放到根目录,Workflow 初始化后再放到 workspace 也可以
|
||||
# 这里简单起见放在根目录
|
||||
setup_logger(root_dir)
|
||||
|
||||
# 2. 检查基本文件是否存在
|
||||
required_dirs = ['config', 'data', 'template']
|
||||
for d in required_dirs:
|
||||
if not os.path.exists(os.path.join(root_dir, d)):
|
||||
print(f"Error: Missing directory '{d}'. Please check file structure.")
|
||||
sys.exit(1)
|
||||
|
||||
# 3. 启动工作流
|
||||
try:
|
||||
# 2. 初始化驱动器 (加载配置,恢复状态)
|
||||
driver = NEPDriver()
|
||||
|
||||
# 3. 启动主循环
|
||||
driver.run()
|
||||
|
||||
except KeyboardInterrupt:
|
||||
logger.warning("⚠️ 用户手动中断程序 (KeyboardInterrupt)")
|
||||
sys.exit(0)
|
||||
app = Workflow(root_dir)
|
||||
app.run()
|
||||
except Exception as e:
|
||||
logger.error(f"❌ 程序发生严重崩溃: {str(e)}")
|
||||
logger.error(traceback.format_exc())
|
||||
# 这里可以加入发送崩溃通知的逻辑
|
||||
sys.exit(1)
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
print(f"Critical Error: {e}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
@@ -1,41 +0,0 @@
|
||||
import yaml
|
||||
import time
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from nep_auto.status_manager import StatusManager
|
||||
|
||||
|
||||
class NEPDriver:
|
||||
def __init__(self):
|
||||
self.logger = logging.getLogger("NEP_Auto")
|
||||
self.root = Path(".")
|
||||
|
||||
# 1. 加载所有配置
|
||||
self.config_sys = self._load_yaml("config/system.yaml")
|
||||
self.config_param = self._load_yaml("config/param.yaml")
|
||||
# 【新增】加载 machine 配置
|
||||
self.config_machine = self._load_yaml("config/machine.yaml")
|
||||
|
||||
self.logger.info(f"项目名称: {self.config_sys.get('project_name')}")
|
||||
self.logger.info(f"计算环境: {self.config_machine.get('current_system')}")
|
||||
|
||||
# 2. 初始化状态管理器
|
||||
self.status = StatusManager(self.root / "workspace")
|
||||
|
||||
def _load_yaml(self, path):
|
||||
if not Path(path).exists():
|
||||
raise FileNotFoundError(f"配置文件缺失: {path}")
|
||||
with open(path, 'r') as f:
|
||||
return yaml.safe_load(f)
|
||||
|
||||
def run(self):
|
||||
"""主循环"""
|
||||
self.logger.info("✅ 驱动器初始化完成,准备进入主循环...")
|
||||
|
||||
# 获取当前轮次
|
||||
current_iter = self.status.get_current_iter()
|
||||
self.logger.info(f"当前进度: iter_{current_iter:03d}")
|
||||
|
||||
# 暂时只打印一次就退出,用于测试环境
|
||||
self.logger.info("测试阶段:环境检查通过。等待模块代码实现...")
|
||||
# while True: ... (后续我们将在这里实现调度逻辑)
|
||||
@@ -1,77 +0,0 @@
|
||||
import os
|
||||
import shutil
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from nep_auto.utils.runner import CommandRunner
|
||||
|
||||
|
||||
class BaseModule:
|
||||
def __init__(self, driver, iter_id):
|
||||
"""
|
||||
:param driver: NEPDriver 实例,包含所有配置
|
||||
:param iter_id: 当前轮次 (int)
|
||||
"""
|
||||
self.driver = driver
|
||||
self.config_sys = driver.config_sys
|
||||
self.config_param = driver.config_param
|
||||
self.machine_config = driver.config_machine['systems'][driver.config_machine['current_system']]
|
||||
|
||||
self.iter_id = iter_id
|
||||
self.iter_name = f"iter_{iter_id:03d}"
|
||||
self.logger = logging.getLogger("NEP_Auto")
|
||||
|
||||
# 初始化运行器
|
||||
self.runner = CommandRunner(self.machine_config)
|
||||
|
||||
# 定义路径
|
||||
self.root = Path(driver.root) / "workspace"
|
||||
self.iter_dir = self.root / self.iter_name
|
||||
self.output_dir = self.iter_dir / "05.output" # 公共输出区
|
||||
|
||||
def get_work_dir(self):
|
||||
"""需由子类实现:返回当前模块的具体工作目录"""
|
||||
raise NotImplementedError
|
||||
|
||||
def initialize(self):
|
||||
"""通用初始化:创建目录,复制通用文件"""
|
||||
work_dir = self.get_work_dir()
|
||||
if not work_dir.exists():
|
||||
work_dir.mkdir(parents=True, exist_ok=True)
|
||||
self.logger.debug(f"📁 Created dir: {work_dir}")
|
||||
|
||||
# 确保公共输出目录存在
|
||||
if not self.output_dir.exists():
|
||||
self.output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
def run(self):
|
||||
"""核心逻辑入口,子类必须实现"""
|
||||
raise NotImplementedError
|
||||
|
||||
def check_done(self):
|
||||
"""检查任务是否完成,子类必须实现"""
|
||||
raise NotImplementedError
|
||||
|
||||
# --- 通用工具方法 ---
|
||||
|
||||
def copy_template(self, template_name, target_name=None):
|
||||
"""从 template 目录复制文件"""
|
||||
if target_name is None:
|
||||
target_name = template_name
|
||||
|
||||
# 根据模块类型寻找模板目录 (需要在子类定义 self.template_subdir)
|
||||
src = Path("template") / getattr(self, "template_subdir", "common") / template_name
|
||||
dst = self.get_work_dir() / target_name
|
||||
|
||||
if src.exists():
|
||||
shutil.copy(src, dst)
|
||||
# self.logger.debug(f"📄 Copied {template_name} -> {dst}")
|
||||
else:
|
||||
self.logger.warning(f"⚠️ Template not found: {src}")
|
||||
|
||||
def link_file(self, src_path, dst_name):
|
||||
"""创建软链接"""
|
||||
src = Path(src_path).resolve()
|
||||
dst = self.get_work_dir() / dst_name
|
||||
if dst.exists():
|
||||
dst.unlink()
|
||||
os.symlink(src, dst)
|
||||
@@ -1,113 +0,0 @@
|
||||
import shutil
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from .base_module import BaseModule
|
||||
|
||||
|
||||
class PreheatModule(BaseModule):
|
||||
def __init__(self, driver, iter_id):
|
||||
super().__init__(driver, iter_id)
|
||||
self.template_subdir = "00_md"
|
||||
|
||||
def get_work_dir(self):
|
||||
return self.iter_dir / "00.md" / "preheat"
|
||||
|
||||
def initialize(self):
|
||||
super().initialize() # 创建目录
|
||||
work_dir = self.get_work_dir()
|
||||
|
||||
# 1. 准备 run.in (从配置读取模板名)
|
||||
template_name = self.config_param['params']['preheat'].get('template_file', 'run.in')
|
||||
self.copy_template(template_name, "run.in")
|
||||
|
||||
# 2. 准备 nep.in (GPUMD 运行必需,虽然内容可能很简单)
|
||||
self.copy_template("nep.in")
|
||||
|
||||
# 3. 准备 nep.txt (势函数)
|
||||
self._prepare_potential()
|
||||
|
||||
# 4. 准备 model.xyz (结构)
|
||||
self._prepare_structure()
|
||||
|
||||
def _prepare_potential(self):
|
||||
"""准备势函数文件 nep.txt"""
|
||||
dst = self.get_work_dir() / "nep.txt"
|
||||
|
||||
if self.iter_id == 1:
|
||||
# 第一轮:使用 system.yaml 里定义的初始势
|
||||
init_pot = Path(self.config_sys['system']['initial_potential'])
|
||||
if not init_pot.exists():
|
||||
raise FileNotFoundError(f"Initial potential not found: {init_pot}")
|
||||
shutil.copy(init_pot, dst)
|
||||
self.logger.info(f" -> Copied initial potential: {init_pot.name}")
|
||||
else:
|
||||
# 后续轮次:使用上一轮训练结果
|
||||
prev_iter = f"iter_{self.iter_id - 1:03d}"
|
||||
prev_train_dir = self.root / prev_iter / "03.train"
|
||||
src = prev_train_dir / "nep.txt"
|
||||
|
||||
if not src.exists():
|
||||
raise FileNotFoundError(f"Previous potential not found: {src}")
|
||||
shutil.copy(src, dst)
|
||||
self.logger.info(f" -> Copied potential from {prev_iter}")
|
||||
|
||||
def _prepare_structure(self):
|
||||
"""准备 model.xyz"""
|
||||
work_dir = self.get_work_dir()
|
||||
|
||||
# 目前逻辑:Preheat 总是从初始结构开始(或者你可以改为从上一轮的 dump 中取)
|
||||
# 这里演示从 VASP 文件转换
|
||||
vasp_path = Path(self.config_sys['system']['initial_structure'])
|
||||
if not vasp_path.exists():
|
||||
raise FileNotFoundError(f"Structure file not found: {vasp_path}")
|
||||
|
||||
# 复制到工作目录
|
||||
local_vasp = work_dir / vasp_path.name
|
||||
shutil.copy(vasp_path, local_vasp)
|
||||
|
||||
# 调用 gpumdkit.sh -addlabel 进行转换
|
||||
# 命令格式: gpumdkit.sh -addlabel file.vasp Li Y Cl
|
||||
elements = " ".join(self.config_sys['system']['elements'])
|
||||
|
||||
self.logger.info(" -> Converting VASP to model.xyz...")
|
||||
|
||||
# 使用 runner 调用 gpumdkit (必须在 machine.yaml 里定义了 'gpumdkit')
|
||||
# 注意:gpumdkit.sh 可能不输出 model.xyz 而是输出 file.xyz,需要确认
|
||||
# 假设输出为 model.xyz
|
||||
cmd_args = f"-addlabel {local_vasp.name} {elements}"
|
||||
|
||||
self.runner.run("gpumdkit", cwd=work_dir, extra_args=cmd_args)
|
||||
|
||||
# 检查是否生成成功
|
||||
if not (work_dir / "model.xyz").exists():
|
||||
# 有时候 gpumdkit 生成的文件名可能是 LiYCl.xyz,需要重命名为 model.xyz
|
||||
# 这里做一个容错检查
|
||||
expected_name = local_vasp.stem + ".xyz" # e.g., LiYCl.xyz
|
||||
if (work_dir / expected_name).exists():
|
||||
shutil.move(work_dir / expected_name, work_dir / "model.xyz")
|
||||
else:
|
||||
raise RuntimeError("Failed to generate model.xyz from gpumdkit")
|
||||
|
||||
def run(self):
|
||||
"""执行 GPUMD"""
|
||||
work_dir = self.get_work_dir()
|
||||
|
||||
# 检查是否已经跑完 (简单的锁文件机制)
|
||||
if (work_dir / "thermo.out").exists():
|
||||
self.logger.info(f" -> Pre-check: thermo.out exists, skipping preheat.")
|
||||
# 这里可以加更复杂的检查,比如步数是否足够
|
||||
return
|
||||
|
||||
self.logger.info(f"🔥 Running Preheat in {self.iter_name}")
|
||||
self.initialize()
|
||||
|
||||
# 调用 GPUMD
|
||||
# GPUMD 没有参数,直接运行
|
||||
self.runner.run("gpumd", cwd=work_dir)
|
||||
|
||||
self.logger.info(" -> Preheat finished.")
|
||||
|
||||
def check_done(self):
|
||||
# 简单检查 thermo.out 是否存在且非空
|
||||
f = self.get_work_dir() / "thermo.out"
|
||||
return f.exists() and f.stat().st_size > 0
|
||||
@@ -1,116 +0,0 @@
|
||||
import shutil
|
||||
import glob
|
||||
from pathlib import Path
|
||||
from nep_auto.modules.base_module import BaseModule
|
||||
|
||||
|
||||
class MDModule(BaseModule):
|
||||
def __init__(self, driver, iter_id):
|
||||
super().__init__(driver, iter_id)
|
||||
self.template_subdir = "00_md"
|
||||
# 预热目录 (输入源)
|
||||
self.preheat_dir = self.iter_dir / "00.md" / "preheat"
|
||||
# MD 目录 (工作区)
|
||||
self.work_dir = self.iter_dir / "00.md" / "md"
|
||||
|
||||
def get_work_dir(self):
|
||||
return self.work_dir
|
||||
|
||||
def run(self):
|
||||
self.logger.info(f"🌪️ [MD] Starting Sampling Phase Iter {self.iter_id}...")
|
||||
self.initialize()
|
||||
|
||||
# ----------------------------------------
|
||||
# 1. 从预热轨迹中采样 (dump.xyz -> sampled_structures.xyz)
|
||||
# ----------------------------------------
|
||||
preheat_dump = self.preheat_dir / "dump.xyz"
|
||||
if not preheat_dump.exists():
|
||||
raise FileNotFoundError(f"Preheat dump not found: {preheat_dump}")
|
||||
|
||||
# 调用 sample_structures.py
|
||||
# 假设参数: input_file method number
|
||||
kit_root = self.driver.config_param['env']['gpumdkit_root']
|
||||
script = f"{kit_root}/Scripts/sample_structures/sample_structures.py"
|
||||
|
||||
# 复制 dump 到当前目录以便处理
|
||||
local_dump = self.work_dir / "preheat_dump.xyz"
|
||||
shutil.copy(preheat_dump, local_dump)
|
||||
|
||||
self.logger.info(" -> Sampling structures from preheat trajectory...")
|
||||
# 按照你的描述: sample_structures.py dump.xyz uniform 4
|
||||
# 这里 "4" 可以放到 param.yaml 里配置,暂时写死或读取默认
|
||||
self.runner.run(
|
||||
"python_script", # 这里可以用 local runner 直接跑 python
|
||||
cwd=self.work_dir,
|
||||
extra_args=f"{script} preheat_dump.xyz uniform 4"
|
||||
)
|
||||
|
||||
# 产物通常叫 sampled_structures.xyz,我们需要把它作为后续 MD 的起始结构
|
||||
# 但注意:GPUMD MD 通常读取 model.xyz 或者 restart。
|
||||
# 如果你的 run.in 里写的是 load_xyz sampled_structures.xyz,那就没问题。
|
||||
# 如果不是,通常做法是把 sampled_structures.xyz 切分成多个文件夹。
|
||||
|
||||
# --- 修正逻辑:根据你的描述 "生成 sample_1-4 文件夹" ---
|
||||
# 我们遍历 template/00_md/md_run_*.in
|
||||
tpl_path = Path("template") / self.template_subdir
|
||||
run_templates = sorted(list(tpl_path.glob("md_run_*.in")))
|
||||
|
||||
if not run_templates:
|
||||
self.logger.warning(f"⚠️ No 'md_run_*.in' found in {tpl_path}, looking for 'run.in'...")
|
||||
run_templates = list(tpl_path.glob("run.in"))
|
||||
|
||||
sub_tasks = []
|
||||
nep_source = self.preheat_dir / "nep.txt" # 沿用预热阶段的势函数
|
||||
|
||||
for idx, tpl in enumerate(run_templates, start=1):
|
||||
task_name = f"sample_{idx}"
|
||||
task_dir = self.work_dir / task_name
|
||||
task_dir.mkdir(exist_ok=True)
|
||||
sub_tasks.append(task_dir)
|
||||
|
||||
# 1. 复制 run.in
|
||||
shutil.copy(tpl, task_dir / "run.in")
|
||||
|
||||
# 2. 复制 nep.txt
|
||||
shutil.copy(nep_source, task_dir / "nep.txt")
|
||||
|
||||
# 3. 复制结构 (假设所有 sample 都从预热的最后一帧或 sampled_structures 开始)
|
||||
# 这里简化处理:复制 model.xyz (初始结构) 或者 使用 preheat 的最后状态
|
||||
# 根据你的流程,通常需要把 sampled_structures.xyz 里的某一帧放进去
|
||||
# 或者 GPUMD 支持直接读取 exyz。
|
||||
# 这里我们假设 run.in 里配置好了读取方式,我们只负责给文件。
|
||||
if (self.preheat_dir / "model.xyz").exists():
|
||||
shutil.copy(self.preheat_dir / "model.xyz", task_dir / "model.xyz")
|
||||
|
||||
# ----------------------------------------
|
||||
# 2. 执行所有 Sample 任务
|
||||
# ----------------------------------------
|
||||
self.logger.info(f" -> Submitting {len(sub_tasks)} MD tasks...")
|
||||
|
||||
for task_dir in sub_tasks:
|
||||
self.logger.info(f" -> Running {task_dir.name}...")
|
||||
self.runner.run("gpumd", cwd=task_dir)
|
||||
|
||||
# ----------------------------------------
|
||||
# 3. 合并结果
|
||||
# ----------------------------------------
|
||||
self.logger.info(" -> Merging dump files...")
|
||||
# cat sample_*/dump.xyz >> dump.xyz
|
||||
# 使用 python 实现 cat 以跨平台安全
|
||||
target_dump = self.work_dir / "dump.xyz"
|
||||
with open(target_dump, 'wb') as outfile:
|
||||
for task_dir in sub_tasks:
|
||||
src = task_dir / "dump.xyz"
|
||||
if src.exists():
|
||||
with open(src, 'rb') as infile:
|
||||
shutil.copyfileobj(infile, outfile)
|
||||
else:
|
||||
self.logger.warning(f"⚠️ {task_dir.name} generated no dump.xyz")
|
||||
|
||||
self.check_done()
|
||||
|
||||
def check_done(self):
|
||||
if (self.work_dir / "dump.xyz").exists():
|
||||
self.logger.info("✅ MD Sampling finished.")
|
||||
return True
|
||||
raise RuntimeError("MD failed: dump.xyz not created.")
|
||||
@@ -1,148 +0,0 @@
|
||||
import shutil
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
from nep_auto.modules.base_module import BaseModule
|
||||
|
||||
|
||||
class SelectModule(BaseModule):
|
||||
def __init__(self, driver, iter_id):
|
||||
super().__init__(driver, iter_id)
|
||||
self.work_dir = self.iter_dir / "01.select"
|
||||
self.md_dir = self.iter_dir / "00.md" / "md"
|
||||
|
||||
def get_work_dir(self):
|
||||
return self.work_dir
|
||||
|
||||
def get_frame_count(self, xyz_file):
|
||||
"""读取 xyz 文件帧数 (通过 grep 'Lattice' 计数)"""
|
||||
if not xyz_file.exists():
|
||||
return 0
|
||||
try:
|
||||
# 使用 grep -c 更快,避免 python 读取大文件内存溢出
|
||||
result = subprocess.run(
|
||||
f"grep -c 'Lattice' {xyz_file}",
|
||||
shell=True, stdout=subprocess.PIPE, text=True
|
||||
)
|
||||
return int(result.stdout.strip())
|
||||
except:
|
||||
return 0
|
||||
|
||||
def run(self):
|
||||
self.logger.info(f"🔍 [Select] Starting Active Learning Selection Iter {self.iter_id}...")
|
||||
self.initialize()
|
||||
|
||||
# ----------------------------------------
|
||||
# 1. 准备必要文件
|
||||
# ----------------------------------------
|
||||
# A. 待筛选数据 (从 MD 结果拿)
|
||||
src_dump = self.md_dir / "dump.xyz"
|
||||
if not src_dump.exists():
|
||||
raise FileNotFoundError(f"MD dump missing: {src_dump}")
|
||||
shutil.copy(src_dump, self.work_dir / "dump.xyz")
|
||||
|
||||
# B. 势函数 (从 MD 结果拿)
|
||||
shutil.copy(self.md_dir / "nep.txt", self.work_dir / "nep.txt")
|
||||
|
||||
# C. 历史训练集 (用于对比)
|
||||
# 逻辑:如果是第一轮,我们需要一个初始的 train.xyz (即使是空的或者是 model.xyz)
|
||||
# gpumdkit 需要这个文件存在
|
||||
target_train_xyz = self.work_dir / "train.xyz"
|
||||
|
||||
if self.iter_id == 1:
|
||||
# 尝试从 data 目录拿初始训练集,如果没有,可以用 model.xyz 充数
|
||||
init_train = self.root / "00.data" / "train.xyz"
|
||||
if init_train.exists():
|
||||
shutil.copy(init_train, target_train_xyz)
|
||||
else:
|
||||
# 如果实在没有,把初始结构当做 train.xyz,避免脚本报错
|
||||
self.logger.warning("No initial train.xyz found, using model.xyz as placeholder.")
|
||||
shutil.copy(self.md_dir / "model.xyz", target_train_xyz)
|
||||
else:
|
||||
# 使用上一轮累积的训练集
|
||||
prev_train = self.root / f"iter_{self.iter_id - 1:03d}" / "03.train" / "train.xyz"
|
||||
if prev_train.exists():
|
||||
shutil.copy(prev_train, target_train_xyz)
|
||||
else:
|
||||
raise FileNotFoundError(f"Previous train.xyz missing: {prev_train}")
|
||||
|
||||
# ----------------------------------------
|
||||
# 2. 循环筛选 (调整阈值)
|
||||
# ----------------------------------------
|
||||
cfg = self.config_param['params']['select']
|
||||
target_min = cfg.get('target_min', 60)
|
||||
target_max = cfg.get('target_max', 120)
|
||||
threshold = cfg.get('init_threshold', 0.01)
|
||||
|
||||
max_attempts = 10
|
||||
attempt = 0
|
||||
|
||||
# gpumdkit 命令 (假设 machine.yaml 里配好了 tool 叫 'gpumdkit')
|
||||
# 如果是 local 模式,runner.run 实际上是执行 command。
|
||||
# 但这里我们需要特殊的 input pipe,runner 的通用接口可能不够用。
|
||||
# 既然我们明确是 local 环境且用 pipe,直接用 subprocess 最稳。
|
||||
gpumdkit_cmd = self.machine_config['tools']['gpumdkit']['command'] # e.g. "gpumdkit.sh"
|
||||
|
||||
while attempt < max_attempts:
|
||||
self.logger.info(f" -> Attempt {attempt + 1}: Threshold = {threshold:.5f}")
|
||||
|
||||
# 构造输入流字符串
|
||||
# 对应你的流程: 203 -> file names -> 1 (distance mode) -> threshold
|
||||
input_str = f"203\ndump.xyz train.xyz nep.txt\n1\n{threshold}\n"
|
||||
|
||||
# 构造完整命令: echo -e "..." | gpumdkit.sh
|
||||
# 注意:python 的 input 参数直接传给 stdin,不需要用 echo |
|
||||
|
||||
try:
|
||||
self.logger.debug(f" Input string: {repr(input_str)}")
|
||||
|
||||
process = subprocess.run(
|
||||
gpumdkit_cmd,
|
||||
input=input_str,
|
||||
cwd=self.work_dir,
|
||||
shell=True,
|
||||
executable="/bin/bash",
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
text=True
|
||||
)
|
||||
|
||||
# 记录输出以便 debug
|
||||
# self.logger.debug(process.stdout)
|
||||
|
||||
if process.returncode != 0:
|
||||
self.logger.error(f"gpumdkit execution failed: {process.stderr}")
|
||||
raise RuntimeError("gpumdkit failed")
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Execution error: {e}")
|
||||
raise
|
||||
|
||||
# 检查 selected.xyz
|
||||
selected_file = self.work_dir / "selected.xyz"
|
||||
count = self.get_frame_count(selected_file)
|
||||
self.logger.info(f" -> Selected {count} structures.")
|
||||
|
||||
if target_min <= count <= target_max:
|
||||
self.logger.info(f"✅ Selection success! ({count} frames)")
|
||||
break
|
||||
elif count < target_min:
|
||||
self.logger.info(" -> Too few, lowering threshold (x0.8)...")
|
||||
threshold *= 0.8
|
||||
else:
|
||||
self.logger.info(" -> Too many, raising threshold (x1.2)...")
|
||||
threshold *= 1.2
|
||||
|
||||
# 稍微清理一下生成的中间文件,防止下次干扰?
|
||||
# selected.xyz 会被下次覆盖,所以不删也行。
|
||||
|
||||
attempt += 1
|
||||
|
||||
if attempt >= max_attempts:
|
||||
self.logger.warning("⚠️ Max attempts reached. Proceeding with current best.")
|
||||
|
||||
self.check_done()
|
||||
|
||||
def check_done(self):
|
||||
if (self.work_dir / "selected.xyz").exists():
|
||||
return True
|
||||
raise RuntimeError("Selection failed: selected.xyz not found")
|
||||
@@ -1,167 +0,0 @@
|
||||
import shutil
|
||||
import subprocess
|
||||
import glob
|
||||
from pathlib import Path
|
||||
from nep_auto.modules.base_module import BaseModule
|
||||
|
||||
|
||||
class SCFModule(BaseModule):
|
||||
def __init__(self, driver, iter_id):
|
||||
super().__init__(driver, iter_id)
|
||||
self.template_subdir = "02_scf"
|
||||
self.work_dir = self.iter_dir / "02.scf"
|
||||
self.select_dir = self.iter_dir / "01.select"
|
||||
|
||||
def get_work_dir(self):
|
||||
return self.work_dir
|
||||
|
||||
def run(self):
|
||||
self.logger.info(f"⚛️ [SCF] Starting DFT Calculation Iter {self.iter_id}...")
|
||||
self.initialize()
|
||||
|
||||
# ----------------------------------------
|
||||
# 1. 准备数据: selected.xyz -> 301 切分
|
||||
# ----------------------------------------
|
||||
src_xyz = self.select_dir / "selected.xyz"
|
||||
if not src_xyz.exists():
|
||||
raise FileNotFoundError("selected.xyz missing from select module")
|
||||
|
||||
shutil.copy(src_xyz, self.work_dir / "selected.xyz")
|
||||
|
||||
# 调用 gpumdkit.sh (301 -> prefix)
|
||||
# Prefix 使用 "task" 或者 "job",生成 job_1, job_2...
|
||||
prefix = "task"
|
||||
input_str = f"301\n{prefix}\n"
|
||||
|
||||
gpumdkit_cmd = self.machine_config['tools']['gpumdkit']['command']
|
||||
|
||||
self.logger.info(" -> Splitting structures using gpumdkit...")
|
||||
try:
|
||||
subprocess.run(
|
||||
gpumdkit_cmd,
|
||||
input=input_str,
|
||||
cwd=self.work_dir,
|
||||
shell=True,
|
||||
executable="/bin/bash",
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
text=True,
|
||||
check=True
|
||||
)
|
||||
except subprocess.CalledProcessError as e:
|
||||
self.logger.error(f"gpumdkit splitting failed: {e.stderr}")
|
||||
raise
|
||||
|
||||
# ----------------------------------------
|
||||
# 2. 准备 DFT 输入文件 (fp 文件夹)
|
||||
# ----------------------------------------
|
||||
# gpumdkit 会生成一个 fp 文件夹,我们需要把模版放进去
|
||||
fp_dir = self.work_dir / "fp"
|
||||
if not fp_dir.exists():
|
||||
# 某些版本的脚本可能不自动创建 fp,手动建一个保险
|
||||
fp_dir.mkdir(exist_ok=True)
|
||||
|
||||
self.logger.info(" -> preparing INCAR/KPOINTS/POTCAR...")
|
||||
# 从 template/02_scf 复制到 02.scf/fp
|
||||
self.copy_template("INCAR", target_name=None)
|
||||
shutil.copy(self.work_dir / "INCAR", fp_dir / "INCAR")
|
||||
|
||||
self.copy_template("KPOINTS", target_name=None)
|
||||
shutil.copy(self.work_dir / "KPOINTS", fp_dir / "KPOINTS")
|
||||
|
||||
self.copy_template("POTCAR", target_name=None)
|
||||
shutil.copy(self.work_dir / "POTCAR", fp_dir / "POTCAR")
|
||||
|
||||
# ----------------------------------------
|
||||
# 3. 分发文件并提交任务
|
||||
# ----------------------------------------
|
||||
# 找到所有生成的文件夹 (task_1, task_2...)
|
||||
task_dirs = sorted(list(self.work_dir.glob(f"{prefix}_*")))
|
||||
if not task_dirs:
|
||||
raise RuntimeError(f"No {prefix}_* folders generated!")
|
||||
|
||||
self.logger.info(f" -> Found {len(task_dirs)} tasks. Distributing input files...")
|
||||
|
||||
# 将 fp 里的文件分发到每个 task 文件夹 (替代 presub.sh 的功能)
|
||||
common_files = ["INCAR", "KPOINTS", "POTCAR"]
|
||||
for t_dir in task_dirs:
|
||||
if not t_dir.is_dir(): continue
|
||||
for f in common_files:
|
||||
shutil.copy(fp_dir / f, t_dir / f)
|
||||
|
||||
# 提交计算
|
||||
self.logger.info(" -> Running VASP jobs...")
|
||||
success_count = 0
|
||||
|
||||
# 这里的并行策略取决于 machine.yaml
|
||||
# 如果是 Interactive GPU,我们通常是串行跑,或者一次跑 N 个
|
||||
# 这里先简单实现串行跑
|
||||
for t_dir in task_dirs:
|
||||
self.logger.info(f" -> Running {t_dir.name}...")
|
||||
try:
|
||||
# 调用 machine.yaml 里的 vasp 工具
|
||||
self.runner.run("vasp", cwd=t_dir)
|
||||
if (t_dir / "OUTCAR").exists(): # 简单判据
|
||||
success_count += 1
|
||||
except Exception as e:
|
||||
self.logger.error(f"Job {t_dir.name} failed: {e}")
|
||||
|
||||
self.logger.info(f" -> Finished. Success: {success_count}/{len(task_dirs)}")
|
||||
|
||||
# ----------------------------------------
|
||||
# 4. 收集结果 (OUTCARs -> NEP-dataset.xyz)
|
||||
# ----------------------------------------
|
||||
# 使用 gpumdkit 104 功能: Format Conversion -> OUTCAR to xyz (需提供路径)
|
||||
# 或者 108? 根据你的描述是 gpumdkit.sh -out2xyz .
|
||||
|
||||
self.logger.info(" -> Converting OUTCARs to NEP-dataset.xyz...")
|
||||
|
||||
# 方式 A: 命令行参数调用 (如果你确认支持)
|
||||
# cmd = f"{gpumdkit_cmd} -out2xyz ."
|
||||
|
||||
# 方式 B: 交互式调用 (104/108) - 这里假设 -out2xyz 可用,这是最方便的
|
||||
# 如果不支持,我们需要知道交互式的代码。根据你的描述 7: "-out2xyz ."
|
||||
|
||||
try:
|
||||
# 尝试直接调用 -out2xyz
|
||||
subprocess.run(
|
||||
f"{gpumdkit_cmd} -out2xyz .",
|
||||
cwd=self.work_dir,
|
||||
shell=True,
|
||||
executable="/bin/bash",
|
||||
check=True
|
||||
)
|
||||
|
||||
# gpumdkit 通常生成 model.xyz 或 out.xyz,我们需要重命名为 NEP-dataset.xyz
|
||||
# 假设生成的是 model.xyz
|
||||
potential_outputs = ["model.xyz", "movie.xyz", "out.xyz"]
|
||||
found = False
|
||||
for f in potential_outputs:
|
||||
if (self.work_dir / f).exists():
|
||||
shutil.move(self.work_dir / f, self.work_dir / "NEP-dataset.xyz")
|
||||
found = True
|
||||
break
|
||||
|
||||
if not found and not (self.work_dir / "NEP-dataset.xyz").exists():
|
||||
# 如果没找到,可能已经在子文件夹里?
|
||||
pass
|
||||
|
||||
except subprocess.CalledProcessError:
|
||||
self.logger.warning("gpumdkit -out2xyz failed, falling back to ASE...")
|
||||
# Fallback: 使用 ASE 收集 (更稳健)
|
||||
from ase.io import read, write
|
||||
all_atoms = []
|
||||
for t_dir in task_dirs:
|
||||
try:
|
||||
all_atoms.append(read(t_dir / "OUTCAR", format="vasp-outcar"))
|
||||
except:
|
||||
pass
|
||||
if all_atoms:
|
||||
write(self.work_dir / "NEP-dataset.xyz", all_atoms, format="extxyz")
|
||||
|
||||
self.check_done()
|
||||
|
||||
def check_done(self):
|
||||
if (self.work_dir / "NEP-dataset.xyz").exists():
|
||||
return True
|
||||
raise RuntimeError("SCF failed: NEP-dataset.xyz not generated")
|
||||
@@ -1,69 +0,0 @@
|
||||
import shutil
|
||||
from pathlib import Path
|
||||
from nep_auto.modules.base_module import BaseModule
|
||||
|
||||
|
||||
class TrainModule(BaseModule):
|
||||
def __init__(self, driver, iter_id):
|
||||
super().__init__(driver, iter_id)
|
||||
self.template_subdir = "03_train"
|
||||
self.work_dir = self.iter_dir / "03.train"
|
||||
|
||||
def get_work_dir(self):
|
||||
return self.work_dir
|
||||
|
||||
def run(self):
|
||||
self.logger.info(f"🧠 [Train] Starting Training Iter {self.iter_id}...")
|
||||
self.initialize()
|
||||
|
||||
# ----------------------------------------
|
||||
# 1. 准备 train.xyz (合并)
|
||||
# ----------------------------------------
|
||||
# 目标文件
|
||||
current_train = self.work_dir / "train.xyz"
|
||||
|
||||
# 来源 1: 上一轮的 train.xyz (如果是第一轮,找初始数据)
|
||||
sources = []
|
||||
if self.iter_id == 1:
|
||||
init_data = self.root / "00.data" / "train.xyz"
|
||||
if init_data.exists():
|
||||
sources.append(init_data)
|
||||
else:
|
||||
prev_train = self.root / f"iter_{self.iter_id - 1:03d}" / "03.train" / "train.xyz"
|
||||
if prev_train.exists():
|
||||
sources.append(prev_train)
|
||||
|
||||
# 来源 2: 本轮新算的 SCF 数据
|
||||
new_data = self.iter_dir / "02.scf" / "NEP-dataset.xyz"
|
||||
if new_data.exists():
|
||||
sources.append(new_data)
|
||||
else:
|
||||
raise FileNotFoundError("New training data (NEP-dataset.xyz) missing!")
|
||||
|
||||
# 执行合并
|
||||
self.logger.info(f" -> Merging {len(sources)} datasets into train.xyz...")
|
||||
with open(current_train, 'wb') as outfile:
|
||||
for src in sources:
|
||||
with open(src, 'rb') as infile:
|
||||
shutil.copyfileobj(infile, outfile)
|
||||
|
||||
# ----------------------------------------
|
||||
# 2. 准备 nep.in
|
||||
# ----------------------------------------
|
||||
self.copy_template("nep.in")
|
||||
|
||||
# ----------------------------------------
|
||||
# 3. 运行训练 (调用 machine.yaml 里的 nep)
|
||||
# ----------------------------------------
|
||||
self.logger.info(" -> Running NEP training...")
|
||||
self.runner.run("nep", cwd=self.work_dir)
|
||||
|
||||
self.check_done()
|
||||
|
||||
def check_done(self):
|
||||
# 检查是否生成了 nep.txt
|
||||
# 通常还会检查 loss.out 是否收敛,或者生成了 virials.out 等
|
||||
if (self.work_dir / "nep.txt").exists():
|
||||
self.logger.info("✅ Training finished.")
|
||||
return True
|
||||
raise RuntimeError("Training failed: nep.txt not generated")
|
||||
@@ -1,27 +0,0 @@
|
||||
import json
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
class StatusManager:
|
||||
def __init__(self, workspace_path):
|
||||
self.workspace = Path(workspace_path)
|
||||
self.status_file = self.workspace / "status.json"
|
||||
|
||||
if not self.workspace.exists():
|
||||
self.workspace.mkdir(parents=True)
|
||||
|
||||
# 如果没有状态文件,创建一个初始的
|
||||
if not self.status_file.exists():
|
||||
self._save_status({"current_iter": 1, "stages": {}})
|
||||
|
||||
def _save_status(self, data):
|
||||
with open(self.status_file, 'w') as f:
|
||||
json.dump(data, f, indent=4)
|
||||
|
||||
def get_current_iter(self):
|
||||
if self.status_file.exists():
|
||||
with open(self.status_file, 'r') as f:
|
||||
data = json.load(f)
|
||||
return data.get("current_iter", 1)
|
||||
return 1
|
||||
@@ -1,33 +0,0 @@
|
||||
import logging
|
||||
import os
|
||||
import sys
|
||||
|
||||
|
||||
def setup_logger(log_file="logs/runtime.log"):
|
||||
# 确保日志目录存在
|
||||
os.makedirs(os.path.dirname(log_file), exist_ok=True)
|
||||
|
||||
logger = logging.getLogger("NEP_Auto")
|
||||
logger.setLevel(logging.INFO)
|
||||
|
||||
# 避免重复添加 handler
|
||||
if logger.handlers:
|
||||
return logger
|
||||
|
||||
# 格式
|
||||
formatter = logging.Formatter(
|
||||
'[%(asctime)s] [%(levelname)s] %(message)s',
|
||||
datefmt='%Y-%m-%d %H:%M:%S'
|
||||
)
|
||||
|
||||
# 文件输出
|
||||
fh = logging.FileHandler(log_file, mode='a', encoding='utf-8')
|
||||
fh.setFormatter(formatter)
|
||||
logger.addHandler(fh)
|
||||
|
||||
# 屏幕输出
|
||||
ch = logging.StreamHandler(sys.stdout)
|
||||
ch.setFormatter(formatter)
|
||||
logger.addHandler(ch)
|
||||
|
||||
return logger
|
||||
@@ -1,74 +0,0 @@
|
||||
import subprocess
|
||||
import os
|
||||
import time
|
||||
import logging
|
||||
|
||||
|
||||
class CommandRunner:
|
||||
def __init__(self, machine_config):
|
||||
"""
|
||||
:param machine_config: config/machine.yaml 中 'systems' -> 'current_system' 对应的内容
|
||||
"""
|
||||
self.config = machine_config
|
||||
self.logger = logging.getLogger("NEP_Auto")
|
||||
self.mode = self.config.get("type", "local") # local 或 slurm
|
||||
|
||||
def run(self, tool_name, cwd=".", wait=True, extra_args=""):
|
||||
"""
|
||||
核心运行方法
|
||||
:param tool_name: machine.yaml 中 tools 下的键名 (如 'gpumd', 'vasp')
|
||||
:param cwd: 执行命令的工作目录
|
||||
:param wait: 是否等待命令结束 (True: 阻塞, False: 后台运行)
|
||||
:param extra_args: 附加在命令后的参数
|
||||
"""
|
||||
# 1. 获取工具配置
|
||||
tool_conf = self.config.get("tools", {}).get(tool_name)
|
||||
if not tool_conf:
|
||||
self.logger.error(f"❌ 找不到工具配置: {tool_name}")
|
||||
raise ValueError(f"Tool {tool_name} not defined in machine.yaml")
|
||||
|
||||
cmd = tool_conf.get("command")
|
||||
env_setup = tool_conf.get("env_setup", "")
|
||||
|
||||
# 2. 组装命令 (Local 模式)
|
||||
if self.mode == "local":
|
||||
full_cmd = f"{cmd} {extra_args}"
|
||||
|
||||
# 如果有环境加载脚本,用 && 连接
|
||||
if env_setup:
|
||||
full_cmd = f"{env_setup} && {full_cmd}"
|
||||
|
||||
self.logger.info(f"⚙️ [Local] Executing: {full_cmd}")
|
||||
self.logger.info(f" 📂 Workdir: {cwd}")
|
||||
|
||||
try:
|
||||
# 使用 bash 执行以支持 source 命令
|
||||
process = subprocess.Popen(
|
||||
full_cmd,
|
||||
shell=True,
|
||||
cwd=cwd,
|
||||
executable="/bin/bash",
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
text=True
|
||||
)
|
||||
|
||||
if wait:
|
||||
stdout, stderr = process.communicate()
|
||||
if process.returncode != 0:
|
||||
self.logger.error(f"❌ Execution failed (Code {process.returncode})")
|
||||
self.logger.error(f"Stderr: {stderr}")
|
||||
raise RuntimeError(f"Command failed: {full_cmd}")
|
||||
return True
|
||||
else:
|
||||
return process # 返回进程对象供监控
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"❌ Runner Error: {str(e)}")
|
||||
raise
|
||||
|
||||
# 3. Slurm 模式 (预留接口,暂未实现具体逻辑)
|
||||
elif self.mode == "slurm":
|
||||
self.logger.warning("⚠️ Slurm mode not fully implemented yet.")
|
||||
# 这里未来会生成 sbatch 脚本并提交
|
||||
return False
|
||||
129
src/machine.py
Normal file
129
src/machine.py
Normal file
@@ -0,0 +1,129 @@
|
||||
# src/machine.py
|
||||
import os
|
||||
import subprocess
|
||||
import time
|
||||
import logging
|
||||
import shutil
|
||||
|
||||
|
||||
class MachineManager:
|
||||
def __init__(self, machine_config_path):
|
||||
from src.utils import load_yaml
|
||||
self.config = load_yaml(machine_config_path)
|
||||
self.root_dir = self.config.get('root_dir', os.getcwd())
|
||||
self.script_dir = os.path.join(self.root_dir, self.config.get('script_dir', 'config/scripts'))
|
||||
|
||||
logging.info(f"MachineManager initialized. Script dir: {self.script_dir}")
|
||||
|
||||
def execute(self, executor_name, work_dir):
|
||||
"""
|
||||
统一执行入口
|
||||
:param executor_name: machine.yaml 中定义的 key (如 gpumd, vasp_cpu)
|
||||
:param work_dir: 任务执行的工作目录
|
||||
"""
|
||||
if executor_name not in self.config['executors']:
|
||||
logging.error(f"Executor '{executor_name}' not defined in machine.yaml")
|
||||
return False
|
||||
|
||||
exec_conf = self.config['executors'][executor_name]
|
||||
exec_type = exec_conf.get('type', 'local')
|
||||
|
||||
# 确保工作目录存在
|
||||
os.makedirs(work_dir, exist_ok=True)
|
||||
|
||||
logging.info(f"--- Task: {executor_name} | Type: {exec_type} ---")
|
||||
logging.info(f"Working Dir: {work_dir}")
|
||||
|
||||
if exec_type == 'local':
|
||||
return self._run_local(exec_conf, work_dir)
|
||||
elif exec_type == 'slurm':
|
||||
return self._submit_slurm(exec_conf, work_dir, executor_name)
|
||||
else:
|
||||
logging.error(f"Unknown execution type: {exec_type}")
|
||||
return False
|
||||
|
||||
def _run_local(self, conf, work_dir):
|
||||
"""本地直接执行"""
|
||||
# 1. 优先看有没有 script 脚本文件
|
||||
if 'script' in conf:
|
||||
script_name = conf['script']
|
||||
src_script = os.path.join(self.script_dir, script_name)
|
||||
|
||||
if not os.path.exists(src_script):
|
||||
logging.error(f"Script not found: {src_script}")
|
||||
return False
|
||||
|
||||
# 运行脚本: bash /path/to/script.sh
|
||||
cmd = f"bash {src_script}"
|
||||
|
||||
# 2. 如果没有脚本,看有没有 cmd 直接命令
|
||||
elif 'cmd' in conf:
|
||||
cmd = conf['cmd']
|
||||
else:
|
||||
logging.error("No 'script' or 'cmd' defined for local executor.")
|
||||
return False
|
||||
|
||||
try:
|
||||
# 切换到工作目录执行
|
||||
logging.info(f"Executing Local Command: {cmd}")
|
||||
subprocess.check_call(cmd, shell=True, cwd=work_dir)
|
||||
logging.info("Local execution success.")
|
||||
return True
|
||||
except subprocess.CalledProcessError as e:
|
||||
logging.error(f"Execution failed with error code {e.returncode}")
|
||||
return False
|
||||
|
||||
def _submit_slurm(self, conf, work_dir, job_name):
|
||||
"""生成 Slurm 脚本并提交 (模拟)"""
|
||||
script_name = conf.get('script')
|
||||
src_script = os.path.join(self.script_dir, script_name)
|
||||
|
||||
if not os.path.exists(src_script):
|
||||
logging.error(f"Script not found: {src_script}")
|
||||
return False
|
||||
|
||||
# 1. 读取用户自定义脚本内容
|
||||
with open(src_script, 'r') as f:
|
||||
user_script_content = f.read()
|
||||
|
||||
# 2. 生成提交脚本 (.sub)
|
||||
sub_file = os.path.join(work_dir, "submit.sub")
|
||||
|
||||
with open(sub_file, 'w') as f:
|
||||
f.write("#!/bin/bash\n")
|
||||
f.write(f"#SBATCH --job-name={job_name}\n")
|
||||
# 根据 yaml 自动填入 SBATCH 参数
|
||||
if 'partition' in conf: f.write(f"#SBATCH --partition={conf['partition']}\n")
|
||||
if 'nodes' in conf: f.write(f"#SBATCH --nodes={conf['nodes']}\n")
|
||||
if 'ntasks' in conf: f.write(f"#SBATCH --ntasks={conf['ntasks']}\n")
|
||||
if 'time' in conf: f.write(f"#SBATCH --time={conf['time']}\n")
|
||||
if 'gpus' in conf: f.write(f"#SBATCH --gres=gpu:{conf['gpus']}\n")
|
||||
|
||||
f.write("\n")
|
||||
f.write("cd $SLURM_SUBMIT_DIR\n")
|
||||
f.write("\n")
|
||||
f.write("# --- User Script Content ---\n")
|
||||
f.write(user_script_content)
|
||||
|
||||
logging.info(f"Generated submission script: {sub_file}")
|
||||
|
||||
# 3. 提交任务
|
||||
# 注意:这里我们做个判断,如果是在非 Slurm 环境测试,就不真正提交,只生成文件
|
||||
# 如果你想真正提交,把下面的 True 改为 False
|
||||
TEST_MODE = True
|
||||
|
||||
if TEST_MODE:
|
||||
logging.info("[TEST_MODE] Simulated 'sbatch submit.sub'. Check the .sub file.")
|
||||
return True
|
||||
else:
|
||||
try:
|
||||
# 提交并获取 Job ID
|
||||
res = subprocess.check_output(f"sbatch {sub_file}", shell=True, cwd=work_dir)
|
||||
job_id = res.decode().strip().split()[-1] # 通常输出是 Submitted batch job 123456
|
||||
logging.info(f"Job submitted. ID: {job_id}")
|
||||
|
||||
# TODO: 这里需要加入 wait_for_job(job_id) 的逻辑,我们下一阶段实现
|
||||
return True
|
||||
except subprocess.CalledProcessError as e:
|
||||
logging.error(f"Submission failed: {e}")
|
||||
return False
|
||||
61
src/state.py
Normal file
61
src/state.py
Normal file
@@ -0,0 +1,61 @@
|
||||
# src/state.py
|
||||
import os
|
||||
import json
|
||||
import time
|
||||
from datetime import datetime
|
||||
|
||||
|
||||
class StateTracker:
|
||||
def __init__(self, workspace_dir):
|
||||
self.state_file = os.path.join(workspace_dir, "workflow_status.json")
|
||||
self.history = [] # 用于存储有序的记录 [{'task': id, 'time': time}, ...]
|
||||
self.completed_set = set() # 用于快速查找
|
||||
self.load()
|
||||
|
||||
def load(self):
|
||||
if os.path.exists(self.state_file):
|
||||
try:
|
||||
with open(self.state_file, 'r') as f:
|
||||
data = json.load(f)
|
||||
# 兼容旧版本:如果 data["completed"] 是纯字符串列表
|
||||
raw_list = data.get("completed", [])
|
||||
if raw_list and isinstance(raw_list[0], str):
|
||||
# 旧版本转换:给个假时间
|
||||
self.history = [{"task": task, "time": "Unknown"} for task in raw_list]
|
||||
else:
|
||||
# 新版本直接读取
|
||||
self.history = raw_list
|
||||
|
||||
# 重建集合用于快速判断 is_done
|
||||
self.completed_set = {item['task'] for item in self.history}
|
||||
except Exception as e:
|
||||
print(f"Warning: Failed to load state file: {e}")
|
||||
self.history = []
|
||||
self.completed_set = set()
|
||||
|
||||
def mark_done(self, task_id):
|
||||
"""标记任务完成并保存(带时间戳)"""
|
||||
if task_id in self.completed_set:
|
||||
return # 避免重复记录
|
||||
|
||||
# 获取当前时间
|
||||
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
||||
|
||||
# 添加记录
|
||||
record = {
|
||||
"task": task_id,
|
||||
"time": timestamp
|
||||
}
|
||||
self.history.append(record)
|
||||
self.completed_set.add(task_id)
|
||||
|
||||
self.save()
|
||||
|
||||
def is_done(self, task_id):
|
||||
"""检查任务是否已完成"""
|
||||
return task_id in self.completed_set
|
||||
|
||||
def save(self):
|
||||
with open(self.state_file, 'w') as f:
|
||||
# indent=2 让文件更易读
|
||||
json.dump({"completed": self.history}, f, indent=2, ensure_ascii=False)
|
||||
177
src/steps.py
Normal file
177
src/steps.py
Normal file
@@ -0,0 +1,177 @@
|
||||
# src/steps.py
|
||||
import os
|
||||
import shutil
|
||||
import time
|
||||
import logging
|
||||
import subprocess
|
||||
|
||||
|
||||
class BaseStep:
|
||||
def __init__(self, name, work_dir, machine_manager, config):
|
||||
self.name = name
|
||||
self.work_dir = work_dir
|
||||
self.machine = machine_manager
|
||||
self.config = config
|
||||
os.makedirs(self.work_dir, exist_ok=True)
|
||||
self.logger = logging.getLogger()
|
||||
|
||||
def copy_file(self, src, dst_name=None):
|
||||
"""辅助函数:安全复制文件"""
|
||||
if not os.path.exists(src):
|
||||
self.logger.error(f"[{self.name}] Source file missing: {src}")
|
||||
return False
|
||||
|
||||
dst_name = dst_name if dst_name else os.path.basename(src)
|
||||
dst_path = os.path.join(self.work_dir, dst_name)
|
||||
shutil.copy(src, dst_path)
|
||||
return dst_path
|
||||
|
||||
|
||||
class MDStep(BaseStep):
|
||||
"""
|
||||
对应 00.md: 负责预热/采样
|
||||
"""
|
||||
|
||||
def run(self, prev_nep_path, template_path):
|
||||
self.logger.info(f"=== Running Step: {self.name} (MD) ===")
|
||||
|
||||
# 1. 准备 nep.txt (来自上一轮或初始数据)
|
||||
if not prev_nep_path:
|
||||
self.logger.error("No nep.txt provided for MD.")
|
||||
return False
|
||||
self.copy_file(prev_nep_path, "nep.txt")
|
||||
|
||||
# 2. 准备 model.xyz (如果是第一轮,这里假设外部已经放好了,或者由init步生成)
|
||||
# 为了简化,我们假设上一级流程已经把 model.xyz 准备在 work_dir 或者由上一轮传递
|
||||
# 这里我们假设 model.xyz 必须存在于 work_dir (可以通过 init 步骤拷入)
|
||||
if not os.path.exists(os.path.join(self.work_dir, "model.xyz")):
|
||||
self.logger.warning(f"[{self.name}] model.xyz not found in {self.work_dir}. Make sure Init step ran.")
|
||||
|
||||
# 3. 准备 run.in (从 template 复制)
|
||||
run_in_src = os.path.join(template_path, "run.in")
|
||||
self.copy_file(run_in_src, "run.in")
|
||||
|
||||
# 4. 调用 Machine 执行 GPUMD
|
||||
# 注意:这里我们调用 machine.yaml 里定义的 'gpumd' 执行器
|
||||
success = self.machine.execute("gpumd", self.work_dir)
|
||||
|
||||
if success and os.path.exists(os.path.join(self.work_dir, "dump.xyz")):
|
||||
self.logger.info(f"[{self.name}] MD finished. dump.xyz generated.")
|
||||
return True
|
||||
else:
|
||||
self.logger.error(f"[{self.name}] MD failed or dump.xyz missing.")
|
||||
return False
|
||||
|
||||
|
||||
class SelectStep(BaseStep):
|
||||
"""
|
||||
对应 01.select: 智能筛选
|
||||
"""
|
||||
|
||||
def run(self, dump_path, train_path, nep_path, method="distance", params=[0.01, 60, 120]):
|
||||
self.logger.info(f"=== Running Step: {self.name} (Smart Selection) ===")
|
||||
|
||||
# 准备文件
|
||||
self.copy_file(dump_path, "dump.xyz")
|
||||
self.copy_file(train_path, "train.xyz")
|
||||
self.copy_file(nep_path, "nep.txt")
|
||||
|
||||
target_min, target_max = params[1], params[2]
|
||||
threshold = params[0]
|
||||
step_size = 0.001 # 每次调整的步长
|
||||
|
||||
# 你的流程里是用 gpumdkit.sh 做筛选 (option 203)
|
||||
# 这里的命令构造需要非常小心,模拟你的 echo输入
|
||||
# 假设 gpumdkit.sh 在 PATH 中,或者通过 machine config 获取路径
|
||||
# 由于我们现在是 local 调试,假设你依然依赖 gpumdkit.sh
|
||||
# 但既然我们写 Python,建议未来把筛选逻辑(计算距离)直接写成 Python 代码。
|
||||
# 这里暂时模拟调用逻辑:
|
||||
|
||||
for i in range(10): # 最多尝试10次
|
||||
self.logger.info(f"Selection attempt {i + 1}: Threshold={threshold:.4f}")
|
||||
|
||||
# 构造输入字符串: 203 -> file names -> 1 (distance) -> threshold
|
||||
# 注意:这里假设 gpumdkit.sh 能接受这种输入
|
||||
# 为了调试方便,这里我们暂时只打日志,不真的调 gpumdkit (因为它需要真实的数据文件)
|
||||
# 在真实运行中,这里应该调用:
|
||||
# input_str = f"203\ndump.xyz train.xyz nep.txt\n1\n{threshold}\n"
|
||||
# subprocess.run("gpumdkit.sh", input=input_str, cwd=self.work_dir...)
|
||||
|
||||
# --- 模拟代码 Start ---
|
||||
# 假设生成了一个假的 selected.xyz
|
||||
with open(os.path.join(self.work_dir, "selected.xyz"), 'w') as f:
|
||||
# 模拟根据阈值,阈值越小选的越多
|
||||
mock_count = int(100 / (threshold * 100))
|
||||
f.write(f"Mock selected {mock_count} frames")
|
||||
|
||||
selected_count = mock_count
|
||||
self.logger.info(f"Found {selected_count} structures (Mock).")
|
||||
# --- 模拟代码 End ---
|
||||
|
||||
if target_min <= selected_count <= target_max:
|
||||
self.logger.info(f"Selection Success! Final count: {selected_count}")
|
||||
return True
|
||||
elif selected_count < target_min:
|
||||
self.logger.info("Too few. Decreasing threshold.")
|
||||
threshold -= step_size
|
||||
if threshold < 0: threshold = 0.001
|
||||
else:
|
||||
self.logger.info("Too many. Increasing threshold.")
|
||||
threshold += step_size
|
||||
|
||||
self.logger.warning("Selection failed to converge. Using last result.")
|
||||
return True # 暂时允许继续
|
||||
|
||||
|
||||
class SCFStep(BaseStep):
|
||||
"""
|
||||
对应 02.scf: VASP 计算
|
||||
"""
|
||||
|
||||
def run(self, template_path, potcar_path):
|
||||
self.logger.info(f"=== Running Step: {self.name} (SCF/VASP) ===")
|
||||
|
||||
# 1. 复制 POTCAR
|
||||
self.copy_file(potcar_path, "POTCAR")
|
||||
|
||||
# 2. 复制 INCAR
|
||||
incar_src = os.path.join(template_path, "INCAR")
|
||||
if not self.copy_file(incar_src, "INCAR"):
|
||||
return False # INCAR 必须有
|
||||
|
||||
# 3. 复制 KPOINTS (可选)
|
||||
kpoints_src = os.path.join(template_path, "KPOINTS")
|
||||
if os.path.exists(kpoints_src):
|
||||
self.copy_file(kpoints_src, "KPOINTS")
|
||||
|
||||
# 4. 执行 VASP
|
||||
# 注意:这里通常需要把 selected.xyz 拆分成多个文件夹
|
||||
# 在 Local 简单测试中,我们假设 selected.xyz 已经被拆分成了 POSCAR
|
||||
# 或者我们只跑一个单点能测试。
|
||||
# 既然是框架开发,这里我们调用 machine.yaml 里的 'vasp_cpu'
|
||||
|
||||
success = self.machine.execute("vasp_cpu", self.work_dir)
|
||||
return success
|
||||
|
||||
|
||||
class TrainStep(BaseStep):
|
||||
"""
|
||||
对应 03.train: NEP 训练
|
||||
"""
|
||||
|
||||
def run(self, template_path, new_train_data_path):
|
||||
self.logger.info(f"=== Running Step: {self.name} (Train) ===")
|
||||
|
||||
# 1. 准备 nep.in
|
||||
self.copy_file(os.path.join(template_path, "nep.in"), "nep.in")
|
||||
|
||||
# 2. 准备 train.xyz (这里假设我们把所有数据 cat 到了这里)
|
||||
if new_train_data_path and os.path.exists(new_train_data_path):
|
||||
self.copy_file(new_train_data_path, "train.xyz")
|
||||
else:
|
||||
# 如果没有新数据,只是测试,创建一个空的
|
||||
with open(os.path.join(self.work_dir, "train.xyz"), 'w') as f:
|
||||
f.write("Mock training data")
|
||||
|
||||
# 3. 运行 NEP
|
||||
return self.machine.execute("nep_local", self.work_dir)
|
||||
143
src/utils.py
Normal file
143
src/utils.py
Normal file
@@ -0,0 +1,143 @@
|
||||
# src/utils.py
|
||||
import yaml
|
||||
import logging
|
||||
import os
|
||||
import sys
|
||||
|
||||
|
||||
def load_yaml(path):
|
||||
"""加载 YAML 配置文件"""
|
||||
if not os.path.exists(path):
|
||||
logging.error(f"Config file not found: {path}")
|
||||
sys.exit(1)
|
||||
with open(path, 'r') as f:
|
||||
return yaml.safe_load(f)
|
||||
|
||||
|
||||
def setup_logger(work_dir, log_file="autonep.log"):
|
||||
"""配置日志:同时输出到文件和控制台"""
|
||||
logger = logging.getLogger()
|
||||
logger.setLevel(logging.INFO)
|
||||
|
||||
# 清楚之前的 handler 防止重复
|
||||
if logger.hasHandlers():
|
||||
logger.handlers.clear()
|
||||
|
||||
# 文件 Handler
|
||||
file_handler = logging.FileHandler(os.path.join(work_dir, log_file))
|
||||
file_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))
|
||||
logger.addHandler(file_handler)
|
||||
|
||||
# 控制台 Handler
|
||||
console_handler = logging.StreamHandler()
|
||||
console_handler.setFormatter(logging.Formatter('%(message)s')) # 控制台只看消息,简洁点
|
||||
logger.addHandler(console_handler)
|
||||
|
||||
return logger
|
||||
|
||||
|
||||
# src/utils.py 中的 Notifier 类
|
||||
|
||||
class Notifier:
|
||||
def __init__(self, project_name="AutoNEP"):
|
||||
self.project_name = project_name
|
||||
|
||||
def send(self, title, message, priority=3):
|
||||
"""
|
||||
调用本地 'post' 命令发送通知
|
||||
Args:
|
||||
title: 标题
|
||||
message: 内容
|
||||
priority: 优先级 (0-3: Low/Log, 4-7: Med/Done, 8+: High/Err)
|
||||
"""
|
||||
import subprocess
|
||||
|
||||
# 自动加上项目前缀
|
||||
full_title = f"[{self.project_name}] {title}"
|
||||
|
||||
# 构造命令: post -a task -t "Title" -m "Msg" -p Priority
|
||||
cmd = [
|
||||
"post",
|
||||
"-a", "task", # 固定推送到 task 频道
|
||||
"-t", full_title,
|
||||
"-m", message,
|
||||
"-p", str(priority)
|
||||
]
|
||||
|
||||
try:
|
||||
# 执行命令,不输出到屏幕,防止干扰主程序日志
|
||||
subprocess.run(cmd, check=False, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
|
||||
# 同时在本地日志留档
|
||||
logging.info(f"[[Notification Sent]] P{priority} | {title}: {message}")
|
||||
except FileNotFoundError:
|
||||
logging.warning("Command 'post' not found. Notification skipped.")
|
||||
except Exception as e:
|
||||
logging.warning(f"Failed to send notification: {e}")
|
||||
|
||||
# src/utils.py 添加在最后
|
||||
|
||||
def run_cmd_with_log(cmd, cwd, log_file="exec.log", input_str=None):
|
||||
"""
|
||||
执行命令并将 stdout/stderr 重定向到日志文件
|
||||
"""
|
||||
import subprocess
|
||||
|
||||
log_path = os.path.join(cwd, log_file)
|
||||
mode = 'a' if os.path.exists(log_path) else 'w'
|
||||
|
||||
with open(log_path, mode) as f:
|
||||
f.write(f"\n\n>>> Executing: {cmd}\n")
|
||||
f.write(f">>> Input: {repr(input_str)}\n")
|
||||
f.write("-" * 40 + "\n")
|
||||
f.flush()
|
||||
|
||||
try:
|
||||
process = subprocess.Popen(
|
||||
cmd,
|
||||
shell=True,
|
||||
cwd=cwd,
|
||||
stdin=subprocess.PIPE if input_str else None,
|
||||
stdout=f, # 直接指向文件
|
||||
stderr=subprocess.STDOUT, # 把错误也合并到同一个日志
|
||||
text=True
|
||||
)
|
||||
|
||||
# 发送输入并等待
|
||||
process.communicate(input=input_str)
|
||||
|
||||
f.write(f"\n>>> Finished with Return Code: {process.returncode}\n")
|
||||
return process.returncode == 0
|
||||
except Exception as e:
|
||||
f.write(f"\n>>> Exception: {str(e)}\n")
|
||||
return False
|
||||
|
||||
|
||||
def parse_time_to_steps(time_str, time_step_fs=1.0):
|
||||
"""
|
||||
解析时间字符串 (e.g., '5ns', '10ps', '50000') 为模拟步数
|
||||
"""
|
||||
import re
|
||||
|
||||
# 如果纯数字,直接返回整数
|
||||
if str(time_str).isdigit():
|
||||
return int(time_str)
|
||||
|
||||
# 正则匹配数字和单位
|
||||
match = re.match(r"([\d\.]+)\s*([a-zA-Z]+)", str(time_str))
|
||||
if not match:
|
||||
raise ValueError(f"Unknown time format: {time_str}")
|
||||
|
||||
value = float(match.group(1))
|
||||
unit = match.group(2).lower()
|
||||
|
||||
# 基础单位是 fs
|
||||
if unit == 'fs':
|
||||
total_fs = value
|
||||
elif unit == 'ps':
|
||||
total_fs = value * 1000.0
|
||||
elif unit == 'ns':
|
||||
total_fs = value * 1000.0 * 1000.0
|
||||
else:
|
||||
raise ValueError(f"Unsupported time unit: {unit}")
|
||||
|
||||
return int(total_fs / time_step_fs)
|
||||
904
src/workflow.py
Normal file
904
src/workflow.py
Normal file
@@ -0,0 +1,904 @@
|
||||
# src/workflow.py
|
||||
import os
|
||||
import shutil
|
||||
import logging
|
||||
import subprocess
|
||||
import re
|
||||
from src.utils import load_yaml, run_cmd_with_log, parse_time_to_steps,Notifier
|
||||
from src.machine import MachineManager
|
||||
from src.state import StateTracker # 新增
|
||||
from src.steps import MDStep, SelectStep, SCFStep, TrainStep
|
||||
|
||||
|
||||
class Workflow:
|
||||
def __init__(self, root_dir):
|
||||
self.root_dir = root_dir
|
||||
self.param = load_yaml(os.path.join(root_dir, "config/param.yaml"))
|
||||
self.machine = MachineManager(os.path.join(root_dir, "config/machine.yaml"))
|
||||
|
||||
self.workspace = os.path.join(root_dir, "workspace")
|
||||
self.data_dir = os.path.join(root_dir, "data")
|
||||
self.template_dir = os.path.join(root_dir, "template")
|
||||
|
||||
self.logger = logging.getLogger()
|
||||
self.notifier = Notifier()
|
||||
# 初始化状态追踪
|
||||
os.makedirs(self.workspace, exist_ok=True)
|
||||
self.tracker = StateTracker(self.workspace)
|
||||
project_name = self.param.get('project', 'AutoNEP')
|
||||
self.notifier = Notifier(project_name)
|
||||
self.notifier.send("Workflow Init", "AutoNEP framework initialized.", 1)
|
||||
# 初始变量
|
||||
self.current_nep_pot = os.path.join(self.data_dir, self.param['files']['initial_pot'])
|
||||
self.current_train_set = os.path.join(self.workspace, "accumulated_train.xyz")
|
||||
|
||||
def run(self):
|
||||
self.logger.info(f"Workflow Started: {self.param['project']}")
|
||||
self.notifier.send("Workflow Start", f"Starting project: {self.param['project']}", 5)
|
||||
for iteration in self.param['iterations']:
|
||||
iter_id = iteration['id']
|
||||
iter_name = f"iter_{iter_id:02d}"
|
||||
iter_path = os.path.join(self.workspace, iter_name)
|
||||
self.logger.info(f"\n >>> Processing Iteration: {iter_id} <<<")
|
||||
os.makedirs(iter_path, exist_ok=True)
|
||||
|
||||
for step_conf in iteration['steps']:
|
||||
step_name = step_conf['name']
|
||||
|
||||
# ==========================
|
||||
# Step: 00.md
|
||||
# ==========================
|
||||
if step_name == "00.md":
|
||||
step_dir = os.path.join(iter_path, "00.md")
|
||||
task_id_init = f"{iter_name}.00.md.init"
|
||||
|
||||
if not os.path.exists(step_dir):
|
||||
os.makedirs(step_dir, exist_ok=True)
|
||||
|
||||
# 获取本轮是否定义了自定义 POSCAR
|
||||
custom_poscar = iteration.get('custom_poscar')
|
||||
|
||||
if not self.tracker.is_done(task_id_init):
|
||||
# === 情况 A: 用户指定了自定义 POSCAR (优先级最高) ===
|
||||
if custom_poscar:
|
||||
self.logger.info(f"Using Custom POSCAR for this iteration: {custom_poscar}")
|
||||
poscar_src = os.path.join(self.data_dir, custom_poscar)
|
||||
|
||||
if os.path.exists(poscar_src):
|
||||
# 复制并重命名为 config 中定义的标准名 (为了方便 gpumdkit 处理)
|
||||
std_poscar_name = self.param['files']['poscar']
|
||||
shutil.copy(poscar_src, os.path.join(step_dir, std_poscar_name))
|
||||
|
||||
# 调用 gpumdkit 转化
|
||||
atom_labels = self.param['files'].get('label', '')
|
||||
kit_path = self.machine.config['paths'].get('gpumdkit', 'gpumdkit.sh')
|
||||
cmd = f"{kit_path} -addlabel {std_poscar_name} {atom_labels}"
|
||||
|
||||
if run_cmd_with_log(cmd, step_dir, "init.log"):
|
||||
self.notifier.send(f"Step Done: {step_name}", f"project {self.param['project']} Finished task {task_id_init}", 2)
|
||||
self.tracker.mark_done(task_id_init)
|
||||
else:
|
||||
self.logger.error("Custom POSCAR initialization failed.")
|
||||
return
|
||||
else:
|
||||
self.logger.error(f"Custom POSCAR not found in data dir: {poscar_src}")
|
||||
return
|
||||
|
||||
# === 情况 B: 第一轮且无自定义 (使用默认 POSCAR) ===
|
||||
elif iter_id == 0:
|
||||
poscar_name = self.param['files']['poscar']
|
||||
poscar_src = os.path.join(self.data_dir, poscar_name)
|
||||
|
||||
if os.path.exists(poscar_src):
|
||||
shutil.copy(poscar_src, os.path.join(step_dir, poscar_name))
|
||||
atom_labels = self.param['files'].get('label', '')
|
||||
kit_path = self.machine.config['paths'].get('gpumdkit', 'gpumdkit.sh')
|
||||
cmd = f"{kit_path} -addlabel {poscar_name} {atom_labels}"
|
||||
|
||||
if run_cmd_with_log(cmd, step_dir, "init.log"):
|
||||
self.notifier.send(f"Step Done: {step_name}", f"project {self.param['project']} Finished task {task_id_init}", 2)
|
||||
self.tracker.mark_done(task_id_init)
|
||||
else:
|
||||
self.logger.error("Initialization failed.")
|
||||
return
|
||||
else:
|
||||
self.logger.error("Default POSCAR missing.")
|
||||
return
|
||||
|
||||
# === 情况 C: 后续轮次且无自定义 (继承上一轮) ===
|
||||
else:
|
||||
# 只有当 model.xyz 不存在时才去复制
|
||||
if not os.path.exists(os.path.join(step_dir, "model.xyz")):
|
||||
prev_iter_name = f"iter_{iter_id - 1:02d}"
|
||||
prev_model_src = os.path.join(self.workspace, prev_iter_name, "00.md", "model.xyz")
|
||||
|
||||
if os.path.exists(prev_model_src):
|
||||
self.logger.info(f"Copying model.xyz from {prev_iter_name}...")
|
||||
shutil.copy(prev_model_src, os.path.join(step_dir, "model.xyz"))
|
||||
self.notifier.send(f"Step Done: {step_name}", f"project {self.param['project']} Finished task {task_id_init}", 2)
|
||||
self.tracker.mark_done(task_id_init) # 标记完成
|
||||
else:
|
||||
self.logger.error(f"Previous model.xyz not found: {prev_model_src}")
|
||||
return
|
||||
else:
|
||||
self.notifier.send(f"Step Done: {step_name}", f"project {self.param['project']} Finished task {task_id_init}", 2)
|
||||
self.tracker.mark_done(task_id_init)
|
||||
# 确保 gpumdkit 路径可用
|
||||
kit_path = self.machine.config['paths'].get('gpumdkit', 'gpumdkit.sh')
|
||||
|
||||
# === Sub-task 1: Preheat ===
|
||||
task_id_preheat = f"{iter_name}.00.md.preheat"
|
||||
preheat_dir = os.path.join(step_dir, "preheat")
|
||||
|
||||
if not self.tracker.is_done(task_id_preheat):
|
||||
self.logger.info(">>> Starting Preheat...")
|
||||
os.makedirs(preheat_dir, exist_ok=True)
|
||||
|
||||
# 准备文件
|
||||
shutil.copy(os.path.join(step_dir, "model.xyz"), os.path.join(preheat_dir, "model.xyz"))
|
||||
shutil.copy(self.current_nep_pot, os.path.join(preheat_dir, "nep.txt"))
|
||||
shutil.copy(os.path.join(self.template_dir, "00.md", "preheat", "run.in"),
|
||||
os.path.join(preheat_dir, "run.in"))
|
||||
|
||||
# A. 运行 GPUMD
|
||||
# 假设 gpumd 命令直接运行,无输入
|
||||
if not run_cmd_with_log("gpumd", preheat_dir, "step_exec.log"):
|
||||
self.logger.error("Preheat GPUMD failed.")
|
||||
return
|
||||
|
||||
# B. 运行 采样 (201)
|
||||
# [修正] 严格按照要求: "201\ndump.xyz uniform 4" (中间无额外换行)
|
||||
input_str_201 = "201\ndump.xyz uniform 4"
|
||||
self.logger.info(">>> Running Sampling (201)...")
|
||||
|
||||
if run_cmd_with_log(kit_path, preheat_dir, "step_exec.log", input_str=input_str_201):
|
||||
if os.path.exists(os.path.join(preheat_dir, "sampled_structures.xyz")):
|
||||
self.notifier.send(f"Step Done: {step_name}", f"project {self.param['project']} Finished task {task_id_preheat}", 2)
|
||||
self.tracker.mark_done(task_id_preheat)
|
||||
else:
|
||||
self.logger.error("sampled_structures.xyz not generated.")
|
||||
return
|
||||
else:
|
||||
self.logger.error("Sampling command failed.")
|
||||
return
|
||||
else:
|
||||
self.logger.info("Skipping Preheat (Already Done).")
|
||||
|
||||
# === Sub-task 2: Production ===
|
||||
task_id_prod = f"{iter_name}.00.md.production"
|
||||
prod_dir = os.path.join(step_dir, "production")
|
||||
|
||||
if not self.tracker.is_done(task_id_prod):
|
||||
self.logger.info(">>> Starting Production...")
|
||||
os.makedirs(prod_dir, exist_ok=True)
|
||||
|
||||
# 1. 准备基础文件到 production 根目录
|
||||
src_sample = os.path.abspath(os.path.join(preheat_dir, "sampled_structures.xyz"))
|
||||
dst_sample = os.path.join(prod_dir, "sampled_structures.xyz")
|
||||
if os.path.exists(dst_sample): os.remove(dst_sample)
|
||||
os.symlink(src_sample, dst_sample)
|
||||
|
||||
shutil.copy(self.current_nep_pot, os.path.join(prod_dir, "nep.txt"))
|
||||
shutil.copy(os.path.join(self.template_dir, "00.md", "production", "run.in"),
|
||||
os.path.join(prod_dir, "run.in"))
|
||||
|
||||
# 2. 运行 302 (生成 md 文件夹, sample_* 文件夹, presub.sh)
|
||||
# 302 通常会读取当前目录的 run.in 并在 md/ 下生成拆分后的 run_x.in
|
||||
input_str_302 = "302"
|
||||
if not run_cmd_with_log(kit_path, prod_dir, "step_exec.log", input_str=input_str_302):
|
||||
self.logger.error("302 command failed.")
|
||||
return
|
||||
|
||||
if not os.path.exists(os.path.join(prod_dir, "presub.sh")):
|
||||
self.logger.error("presub.sh not found.")
|
||||
return
|
||||
|
||||
# ---------------------------------------------------------
|
||||
# [新增] 3. 补全文件:将 nep.txt 和 run.in 复制到 md 文件夹
|
||||
# ---------------------------------------------------------
|
||||
md_subdir = os.path.join(prod_dir, "md")
|
||||
if os.path.exists(md_subdir):
|
||||
self.logger.info("Copying nep.txt and run.in to 'md' folder...")
|
||||
shutil.copy(os.path.join(prod_dir, "nep.txt"), os.path.join(md_subdir, "nep.txt"))
|
||||
# 复制 run.in,虽然 302 可能已经生成了 run_1.in 等,但为了保险或用户习惯,我们也拷进去
|
||||
shutil.copy(os.path.join(prod_dir, "run.in"), os.path.join(md_subdir, "run.in"))
|
||||
shutil.copy(os.path.join(prod_dir, "run.in"), os.path.join(md_subdir, "run_1.in"))
|
||||
shutil.copy(os.path.join(prod_dir, "run.in"), os.path.join(md_subdir, "run_2.in"))
|
||||
shutil.copy(os.path.join(prod_dir, "run.in"), os.path.join(md_subdir, "run_3.in"))
|
||||
shutil.copy(os.path.join(prod_dir, "run.in"), os.path.join(md_subdir, "run_4.in"))
|
||||
else:
|
||||
self.logger.error("'md' folder was not created by 302 command.")
|
||||
return
|
||||
|
||||
# 4. 运行 presub.sh
|
||||
os.chmod(os.path.join(prod_dir, "presub.sh"), 0o755)
|
||||
self.logger.info(">>> Executing presub.sh...")
|
||||
|
||||
if not run_cmd_with_log("./presub.sh", prod_dir, "step_exec.log"):
|
||||
self.logger.error("presub.sh execution failed.")
|
||||
return
|
||||
|
||||
# 5. 合并 dump
|
||||
self.logger.info("Merging dump files...")
|
||||
run_cmd_with_log("cat sample_*/dump.xyz > dump.xyz", prod_dir, "step_exec.log")
|
||||
|
||||
self.last_dump_path = os.path.join(prod_dir, "dump.xyz")
|
||||
self.notifier.send(f"Step Done: {step_name}", f"project {self.param['project']} Finished task {task_id_prod}", 2)
|
||||
self.tracker.mark_done(task_id_prod)
|
||||
else:
|
||||
self.logger.info("Skipping Production (Already Done).")
|
||||
self.last_dump_path = os.path.join(prod_dir, "dump.xyz")
|
||||
|
||||
# ==========================
|
||||
# Step: 01.select
|
||||
# ==========================
|
||||
elif step_name == "01.select":
|
||||
step_dir = os.path.join(iter_path, "01.select")
|
||||
task_id_select = f"{iter_name}.01.select"
|
||||
|
||||
if not self.tracker.is_done(task_id_select):
|
||||
method = step_conf.get('method', 'distance')
|
||||
self.logger.info(f"=== Step: 01.select ({method}) ===")
|
||||
os.makedirs(step_dir, exist_ok=True)
|
||||
|
||||
# Output 目录
|
||||
output_dir = os.path.join(iter_path, "05.output")
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
|
||||
# 1. 准备 dump.xyz (软链接)
|
||||
dump_src = getattr(self, 'last_dump_path', None)
|
||||
if not dump_src:
|
||||
dump_src = os.path.join(iter_path, "00.md", "production", "dump.xyz")
|
||||
|
||||
if os.path.exists(dump_src):
|
||||
dst_dump = os.path.join(step_dir, "dump.xyz")
|
||||
if os.path.exists(dst_dump): os.remove(dst_dump)
|
||||
os.symlink(os.path.abspath(dump_src), dst_dump)
|
||||
else:
|
||||
self.logger.error(f"Source dump.xyz not found: {dump_src}")
|
||||
return
|
||||
|
||||
# 2. 准备 nep.txt
|
||||
shutil.copy(self.current_nep_pot, os.path.join(step_dir, "nep.txt"))
|
||||
|
||||
# 3. 准备 train.xyz
|
||||
if iter_id == 0:
|
||||
model_xyz_src = os.path.join(iter_path, "00.md", "model.xyz")
|
||||
if os.path.exists(model_xyz_src):
|
||||
shutil.copy(model_xyz_src, os.path.join(step_dir, "train.xyz"))
|
||||
else:
|
||||
self.logger.error("model.xyz not found for initial train.xyz")
|
||||
return
|
||||
else:
|
||||
if os.path.exists(self.current_train_set):
|
||||
shutil.copy(self.current_train_set, os.path.join(step_dir, "train.xyz"))
|
||||
else:
|
||||
self.logger.error(f"Previous train set missing: {self.current_train_set}")
|
||||
return
|
||||
|
||||
# 4. 执行筛选逻辑
|
||||
params = step_conf.get('params', [0.01, 60, 120])
|
||||
kit_path = self.machine.config['paths'].get('gpumdkit', 'gpumdkit.sh')
|
||||
|
||||
# === 分支 A: 按距离筛选 (二分法 Binary Search) ===
|
||||
if method == "distance":
|
||||
target_min, target_max = params[1], params[2]
|
||||
|
||||
# 定义二分查找的初始区间
|
||||
# lower_bound = 0.0 (极其宽松,选所有)
|
||||
# upper_bound = 0.2 (通常足够大,甚至可以设大一点,视体系而定)
|
||||
# 我们取 param[0] 作为初始猜测,但搜索范围设宽一点
|
||||
lower_bound = 0.0
|
||||
upper_bound = 0.2
|
||||
|
||||
max_attempts = 15
|
||||
success = False
|
||||
|
||||
selection_log = []
|
||||
|
||||
# 用于记录“最佳失败结果” (如果最终没收敛,用这个)
|
||||
best_result = None
|
||||
min_dist_to_range = float('inf') # 距离目标区间的差距
|
||||
|
||||
self.logger.info(f"Targeting {target_min}-{target_max} structures using Binary Search.")
|
||||
|
||||
for attempt in range(max_attempts):
|
||||
current_attempt = attempt + 1
|
||||
|
||||
# 二分取值
|
||||
threshold = (lower_bound + upper_bound) / 2.0
|
||||
|
||||
self.logger.info(
|
||||
f"--- Attempt {current_attempt}: Threshold {threshold:.6f} (Range: {lower_bound:.4f}-{upper_bound:.4f}) ---")
|
||||
|
||||
input_str = f"203\ndump.xyz train.xyz nep.txt\n1\n{threshold}"
|
||||
|
||||
if not run_cmd_with_log(kit_path, step_dir, "select_exec.log", input_str=input_str):
|
||||
self.logger.error("gpumdkit execution failed.")
|
||||
break
|
||||
|
||||
# 检查数量
|
||||
count = 0
|
||||
if os.path.exists(os.path.join(step_dir, "selected.xyz")):
|
||||
try:
|
||||
count_out = subprocess.check_output('grep -c "Lat" selected.xyz', shell=True,
|
||||
cwd=step_dir)
|
||||
count = int(count_out.decode().strip())
|
||||
except:
|
||||
count = 0
|
||||
self.logger.info(f"Selected count: {count}")
|
||||
|
||||
# 记录日志
|
||||
status = "Fail"
|
||||
if target_min <= count <= target_max:
|
||||
status = "Success"
|
||||
|
||||
selection_log.append({
|
||||
"attempt": current_attempt,
|
||||
"threshold": threshold,
|
||||
"count": count,
|
||||
"result": status
|
||||
})
|
||||
|
||||
# 记录最佳结果 (距离 target_min 或 target_max 最近的)
|
||||
dist = 0
|
||||
if count < target_min:
|
||||
dist = target_min - count
|
||||
elif count > target_max:
|
||||
dist = count - target_max
|
||||
|
||||
if dist < min_dist_to_range:
|
||||
min_dist_to_range = dist
|
||||
# 备份当前的 selected.xyz 为 best_selected.xyz
|
||||
shutil.copy(os.path.join(step_dir, "selected.xyz"),
|
||||
os.path.join(step_dir, "best_selected.xyz"))
|
||||
best_result = {"threshold": threshold, "count": count}
|
||||
|
||||
# 逻辑判断
|
||||
if target_min <= count <= target_max:
|
||||
self.logger.info(f"Success! Count {count} is within range.")
|
||||
success = True
|
||||
break
|
||||
|
||||
# 二分调整逻辑:
|
||||
# 假设: 阈值越小(Loose) -> 选的越多; 阈值越大(Tight) -> 选的越少
|
||||
if count < target_min:
|
||||
# 选少了 -> 需要更宽松 -> 降低阈值 -> 往 [lower, current] 搜
|
||||
self.logger.info("Too few. Need Looser (Lower) threshold.")
|
||||
upper_bound = threshold
|
||||
else:
|
||||
# 选多了 -> 需要更严格 -> 提高阈值 -> 往 [current, upper] 搜
|
||||
self.logger.info("Too many. Need Tighter (Higher) threshold.")
|
||||
lower_bound = threshold
|
||||
|
||||
# 极小区间保护:如果上下界太接近,直接退出,避免死循环
|
||||
if (upper_bound - lower_bound) < 1e-6:
|
||||
self.logger.warning("Search interval too small. Stopping.")
|
||||
break
|
||||
|
||||
# 循环结束后的处理
|
||||
if not success and best_result:
|
||||
self.logger.warning(
|
||||
f"Could not strictly satisfy range. Using best result: {best_result['count']} (Thr={best_result['threshold']:.6f})")
|
||||
# 恢复最佳文件
|
||||
shutil.move(os.path.join(step_dir, "best_selected.xyz"),
|
||||
os.path.join(step_dir, "selected.xyz"))
|
||||
|
||||
# 清理临时文件
|
||||
if os.path.exists(os.path.join(step_dir, "best_selected.xyz")):
|
||||
os.remove(os.path.join(step_dir, "best_selected.xyz"))
|
||||
|
||||
# 写CSV日志
|
||||
csv_path = os.path.join(output_dir, "select_log.csv")
|
||||
try:
|
||||
with open(csv_path, 'w') as f:
|
||||
f.write("Attempt,Threshold,Count,Result\n")
|
||||
for entry in selection_log:
|
||||
f.write(
|
||||
f"{entry['attempt']},{entry['threshold']:.6f},{entry['count']},{entry['result']}\n")
|
||||
except:
|
||||
pass
|
||||
|
||||
# 归档图片
|
||||
src_png = os.path.join(step_dir, "select.png")
|
||||
if os.path.exists(src_png):
|
||||
shutil.copy(src_png, os.path.join(output_dir, "select.png"))
|
||||
self.notifier.send(f"Step Done: {step_name}", f"project {self.param['project']} Finished task {task_id_select}", 2)
|
||||
self.tracker.mark_done(task_id_select)
|
||||
|
||||
# === 分支 B: 随机筛选 (保持不变) ===
|
||||
elif method == "random":
|
||||
min_n, max_n = params[0], params[1]
|
||||
input_str = f"203\ndump.xyz train.xyz nep.txt\n2\n{min_n} {max_n}"
|
||||
self.logger.info(f"Random selection: {min_n}-{max_n}")
|
||||
if run_cmd_with_log(kit_path, step_dir, "select_exec.log", input_str=input_str):
|
||||
with open(os.path.join(output_dir, "select_log.csv"), 'w') as f:
|
||||
f.write("Method,Min,Max,Result\n")
|
||||
f.write(f"Random,{min_n},{max_n},Executed\n")
|
||||
src_png = os.path.join(step_dir, "select.png")
|
||||
if os.path.exists(src_png): shutil.copy(src_png, os.path.join(output_dir, "select.png"))
|
||||
self.notifier.send(f"Step Done: {step_name}", f"project {self.param['project']} Finished task {task_id_select}", 2)
|
||||
self.tracker.mark_done(task_id_select)
|
||||
else:
|
||||
self.logger.error("Random selection failed.")
|
||||
return
|
||||
else:
|
||||
self.logger.info("Skipping Select (Already Done).")
|
||||
# ==========================
|
||||
# Step: 02.scf (VASP Calculation)
|
||||
# ==========================
|
||||
elif step_name == "02.scf":
|
||||
step_dir = os.path.join(iter_path, "02.scf")
|
||||
task_id_scf = f"{iter_name}.02.scf"
|
||||
|
||||
if not self.tracker.is_done(task_id_scf):
|
||||
self.logger.info("=== Step: 02.scf (VASP) ===")
|
||||
os.makedirs(step_dir, exist_ok=True)
|
||||
|
||||
# 1. 准备 selected.xyz
|
||||
# 尝试从同轮次的 01.select 获取
|
||||
select_step_dir = os.path.join(iter_path, "01.select")
|
||||
src_selected = os.path.join(select_step_dir, "selected.xyz")
|
||||
if not os.path.exists(src_selected):
|
||||
self.logger.error(f"selected.xyz not found in {select_step_dir}")
|
||||
return
|
||||
|
||||
dst_selected = os.path.join(step_dir, "selected.xyz")
|
||||
if os.path.exists(dst_selected): os.remove(dst_selected)
|
||||
os.symlink(os.path.abspath(src_selected), dst_selected)
|
||||
|
||||
# 2. 运行 301 拆分结构
|
||||
# 命令: echo -e "301\niter" | gpumdkit.sh
|
||||
# 这会生成 iterX_1, iterX_2... 和 fp 文件夹
|
||||
kit_path = self.machine.config['paths'].get('gpumdkit', 'gpumdkit.sh')
|
||||
input_str_301 = "301\niter" # 这里 "iter" 是文件夹前缀名,gpumdkit 会自动加数字
|
||||
|
||||
self.logger.info("Splitting structures (301)...")
|
||||
if not run_cmd_with_log(kit_path, step_dir, "scf_setup.log", input_str=input_str_301):
|
||||
self.logger.error("301 command failed.")
|
||||
return
|
||||
|
||||
# 3. 准备 VASP 输入文件到 'fp' 文件夹
|
||||
fp_dir = os.path.join(step_dir, "fp")
|
||||
if not os.path.exists(fp_dir):
|
||||
self.logger.error("'fp' directory was not created by 301.")
|
||||
return
|
||||
|
||||
self.logger.info("Distributing VASP inputs to 'fp' folder...")
|
||||
|
||||
# A. POTCAR (来自 Data)
|
||||
potcar_src = os.path.join(self.data_dir, self.param['files']['potcar'])
|
||||
if os.path.exists(potcar_src):
|
||||
shutil.copy(potcar_src, os.path.join(fp_dir, "POTCAR"))
|
||||
else:
|
||||
self.logger.error(f"POTCAR missing: {potcar_src}")
|
||||
return
|
||||
|
||||
# B. INCAR (来自 Template)
|
||||
incar_src = os.path.join(self.template_dir, "02.scf", "INCAR")
|
||||
if os.path.exists(incar_src):
|
||||
shutil.copy(incar_src, os.path.join(fp_dir, "INCAR"))
|
||||
else:
|
||||
self.logger.error(f"INCAR missing in template: {incar_src}")
|
||||
return
|
||||
|
||||
# C. KPOINTS (来自 Template, 可选)
|
||||
kpoints_src = os.path.join(self.template_dir, "02.scf", "KPOINTS")
|
||||
if os.path.exists(kpoints_src):
|
||||
shutil.copy(kpoints_src, os.path.join(fp_dir, "KPOINTS"))
|
||||
else:
|
||||
self.logger.info("KPOINTS not found in template, assuming KSPACING in INCAR.")
|
||||
|
||||
# D. [新增] vdw_kernel.bindat (可选 + 手动软链接)
|
||||
vdw_src = os.path.join(self.template_dir, "02.scf", "vdw_kernel.bindat")
|
||||
vdw_fp_path = os.path.join(fp_dir, "vdw_kernel.bindat")
|
||||
has_vdw = False
|
||||
|
||||
if os.path.exists(vdw_src):
|
||||
shutil.copy(vdw_src, vdw_fp_path)
|
||||
has_vdw = True
|
||||
self.logger.info("Found vdw_kernel.bindat, copied to fp folder.")
|
||||
|
||||
# E. [新增] 遍历所有子目录,补全 vdw_kernel.bindat 的软链接
|
||||
# gpumdkit 只会自动链接 fp 里的 INCAR/POTCAR/KPOINTS
|
||||
if has_vdw:
|
||||
self.logger.info("Manually creating symlinks for vdw_kernel.bindat...")
|
||||
# 获取 step_dir 下所有的 iterX_Y 文件夹
|
||||
for item in os.listdir(step_dir):
|
||||
item_path = os.path.join(step_dir, item)
|
||||
# 判断是否是拆分出来的子任务目录 (通常以 iter 开头)
|
||||
if os.path.isdir(item_path) and item.startswith("iter"):
|
||||
# 目标链接路径
|
||||
dst_link = os.path.join(item_path, "vdw_kernel.bindat")
|
||||
# 如果已存在(比如是死文件),先删除
|
||||
if os.path.exists(dst_link) or os.path.islink(dst_link):
|
||||
os.remove(dst_link)
|
||||
|
||||
# 创建指向 ../fp/vdw_kernel.bindat 的相对软链接
|
||||
# 这样即使文件夹移动,链接也大概率有效
|
||||
try:
|
||||
os.symlink(os.path.join("..", "fp", "vdw_kernel.bindat"), dst_link)
|
||||
except Exception as e:
|
||||
self.logger.error(f"Failed to symlink vdw in {item}: {e}")
|
||||
# 4. 生成并提交计算任务
|
||||
# 这里我们不理会 gpumdkit 生成的 presub.sh,而是根据 machine.yaml 生成自己的
|
||||
executor_name = step_conf.get('executor', 'vasp_gpu') # 默认用 cpu
|
||||
|
||||
# 获取执行命令 (例如 "mpirun -np 32 vasp_std")
|
||||
# 这里的逻辑需要调用 machine 模块的一个新功能:批量生成提交脚本
|
||||
# 但为了简化,我们在 Local 模式下生成一个遍历脚本
|
||||
|
||||
self.logger.info(f"Generating batch submission script for {executor_name}...")
|
||||
|
||||
# 读取 machine 配置里的命令
|
||||
exec_conf = self.machine.config['executors'].get(executor_name, {})
|
||||
vasp_cmd = exec_conf.get('cmd', 'mpirun -np 1 vasp_std') # 默认值
|
||||
|
||||
# 生成 run_vasp.sh
|
||||
run_script_path = os.path.join(step_dir, "run_vasp.sh")
|
||||
with open(run_script_path, 'w') as f:
|
||||
f.write("#!/bin/bash\n")
|
||||
# 遍历 iter* 目录
|
||||
f.write(f"for dir in iter*_*; do\n")
|
||||
f.write(f" if [ -d \"$dir\" ]; then\n")
|
||||
f.write(f" echo \"Running VASP in $dir ...\"\n")
|
||||
f.write(f" cd $dir\n")
|
||||
# 写入具体的 VASP 执行命令
|
||||
f.write(f" {vasp_cmd} > vasp.log 2>&1\n") # 重定向日志
|
||||
f.write(f" cd ..\n")
|
||||
f.write(f" fi\n")
|
||||
f.write(f"done\n")
|
||||
|
||||
os.chmod(run_script_path, 0o755)
|
||||
|
||||
# 执行 VASP 计算
|
||||
# 注意:如果是在 Slurm 上,这里应该提交 run_vasp.sh,并使用 Job ID 等待
|
||||
# 目前 Local 模式直接运行
|
||||
self.logger.info(">>> Executing VASP batch calculations (this may take time)...")
|
||||
if not run_cmd_with_log("./run_vasp.sh", step_dir, "scf_exec.log"):
|
||||
self.logger.error("VASP batch execution failed.")
|
||||
return
|
||||
|
||||
# 5. 结果收集 (out2xyz)
|
||||
self.logger.info("Collecting results (out2xyz)...")
|
||||
cmd_collect = f"{kit_path} -out2xyz ."
|
||||
if run_cmd_with_log(cmd_collect, step_dir, "scf_collect.log"):
|
||||
# 检查结果
|
||||
res_dir = os.path.join(step_dir, "NEPdataset-multiple_frames")
|
||||
res_file = os.path.join(res_dir, "NEP-dataset.xyz")
|
||||
|
||||
if os.path.exists(res_file):
|
||||
self.logger.info(f"VASP data collected: {res_file}")
|
||||
# 保存这个路径供 Train 使用
|
||||
self.new_data_chunk = res_file
|
||||
self.notifier.send(f"Step Done: {step_name}", f"project {self.param['project']} Finished task {task_id_scf}", 2)
|
||||
self.tracker.mark_done(task_id_scf)
|
||||
else:
|
||||
self.logger.error("NEP-dataset.xyz not found after collection.")
|
||||
else:
|
||||
self.logger.error("out2xyz failed.")
|
||||
|
||||
else:
|
||||
self.logger.info("Skipping SCF (Already Done).")
|
||||
# 即使跳过,也要尝试恢复 self.new_data_chunk 变量,防止 Train 找不到数据
|
||||
# 这里简单推断一下
|
||||
res_file = os.path.join(step_dir, "NEPdataset-multiple_frames", "NEP-dataset.xyz")
|
||||
if os.path.exists(res_file):
|
||||
self.new_data_chunk = res_file
|
||||
# ==========================
|
||||
# Step: 03.train (Training)
|
||||
# ==========================
|
||||
elif step_name == "03.train":
|
||||
step_dir = os.path.join(iter_path, "03.train")
|
||||
task_id_train = f"{iter_name}.03.train"
|
||||
|
||||
if not self.tracker.is_done(task_id_train):
|
||||
self.logger.info("=== Step: 03.train (NEP) ===")
|
||||
os.makedirs(step_dir, exist_ok=True)
|
||||
|
||||
# 1. 准备 train.xyz (合并数据)
|
||||
# 逻辑:Current Total Train = Previous Total Train + New Data Chunk
|
||||
self.logger.info("Merging training data...")
|
||||
|
||||
train_xyz_path = os.path.join(step_dir, "train.xyz")
|
||||
|
||||
# 打开目标文件准备写入
|
||||
with open(train_xyz_path, 'w') as outfile:
|
||||
# A. 写入旧数据 (如果存在)
|
||||
if os.path.exists(self.current_train_set):
|
||||
self.logger.info(f"Appending previous data: {self.current_train_set}")
|
||||
with open(self.current_train_set, 'r') as infile:
|
||||
shutil.copyfileobj(infile, outfile)
|
||||
|
||||
# B. [新增] 注入 data 中的额外数据 (仅在第一轮注入,防止重复)
|
||||
# 如果 extra_train_data 存在,且当前是第一轮 (或者 current_train_set 还没建立)
|
||||
extra_files = self.param['files'].get('extra_train_data', [])
|
||||
if iter_id == 0 and extra_files:
|
||||
self.logger.info(f"Injecting extra training data: {extra_files}")
|
||||
for xyz_file in extra_files:
|
||||
src_xyz = os.path.join(self.data_dir, xyz_file)
|
||||
if os.path.exists(src_xyz):
|
||||
self.logger.info(f" -> Appending {xyz_file}")
|
||||
with open(src_xyz, 'r') as infile:
|
||||
shutil.copyfileobj(infile, outfile)
|
||||
else:
|
||||
self.logger.warning(f"Extra data file not found: {src_xyz}")
|
||||
|
||||
# C. 写入新数据 (来自本轮 SCF)
|
||||
new_data = getattr(self, 'new_data_chunk', None)
|
||||
if not new_data:
|
||||
new_data = os.path.join(iter_path, "02.scf", "NEPdataset-multiple_frames",
|
||||
"NEP-dataset.xyz")
|
||||
|
||||
if new_data and os.path.exists(new_data):
|
||||
self.logger.info(f"Appending new data: {new_data}")
|
||||
with open(new_data, 'r') as infile:
|
||||
shutil.copyfileobj(infile, outfile)
|
||||
else:
|
||||
# 只有在既没有旧数据,又没有额外数据,也没有新数据时才报错
|
||||
has_data = (os.path.exists(self.current_train_set) or
|
||||
(iter_id == 0 and extra_files))
|
||||
if not has_data:
|
||||
self.logger.error("No training data available at all.")
|
||||
return
|
||||
elif not new_data or not os.path.exists(new_data):
|
||||
self.logger.warning("No new SCF data found. Training on existing/extra data only.")
|
||||
# 更新全局变量指向最新的 train.xyz,供下一轮使用
|
||||
self.current_train_set = train_xyz_path
|
||||
|
||||
# 2. 准备 nep.in
|
||||
template_nep_in = os.path.join(self.template_dir, "03.train", "nep.in")
|
||||
if os.path.exists(template_nep_in):
|
||||
shutil.copy(template_nep_in, os.path.join(step_dir, "nep.in"))
|
||||
else:
|
||||
self.logger.error(f"nep.in template missing: {template_nep_in}")
|
||||
return
|
||||
|
||||
# 3. 执行训练
|
||||
executor_name = step_conf.get('executor', 'nep_local')
|
||||
# 获取nep命令,比如 "nep" 或者 "/path/to/nep"
|
||||
# 注意:nep 命令通常不需要参数,它会自动读取 nep.in
|
||||
|
||||
self.logger.info(f"Starting NEP training using {executor_name}...")
|
||||
# 这里的 log 文件叫 train_exec.log
|
||||
if not run_cmd_with_log("nep", step_dir,
|
||||
"train_exec.log"): # 假设 cmd 是 nep,如果 machine.yaml 里有特殊定义请调整
|
||||
self.logger.error("NEP training failed.")
|
||||
return
|
||||
|
||||
# 检查是否生成了 nep.txt
|
||||
if os.path.exists(os.path.join(step_dir, "nep.txt")):
|
||||
self.logger.info("Training finished. nep.txt generated.")
|
||||
# 更新全局势函数路径,供下一轮 MD 使用
|
||||
self.current_nep_pot = os.path.join(step_dir, "nep.txt")
|
||||
else:
|
||||
self.logger.error("nep.txt not found after training.")
|
||||
return
|
||||
|
||||
# 4. 后处理:绘图与归档
|
||||
self.logger.info("Generating plots (gpumdkit.sh -plt train)...")
|
||||
kit_path = self.machine.config['paths'].get('gpumdkit', 'gpumdkit.sh')
|
||||
cmd_plt = f"{kit_path} -plt train"
|
||||
|
||||
run_cmd_with_log(cmd_plt, step_dir, "plot.log")
|
||||
|
||||
# [修改] 创建 output 目录在当前 iter 内部
|
||||
output_dir = os.path.join(iter_path, "05.output")
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
|
||||
# 移动 png 图片
|
||||
for file in os.listdir(step_dir):
|
||||
if file.endswith(".png"):
|
||||
src_png = os.path.join(step_dir, file)
|
||||
dst_png = os.path.join(output_dir, file)
|
||||
shutil.copy(src_png, dst_png)
|
||||
self.logger.info(f"Archived plot: {file}")
|
||||
self.notifier.send(f"Step Done: {step_name}", f"project {self.param['project']} Finished task {task_id_train}", 2)
|
||||
self.tracker.mark_done(task_id_train)
|
||||
|
||||
else:
|
||||
self.logger.info("Skipping Train (Already Done).")
|
||||
# 恢复变量状态
|
||||
self.current_nep_pot = os.path.join(step_dir, "nep.txt")
|
||||
self.current_train_set = os.path.join(step_dir, "train.xyz")
|
||||
|
||||
|
||||
|
||||
|
||||
# ==========================
|
||||
# Step: 04.predict (Conductivity)
|
||||
# ==========================
|
||||
elif step_name == "04.predict":
|
||||
step_dir = os.path.join(iter_path, "04.predict")
|
||||
task_id_predict = f"{iter_name}.04.predict"
|
||||
|
||||
if not self.tracker.is_done(task_id_predict):
|
||||
self.logger.info("=== Step: 04.predict (Arrhenius) ===")
|
||||
os.makedirs(step_dir, exist_ok=True)
|
||||
|
||||
output_dir = os.path.join(iter_path, "05.output")
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
|
||||
# -------------------------------------------------
|
||||
# 1. 准备 NEP 势函数 (支持自定义)
|
||||
# -------------------------------------------------
|
||||
custom_nep_name = step_conf.get('custom_nep')
|
||||
nep_src = ""
|
||||
|
||||
if custom_nep_name:
|
||||
# 优先使用 data/ 下的自定义模型
|
||||
nep_src = os.path.join(self.data_dir, custom_nep_name)
|
||||
if not os.path.exists(nep_src):
|
||||
self.logger.error(f"Custom NEP file not found: {nep_src}")
|
||||
return
|
||||
self.logger.info(f"Using Custom NEP model: {custom_nep_name}")
|
||||
else:
|
||||
# 默认使用当前流程的训练结果
|
||||
nep_src = self.current_nep_pot
|
||||
self.logger.info("Using current training result for prediction.")
|
||||
|
||||
if not os.path.exists(nep_src):
|
||||
self.logger.error(f"NEP potential source missing: {nep_src}")
|
||||
return
|
||||
|
||||
# -------------------------------------------------
|
||||
# 2. 准备 model.xyz (支持自定义 VASP -> 转化)
|
||||
# -------------------------------------------------
|
||||
custom_poscar_name = step_conf.get('custom_poscar')
|
||||
final_model_xyz = os.path.join(step_dir, "model.xyz") # 统一存放在 step_dir 根目录备用
|
||||
|
||||
kit_path = self.machine.config['paths'].get('gpumdkit', 'gpumdkit.sh')
|
||||
|
||||
if custom_poscar_name:
|
||||
# === Case A: 使用自定义 VASP 结构 ===
|
||||
self.logger.info(f"Using Custom POSCAR for prediction: {custom_poscar_name}")
|
||||
poscar_src = os.path.join(self.data_dir, custom_poscar_name)
|
||||
|
||||
if os.path.exists(poscar_src):
|
||||
# 复制 VASP 文件到当前目录
|
||||
shutil.copy(poscar_src, os.path.join(step_dir, custom_poscar_name))
|
||||
|
||||
# 获取 label 并执行转化
|
||||
atom_labels = self.param['files'].get('label', '')
|
||||
if not atom_labels:
|
||||
self.logger.error("Labels missing in config (files.label), cannot convert POSCAR.")
|
||||
return
|
||||
|
||||
cmd = f"{kit_path} -addlabel {custom_poscar_name} {atom_labels}"
|
||||
self.logger.info(f"Converting POSCAR to model.xyz: {cmd}")
|
||||
|
||||
if not run_cmd_with_log(cmd, step_dir, "convert_model.log"):
|
||||
self.logger.error("Failed to convert custom POSCAR.")
|
||||
return
|
||||
|
||||
if not os.path.exists(final_model_xyz):
|
||||
self.logger.error("model.xyz not generated after conversion.")
|
||||
return
|
||||
else:
|
||||
self.logger.error(f"Custom POSCAR not found: {poscar_src}")
|
||||
return
|
||||
else:
|
||||
# === Case B: 使用流程默认结构 (00.md) ===
|
||||
self.logger.info("Using default structure from 00.md")
|
||||
default_src = os.path.join(iter_path, "00.md", "model.xyz")
|
||||
|
||||
if os.path.exists(default_src):
|
||||
shutil.copy(default_src, final_model_xyz)
|
||||
else:
|
||||
# 尝试兜底:用 data 里的初始 POSCAR
|
||||
self.logger.warning("00.md/model.xyz not found. Trying initial POSCAR...")
|
||||
# 这里省略复杂的再次转化逻辑,建议保证 00.md 跑通
|
||||
self.logger.error(f"Default model.xyz source missing: {default_src}")
|
||||
return
|
||||
|
||||
# -------------------------------------------------
|
||||
# 3. 遍历温度点执行模拟
|
||||
# -------------------------------------------------
|
||||
conditions = step_conf.get('conditions', [])
|
||||
if not conditions:
|
||||
self.logger.error("No conditions defined for 04.predict")
|
||||
continue
|
||||
|
||||
self.notifier.send("Predict Start", f"Tasks: {len(conditions)}", 5)
|
||||
|
||||
for cond in conditions:
|
||||
temp = cond['T']
|
||||
time_str = cond['time']
|
||||
steps = parse_time_to_steps(time_str)
|
||||
|
||||
# 自动计算 MSD Window (Steps / 200)
|
||||
msd_window = int(steps / 200)
|
||||
|
||||
sub_dir_name = f"{temp}K"
|
||||
sub_work_dir = os.path.join(step_dir, sub_dir_name)
|
||||
os.makedirs(sub_work_dir, exist_ok=True)
|
||||
|
||||
self.logger.info(f"-> Running Prediction: {temp}K, {time_str}")
|
||||
|
||||
# 分发准备好的 model.xyz 和 nep.txt
|
||||
shutil.copy(final_model_xyz, os.path.join(sub_work_dir, "model.xyz"))
|
||||
shutil.copy(nep_src, os.path.join(sub_work_dir, "nep.txt"))
|
||||
|
||||
# 生成 run.in
|
||||
template_path = os.path.join(self.template_dir, "04.predict", "run.in")
|
||||
if os.path.exists(template_path):
|
||||
with open(template_path, 'r') as f:
|
||||
content = f.read()
|
||||
|
||||
content = content.replace("{T}", str(temp))
|
||||
content = content.replace("{STEPS}", str(steps))
|
||||
content = content.replace("{MSD_WINDOW}", str(msd_window))
|
||||
|
||||
with open(os.path.join(sub_work_dir, "run.in"), 'w') as f:
|
||||
f.write(content)
|
||||
else:
|
||||
self.logger.error(f"Template not found: {template_path}")
|
||||
continue
|
||||
|
||||
# 执行 GPUMD
|
||||
if not run_cmd_with_log("gpumd", sub_work_dir, "predict.log"):
|
||||
self.logger.error(f"Prediction failed at {temp}K")
|
||||
|
||||
# -------------------------------------------------
|
||||
# 4. 后处理分析
|
||||
# -------------------------------------------------
|
||||
self.logger.info("Running Analysis (gpumdkit -plt sigma)...")
|
||||
|
||||
analysis_log = os.path.join(step_dir, "sigma_analysis.log")
|
||||
cmd_analyze = f"{kit_path} -plt sigma"
|
||||
|
||||
process = subprocess.Popen(
|
||||
cmd_analyze, shell=True, cwd=step_dir,
|
||||
stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True
|
||||
)
|
||||
stdout, _ = process.communicate()
|
||||
|
||||
with open(analysis_log, 'w') as f:
|
||||
f.write(stdout)
|
||||
|
||||
# 解析输出并生成报告 (Regex Logic)
|
||||
csv_data = []
|
||||
ea_val = "N/A"
|
||||
sigma_300k = "N/A"
|
||||
|
||||
lines = stdout.split('\n')
|
||||
in_table = False
|
||||
|
||||
for line in lines:
|
||||
line = line.strip()
|
||||
if "Ea:" in line:
|
||||
match = re.search(r"Ea:\s+([\d\.]+)\s+eV", line)
|
||||
if match: ea_val = match.group(1)
|
||||
if "at 300K" in line:
|
||||
match = re.search(r"Sigma\s*=\s*([\d\.eE\+\-]+)", line)
|
||||
if match: sigma_300k = match.group(1)
|
||||
if "----------------" in line:
|
||||
in_table = not in_table
|
||||
continue
|
||||
if in_table:
|
||||
parts = line.split()
|
||||
if len(parts) >= 3 and parts[0].isdigit():
|
||||
csv_data.append({
|
||||
"T(K)": parts[0],
|
||||
"Sigma(S/cm)": parts[1],
|
||||
"Sigma*T": parts[2]
|
||||
})
|
||||
|
||||
report_path = os.path.join(output_dir, "conductivity_report.csv")
|
||||
with open(report_path, 'w') as f:
|
||||
f.write(f"# Ea (eV):,{ea_val}\n")
|
||||
f.write(f"# Sigma@300K (S/cm):,{sigma_300k}\n\n")
|
||||
f.write("Temperature(K),Sigma(S/cm),Sigma*T(K*S/cm)\n")
|
||||
for row in csv_data:
|
||||
f.write(f"{row['T(K)']},{row['Sigma(S/cm)']},{row['Sigma*T']}\n")
|
||||
|
||||
self.notifier.send("Predict Done", f"Ea: {ea_val} eV", 5)
|
||||
|
||||
src_png = os.path.join(step_dir, "Arrhenius.png")
|
||||
if os.path.exists(src_png):
|
||||
shutil.copy(src_png, os.path.join(output_dir, "Arrhenius.png"))
|
||||
self.notifier.send(f"Step Done: {step_name}", f"project {self.param['project']} Finished task {task_id_predict}", 2)
|
||||
self.tracker.mark_done(task_id_predict)
|
||||
else:
|
||||
self.logger.info("Skipping Predict (Already Done).")
|
||||
self.notifier.send("Workflow end", f"project {self.param['project']} success", 5)
|
||||
@@ -1,8 +1,10 @@
|
||||
potential ./nep.txt
|
||||
velocity 100
|
||||
|
||||
ensemble npt_mttk temp 100 400 aniso 0 0
|
||||
run 100000
|
||||
ensemble npt_mttk temp 400 1200 aniso 0 0
|
||||
|
||||
ensemble npt_mttk temp 400 800 aniso 0 0
|
||||
dump_thermo 10
|
||||
dump_exyz 10000
|
||||
run 100000
|
||||
run 100000
|
||||
7
template/00.md/production/run.in
Normal file
7
template/00.md/production/run.in
Normal file
@@ -0,0 +1,7 @@
|
||||
potential ./nep.txt
|
||||
velocity 350
|
||||
|
||||
ensemble npt_mttk temp 350 900 aniso 0 0
|
||||
dump_thermo 10
|
||||
dump_exyz 100
|
||||
run 500000
|
||||
29
template/02.scf/INCAR
Normal file
29
template/02.scf/INCAR
Normal file
@@ -0,0 +1,29 @@
|
||||
SYSTEM = Li3YCl6 Training Data Generation
|
||||
|
||||
! --- 电子步基础设置 ---
|
||||
ENCUT = 520 ! [anie202215544-sup-0001-misc_information (1).pdf] 明确指定
|
||||
PREC = Accurate ! 保证力计算的精度
|
||||
ALGO = Normal ! 或者 Normal
|
||||
LREAL = .FALSE. ! 对于生成 Force training data,Projection 最好关掉(False)
|
||||
|
||||
! --- 电子收敛标准 ---
|
||||
! 文献提到能量收敛 1e-4,但为了训练势函数,建议稍微严一点
|
||||
EDIFF = 1E-6
|
||||
ISYM = 0 ! 关闭对称性,防止 MD/采样 过程中因为对称性导致力计算的人为约束
|
||||
|
||||
! --- 展宽设置 (绝缘体/半导体) ---
|
||||
ISMEAR = 0 ! Gaussian Smearing
|
||||
SIGMA = 0.05 ! 配合 ISMEAR=0 使用,文献常用值
|
||||
|
||||
! --- 关键:optB88-vdW 泛函设置 ---
|
||||
! [anie202215544-sup-0001-misc_information (1).pdf] 明确指出使用 optB88-vdW
|
||||
! 以下参数必须完全照抄,不能改动
|
||||
LUSE_VDW = .TRUE. ! 开启 vdW 修正
|
||||
AGGAC = 0.0000 ! 必须为 0
|
||||
GGA = BO ! optB88 基于 MK 交换泛函
|
||||
PARAM1 = 0.18333333 ! optB88 特有参数
|
||||
PARAM2 = 0.22000000 ! optB88 特有参数
|
||||
|
||||
! --- 输出控制 ---
|
||||
LWAVE = .FALSE. ! 训练数据不需要波函数,节省空间
|
||||
LCHARG = .FALSE. ! 不需要电荷密度
|
||||
24
template/02.scf/INCAR_unifrom
Normal file
24
template/02.scf/INCAR_unifrom
Normal file
@@ -0,0 +1,24 @@
|
||||
SYSTEM = static SCF
|
||||
ISTART = 0
|
||||
ICHARG = 2
|
||||
ENCUT = 520
|
||||
NPAR = 8
|
||||
PREC = Normal
|
||||
GGA = PE
|
||||
EDIFF = 1E-6
|
||||
ALGO = Normal
|
||||
NELM = 120
|
||||
NSW = 0
|
||||
IBRION = -1
|
||||
ISPIN = 1
|
||||
ISMEAR = 0
|
||||
SIGMA = 0.05
|
||||
LASPH = .TRUE.
|
||||
LREAL = .FALSE.
|
||||
ADDGRID = .TRUE.
|
||||
ISYM = 2
|
||||
LCHARG = .FALSE.
|
||||
LWAVE = .FALSE.
|
||||
|
||||
KSPACING= 0.25
|
||||
KGAMMA = .TRUE.
|
||||
5
template/02.scf/KPOINTS
Normal file
5
template/02.scf/KPOINTS
Normal file
@@ -0,0 +1,5 @@
|
||||
Automatic Mesh for Training Data
|
||||
0 ! 0 表示自动生成
|
||||
Gamma ! 必须用 Gamma centered(六角/三角晶系推荐)
|
||||
3 3 3 ! 或者 3 3 3,取决于你的计算资源,2x2x2 通常够用
|
||||
0 0 0 ! shift (通常为0)
|
||||
BIN
template/02.scf/vdw_kernel.bindat
Normal file
BIN
template/02.scf/vdw_kernel.bindat
Normal file
Binary file not shown.
4
template/03.train/nep.in
Normal file
4
template/03.train/nep.in
Normal file
@@ -0,0 +1,4 @@
|
||||
type 4 Li Y Cl
|
||||
zbl 2
|
||||
cutoff 6 5
|
||||
generation 100000
|
||||
19
template/04.predict/run.in
Normal file
19
template/04.predict/run.in
Normal file
@@ -0,0 +1,19 @@
|
||||
potential ./nep.txt
|
||||
time_step 1
|
||||
|
||||
# Stage 1: Heating (NPT) - 30 ps
|
||||
velocity 300
|
||||
ensemble npt_scr 300 {T} 100 0 0 0 0 0 0 50 50 50 5 5 5 1000
|
||||
run 30000
|
||||
|
||||
# Stage 2: Equilibration (NPT) - 60 ps
|
||||
ensemble npt_scr {T} {T} 100 0 0 0 0 0 0 50 50 50 5 5 5 1000
|
||||
run 60000
|
||||
|
||||
# Stage 3: Production (NVT)
|
||||
ensemble nvt_nhc {T} {T} 100
|
||||
# MSD 设置: 10 * window * 20 = steps
|
||||
compute_msd 10 {MSD_WINDOW} group 0 0
|
||||
dump_thermo 1000
|
||||
dump_exyz 5000
|
||||
run {STEPS}
|
||||
Reference in New Issue
Block a user