150 lines
7.3 KiB
Python
150 lines
7.3 KiB
Python
import os
|
||
import numpy as np
|
||
|
||
|
||
def convert_raw_to_gpumd_xyz(input_folder: str, output_filename: str = "gpumd_nep_training_data.xyz"):
|
||
"""
|
||
将 DeePMD-kit 风格的 .raw 训练数据转换为 GPUMD NEP 训练所需的 extended XYZ 格式。
|
||
调整为 GPUMD 期望的格式,包括在注释行中添加 Properties 字段,
|
||
并将每个原子的力数据附加到原子坐标行。
|
||
Args:
|
||
input_folder (str): 包含 .raw 文件的文件夹路径 (例如 './set.000/').
|
||
output_filename (str): 输出的 GPUMD extended XYZ 文件的名称。
|
||
Raises:
|
||
FileNotFoundError: 如果必需的 .raw 文件不存在。
|
||
ValueError: 如果数据格式不符合预期。
|
||
"""
|
||
required_files = [
|
||
'box.raw', 'coord.raw', 'energy.raw', 'force.raw',
|
||
'type.raw', 'type_map.raw', 'virial.raw'
|
||
]
|
||
# 检查所有必需的文件是否存在
|
||
for filename in required_files:
|
||
filepath = os.path.join(input_folder, filename)
|
||
if not os.path.exists(filepath):
|
||
raise FileNotFoundError(
|
||
f"Missing required file: {filepath}. Please ensure all .raw files are in the specified folder.")
|
||
print(f"Loading raw from folder: {input_folder}")
|
||
|
||
# --- 1. 读取数据 ---
|
||
try:
|
||
# 读取 type_map.raw
|
||
with open(os.path.join(input_folder, 'type_map.raw'), 'r') as f:
|
||
type_map_list = [line.strip() for line in f if line.strip()] # 移除空行
|
||
|
||
# 首次加载 coord.raw 来确定 num_atoms
|
||
first_coord_line = np.loadtxt(os.path.join(input_folder, 'coord.raw'), max_rows=1)
|
||
if first_coord_line.ndim == 0: # 如果只有1个数字
|
||
num_atoms = 1
|
||
else:
|
||
num_atoms = first_coord_line.shape[0] // 3
|
||
if num_atoms == 0:
|
||
raise ValueError(f"Could not determine num_atoms from coord.raw. It seems empty or malformed.")
|
||
|
||
# 现在有了正确的 num_atoms,重新加载 type.raw 以获取原子类型列表
|
||
with open(os.path.join(input_folder, 'type.raw'), 'r') as f:
|
||
all_types_lines = f.readlines()
|
||
if not all_types_lines:
|
||
raise ValueError(f"{os.path.join(input_folder, 'type.raw')} is empty or malformed.")
|
||
|
||
# 假设所有构型的原子类型序列是相同的,我们只需要第一个构型的类型
|
||
first_type_config = np.array([int(x) for x in all_types_lines[0].strip().split()])
|
||
if len(first_type_config) != num_atoms:
|
||
# 尝试另一种 DeePMD 常见的 type.raw 格式:一个长序列,表示所有原子类型
|
||
# 如果 type.raw 的行数等于原子数,我们假设每行一个原子类型
|
||
if len(all_types_lines) == num_atoms:
|
||
atom_types_numeric = np.array([int(line.strip()) for line in all_types_lines])
|
||
else:
|
||
raise ValueError(
|
||
f"Mismatch between num_atoms ({num_atoms}) derived from coord.raw and type.raw format. "
|
||
f"First line of type.raw has {len(first_type_config)} types, total lines {len(all_types_lines)}. "
|
||
f"Please check type.raw format and adjust script.")
|
||
else:
|
||
atom_types_numeric = first_type_config # 正常情况,第一行就是第一个构型的所有原子类型
|
||
|
||
atom_symbols = [type_map_list[t] for t in atom_types_numeric]
|
||
|
||
# 读取其他数据
|
||
boxes = np.loadtxt(os.path.join(input_folder, 'box.raw')).reshape(-1, 3, 3)
|
||
coords_flat = np.loadtxt(os.path.join(input_folder, 'coord.raw'))
|
||
energies = np.loadtxt(os.path.join(input_folder, 'energy.raw'))
|
||
forces_flat = np.loadtxt(os.path.join(input_folder, 'force.raw'))
|
||
virials_flat = np.loadtxt(os.path.join(input_folder, 'virial.raw')) # 可能是 9 个分量
|
||
|
||
except Exception as e:
|
||
raise ValueError(f"Error reading .raw files. Please check their format. Details: {e}")
|
||
|
||
# 验证数据维度
|
||
num_configs = len(energies)
|
||
expected_coord_cols = num_atoms * 3
|
||
expected_virial_cols = 9 # DeepMD通常输出9个分量
|
||
|
||
if coords_flat.shape[1] != expected_coord_cols:
|
||
raise ValueError(
|
||
f"coord.raw has {coords_flat.shape[1]} columns, but expected {expected_coord_cols} (N_atoms * 3).")
|
||
if boxes.shape[0] != num_configs:
|
||
raise ValueError(f"box.raw has {boxes.shape[0]} configurations, but expected {num_configs}. Data mismatch.")
|
||
if forces_flat.shape[1] != expected_coord_cols:
|
||
raise ValueError(
|
||
f"force.raw has {forces_flat.shape[1]} columns, but expected {expected_coord_cols} (N_atoms * 3). Check file format.")
|
||
if virials_flat.shape[0] != num_configs or virials_flat.shape[1] != expected_virial_cols:
|
||
raise ValueError(
|
||
f"virial.raw has shape {virials_flat.shape}, but expected ({num_configs}, {expected_virial_cols}). Check file format.")
|
||
|
||
coords = coords_flat.reshape(num_configs, num_atoms, 3)
|
||
forces = forces_flat.reshape(num_configs, num_atoms, 3)
|
||
virials_matrix = virials_flat.reshape(num_configs, 3, 3)
|
||
|
||
print(f"Loaded {num_configs} configurations with {num_atoms} atoms each.")
|
||
|
||
# --- 2. 写入到 GPUMD NEP 的 extended XYZ 格式 ---
|
||
# 确保输出路径的目录存在
|
||
output_dir = os.path.dirname(output_filename)
|
||
if output_dir and not os.path.exists(output_dir):
|
||
os.makedirs(output_dir)
|
||
|
||
output_filepath = output_filename # 直接使用传入的output_filename作为最终路径
|
||
|
||
with open(output_filepath, 'w') as f:
|
||
for i in range(num_configs):
|
||
# 第一行:原子数量
|
||
f.write(f"{num_atoms}\n")
|
||
|
||
# 第二行:元数据
|
||
box_matrix_flat = boxes[i].flatten()
|
||
box_str = " ".join(f"{x:.10f}" for x in box_matrix_flat)
|
||
energy_str = f"{energies[i]:.10f}"
|
||
|
||
virial_tensor = virials_matrix[i]
|
||
# --- 关键修改处:输出 Virial 的九个分量 ---
|
||
# 展平 3x3 矩阵以得到九个分量
|
||
virial_gpumd_components = virial_tensor.flatten()
|
||
virial_str = " ".join(f"{x:.10f}" for x in virial_gpumd_components)
|
||
|
||
# 构造 GPUMD 兼容的第二行
|
||
config_type_str = f"Config_type=dpgen_iter{i:03d}" # 示例:迭代号,可以自定义
|
||
weight_str = "Weight=1.0"
|
||
properties_str = "Properties=species:S:1:pos:R:3:forces:R:3" # 关键修改
|
||
|
||
f.write(
|
||
f'{config_type_str} {weight_str} Lattice="{box_str}" Energy={energy_str} Virial="{virial_str}" pbc="T T T" {properties_str}\n'
|
||
)
|
||
|
||
# 后续行:原子符号、坐标和力
|
||
for j in range(num_atoms):
|
||
x, y, z = coords[i, j]
|
||
fx, fy, fz = forces[i, j]
|
||
f.write(f"{atom_symbols[j]} {x:.10f} {y:.10f} {z:.10f} {fx:.10f} {fy:.10f} {fz:.10f}\n")
|
||
|
||
print(f"Successfully converted {num_configs} configurations to {output_filepath}")
|
||
print(f"Output file saved at: {output_filepath}")
|
||
|
||
|
||
# --- 如何使用这个函数 ---
|
||
if __name__ == "__main__":
|
||
# 示例用法:
|
||
input_folder_path = 'data/dpmd_data/lyc/training_data/p3m1_data/raw'
|
||
output_file_path = 'data/dpmd_data/lyc/training_data/p3m1_data/p3m1_train.xyz'
|
||
|
||
convert_raw_to_gpumd_xyz(input_folder=input_folder_path, output_filename=output_file_path)
|