一些小修改

This commit is contained in:
2025-11-19 12:23:17 +08:00
parent 95d719cc1e
commit 80ae03c8c1
25 changed files with 2291 additions and 17 deletions

149
GPUMD/raw2xyz.py Normal file
View File

@@ -0,0 +1,149 @@
import os
import numpy as np
def convert_raw_to_gpumd_xyz(input_folder: str, output_filename: str = "gpumd_nep_training_data.xyz"):
"""
将 DeePMD-kit 风格的 .raw 训练数据转换为 GPUMD NEP 训练所需的 extended XYZ 格式。
调整为 GPUMD 期望的格式,包括在注释行中添加 Properties 字段,
并将每个原子的力数据附加到原子坐标行。
Args:
input_folder (str): 包含 .raw 文件的文件夹路径 (例如 './set.000/').
output_filename (str): 输出的 GPUMD extended XYZ 文件的名称。
Raises:
FileNotFoundError: 如果必需的 .raw 文件不存在。
ValueError: 如果数据格式不符合预期。
"""
required_files = [
'box.raw', 'coord.raw', 'energy.raw', 'force.raw',
'type.raw', 'type_map.raw', 'virial.raw'
]
# 检查所有必需的文件是否存在
for filename in required_files:
filepath = os.path.join(input_folder, filename)
if not os.path.exists(filepath):
raise FileNotFoundError(
f"Missing required file: {filepath}. Please ensure all .raw files are in the specified folder.")
print(f"Loading raw from folder: {input_folder}")
# --- 1. 读取数据 ---
try:
# 读取 type_map.raw
with open(os.path.join(input_folder, 'type_map.raw'), 'r') as f:
type_map_list = [line.strip() for line in f if line.strip()] # 移除空行
# 首次加载 coord.raw 来确定 num_atoms
first_coord_line = np.loadtxt(os.path.join(input_folder, 'coord.raw'), max_rows=1)
if first_coord_line.ndim == 0: # 如果只有1个数字
num_atoms = 1
else:
num_atoms = first_coord_line.shape[0] // 3
if num_atoms == 0:
raise ValueError(f"Could not determine num_atoms from coord.raw. It seems empty or malformed.")
# 现在有了正确的 num_atoms重新加载 type.raw 以获取原子类型列表
with open(os.path.join(input_folder, 'type.raw'), 'r') as f:
all_types_lines = f.readlines()
if not all_types_lines:
raise ValueError(f"{os.path.join(input_folder, 'type.raw')} is empty or malformed.")
# 假设所有构型的原子类型序列是相同的,我们只需要第一个构型的类型
first_type_config = np.array([int(x) for x in all_types_lines[0].strip().split()])
if len(first_type_config) != num_atoms:
# 尝试另一种 DeePMD 常见的 type.raw 格式:一个长序列,表示所有原子类型
# 如果 type.raw 的行数等于原子数,我们假设每行一个原子类型
if len(all_types_lines) == num_atoms:
atom_types_numeric = np.array([int(line.strip()) for line in all_types_lines])
else:
raise ValueError(
f"Mismatch between num_atoms ({num_atoms}) derived from coord.raw and type.raw format. "
f"First line of type.raw has {len(first_type_config)} types, total lines {len(all_types_lines)}. "
f"Please check type.raw format and adjust script.")
else:
atom_types_numeric = first_type_config # 正常情况,第一行就是第一个构型的所有原子类型
atom_symbols = [type_map_list[t] for t in atom_types_numeric]
# 读取其他数据
boxes = np.loadtxt(os.path.join(input_folder, 'box.raw')).reshape(-1, 3, 3)
coords_flat = np.loadtxt(os.path.join(input_folder, 'coord.raw'))
energies = np.loadtxt(os.path.join(input_folder, 'energy.raw'))
forces_flat = np.loadtxt(os.path.join(input_folder, 'force.raw'))
virials_flat = np.loadtxt(os.path.join(input_folder, 'virial.raw')) # 可能是 9 个分量
except Exception as e:
raise ValueError(f"Error reading .raw files. Please check their format. Details: {e}")
# 验证数据维度
num_configs = len(energies)
expected_coord_cols = num_atoms * 3
expected_virial_cols = 9 # DeepMD通常输出9个分量
if coords_flat.shape[1] != expected_coord_cols:
raise ValueError(
f"coord.raw has {coords_flat.shape[1]} columns, but expected {expected_coord_cols} (N_atoms * 3).")
if boxes.shape[0] != num_configs:
raise ValueError(f"box.raw has {boxes.shape[0]} configurations, but expected {num_configs}. Data mismatch.")
if forces_flat.shape[1] != expected_coord_cols:
raise ValueError(
f"force.raw has {forces_flat.shape[1]} columns, but expected {expected_coord_cols} (N_atoms * 3). Check file format.")
if virials_flat.shape[0] != num_configs or virials_flat.shape[1] != expected_virial_cols:
raise ValueError(
f"virial.raw has shape {virials_flat.shape}, but expected ({num_configs}, {expected_virial_cols}). Check file format.")
coords = coords_flat.reshape(num_configs, num_atoms, 3)
forces = forces_flat.reshape(num_configs, num_atoms, 3)
virials_matrix = virials_flat.reshape(num_configs, 3, 3)
print(f"Loaded {num_configs} configurations with {num_atoms} atoms each.")
# --- 2. 写入到 GPUMD NEP 的 extended XYZ 格式 ---
# 确保输出路径的目录存在
output_dir = os.path.dirname(output_filename)
if output_dir and not os.path.exists(output_dir):
os.makedirs(output_dir)
output_filepath = output_filename # 直接使用传入的output_filename作为最终路径
with open(output_filepath, 'w') as f:
for i in range(num_configs):
# 第一行:原子数量
f.write(f"{num_atoms}\n")
# 第二行:元数据
box_matrix_flat = boxes[i].flatten()
box_str = " ".join(f"{x:.10f}" for x in box_matrix_flat)
energy_str = f"{energies[i]:.10f}"
virial_tensor = virials_matrix[i]
# --- 关键修改处:输出 Virial 的九个分量 ---
# 展平 3x3 矩阵以得到九个分量
virial_gpumd_components = virial_tensor.flatten()
virial_str = " ".join(f"{x:.10f}" for x in virial_gpumd_components)
# 构造 GPUMD 兼容的第二行
config_type_str = f"Config_type=dpgen_iter{i:03d}" # 示例:迭代号,可以自定义
weight_str = "Weight=1.0"
properties_str = "Properties=species:S:1:pos:R:3:forces:R:3" # 关键修改
f.write(
f'{config_type_str} {weight_str} Lattice="{box_str}" Energy={energy_str} Virial="{virial_str}" pbc="T T T" {properties_str}\n'
)
# 后续行:原子符号、坐标和力
for j in range(num_atoms):
x, y, z = coords[i, j]
fx, fy, fz = forces[i, j]
f.write(f"{atom_symbols[j]} {x:.10f} {y:.10f} {z:.10f} {fx:.10f} {fy:.10f} {fz:.10f}\n")
print(f"Successfully converted {num_configs} configurations to {output_filepath}")
print(f"Output file saved at: {output_filepath}")
# --- 如何使用这个函数 ---
if __name__ == "__main__":
# 示例用法:
input_folder_path = 'data/dpmd_data/lyc/training_data/p3m1_data/raw'
output_file_path = 'data/dpmd_data/lyc/training_data/p3m1_data/p3m1_train.xyz'
convert_raw_to_gpumd_xyz(input_folder=input_folder_path, output_filename=output_file_path)