Files
solidstate-tools/GPUMD/raw2xyz.py
2025-11-19 12:23:17 +08:00

150 lines
7.3 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import os
import numpy as np
def convert_raw_to_gpumd_xyz(input_folder: str, output_filename: str = "gpumd_nep_training_data.xyz"):
"""
将 DeePMD-kit 风格的 .raw 训练数据转换为 GPUMD NEP 训练所需的 extended XYZ 格式。
调整为 GPUMD 期望的格式,包括在注释行中添加 Properties 字段,
并将每个原子的力数据附加到原子坐标行。
Args:
input_folder (str): 包含 .raw 文件的文件夹路径 (例如 './set.000/').
output_filename (str): 输出的 GPUMD extended XYZ 文件的名称。
Raises:
FileNotFoundError: 如果必需的 .raw 文件不存在。
ValueError: 如果数据格式不符合预期。
"""
required_files = [
'box.raw', 'coord.raw', 'energy.raw', 'force.raw',
'type.raw', 'type_map.raw', 'virial.raw'
]
# 检查所有必需的文件是否存在
for filename in required_files:
filepath = os.path.join(input_folder, filename)
if not os.path.exists(filepath):
raise FileNotFoundError(
f"Missing required file: {filepath}. Please ensure all .raw files are in the specified folder.")
print(f"Loading raw from folder: {input_folder}")
# --- 1. 读取数据 ---
try:
# 读取 type_map.raw
with open(os.path.join(input_folder, 'type_map.raw'), 'r') as f:
type_map_list = [line.strip() for line in f if line.strip()] # 移除空行
# 首次加载 coord.raw 来确定 num_atoms
first_coord_line = np.loadtxt(os.path.join(input_folder, 'coord.raw'), max_rows=1)
if first_coord_line.ndim == 0: # 如果只有1个数字
num_atoms = 1
else:
num_atoms = first_coord_line.shape[0] // 3
if num_atoms == 0:
raise ValueError(f"Could not determine num_atoms from coord.raw. It seems empty or malformed.")
# 现在有了正确的 num_atoms重新加载 type.raw 以获取原子类型列表
with open(os.path.join(input_folder, 'type.raw'), 'r') as f:
all_types_lines = f.readlines()
if not all_types_lines:
raise ValueError(f"{os.path.join(input_folder, 'type.raw')} is empty or malformed.")
# 假设所有构型的原子类型序列是相同的,我们只需要第一个构型的类型
first_type_config = np.array([int(x) for x in all_types_lines[0].strip().split()])
if len(first_type_config) != num_atoms:
# 尝试另一种 DeePMD 常见的 type.raw 格式:一个长序列,表示所有原子类型
# 如果 type.raw 的行数等于原子数,我们假设每行一个原子类型
if len(all_types_lines) == num_atoms:
atom_types_numeric = np.array([int(line.strip()) for line in all_types_lines])
else:
raise ValueError(
f"Mismatch between num_atoms ({num_atoms}) derived from coord.raw and type.raw format. "
f"First line of type.raw has {len(first_type_config)} types, total lines {len(all_types_lines)}. "
f"Please check type.raw format and adjust script.")
else:
atom_types_numeric = first_type_config # 正常情况,第一行就是第一个构型的所有原子类型
atom_symbols = [type_map_list[t] for t in atom_types_numeric]
# 读取其他数据
boxes = np.loadtxt(os.path.join(input_folder, 'box.raw')).reshape(-1, 3, 3)
coords_flat = np.loadtxt(os.path.join(input_folder, 'coord.raw'))
energies = np.loadtxt(os.path.join(input_folder, 'energy.raw'))
forces_flat = np.loadtxt(os.path.join(input_folder, 'force.raw'))
virials_flat = np.loadtxt(os.path.join(input_folder, 'virial.raw')) # 可能是 9 个分量
except Exception as e:
raise ValueError(f"Error reading .raw files. Please check their format. Details: {e}")
# 验证数据维度
num_configs = len(energies)
expected_coord_cols = num_atoms * 3
expected_virial_cols = 9 # DeepMD通常输出9个分量
if coords_flat.shape[1] != expected_coord_cols:
raise ValueError(
f"coord.raw has {coords_flat.shape[1]} columns, but expected {expected_coord_cols} (N_atoms * 3).")
if boxes.shape[0] != num_configs:
raise ValueError(f"box.raw has {boxes.shape[0]} configurations, but expected {num_configs}. Data mismatch.")
if forces_flat.shape[1] != expected_coord_cols:
raise ValueError(
f"force.raw has {forces_flat.shape[1]} columns, but expected {expected_coord_cols} (N_atoms * 3). Check file format.")
if virials_flat.shape[0] != num_configs or virials_flat.shape[1] != expected_virial_cols:
raise ValueError(
f"virial.raw has shape {virials_flat.shape}, but expected ({num_configs}, {expected_virial_cols}). Check file format.")
coords = coords_flat.reshape(num_configs, num_atoms, 3)
forces = forces_flat.reshape(num_configs, num_atoms, 3)
virials_matrix = virials_flat.reshape(num_configs, 3, 3)
print(f"Loaded {num_configs} configurations with {num_atoms} atoms each.")
# --- 2. 写入到 GPUMD NEP 的 extended XYZ 格式 ---
# 确保输出路径的目录存在
output_dir = os.path.dirname(output_filename)
if output_dir and not os.path.exists(output_dir):
os.makedirs(output_dir)
output_filepath = output_filename # 直接使用传入的output_filename作为最终路径
with open(output_filepath, 'w') as f:
for i in range(num_configs):
# 第一行:原子数量
f.write(f"{num_atoms}\n")
# 第二行:元数据
box_matrix_flat = boxes[i].flatten()
box_str = " ".join(f"{x:.10f}" for x in box_matrix_flat)
energy_str = f"{energies[i]:.10f}"
virial_tensor = virials_matrix[i]
# --- 关键修改处:输出 Virial 的九个分量 ---
# 展平 3x3 矩阵以得到九个分量
virial_gpumd_components = virial_tensor.flatten()
virial_str = " ".join(f"{x:.10f}" for x in virial_gpumd_components)
# 构造 GPUMD 兼容的第二行
config_type_str = f"Config_type=dpgen_iter{i:03d}" # 示例:迭代号,可以自定义
weight_str = "Weight=1.0"
properties_str = "Properties=species:S:1:pos:R:3:forces:R:3" # 关键修改
f.write(
f'{config_type_str} {weight_str} Lattice="{box_str}" Energy={energy_str} Virial="{virial_str}" pbc="T T T" {properties_str}\n'
)
# 后续行:原子符号、坐标和力
for j in range(num_atoms):
x, y, z = coords[i, j]
fx, fy, fz = forces[i, j]
f.write(f"{atom_symbols[j]} {x:.10f} {y:.10f} {z:.10f} {fx:.10f} {fy:.10f} {fz:.10f}\n")
print(f"Successfully converted {num_configs} configurations to {output_filepath}")
print(f"Output file saved at: {output_filepath}")
# --- 如何使用这个函数 ---
if __name__ == "__main__":
# 示例用法:
input_folder_path = 'data/dpmd_data/lyc/training_data/p3m1_data/raw'
output_file_path = 'data/dpmd_data/lyc/training_data/p3m1_data/p3m1_train.xyz'
convert_raw_to_gpumd_xyz(input_folder=input_folder_path, output_filename=output_file_path)