import os import numpy as np def convert_raw_to_gpumd_xyz(input_folder: str, output_filename: str = "gpumd_nep_training_data.xyz"): """ 将 DeePMD-kit 风格的 .raw 训练数据转换为 GPUMD NEP 训练所需的 extended XYZ 格式。 调整为 GPUMD 期望的格式,包括在注释行中添加 Properties 字段, 并将每个原子的力数据附加到原子坐标行。 Args: input_folder (str): 包含 .raw 文件的文件夹路径 (例如 './set.000/'). output_filename (str): 输出的 GPUMD extended XYZ 文件的名称。 Raises: FileNotFoundError: 如果必需的 .raw 文件不存在。 ValueError: 如果数据格式不符合预期。 """ required_files = [ 'box.raw', 'coord.raw', 'energy.raw', 'force.raw', 'type.raw', 'type_map.raw', 'virial.raw' ] # 检查所有必需的文件是否存在 for filename in required_files: filepath = os.path.join(input_folder, filename) if not os.path.exists(filepath): raise FileNotFoundError( f"Missing required file: {filepath}. Please ensure all .raw files are in the specified folder.") print(f"Loading raw from folder: {input_folder}") # --- 1. 读取数据 --- try: # 读取 type_map.raw with open(os.path.join(input_folder, 'type_map.raw'), 'r') as f: type_map_list = [line.strip() for line in f if line.strip()] # 移除空行 # 首次加载 coord.raw 来确定 num_atoms first_coord_line = np.loadtxt(os.path.join(input_folder, 'coord.raw'), max_rows=1) if first_coord_line.ndim == 0: # 如果只有1个数字 num_atoms = 1 else: num_atoms = first_coord_line.shape[0] // 3 if num_atoms == 0: raise ValueError(f"Could not determine num_atoms from coord.raw. It seems empty or malformed.") # 现在有了正确的 num_atoms,重新加载 type.raw 以获取原子类型列表 with open(os.path.join(input_folder, 'type.raw'), 'r') as f: all_types_lines = f.readlines() if not all_types_lines: raise ValueError(f"{os.path.join(input_folder, 'type.raw')} is empty or malformed.") # 假设所有构型的原子类型序列是相同的,我们只需要第一个构型的类型 first_type_config = np.array([int(x) for x in all_types_lines[0].strip().split()]) if len(first_type_config) != num_atoms: # 尝试另一种 DeePMD 常见的 type.raw 格式:一个长序列,表示所有原子类型 # 如果 type.raw 的行数等于原子数,我们假设每行一个原子类型 if len(all_types_lines) == num_atoms: atom_types_numeric = np.array([int(line.strip()) for line in all_types_lines]) else: raise ValueError( f"Mismatch between num_atoms ({num_atoms}) derived from coord.raw and type.raw format. " f"First line of type.raw has {len(first_type_config)} types, total lines {len(all_types_lines)}. " f"Please check type.raw format and adjust script.") else: atom_types_numeric = first_type_config # 正常情况,第一行就是第一个构型的所有原子类型 atom_symbols = [type_map_list[t] for t in atom_types_numeric] # 读取其他数据 boxes = np.loadtxt(os.path.join(input_folder, 'box.raw')).reshape(-1, 3, 3) coords_flat = np.loadtxt(os.path.join(input_folder, 'coord.raw')) energies = np.loadtxt(os.path.join(input_folder, 'energy.raw')) forces_flat = np.loadtxt(os.path.join(input_folder, 'force.raw')) virials_flat = np.loadtxt(os.path.join(input_folder, 'virial.raw')) # 可能是 9 个分量 except Exception as e: raise ValueError(f"Error reading .raw files. Please check their format. Details: {e}") # 验证数据维度 num_configs = len(energies) expected_coord_cols = num_atoms * 3 expected_virial_cols = 9 # DeepMD通常输出9个分量 if coords_flat.shape[1] != expected_coord_cols: raise ValueError( f"coord.raw has {coords_flat.shape[1]} columns, but expected {expected_coord_cols} (N_atoms * 3).") if boxes.shape[0] != num_configs: raise ValueError(f"box.raw has {boxes.shape[0]} configurations, but expected {num_configs}. Data mismatch.") if forces_flat.shape[1] != expected_coord_cols: raise ValueError( f"force.raw has {forces_flat.shape[1]} columns, but expected {expected_coord_cols} (N_atoms * 3). Check file format.") if virials_flat.shape[0] != num_configs or virials_flat.shape[1] != expected_virial_cols: raise ValueError( f"virial.raw has shape {virials_flat.shape}, but expected ({num_configs}, {expected_virial_cols}). Check file format.") coords = coords_flat.reshape(num_configs, num_atoms, 3) forces = forces_flat.reshape(num_configs, num_atoms, 3) virials_matrix = virials_flat.reshape(num_configs, 3, 3) print(f"Loaded {num_configs} configurations with {num_atoms} atoms each.") # --- 2. 写入到 GPUMD NEP 的 extended XYZ 格式 --- # 确保输出路径的目录存在 output_dir = os.path.dirname(output_filename) if output_dir and not os.path.exists(output_dir): os.makedirs(output_dir) output_filepath = output_filename # 直接使用传入的output_filename作为最终路径 with open(output_filepath, 'w') as f: for i in range(num_configs): # 第一行:原子数量 f.write(f"{num_atoms}\n") # 第二行:元数据 box_matrix_flat = boxes[i].flatten() box_str = " ".join(f"{x:.10f}" for x in box_matrix_flat) energy_str = f"{energies[i]:.10f}" virial_tensor = virials_matrix[i] # --- 关键修改处:输出 Virial 的九个分量 --- # 展平 3x3 矩阵以得到九个分量 virial_gpumd_components = virial_tensor.flatten() virial_str = " ".join(f"{x:.10f}" for x in virial_gpumd_components) # 构造 GPUMD 兼容的第二行 config_type_str = f"Config_type=dpgen_iter{i:03d}" # 示例:迭代号,可以自定义 weight_str = "Weight=1.0" properties_str = "Properties=species:S:1:pos:R:3:forces:R:3" # 关键修改 f.write( f'{config_type_str} {weight_str} Lattice="{box_str}" Energy={energy_str} Virial="{virial_str}" pbc="T T T" {properties_str}\n' ) # 后续行:原子符号、坐标和力 for j in range(num_atoms): x, y, z = coords[i, j] fx, fy, fz = forces[i, j] f.write(f"{atom_symbols[j]} {x:.10f} {y:.10f} {z:.10f} {fx:.10f} {fy:.10f} {fz:.10f}\n") print(f"Successfully converted {num_configs} configurations to {output_filepath}") print(f"Output file saved at: {output_filepath}") # --- 如何使用这个函数 --- if __name__ == "__main__": # 示例用法: input_folder_path = 'data/dpmd_data/lyc/training_data/p3m1_data/raw' output_file_path = 'data/dpmd_data/lyc/training_data/p3m1_data/p3m1_train.xyz' convert_raw_to_gpumd_xyz(input_folder=input_folder_path, output_filename=output_file_path)