solidstate-tools/GPUMD/raw2xyz.py

import os
import numpy as np


def convert_raw_to_gpumd_xyz(input_folder: str, output_filename: str = "gpumd_nep_training_data.xyz"):
    """
    将 DeePMD-kit 风格的 .raw 训练数据转换为 GPUMD NEP 训练所需的 extended XYZ 格式。
    调整为 GPUMD 期望的格式，包括在注释行中添加 Properties 字段，
    并将每个原子的力数据附加到原子坐标行。
    Args:
        input_folder (str): 包含 .raw 文件的文件夹路径 (例如 './set.000/').
        output_filename (str): 输出的 GPUMD extended XYZ 文件的名称。
    Raises:
        FileNotFoundError: 如果必需的 .raw 文件不存在。
        ValueError: 如果数据格式不符合预期。
    """
    required_files = [
        'box.raw', 'coord.raw', 'energy.raw', 'force.raw',
        'type.raw', 'type_map.raw', 'virial.raw'
    ]
    # 检查所有必需的文件是否存在
    for filename in required_files:
        filepath = os.path.join(input_folder, filename)
        if not os.path.exists(filepath):
            raise FileNotFoundError(
                f"Missing required file: {filepath}. Please ensure all .raw files are in the specified folder.")
    print(f"Loading raw from folder: {input_folder}")

    # --- 1. 读取数据 ---
    try:
        # 读取 type_map.raw
        with open(os.path.join(input_folder, 'type_map.raw'), 'r') as f:
            type_map_list = [line.strip() for line in f if line.strip()]  # 移除空行

        # 首次加载 coord.raw 来确定 num_atoms
        first_coord_line = np.loadtxt(os.path.join(input_folder, 'coord.raw'), max_rows=1)
        if first_coord_line.ndim == 0:  # 如果只有1个数字
            num_atoms = 1
        else:
            num_atoms = first_coord_line.shape[0] // 3
        if num_atoms == 0:
            raise ValueError(f"Could not determine num_atoms from coord.raw. It seems empty or malformed.")

        # 现在有了正确的 num_atoms，重新加载 type.raw 以获取原子类型列表
        with open(os.path.join(input_folder, 'type.raw'), 'r') as f:
            all_types_lines = f.readlines()
            if not all_types_lines:
                raise ValueError(f"{os.path.join(input_folder, 'type.raw')} is empty or malformed.")

            # 假设所有构型的原子类型序列是相同的，我们只需要第一个构型的类型
            first_type_config = np.array([int(x) for x in all_types_lines[0].strip().split()])
            if len(first_type_config) != num_atoms:
                # 尝试另一种 DeePMD 常见的 type.raw 格式：一个长序列，表示所有原子类型
                # 如果 type.raw 的行数等于原子数，我们假设每行一个原子类型
                if len(all_types_lines) == num_atoms:
                    atom_types_numeric = np.array([int(line.strip()) for line in all_types_lines])
                else:
                    raise ValueError(
                        f"Mismatch between num_atoms ({num_atoms}) derived from coord.raw and type.raw format. "
                        f"First line of type.raw has {len(first_type_config)} types, total lines {len(all_types_lines)}. "
                        f"Please check type.raw format and adjust script.")
            else:
                atom_types_numeric = first_type_config  # 正常情况，第一行就是第一个构型的所有原子类型

        atom_symbols = [type_map_list[t] for t in atom_types_numeric]

        # 读取其他数据
        boxes = np.loadtxt(os.path.join(input_folder, 'box.raw')).reshape(-1, 3, 3)
        coords_flat = np.loadtxt(os.path.join(input_folder, 'coord.raw'))
        energies = np.loadtxt(os.path.join(input_folder, 'energy.raw'))
        forces_flat = np.loadtxt(os.path.join(input_folder, 'force.raw'))
        virials_flat = np.loadtxt(os.path.join(input_folder, 'virial.raw'))  # 可能是 9 个分量

    except Exception as e:
        raise ValueError(f"Error reading .raw files. Please check their format. Details: {e}")

    # 验证数据维度
    num_configs = len(energies)
    expected_coord_cols = num_atoms * 3
    expected_virial_cols = 9  # DeepMD通常输出9个分量

    if coords_flat.shape[1] != expected_coord_cols:
        raise ValueError(
            f"coord.raw has {coords_flat.shape[1]} columns, but expected {expected_coord_cols} (N_atoms * 3).")
    if boxes.shape[0] != num_configs:
        raise ValueError(f"box.raw has {boxes.shape[0]} configurations, but expected {num_configs}. Data mismatch.")
    if forces_flat.shape[1] != expected_coord_cols:
        raise ValueError(
            f"force.raw has {forces_flat.shape[1]} columns, but expected {expected_coord_cols} (N_atoms * 3). Check file format.")
    if virials_flat.shape[0] != num_configs or virials_flat.shape[1] != expected_virial_cols:
        raise ValueError(
            f"virial.raw has shape {virials_flat.shape}, but expected ({num_configs}, {expected_virial_cols}). Check file format.")

    coords = coords_flat.reshape(num_configs, num_atoms, 3)
    forces = forces_flat.reshape(num_configs, num_atoms, 3)
    virials_matrix = virials_flat.reshape(num_configs, 3, 3)

    print(f"Loaded {num_configs} configurations with {num_atoms} atoms each.")

    # --- 2. 写入到 GPUMD NEP 的 extended XYZ 格式 ---
    # 确保输出路径的目录存在
    output_dir = os.path.dirname(output_filename)
    if output_dir and not os.path.exists(output_dir):
        os.makedirs(output_dir)

    output_filepath = output_filename  # 直接使用传入的output_filename作为最终路径

    with open(output_filepath, 'w') as f:
        for i in range(num_configs):
            # 第一行：原子数量
            f.write(f"{num_atoms}\n")

            # 第二行：元数据
            box_matrix_flat = boxes[i].flatten()
            box_str = " ".join(f"{x:.10f}" for x in box_matrix_flat)
            energy_str = f"{energies[i]:.10f}"

            virial_tensor = virials_matrix[i]
            # --- 关键修改处：输出 Virial 的九个分量 ---
            # 展平 3x3 矩阵以得到九个分量
            virial_gpumd_components = virial_tensor.flatten()
            virial_str = " ".join(f"{x:.10f}" for x in virial_gpumd_components)

            # 构造 GPUMD 兼容的第二行
            config_type_str = f"Config_type=dpgen_iter{i:03d}"  # 示例：迭代号，可以自定义
            weight_str = "Weight=1.0"
            properties_str = "Properties=species:S:1:pos:R:3:forces:R:3"  # 关键修改

            f.write(
                f'{config_type_str} {weight_str} Lattice="{box_str}" Energy={energy_str} Virial="{virial_str}" pbc="T T T" {properties_str}\n'
            )

            # 后续行：原子符号、坐标和力
            for j in range(num_atoms):
                x, y, z = coords[i, j]
                fx, fy, fz = forces[i, j]
                f.write(f"{atom_symbols[j]} {x:.10f} {y:.10f} {z:.10f} {fx:.10f} {fy:.10f} {fz:.10f}\n")

    print(f"Successfully converted {num_configs} configurations to {output_filepath}")
    print(f"Output file saved at: {output_filepath}")


# --- 如何使用这个函数 ---
if __name__ == "__main__":
    # 示例用法:
    input_folder_path = 'data/dpmd_data/lyc/training_data/p3m1_data/raw'
    output_file_path = 'data/dpmd_data/lyc/training_data/p3m1_data/p3m1_train.xyz'

    convert_raw_to_gpumd_xyz(input_folder=input_folder_path, output_filename=output_file_path)