一些小修改

2025-11-19 12:23:17 +08:00
parent 95d719cc1e
commit 80ae03c8c1
25 changed files with 2291 additions and 17 deletions
--- a/GPUMD/raw2xyz.py
+++ b/GPUMD/raw2xyz.py
@@ -0,0 +1,149 @@
+import os
+import numpy as np
+
+
+def convert_raw_to_gpumd_xyz(input_folder: str, output_filename: str = "gpumd_nep_training_data.xyz"):
+    """
+    将 DeePMD-kit 风格的 .raw 训练数据转换为 GPUMD NEP 训练所需的 extended XYZ 格式。
+    调整为 GPUMD 期望的格式，包括在注释行中添加 Properties 字段，
+    并将每个原子的力数据附加到原子坐标行。
+    Args:
+        input_folder (str): 包含 .raw 文件的文件夹路径 (例如 './set.000/').
+        output_filename (str): 输出的 GPUMD extended XYZ 文件的名称。
+    Raises:
+        FileNotFoundError: 如果必需的 .raw 文件不存在。
+        ValueError: 如果数据格式不符合预期。
+    """
+    required_files = [
+        'box.raw', 'coord.raw', 'energy.raw', 'force.raw',
+        'type.raw', 'type_map.raw', 'virial.raw'
+    ]
+    # 检查所有必需的文件是否存在
+    for filename in required_files:
+        filepath = os.path.join(input_folder, filename)
+        if not os.path.exists(filepath):
+            raise FileNotFoundError(
+                f"Missing required file: {filepath}. Please ensure all .raw files are in the specified folder.")
+    print(f"Loading raw from folder: {input_folder}")
+
+    # --- 1. 读取数据 ---
+    try:
+        # 读取 type_map.raw
+        with open(os.path.join(input_folder, 'type_map.raw'), 'r') as f:
+            type_map_list = [line.strip() for line in f if line.strip()]  # 移除空行
+
+        # 首次加载 coord.raw 来确定 num_atoms
+        first_coord_line = np.loadtxt(os.path.join(input_folder, 'coord.raw'), max_rows=1)
+        if first_coord_line.ndim == 0:  # 如果只有1个数字
+            num_atoms = 1
+        else:
+            num_atoms = first_coord_line.shape[0] // 3
+        if num_atoms == 0:
+            raise ValueError(f"Could not determine num_atoms from coord.raw. It seems empty or malformed.")
+
+        # 现在有了正确的 num_atoms，重新加载 type.raw 以获取原子类型列表
+        with open(os.path.join(input_folder, 'type.raw'), 'r') as f:
+            all_types_lines = f.readlines()
+            if not all_types_lines:
+                raise ValueError(f"{os.path.join(input_folder, 'type.raw')} is empty or malformed.")
+
+            # 假设所有构型的原子类型序列是相同的，我们只需要第一个构型的类型
+            first_type_config = np.array([int(x) for x in all_types_lines[0].strip().split()])
+            if len(first_type_config) != num_atoms:
+                # 尝试另一种 DeePMD 常见的 type.raw 格式：一个长序列，表示所有原子类型
+                # 如果 type.raw 的行数等于原子数，我们假设每行一个原子类型
+                if len(all_types_lines) == num_atoms:
+                    atom_types_numeric = np.array([int(line.strip()) for line in all_types_lines])
+                else:
+                    raise ValueError(
+                        f"Mismatch between num_atoms ({num_atoms}) derived from coord.raw and type.raw format. "
+                        f"First line of type.raw has {len(first_type_config)} types, total lines {len(all_types_lines)}. "
+                        f"Please check type.raw format and adjust script.")
+            else:
+                atom_types_numeric = first_type_config  # 正常情况，第一行就是第一个构型的所有原子类型
+
+        atom_symbols = [type_map_list[t] for t in atom_types_numeric]
+
+        # 读取其他数据
+        boxes = np.loadtxt(os.path.join(input_folder, 'box.raw')).reshape(-1, 3, 3)
+        coords_flat = np.loadtxt(os.path.join(input_folder, 'coord.raw'))
+        energies = np.loadtxt(os.path.join(input_folder, 'energy.raw'))
+        forces_flat = np.loadtxt(os.path.join(input_folder, 'force.raw'))
+        virials_flat = np.loadtxt(os.path.join(input_folder, 'virial.raw'))  # 可能是 9 个分量
+
+    except Exception as e:
+        raise ValueError(f"Error reading .raw files. Please check their format. Details: {e}")
+
+    # 验证数据维度
+    num_configs = len(energies)
+    expected_coord_cols = num_atoms * 3
+    expected_virial_cols = 9  # DeepMD通常输出9个分量
+
+    if coords_flat.shape[1] != expected_coord_cols:
+        raise ValueError(
+            f"coord.raw has {coords_flat.shape[1]} columns, but expected {expected_coord_cols} (N_atoms * 3).")
+    if boxes.shape[0] != num_configs:
+        raise ValueError(f"box.raw has {boxes.shape[0]} configurations, but expected {num_configs}. Data mismatch.")
+    if forces_flat.shape[1] != expected_coord_cols:
+        raise ValueError(
+            f"force.raw has {forces_flat.shape[1]} columns, but expected {expected_coord_cols} (N_atoms * 3). Check file format.")
+    if virials_flat.shape[0] != num_configs or virials_flat.shape[1] != expected_virial_cols:
+        raise ValueError(
+            f"virial.raw has shape {virials_flat.shape}, but expected ({num_configs}, {expected_virial_cols}). Check file format.")
+
+    coords = coords_flat.reshape(num_configs, num_atoms, 3)
+    forces = forces_flat.reshape(num_configs, num_atoms, 3)
+    virials_matrix = virials_flat.reshape(num_configs, 3, 3)
+
+    print(f"Loaded {num_configs} configurations with {num_atoms} atoms each.")
+
+    # --- 2. 写入到 GPUMD NEP 的 extended XYZ 格式 ---
+    # 确保输出路径的目录存在
+    output_dir = os.path.dirname(output_filename)
+    if output_dir and not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+
+    output_filepath = output_filename  # 直接使用传入的output_filename作为最终路径
+
+    with open(output_filepath, 'w') as f:
+        for i in range(num_configs):
+            # 第一行：原子数量
+            f.write(f"{num_atoms}\n")
+
+            # 第二行：元数据
+            box_matrix_flat = boxes[i].flatten()
+            box_str = " ".join(f"{x:.10f}" for x in box_matrix_flat)
+            energy_str = f"{energies[i]:.10f}"
+
+            virial_tensor = virials_matrix[i]
+            # --- 关键修改处：输出 Virial 的九个分量 ---
+            # 展平 3x3 矩阵以得到九个分量
+            virial_gpumd_components = virial_tensor.flatten()
+            virial_str = " ".join(f"{x:.10f}" for x in virial_gpumd_components)
+
+            # 构造 GPUMD 兼容的第二行
+            config_type_str = f"Config_type=dpgen_iter{i:03d}"  # 示例：迭代号，可以自定义
+            weight_str = "Weight=1.0"
+            properties_str = "Properties=species:S:1:pos:R:3:forces:R:3"  # 关键修改
+
+            f.write(
+                f'{config_type_str} {weight_str} Lattice="{box_str}" Energy={energy_str} Virial="{virial_str}" pbc="T T T" {properties_str}\n'
+            )
+
+            # 后续行：原子符号、坐标和力
+            for j in range(num_atoms):
+                x, y, z = coords[i, j]
+                fx, fy, fz = forces[i, j]
+                f.write(f"{atom_symbols[j]} {x:.10f} {y:.10f} {z:.10f} {fx:.10f} {fy:.10f} {fz:.10f}\n")
+
+    print(f"Successfully converted {num_configs} configurations to {output_filepath}")
+    print(f"Output file saved at: {output_filepath}")
+
+
+# --- 如何使用这个函数 ---
+if __name__ == "__main__":
+    # 示例用法:
+    input_folder_path = 'data/dpmd_data/lyc/training_data/p3m1_data/raw'
+    output_file_path = 'data/dpmd_data/lyc/training_data/p3m1_data/p3m1_train.xyz'
+
+    convert_raw_to_gpumd_xyz(input_folder=input_folder_path, output_filename=output_file_path)