一些小修改

2025-11-19 12:23:17 +08:00
parent 95d719cc1e
commit 80ae03c8c1
25 changed files with 2291 additions and 17 deletions
--- a/GPUMD/Umap/umap_make.py
+++ b/GPUMD/Umap/umap_make.py
@@ -0,0 +1,76 @@
+from pathlib import Path
+import numpy as np
+import matplotlib
+matplotlib.use("Agg")  # 仅保存图片，不弹窗
+import matplotlib.pyplot as plt
+from umap import UMAP
+
+def umap_dir_to_pngs(dir_path: str) -> None:
+    """
+    对目录内每个 .npy 文件执行 UMAP(30D->2D) 并保存散点图。
+    - 输入 .npy 期望形状为 (n_samples, 30) 或 (30, n_samples)
+    - 输出图片保存在同目录，命名为 <原文件名>_umap.png
+    """
+    p = Path(dir_path)
+    if not p.is_dir():
+        raise ValueError(f"{dir_path!r} 不是有效文件夹")
+
+    files = sorted(p.glob("*.npy"))
+    if not files:
+        print(f"目录 {p} 中未找到 .npy 文件")
+        return
+
+    for f in files:
+        try:
+            data = np.load(f)
+            if data.ndim == 2:
+                if data.shape[1] == 30:
+                    X = data
+                elif data.shape[0] == 30:
+                    X = data.T
+                else:
+                    print(f"[跳过] {f.name}: shape={data.shape}, 未检测到 30 维特征")
+                    continue
+            else:
+                print(f"[跳过] {f.name}: 期望二维数组，实际 shape={data.shape}")
+                continue
+
+            # 清理非数值行
+            mask = np.isfinite(X).all(axis=1)
+            if not np.all(mask):
+                X = X[mask]
+                print(f"[提示] {f.name}: 移除了含 NaN/Inf 的样本行")
+
+            n_samples = X.shape[0]
+            if n_samples < 3:
+                print(f"[跳过] {f.name}: 样本数过少(n={n_samples})，无法稳定降维")
+                continue
+
+            # 确保 n_neighbors 合法
+            n_neighbors = min(15, max(2, n_samples - 1))
+            reducer = UMAP(
+                n_components=2,
+                n_neighbors=n_neighbors,
+                min_dist=0.1,
+                metric="euclidean",
+                random_state=42,
+            )
+            emb = reducer.fit_transform(X)
+
+            fig, ax = plt.subplots(figsize=(6, 5), dpi=150)
+            ax.scatter(emb[:, 0], emb[:, 1], s=6, c="#1f77b4", alpha=0.8, edgecolors="none")
+            ax.set_title(f"{f.name} • UMAP (n={len(X)}, nn={n_neighbors})", fontsize=10)
+            ax.set_xlabel("UMAP-1")
+            ax.set_ylabel("UMAP-2")
+            ax.grid(True, linestyle="--", linewidth=0.3, alpha=0.5)
+            fig.tight_layout()
+
+            out_png = f.with_suffix("").as_posix() + "_umap.png"
+            fig.savefig(out_png)
+            plt.close(fig)
+            print(f"[完成] {f.name} -> {out_png}")
+        except Exception as e:
+            print(f"[错误] 处理 {f.name} 失败: {e}")
+
+if __name__=="__main__":
+    umap_dir_to_pngs("data")
--- a/GPUMD/Umap/umap_make_2.py
+++ b/GPUMD/Umap/umap_make_2.py
@@ -0,0 +1,161 @@
+from pathlib import Path
+import numpy as np
+import matplotlib
+matplotlib.use("Agg")
+import matplotlib.pyplot as plt
+
+try:
+    from umap import UMAP
+except Exception:
+    from umap.umap_ import UMAP
+
+
+def umap_dir_shared_coords(
+    dir_path: str,
+    *,
+    metric: str = "cosine",
+    n_neighbors: int = 15,
+    min_dist: float = 0.0,
+    spread: float = 1.2,
+    standardize: bool = False,
+    context: bool = True,
+    make_joint: bool = True,
+    init: str = "random",     # 关键：禁用谱初始化，避免告警；也可用 "pca"
+    jitter: float = 0.0,      # 可选：拟合前加微弱噪声，如 1e-6
+    random_state: int = 42
+) -> None:
+    """
+    在同一 UMAP 坐标系中为目录内每个 .npy 文件生成 2D 图。
+    - 每个 .npy 形状为 (n_samples, 30) 或 (30, n_samples)
+    - 统一坐标轴范围；各自输出 *_umap_shared.png，另可输出总览图
+    """
+    p = Path(dir_path)
+    if not p.is_dir():
+        raise ValueError(f"{dir_path!r} 不是有效文件夹")
+
+    files = sorted(p.glob("*.npy"))
+    if not files:
+        print(f"目录 {p} 中未找到 .npy 文件")
+        return
+
+    X_list, paths, counts = [], [], []
+    for f in files:
+        try:
+            data = np.load(f)
+            if data.ndim != 2:
+                print(f"[跳过] {f.name}: 期望二维数组，实际 shape={data.shape}")
+                continue
+
+            if data.shape[1] == 30:
+                X = data
+            elif data.shape[0] == 30:
+                X = data.T
+            else:
+                print(f"[跳过] {f.name}: shape={data.shape}, 未检测到 30 维特征")
+                continue
+
+            mask = np.isfinite(X).all(axis=1)
+            if not np.all(mask):
+                X = X[mask]
+                print(f"[提示] {f.name}: 移除了含 NaN/Inf 的样本行")
+
+            if X.shape[0] < 3:
+                print(f"[跳过] {f.name}: 样本数过少(n={X.shape[0]})")
+                continue
+
+            X_list.append(X)
+            paths.append(f)
+            counts.append(X.shape[0])
+        except Exception as e:
+            print(f"[错误] 读取 {f.name} 失败: {e}")
+
+    if not X_list:
+        print("未找到可用的数据文件")
+        return
+
+    X_all = np.vstack(X_list)
+
+    if standardize:
+        mean = X_all.mean(axis=0)
+        std = X_all.std(axis=0)
+        std[std == 0] = 1.0
+        X_all = (X_all - mean) / std
+
+    if jitter and jitter > 0:
+        rng = np.random.default_rng(random_state)
+        X_all = X_all + rng.normal(scale=jitter, size=X_all.shape)
+
+    reducer = UMAP(
+        n_components=2,
+        n_neighbors=int(max(2, n_neighbors)),
+        min_dist=float(min_dist),
+        spread=float(spread),
+        metric=metric,
+        init=init,                 # 关键改动：避免谱初始化告警
+        random_state=random_state,
+    )
+    Z_all = reducer.fit_transform(X_all)
+
+    x_min, x_max = float(Z_all[:, 0].min()), float(Z_all[:, 0].max())
+    y_min, y_max = float(Z_all[:, 1].min()), float(Z_all[:, 1].max())
+    pad_x = 0.05 * (x_max - x_min) if x_max > x_min else 1.0
+    pad_y = 0.05 * (y_max - y_min) if y_max > y_min else 1.0
+
+    base_colors = [
+        "#1f77b4","#ff7f0e","#2ca02c","#d62728","#9467bd",
+        "#8c564b","#e377c2","#7f7f7f","#bcbd22","#17becf"
+    ]
+
+    start = 0
+    for i, (f, n) in enumerate(zip(paths, counts)):
+        Zi = Z_all[start:start + n]
+        start += n
+
+        fig, ax = plt.subplots(figsize=(6, 5), dpi=150)
+        if context:
+            ax.scatter(Z_all[:, 0], Z_all[:, 1], s=5, c="#cccccc",
+                       alpha=0.35, edgecolors="none", label="All")
+        ax.scatter(Zi[:, 0], Zi[:, 1], s=10,
+                   c=base_colors[i % len(base_colors)],
+                   alpha=0.9, edgecolors="none", label=f.name)
+
+        ax.set_title(
+            f"{f.name} • UMAP(shared) (nn={n_neighbors}, min={min_dist}, metric={metric}, init={init})",
+            fontsize=9
+        )
+        ax.set_xlabel("UMAP-1")
+        ax.set_ylabel("UMAP-2")
+        ax.set_xlim(x_min - pad_x, x_max + pad_x)
+        ax.set_ylim(y_min - pad_y, y_max + pad_y)
+        ax.grid(True, linestyle="--", linewidth=0.3, alpha=0.5)
+        if context:
+            ax.legend(loc="best", fontsize=8, frameon=False)
+        fig.tight_layout()
+
+        out_png = f.with_suffix("").as_posix() + "_umap_shared.png"
+        fig.savefig(out_png)
+        plt.close(fig)
+        print(f"[完成] {f.name} -> {out_png}")
+
+    if make_joint:
+        start = 0
+        fig, ax = plt.subplots(figsize=(7, 6), dpi=150)
+        for i, (f, n) in enumerate(zip(paths, counts)):
+            Zi = Z_all[start:start + n]; start += n
+            ax.scatter(Zi[:, 0], Zi[:, 1], s=8,
+                       c=base_colors[i % len(base_colors)],
+                       alpha=0.85, edgecolors="none", label=f.name)
+        ax.set_title(f"UMAP(shared) overview (metric={metric}, nn={n_neighbors}, min={min_dist}, init={init})",
+                     fontsize=10)
+        ax.set_xlabel("UMAP-1"); ax.set_ylabel("UMAP-2")
+        ax.set_xlim(x_min - pad_x, x_max + pad_x)
+        ax.set_ylim(y_min - pad_y, y_max + pad_y)
+        ax.grid(True, linestyle="--", linewidth=0.3, alpha=0.5)
+        ax.legend(loc="best", fontsize=8, frameon=False, ncol=1)
+        fig.tight_layout()
+        out_png = Path(dir_path) / "umap_shared_overview.png"
+        fig.savefig(out_png.as_posix())
+        plt.close(fig)
+        print(f"[完成] 总览 -> {out_png}")
+if __name__=="__main__":
+    umap_dir_shared_coords("data")
--- a/GPUMD/data_POSCAR/origin/pnma.vasp
+++ b/GPUMD/data_POSCAR/origin/pnma.vasp
@@ -0,0 +1,88 @@
+Li Y  Cl
+ 1.0000000000000000
+    12.1082364219999992   -0.0000000000000000    0.0000000000000000
+     0.0000420925000000   12.6964871139000000    0.0000000000000000
+     0.0000111360000000    0.0000097283000000   11.1520040839999997
+ Li  Y   Cl 
+  24   8  48
+Cartesian
+  3.0170113299999999 11.0208475999999997  6.5429541999999996
+  9.0710813300000002 11.0208076100000003  6.5429413099999998
+  3.0372732299999998  1.6755553700000001  0.9669378900000000
+  9.0914532300000008  1.6755853700000001  0.9668849900000001
+  5.9960454600000004  8.0228419300000002  4.6273539599999998
+ 12.0502254600000001  8.0228319300000006  4.6273410699999999
+  6.0439837100000000  8.0239104000000001  0.9669839400000000
+ 12.0980237099999997  8.0238703999999998  0.9669410500000000
+  2.9687930800000002 11.0219791300000001 10.2032142199999996
+  9.0228930799999993 11.0219691300000004 10.2032413300000009
+  3.0851749800000001  1.6745967300000000  4.6273578600000000
+  9.1393549800000002  1.6745967399999999  4.6273049700000000
+  0.0581325900000000  4.6736939399999997 10.2033481199999994
+  6.1122525899999998  4.6736539400000003 10.2033652299999993
+  0.0102008400000000  4.6725925699999999  6.5430081500000004
+  6.0643308400000002  4.6725225699999999  6.5430152499999998
+  6.0582017800000001 11.3105425600000000  6.4882826299999996
+  0.0040517800000000 11.3105725600000007  6.4882655299999996
+  3.0311532099999998  7.7341853599999997  0.9123054900000001
+  9.0851632099999993  7.7341953600000002  0.9123126000000000
+  3.0230812999999999  4.9623175699999997  6.4882665900000003
+  9.0772212999999997  4.9622875700000000  6.4882637000000001
+ 12.1043127199999994  1.3859403699999999  0.9123036700000000
+  6.0501727199999999  1.3859103699999999  0.9123265600000000
+  0.0501691200000000  4.7065010400000000  2.8276881199999999
+  6.1042791200000002  4.7064910400000004  2.8277152200000000
+  6.0040072100000001  7.9899634099999997  8.4036839699999994
+ 12.0581272100000003  7.9900134100000004  8.4037010799999994
+  3.0772167300000000  1.6417582399999999  8.4037178800000003
+  9.1312967300000007  1.6417482400000001  8.4036949900000000
+  2.9769896100000000 11.0547561999999999  2.8276842100000001
+  9.0310896100000004 11.0547362000000007  2.8277013100000001
+  4.5156189400000004 10.0925444199999994  4.7485038499999996
+ 10.5696489400000004 10.0924044199999994  4.7485109599999999
+  1.5387092400000000  2.6040715400000001 10.3245482299999995
+  7.5927392300000003  2.6040115400000001 10.3245253399999992
+  1.3626281400000000  9.1096203100000004  6.4718857500000002
+  7.4167081399999999  9.1095703100000005  6.4718328500000002
+  1.4883697199999999  8.9523616199999996 10.3245457500000004
+  7.5425797200000000  8.9522516299999992 10.3245128600000005
+  4.3896168400000004  9.9351631099999995  0.8958039700000000
+ 10.4437368399999997  9.9351431100000003  0.8958210800000000
+  1.6644776500000000  2.7613398100000000  6.4718681100000000
+  7.7185976500000004  2.7613198099999998  6.4718352200000000
+  4.6916063499999998  3.5868826100000000  0.8958463400000000
+ 10.7457363499999996  3.5868826100000000  0.8958134500000000
+  4.5657084499999998  3.7442043400000000  4.7485363400000002
+ 10.6197684500000005  3.7442543399999999  4.7484934399999998
+  1.4271435299999999  5.7840809399999999  8.3327970300000000
+  7.4812435300000004  5.7840109399999999  8.3328141400000000
+  4.6270127299999997  6.9124334500000000  2.7567950600000000
+ 10.6811427299999995  6.9124334500000000  2.7567721600000001
+  4.4542422500000001  0.5641837300000000  2.7567976500000002
+ 10.5083422500000001  0.5641637400000000  2.7567847500000000
+  1.5999540200000000 12.1323306500000001  8.3327844399999993
+  7.6540440199999997 12.1323306500000001  8.3327815399999992
+  1.4488319300000001  5.8551694799999998  4.7388569900000004
+  7.5029319299999999  5.8551594800000002  4.7388941000000004
+  4.6052662299999998  6.8413464700000004 10.3148350900000008
+ 10.6594162299999997  6.8413164799999997 10.3148222000000001
+  4.4984539600000000  0.5241951300000000  6.4733476400000001
+ 10.5525539599999991  0.5241751300000000  6.4733547500000004
+  4.4759357399999997  0.4930566900000000 10.3148076700000004
+ 10.5300357400000006  0.4930866900000000 10.3148047700000003
+  1.4714200500000001  5.8240679200000001  0.8973369900000000
+  7.5254900500000002  5.8240779299999996  0.8973441000000000
+  4.5827044399999997  6.8724549899999996  6.4733550900000001
+ 10.6368744399999997  6.8724349900000004  6.4733521999999999
+  1.5557505300000001 12.1723877900000002  0.8973444400000000
+  7.6098205300000004 12.1722777900000008  0.8973515500000000
+  1.5782524200000001 12.2033792699999992  4.7388244200000003
+  7.6324124199999996 12.2033992700000002  4.7388615300000003
+  1.5507855200000000  2.5414585299999999  2.7575782499999999
+  7.6049855199999996  2.5414785300000000  2.7574953600000001
+  4.5034907500000001 10.1550858599999998  8.3335738300000006
+ 10.5574507499999992 10.1550558599999992  8.3335109400000000
+  4.5777702700000003  3.8068257399999998  8.3335863099999994
+ 10.6319002699999992  3.8067957400000001  8.3335634100000000
+  1.4764060000000001  8.8896886500000001  2.7575657800000002
+  7.5304859999999998  8.8896686500000008  2.7575728900000001
--- a/GPUMD/raw2xyz.py
+++ b/GPUMD/raw2xyz.py
@@ -0,0 +1,149 @@
+import os
+import numpy as np
+
+
+def convert_raw_to_gpumd_xyz(input_folder: str, output_filename: str = "gpumd_nep_training_data.xyz"):
+    """
+    将 DeePMD-kit 风格的 .raw 训练数据转换为 GPUMD NEP 训练所需的 extended XYZ 格式。
+    调整为 GPUMD 期望的格式，包括在注释行中添加 Properties 字段，
+    并将每个原子的力数据附加到原子坐标行。
+    Args:
+        input_folder (str): 包含 .raw 文件的文件夹路径 (例如 './set.000/').
+        output_filename (str): 输出的 GPUMD extended XYZ 文件的名称。
+    Raises:
+        FileNotFoundError: 如果必需的 .raw 文件不存在。
+        ValueError: 如果数据格式不符合预期。
+    """
+    required_files = [
+        'box.raw', 'coord.raw', 'energy.raw', 'force.raw',
+        'type.raw', 'type_map.raw', 'virial.raw'
+    ]
+    # 检查所有必需的文件是否存在
+    for filename in required_files:
+        filepath = os.path.join(input_folder, filename)
+        if not os.path.exists(filepath):
+            raise FileNotFoundError(
+                f"Missing required file: {filepath}. Please ensure all .raw files are in the specified folder.")
+    print(f"Loading raw from folder: {input_folder}")
+
+    # --- 1. 读取数据 ---
+    try:
+        # 读取 type_map.raw
+        with open(os.path.join(input_folder, 'type_map.raw'), 'r') as f:
+            type_map_list = [line.strip() for line in f if line.strip()]  # 移除空行
+
+        # 首次加载 coord.raw 来确定 num_atoms
+        first_coord_line = np.loadtxt(os.path.join(input_folder, 'coord.raw'), max_rows=1)
+        if first_coord_line.ndim == 0:  # 如果只有1个数字
+            num_atoms = 1
+        else:
+            num_atoms = first_coord_line.shape[0] // 3
+        if num_atoms == 0:
+            raise ValueError(f"Could not determine num_atoms from coord.raw. It seems empty or malformed.")
+
+        # 现在有了正确的 num_atoms，重新加载 type.raw 以获取原子类型列表
+        with open(os.path.join(input_folder, 'type.raw'), 'r') as f:
+            all_types_lines = f.readlines()
+            if not all_types_lines:
+                raise ValueError(f"{os.path.join(input_folder, 'type.raw')} is empty or malformed.")
+
+            # 假设所有构型的原子类型序列是相同的，我们只需要第一个构型的类型
+            first_type_config = np.array([int(x) for x in all_types_lines[0].strip().split()])
+            if len(first_type_config) != num_atoms:
+                # 尝试另一种 DeePMD 常见的 type.raw 格式：一个长序列，表示所有原子类型
+                # 如果 type.raw 的行数等于原子数，我们假设每行一个原子类型
+                if len(all_types_lines) == num_atoms:
+                    atom_types_numeric = np.array([int(line.strip()) for line in all_types_lines])
+                else:
+                    raise ValueError(
+                        f"Mismatch between num_atoms ({num_atoms}) derived from coord.raw and type.raw format. "
+                        f"First line of type.raw has {len(first_type_config)} types, total lines {len(all_types_lines)}. "
+                        f"Please check type.raw format and adjust script.")
+            else:
+                atom_types_numeric = first_type_config  # 正常情况，第一行就是第一个构型的所有原子类型
+
+        atom_symbols = [type_map_list[t] for t in atom_types_numeric]
+
+        # 读取其他数据
+        boxes = np.loadtxt(os.path.join(input_folder, 'box.raw')).reshape(-1, 3, 3)
+        coords_flat = np.loadtxt(os.path.join(input_folder, 'coord.raw'))
+        energies = np.loadtxt(os.path.join(input_folder, 'energy.raw'))
+        forces_flat = np.loadtxt(os.path.join(input_folder, 'force.raw'))
+        virials_flat = np.loadtxt(os.path.join(input_folder, 'virial.raw'))  # 可能是 9 个分量
+
+    except Exception as e:
+        raise ValueError(f"Error reading .raw files. Please check their format. Details: {e}")
+
+    # 验证数据维度
+    num_configs = len(energies)
+    expected_coord_cols = num_atoms * 3
+    expected_virial_cols = 9  # DeepMD通常输出9个分量
+
+    if coords_flat.shape[1] != expected_coord_cols:
+        raise ValueError(
+            f"coord.raw has {coords_flat.shape[1]} columns, but expected {expected_coord_cols} (N_atoms * 3).")
+    if boxes.shape[0] != num_configs:
+        raise ValueError(f"box.raw has {boxes.shape[0]} configurations, but expected {num_configs}. Data mismatch.")
+    if forces_flat.shape[1] != expected_coord_cols:
+        raise ValueError(
+            f"force.raw has {forces_flat.shape[1]} columns, but expected {expected_coord_cols} (N_atoms * 3). Check file format.")
+    if virials_flat.shape[0] != num_configs or virials_flat.shape[1] != expected_virial_cols:
+        raise ValueError(
+            f"virial.raw has shape {virials_flat.shape}, but expected ({num_configs}, {expected_virial_cols}). Check file format.")
+
+    coords = coords_flat.reshape(num_configs, num_atoms, 3)
+    forces = forces_flat.reshape(num_configs, num_atoms, 3)
+    virials_matrix = virials_flat.reshape(num_configs, 3, 3)
+
+    print(f"Loaded {num_configs} configurations with {num_atoms} atoms each.")
+
+    # --- 2. 写入到 GPUMD NEP 的 extended XYZ 格式 ---
+    # 确保输出路径的目录存在
+    output_dir = os.path.dirname(output_filename)
+    if output_dir and not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+
+    output_filepath = output_filename  # 直接使用传入的output_filename作为最终路径
+
+    with open(output_filepath, 'w') as f:
+        for i in range(num_configs):
+            # 第一行：原子数量
+            f.write(f"{num_atoms}\n")
+
+            # 第二行：元数据
+            box_matrix_flat = boxes[i].flatten()
+            box_str = " ".join(f"{x:.10f}" for x in box_matrix_flat)
+            energy_str = f"{energies[i]:.10f}"
+
+            virial_tensor = virials_matrix[i]
+            # --- 关键修改处：输出 Virial 的九个分量 ---
+            # 展平 3x3 矩阵以得到九个分量
+            virial_gpumd_components = virial_tensor.flatten()
+            virial_str = " ".join(f"{x:.10f}" for x in virial_gpumd_components)
+
+            # 构造 GPUMD 兼容的第二行
+            config_type_str = f"Config_type=dpgen_iter{i:03d}"  # 示例：迭代号，可以自定义
+            weight_str = "Weight=1.0"
+            properties_str = "Properties=species:S:1:pos:R:3:forces:R:3"  # 关键修改
+
+            f.write(
+                f'{config_type_str} {weight_str} Lattice="{box_str}" Energy={energy_str} Virial="{virial_str}" pbc="T T T" {properties_str}\n'
+            )
+
+            # 后续行：原子符号、坐标和力
+            for j in range(num_atoms):
+                x, y, z = coords[i, j]
+                fx, fy, fz = forces[i, j]
+                f.write(f"{atom_symbols[j]} {x:.10f} {y:.10f} {z:.10f} {fx:.10f} {fy:.10f} {fz:.10f}\n")
+
+    print(f"Successfully converted {num_configs} configurations to {output_filepath}")
+    print(f"Output file saved at: {output_filepath}")
+
+
+# --- 如何使用这个函数 ---
+if __name__ == "__main__":
+    # 示例用法:
+    input_folder_path = 'data/dpmd_data/lyc/training_data/p3m1_data/raw'
+    output_file_path = 'data/dpmd_data/lyc/training_data/p3m1_data/p3m1_train.xyz'
+
+    convert_raw_to_gpumd_xyz(input_folder=input_folder_path, output_filename=output_file_path)
--- a/GPUMD/swap_li.py
+++ b/GPUMD/swap_li.py
@@ -0,0 +1,180 @@
+#!/usr/bin/env python3
+# -*- coding: ascii -*-
+"""
+Randomly swap one Li-Y pair in a VASP5 POSCAR and write N new files.
+- Keeps coordinate mode (Direct/Cartesian), Selective Dynamics flags, and Velocities.
+- Requires VASP5+ POSCAR (with element symbols line).
+"""
+
+import random
+from pathlib import Path
+
+
+
+def _is_ints(tokens):
+    try:
+        _ = [int(t) for t in tokens]
+        return True
+    except ValueError:
+        return False
+
+def _find_species_index(species, target):
+    t = target.lower()
+    for i, s in enumerate(species):
+        if s.lower() == t:
+            return i
+    raise ValueError("Element '%s' not found in species line: %s" % (target, " ".join(species)))
+
+def parse_poscar(lines):
+    if len(lines) < 8:
+        raise ValueError("POSCAR too short")
+
+    comment = lines[0].rstrip("\n")
+    scale = lines[1].rstrip("\n")
+    lv = [lines[2].rstrip("\n"), lines[3].rstrip("\n"), lines[4].rstrip("\n")]
+
+    i = 5
+    tokens = lines[i].split()
+    if _is_ints(tokens):
+        raise ValueError("VASP4 format (no element symbols line) is not supported.")
+    species = tokens
+    i += 1
+
+    counts_line = lines[i].rstrip("\n")
+    counts = [int(x) for x in counts_line.split()]
+    i += 1
+
+    selective = False
+    sel_line = None
+    if i < len(lines) and lines[i].strip().lower().startswith("s"):
+        selective = True
+        sel_line = lines[i].rstrip("\n")
+        i += 1
+
+    coord_line = lines[i].rstrip("\n")
+    i += 1
+
+    natoms = sum(counts)
+    pos_start = i
+    pos_end = i + natoms
+    if pos_end > len(lines):
+        raise ValueError("Atom count exceeds file length.")
+    pos_lines = [lines[j].rstrip("\n") for j in range(pos_start, pos_end)]
+
+    # Optional Velocities section
+    k = pos_end
+    while k < len(lines) and lines[k].strip() == "":
+        k += 1
+
+    vel_header = None
+    vel_lines = None
+    vel_end = k
+    if k < len(lines) and lines[k].strip().lower().startswith("veloc"):
+        vel_header = lines[k].rstrip("\n")
+        vel_start = k + 1
+        vel_end = vel_start + natoms
+        if vel_end > len(lines):
+            raise ValueError("Velocities section length inconsistent with atom count.")
+        vel_lines = [lines[j].rstrip("\n") for j in range(vel_start, vel_end)]
+
+    tail_lines = [lines[j].rstrip("\n") for j in range(vel_end, len(lines))] if vel_end < len(lines) else []
+
+    # Species index ranges (by order in species list)
+    starts = []
+    acc = 0
+    for c in counts:
+        starts.append(acc)
+        acc += c
+    species_ranges = []
+    for idx, sp in enumerate(species):
+        s, e = starts[idx], starts[idx] + counts[idx]
+        species_ranges.append((sp, s, e))
+
+    return {
+        "comment": comment,
+        "scale": scale,
+        "lv": lv,
+        "species": species,
+        "counts": counts,
+        "counts_line": counts_line,
+        "selective": selective,
+        "sel_line": sel_line,
+        "coord_line": coord_line,
+        "natoms": natoms,
+        "pos_lines": pos_lines,
+        "vel_header": vel_header,
+        "vel_lines": vel_lines,
+        "tail_lines": tail_lines,
+        "species_ranges": species_ranges,
+    }
+
+def build_poscar(data, pos_lines, vel_lines=None):
+    out = []
+    out.append(data["comment"])
+    out.append(data["scale"])
+    out.extend(data["lv"])
+    out.append(" ".join(data["species"]))
+    out.append(data["counts_line"])
+    if data["selective"]:
+        out.append(data["sel_line"] if data["sel_line"] is not None else "Selective dynamics")
+    out.append(data["coord_line"])
+    out.extend(pos_lines)
+    if data["vel_header"] is not None and vel_lines is not None:
+        out.append(data["vel_header"])
+        out.extend(vel_lines)
+    if data["tail_lines"]:
+        out.extend(data["tail_lines"])
+    return "\n".join(out) + "\n"
+
+def _swap_once(data, rng, li_label="Li", y_label="Y"):
+    si_li = _find_species_index(data["species"], li_label)
+    si_y  = _find_species_index(data["species"], y_label)
+    _, li_start, li_end = data["species_ranges"][si_li]
+    _, y_start,  y_end  = data["species_ranges"][si_y]
+
+    li_pick = rng.randrange(li_start, li_end)
+    y_pick  = rng.randrange(y_start, y_end)
+
+    new_pos = list(data["pos_lines"])
+    new_pos[li_pick], new_pos[y_pick] = new_pos[y_pick], new_pos[li_pick]
+
+    new_vel = None
+    if data["vel_lines"] is not None:
+        new_vel = list(data["vel_lines"])
+        new_vel[li_pick], new_vel[y_pick] = new_vel[y_pick], new_vel[li_pick]
+
+    return new_pos, new_vel, (li_pick, y_pick)
+
+def swap(n, input_file, output_dir):
+    """
+    Generate n POSCAR files, each with one random Li-Y swap.
+
+    Returns: list of Path to written files.
+    """
+    input_path = Path(input_file)
+    out_dir = Path(output_dir)
+    out_dir.mkdir(parents=True, exist_ok=True)
+
+    lines = input_path.read_text().splitlines()
+    data = parse_poscar(lines)
+
+    rng = random.Random()
+    base = input_path.name
+
+    out_paths = []
+    for k in range(1, n + 1):
+        new_pos, new_vel, picked = _swap_once(data, rng)
+        txt = build_poscar(data, new_pos, new_vel)
+        out_path = out_dir / f"swap_{k}_{base}"
+        out_path.write_text(txt)
+        out_paths.append(out_path)
+        print(f"Wrote {out_path}  (swapped Li idx {picked[0]} <-> Y idx {picked[1]})")
+    return out_paths
+# --------- Editable defaults for direct run ---------
+INPUT_FILE = "data_POSCAR/origin/p3m1.vasp"   # path to input POSCAR
+OUTPUT_DIR = "data_POSCAR/p3m1"    # output directory
+N = 5                   # number of files to generate
+# ----------------------------------------------------
+if __name__ == "__main__":
+    # Direct-run entry: edit INPUT_FILE/OUTPUT_DIR/N above to change behavior.
+    swap(n=N, input_file=INPUT_FILE, output_dir=OUTPUT_DIR)
--- a/GPUMD/t-SNE/t-SNE.py
+++ b/GPUMD/t-SNE/t-SNE.py
@@ -0,0 +1,140 @@
+from pathlib import Path
+import numpy as np
+import matplotlib
+matplotlib.use("Agg")
+import matplotlib.pyplot as plt
+from sklearn.manifold import TSNE
+from sklearn.decomposition import PCA
+
+def tsne_dir_shared_coords(
+    dir_path: str,
+    *,
+    metric: str = "euclidean",     # 可试 "cosine"；想保留尺度差异用 "euclidean"
+    perplexity: float = 50.0,      # 30k~50k 样本建议 30~50
+    n_iter: int = 1000,
+    early_exaggeration: float = 12.0,
+    learning_rate = "auto",
+    standardize: bool = False,
+    pca_dim: int | None = None,    # 先用 PCA 降到 pca_dim(如 20) 再跑 t-SNE，可提速
+    context: bool = True,
+    make_joint: bool = True,
+    init: str = "pca",
+    random_state: int = 42
+) -> None:
+    p = Path(dir_path)
+    if not p.is_dir():
+        raise ValueError(f"{dir_path!r} 不是有效文件夹")
+
+    files = sorted(p.glob("*.npy"))
+    if not files:
+        print(f"目录 {p} 中未找到 .npy 文件")
+        return
+
+    X_list, paths, counts = [], [], []
+    for f in files:
+        try:
+            data = np.load(f)
+            if data.ndim != 2:
+                print(f"[跳过] {f.name}: 期望二维数组，实际 shape={data.shape}")
+                continue
+
+            # 统一到 (n_samples, 30)
+            if data.shape[1] == 30:
+                X = data
+            elif data.shape[0] == 30:
+                X = data.T
+            else:
+                print(f"[跳过] {f.name}: shape={data.shape}, 未检测到 30 维特征")
+                continue
+
+            mask = np.isfinite(X).all(axis=1)
+            if not np.all(mask):
+                X = X[mask]
+                print(f"[提示] {f.name}: 移除了含 NaN/Inf 的样本行")
+
+            if X.shape[0] < 3:
+                print(f"[跳过] {f.name}: 样本数过少(n={X.shape[0]})")
+                continue
+
+            X_list.append(X)
+            paths.append(f)
+            counts.append(X.shape[0])
+        except Exception as e:
+            print(f"[错误] 读取 {f.name} 失败: {e}")
+
+    if not X_list:
+        print("未找到可用的数据文件")
+        return
+
+    X_all = np.vstack(X_list)
+
+    if standardize:
+        mean = X_all.mean(axis=0)
+        std = X_all.std(axis=0); std[std == 0] = 1.0
+        X_all = (X_all - mean) / std
+
+    if pca_dim is not None and pca_dim > 2:
+        X_all = PCA(n_components=pca_dim, random_state=random_state).fit_transform(X_all)
+
+    tsne = TSNE(
+        n_components=2,
+        metric=metric,
+        perplexity=float(perplexity),
+        early_exaggeration=float(early_exaggeration),
+        learning_rate=learning_rate,
+        init=init,
+        random_state=random_state,
+        method="barnes_hut",   # 适合大样本
+        angle=0.5,
+        verbose=0,
+    )
+    Z_all = tsne.fit_transform(X_all)
+
+    # 统一坐标轴范围
+    x_min, x_max = float(Z_all[:, 0].min()), float(Z_all[:, 0].max())
+    y_min, y_max = float(Z_all[:, 1].min()), float(Z_all[:, 1].max())
+    pad_x = 0.05 * (x_max - x_min) if x_max > x_min else 1.0
+    pad_y = 0.05 * (y_max - y_min) if y_max > y_min else 1.0
+
+    colors = [
+        "#1f77b4","#ff7f0e","#2ca02c","#d62728","#9467bd",
+        "#8c564b","#e377c2","#7f7f7f","#bcbd22","#17becf"
+    ]
+
+    # 分文件出图
+    start = 0
+    for i, (f, n) in enumerate(zip(paths, counts)):
+        Zi = Z_all[start:start + n]; start += n
+        fig, ax = plt.subplots(figsize=(6, 5), dpi=150)
+        if context:
+            ax.scatter(Z_all[:, 0], Z_all[:, 1], s=5, c="#cccccc", alpha=0.35, edgecolors="none", label="All")
+        ax.scatter(Zi[:, 0], Zi[:, 1], s=8, c=colors[i % len(colors)], alpha=0.9, edgecolors="none", label=f.name)
+        ax.set_title(f"{f.name} • t-SNE(shared) (perp={perplexity}, metric={metric})", fontsize=9)
+        ax.set_xlabel("t-SNE-1"); ax.set_ylabel("t-SNE-2")
+        ax.set_xlim(x_min - pad_x, x_max + pad_x); ax.set_ylim(y_min - pad_y, y_max + pad_y)
+        ax.grid(True, linestyle="--", linewidth=0.3, alpha=0.5)
+        if context: ax.legend(loc="best", fontsize=8, frameon=False)
+        fig.tight_layout()
+        out_png = f.with_suffix("").as_posix() + "_tsne_shared.png"
+        fig.savefig(out_png); plt.close(fig)
+        print(f"[完成] {f.name} -> {out_png}")
+
+    # 总览图
+    if make_joint:
+        start = 0
+        fig, ax = plt.subplots(figsize=(7, 6), dpi=150)
+        for i, (f, n) in enumerate(zip(paths, counts)):
+            Zi = Z_all[start:start + n]; start += n
+            ax.scatter(Zi[:, 0], Zi[:, 1], s=8, c=colors[i % len(colors)], alpha=0.85, edgecolors="none", label=f.name)
+        ax.set_title(f"t-SNE(shared) overview (perp={perplexity}, metric={metric})", fontsize=10)
+        ax.set_xlabel("t-SNE-1"); ax.set_ylabel("t-SNE-2")
+        ax.set_xlim(x_min - pad_x, x_max + pad_x); ax.set_ylim(y_min - pad_y, y_max + pad_y)
+        ax.grid(True, linestyle="--", linewidth=0.3, alpha=0.5)
+        ax.legend(loc="best", fontsize=8, frameon=False)
+        fig.tight_layout()
+        out_png = Path(dir_path) / "tsne_shared_overview.png"
+        fig.savefig(out_png.as_posix()); plt.close(fig)
+        print(f"[完成] 总览 -> {out_png}")
+
+if __name__ == "__main__":
+    tsne_dir_shared_coords("data")