一些小修改

This commit is contained in:
2025-11-19 12:23:17 +08:00
parent 95d719cc1e
commit 80ae03c8c1
25 changed files with 2291 additions and 17 deletions

76
GPUMD/Umap/umap_make.py Normal file
View File

@@ -0,0 +1,76 @@
from pathlib import Path
import numpy as np
import matplotlib
matplotlib.use("Agg") # 仅保存图片,不弹窗
import matplotlib.pyplot as plt
from umap import UMAP
def umap_dir_to_pngs(dir_path: str) -> None:
"""
对目录内每个 .npy 文件执行 UMAP(30D->2D) 并保存散点图。
- 输入 .npy 期望形状为 (n_samples, 30) 或 (30, n_samples)
- 输出图片保存在同目录,命名为 <原文件名>_umap.png
"""
p = Path(dir_path)
if not p.is_dir():
raise ValueError(f"{dir_path!r} 不是有效文件夹")
files = sorted(p.glob("*.npy"))
if not files:
print(f"目录 {p} 中未找到 .npy 文件")
return
for f in files:
try:
data = np.load(f)
if data.ndim == 2:
if data.shape[1] == 30:
X = data
elif data.shape[0] == 30:
X = data.T
else:
print(f"[跳过] {f.name}: shape={data.shape}, 未检测到 30 维特征")
continue
else:
print(f"[跳过] {f.name}: 期望二维数组,实际 shape={data.shape}")
continue
# 清理非数值行
mask = np.isfinite(X).all(axis=1)
if not np.all(mask):
X = X[mask]
print(f"[提示] {f.name}: 移除了含 NaN/Inf 的样本行")
n_samples = X.shape[0]
if n_samples < 3:
print(f"[跳过] {f.name}: 样本数过少(n={n_samples}),无法稳定降维")
continue
# 确保 n_neighbors 合法
n_neighbors = min(15, max(2, n_samples - 1))
reducer = UMAP(
n_components=2,
n_neighbors=n_neighbors,
min_dist=0.1,
metric="euclidean",
random_state=42,
)
emb = reducer.fit_transform(X)
fig, ax = plt.subplots(figsize=(6, 5), dpi=150)
ax.scatter(emb[:, 0], emb[:, 1], s=6, c="#1f77b4", alpha=0.8, edgecolors="none")
ax.set_title(f"{f.name} • UMAP (n={len(X)}, nn={n_neighbors})", fontsize=10)
ax.set_xlabel("UMAP-1")
ax.set_ylabel("UMAP-2")
ax.grid(True, linestyle="--", linewidth=0.3, alpha=0.5)
fig.tight_layout()
out_png = f.with_suffix("").as_posix() + "_umap.png"
fig.savefig(out_png)
plt.close(fig)
print(f"[完成] {f.name} -> {out_png}")
except Exception as e:
print(f"[错误] 处理 {f.name} 失败: {e}")
if __name__=="__main__":
umap_dir_to_pngs("data")

161
GPUMD/Umap/umap_make_2.py Normal file
View File

@@ -0,0 +1,161 @@
from pathlib import Path
import numpy as np
import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt
try:
from umap import UMAP
except Exception:
from umap.umap_ import UMAP
def umap_dir_shared_coords(
dir_path: str,
*,
metric: str = "cosine",
n_neighbors: int = 15,
min_dist: float = 0.0,
spread: float = 1.2,
standardize: bool = False,
context: bool = True,
make_joint: bool = True,
init: str = "random", # 关键:禁用谱初始化,避免告警;也可用 "pca"
jitter: float = 0.0, # 可选:拟合前加微弱噪声,如 1e-6
random_state: int = 42
) -> None:
"""
在同一 UMAP 坐标系中为目录内每个 .npy 文件生成 2D 图。
- 每个 .npy 形状为 (n_samples, 30) 或 (30, n_samples)
- 统一坐标轴范围;各自输出 *_umap_shared.png另可输出总览图
"""
p = Path(dir_path)
if not p.is_dir():
raise ValueError(f"{dir_path!r} 不是有效文件夹")
files = sorted(p.glob("*.npy"))
if not files:
print(f"目录 {p} 中未找到 .npy 文件")
return
X_list, paths, counts = [], [], []
for f in files:
try:
data = np.load(f)
if data.ndim != 2:
print(f"[跳过] {f.name}: 期望二维数组,实际 shape={data.shape}")
continue
if data.shape[1] == 30:
X = data
elif data.shape[0] == 30:
X = data.T
else:
print(f"[跳过] {f.name}: shape={data.shape}, 未检测到 30 维特征")
continue
mask = np.isfinite(X).all(axis=1)
if not np.all(mask):
X = X[mask]
print(f"[提示] {f.name}: 移除了含 NaN/Inf 的样本行")
if X.shape[0] < 3:
print(f"[跳过] {f.name}: 样本数过少(n={X.shape[0]})")
continue
X_list.append(X)
paths.append(f)
counts.append(X.shape[0])
except Exception as e:
print(f"[错误] 读取 {f.name} 失败: {e}")
if not X_list:
print("未找到可用的数据文件")
return
X_all = np.vstack(X_list)
if standardize:
mean = X_all.mean(axis=0)
std = X_all.std(axis=0)
std[std == 0] = 1.0
X_all = (X_all - mean) / std
if jitter and jitter > 0:
rng = np.random.default_rng(random_state)
X_all = X_all + rng.normal(scale=jitter, size=X_all.shape)
reducer = UMAP(
n_components=2,
n_neighbors=int(max(2, n_neighbors)),
min_dist=float(min_dist),
spread=float(spread),
metric=metric,
init=init, # 关键改动:避免谱初始化告警
random_state=random_state,
)
Z_all = reducer.fit_transform(X_all)
x_min, x_max = float(Z_all[:, 0].min()), float(Z_all[:, 0].max())
y_min, y_max = float(Z_all[:, 1].min()), float(Z_all[:, 1].max())
pad_x = 0.05 * (x_max - x_min) if x_max > x_min else 1.0
pad_y = 0.05 * (y_max - y_min) if y_max > y_min else 1.0
base_colors = [
"#1f77b4","#ff7f0e","#2ca02c","#d62728","#9467bd",
"#8c564b","#e377c2","#7f7f7f","#bcbd22","#17becf"
]
start = 0
for i, (f, n) in enumerate(zip(paths, counts)):
Zi = Z_all[start:start + n]
start += n
fig, ax = plt.subplots(figsize=(6, 5), dpi=150)
if context:
ax.scatter(Z_all[:, 0], Z_all[:, 1], s=5, c="#cccccc",
alpha=0.35, edgecolors="none", label="All")
ax.scatter(Zi[:, 0], Zi[:, 1], s=10,
c=base_colors[i % len(base_colors)],
alpha=0.9, edgecolors="none", label=f.name)
ax.set_title(
f"{f.name} • UMAP(shared) (nn={n_neighbors}, min={min_dist}, metric={metric}, init={init})",
fontsize=9
)
ax.set_xlabel("UMAP-1")
ax.set_ylabel("UMAP-2")
ax.set_xlim(x_min - pad_x, x_max + pad_x)
ax.set_ylim(y_min - pad_y, y_max + pad_y)
ax.grid(True, linestyle="--", linewidth=0.3, alpha=0.5)
if context:
ax.legend(loc="best", fontsize=8, frameon=False)
fig.tight_layout()
out_png = f.with_suffix("").as_posix() + "_umap_shared.png"
fig.savefig(out_png)
plt.close(fig)
print(f"[完成] {f.name} -> {out_png}")
if make_joint:
start = 0
fig, ax = plt.subplots(figsize=(7, 6), dpi=150)
for i, (f, n) in enumerate(zip(paths, counts)):
Zi = Z_all[start:start + n]; start += n
ax.scatter(Zi[:, 0], Zi[:, 1], s=8,
c=base_colors[i % len(base_colors)],
alpha=0.85, edgecolors="none", label=f.name)
ax.set_title(f"UMAP(shared) overview (metric={metric}, nn={n_neighbors}, min={min_dist}, init={init})",
fontsize=10)
ax.set_xlabel("UMAP-1"); ax.set_ylabel("UMAP-2")
ax.set_xlim(x_min - pad_x, x_max + pad_x)
ax.set_ylim(y_min - pad_y, y_max + pad_y)
ax.grid(True, linestyle="--", linewidth=0.3, alpha=0.5)
ax.legend(loc="best", fontsize=8, frameon=False, ncol=1)
fig.tight_layout()
out_png = Path(dir_path) / "umap_shared_overview.png"
fig.savefig(out_png.as_posix())
plt.close(fig)
print(f"[完成] 总览 -> {out_png}")
if __name__=="__main__":
umap_dir_shared_coords("data")

View File

@@ -0,0 +1,88 @@
Li Y Cl
1.0000000000000000
12.1082364219999992 -0.0000000000000000 0.0000000000000000
0.0000420925000000 12.6964871139000000 0.0000000000000000
0.0000111360000000 0.0000097283000000 11.1520040839999997
Li Y Cl
24 8 48
Cartesian
3.0170113299999999 11.0208475999999997 6.5429541999999996
9.0710813300000002 11.0208076100000003 6.5429413099999998
3.0372732299999998 1.6755553700000001 0.9669378900000000
9.0914532300000008 1.6755853700000001 0.9668849900000001
5.9960454600000004 8.0228419300000002 4.6273539599999998
12.0502254600000001 8.0228319300000006 4.6273410699999999
6.0439837100000000 8.0239104000000001 0.9669839400000000
12.0980237099999997 8.0238703999999998 0.9669410500000000
2.9687930800000002 11.0219791300000001 10.2032142199999996
9.0228930799999993 11.0219691300000004 10.2032413300000009
3.0851749800000001 1.6745967300000000 4.6273578600000000
9.1393549800000002 1.6745967399999999 4.6273049700000000
0.0581325900000000 4.6736939399999997 10.2033481199999994
6.1122525899999998 4.6736539400000003 10.2033652299999993
0.0102008400000000 4.6725925699999999 6.5430081500000004
6.0643308400000002 4.6725225699999999 6.5430152499999998
6.0582017800000001 11.3105425600000000 6.4882826299999996
0.0040517800000000 11.3105725600000007 6.4882655299999996
3.0311532099999998 7.7341853599999997 0.9123054900000001
9.0851632099999993 7.7341953600000002 0.9123126000000000
3.0230812999999999 4.9623175699999997 6.4882665900000003
9.0772212999999997 4.9622875700000000 6.4882637000000001
12.1043127199999994 1.3859403699999999 0.9123036700000000
6.0501727199999999 1.3859103699999999 0.9123265600000000
0.0501691200000000 4.7065010400000000 2.8276881199999999
6.1042791200000002 4.7064910400000004 2.8277152200000000
6.0040072100000001 7.9899634099999997 8.4036839699999994
12.0581272100000003 7.9900134100000004 8.4037010799999994
3.0772167300000000 1.6417582399999999 8.4037178800000003
9.1312967300000007 1.6417482400000001 8.4036949900000000
2.9769896100000000 11.0547561999999999 2.8276842100000001
9.0310896100000004 11.0547362000000007 2.8277013100000001
4.5156189400000004 10.0925444199999994 4.7485038499999996
10.5696489400000004 10.0924044199999994 4.7485109599999999
1.5387092400000000 2.6040715400000001 10.3245482299999995
7.5927392300000003 2.6040115400000001 10.3245253399999992
1.3626281400000000 9.1096203100000004 6.4718857500000002
7.4167081399999999 9.1095703100000005 6.4718328500000002
1.4883697199999999 8.9523616199999996 10.3245457500000004
7.5425797200000000 8.9522516299999992 10.3245128600000005
4.3896168400000004 9.9351631099999995 0.8958039700000000
10.4437368399999997 9.9351431100000003 0.8958210800000000
1.6644776500000000 2.7613398100000000 6.4718681100000000
7.7185976500000004 2.7613198099999998 6.4718352200000000
4.6916063499999998 3.5868826100000000 0.8958463400000000
10.7457363499999996 3.5868826100000000 0.8958134500000000
4.5657084499999998 3.7442043400000000 4.7485363400000002
10.6197684500000005 3.7442543399999999 4.7484934399999998
1.4271435299999999 5.7840809399999999 8.3327970300000000
7.4812435300000004 5.7840109399999999 8.3328141400000000
4.6270127299999997 6.9124334500000000 2.7567950600000000
10.6811427299999995 6.9124334500000000 2.7567721600000001
4.4542422500000001 0.5641837300000000 2.7567976500000002
10.5083422500000001 0.5641637400000000 2.7567847500000000
1.5999540200000000 12.1323306500000001 8.3327844399999993
7.6540440199999997 12.1323306500000001 8.3327815399999992
1.4488319300000001 5.8551694799999998 4.7388569900000004
7.5029319299999999 5.8551594800000002 4.7388941000000004
4.6052662299999998 6.8413464700000004 10.3148350900000008
10.6594162299999997 6.8413164799999997 10.3148222000000001
4.4984539600000000 0.5241951300000000 6.4733476400000001
10.5525539599999991 0.5241751300000000 6.4733547500000004
4.4759357399999997 0.4930566900000000 10.3148076700000004
10.5300357400000006 0.4930866900000000 10.3148047700000003
1.4714200500000001 5.8240679200000001 0.8973369900000000
7.5254900500000002 5.8240779299999996 0.8973441000000000
4.5827044399999997 6.8724549899999996 6.4733550900000001
10.6368744399999997 6.8724349900000004 6.4733521999999999
1.5557505300000001 12.1723877900000002 0.8973444400000000
7.6098205300000004 12.1722777900000008 0.8973515500000000
1.5782524200000001 12.2033792699999992 4.7388244200000003
7.6324124199999996 12.2033992700000002 4.7388615300000003
1.5507855200000000 2.5414585299999999 2.7575782499999999
7.6049855199999996 2.5414785300000000 2.7574953600000001
4.5034907500000001 10.1550858599999998 8.3335738300000006
10.5574507499999992 10.1550558599999992 8.3335109400000000
4.5777702700000003 3.8068257399999998 8.3335863099999994
10.6319002699999992 3.8067957400000001 8.3335634100000000
1.4764060000000001 8.8896886500000001 2.7575657800000002
7.5304859999999998 8.8896686500000008 2.7575728900000001

149
GPUMD/raw2xyz.py Normal file
View File

@@ -0,0 +1,149 @@
import os
import numpy as np
def convert_raw_to_gpumd_xyz(input_folder: str, output_filename: str = "gpumd_nep_training_data.xyz"):
"""
将 DeePMD-kit 风格的 .raw 训练数据转换为 GPUMD NEP 训练所需的 extended XYZ 格式。
调整为 GPUMD 期望的格式,包括在注释行中添加 Properties 字段,
并将每个原子的力数据附加到原子坐标行。
Args:
input_folder (str): 包含 .raw 文件的文件夹路径 (例如 './set.000/').
output_filename (str): 输出的 GPUMD extended XYZ 文件的名称。
Raises:
FileNotFoundError: 如果必需的 .raw 文件不存在。
ValueError: 如果数据格式不符合预期。
"""
required_files = [
'box.raw', 'coord.raw', 'energy.raw', 'force.raw',
'type.raw', 'type_map.raw', 'virial.raw'
]
# 检查所有必需的文件是否存在
for filename in required_files:
filepath = os.path.join(input_folder, filename)
if not os.path.exists(filepath):
raise FileNotFoundError(
f"Missing required file: {filepath}. Please ensure all .raw files are in the specified folder.")
print(f"Loading raw from folder: {input_folder}")
# --- 1. 读取数据 ---
try:
# 读取 type_map.raw
with open(os.path.join(input_folder, 'type_map.raw'), 'r') as f:
type_map_list = [line.strip() for line in f if line.strip()] # 移除空行
# 首次加载 coord.raw 来确定 num_atoms
first_coord_line = np.loadtxt(os.path.join(input_folder, 'coord.raw'), max_rows=1)
if first_coord_line.ndim == 0: # 如果只有1个数字
num_atoms = 1
else:
num_atoms = first_coord_line.shape[0] // 3
if num_atoms == 0:
raise ValueError(f"Could not determine num_atoms from coord.raw. It seems empty or malformed.")
# 现在有了正确的 num_atoms重新加载 type.raw 以获取原子类型列表
with open(os.path.join(input_folder, 'type.raw'), 'r') as f:
all_types_lines = f.readlines()
if not all_types_lines:
raise ValueError(f"{os.path.join(input_folder, 'type.raw')} is empty or malformed.")
# 假设所有构型的原子类型序列是相同的,我们只需要第一个构型的类型
first_type_config = np.array([int(x) for x in all_types_lines[0].strip().split()])
if len(first_type_config) != num_atoms:
# 尝试另一种 DeePMD 常见的 type.raw 格式:一个长序列,表示所有原子类型
# 如果 type.raw 的行数等于原子数,我们假设每行一个原子类型
if len(all_types_lines) == num_atoms:
atom_types_numeric = np.array([int(line.strip()) for line in all_types_lines])
else:
raise ValueError(
f"Mismatch between num_atoms ({num_atoms}) derived from coord.raw and type.raw format. "
f"First line of type.raw has {len(first_type_config)} types, total lines {len(all_types_lines)}. "
f"Please check type.raw format and adjust script.")
else:
atom_types_numeric = first_type_config # 正常情况,第一行就是第一个构型的所有原子类型
atom_symbols = [type_map_list[t] for t in atom_types_numeric]
# 读取其他数据
boxes = np.loadtxt(os.path.join(input_folder, 'box.raw')).reshape(-1, 3, 3)
coords_flat = np.loadtxt(os.path.join(input_folder, 'coord.raw'))
energies = np.loadtxt(os.path.join(input_folder, 'energy.raw'))
forces_flat = np.loadtxt(os.path.join(input_folder, 'force.raw'))
virials_flat = np.loadtxt(os.path.join(input_folder, 'virial.raw')) # 可能是 9 个分量
except Exception as e:
raise ValueError(f"Error reading .raw files. Please check their format. Details: {e}")
# 验证数据维度
num_configs = len(energies)
expected_coord_cols = num_atoms * 3
expected_virial_cols = 9 # DeepMD通常输出9个分量
if coords_flat.shape[1] != expected_coord_cols:
raise ValueError(
f"coord.raw has {coords_flat.shape[1]} columns, but expected {expected_coord_cols} (N_atoms * 3).")
if boxes.shape[0] != num_configs:
raise ValueError(f"box.raw has {boxes.shape[0]} configurations, but expected {num_configs}. Data mismatch.")
if forces_flat.shape[1] != expected_coord_cols:
raise ValueError(
f"force.raw has {forces_flat.shape[1]} columns, but expected {expected_coord_cols} (N_atoms * 3). Check file format.")
if virials_flat.shape[0] != num_configs or virials_flat.shape[1] != expected_virial_cols:
raise ValueError(
f"virial.raw has shape {virials_flat.shape}, but expected ({num_configs}, {expected_virial_cols}). Check file format.")
coords = coords_flat.reshape(num_configs, num_atoms, 3)
forces = forces_flat.reshape(num_configs, num_atoms, 3)
virials_matrix = virials_flat.reshape(num_configs, 3, 3)
print(f"Loaded {num_configs} configurations with {num_atoms} atoms each.")
# --- 2. 写入到 GPUMD NEP 的 extended XYZ 格式 ---
# 确保输出路径的目录存在
output_dir = os.path.dirname(output_filename)
if output_dir and not os.path.exists(output_dir):
os.makedirs(output_dir)
output_filepath = output_filename # 直接使用传入的output_filename作为最终路径
with open(output_filepath, 'w') as f:
for i in range(num_configs):
# 第一行:原子数量
f.write(f"{num_atoms}\n")
# 第二行:元数据
box_matrix_flat = boxes[i].flatten()
box_str = " ".join(f"{x:.10f}" for x in box_matrix_flat)
energy_str = f"{energies[i]:.10f}"
virial_tensor = virials_matrix[i]
# --- 关键修改处:输出 Virial 的九个分量 ---
# 展平 3x3 矩阵以得到九个分量
virial_gpumd_components = virial_tensor.flatten()
virial_str = " ".join(f"{x:.10f}" for x in virial_gpumd_components)
# 构造 GPUMD 兼容的第二行
config_type_str = f"Config_type=dpgen_iter{i:03d}" # 示例:迭代号,可以自定义
weight_str = "Weight=1.0"
properties_str = "Properties=species:S:1:pos:R:3:forces:R:3" # 关键修改
f.write(
f'{config_type_str} {weight_str} Lattice="{box_str}" Energy={energy_str} Virial="{virial_str}" pbc="T T T" {properties_str}\n'
)
# 后续行:原子符号、坐标和力
for j in range(num_atoms):
x, y, z = coords[i, j]
fx, fy, fz = forces[i, j]
f.write(f"{atom_symbols[j]} {x:.10f} {y:.10f} {z:.10f} {fx:.10f} {fy:.10f} {fz:.10f}\n")
print(f"Successfully converted {num_configs} configurations to {output_filepath}")
print(f"Output file saved at: {output_filepath}")
# --- 如何使用这个函数 ---
if __name__ == "__main__":
# 示例用法:
input_folder_path = 'data/dpmd_data/lyc/training_data/p3m1_data/raw'
output_file_path = 'data/dpmd_data/lyc/training_data/p3m1_data/p3m1_train.xyz'
convert_raw_to_gpumd_xyz(input_folder=input_folder_path, output_filename=output_file_path)

180
GPUMD/swap_li.py Normal file
View File

@@ -0,0 +1,180 @@
#!/usr/bin/env python3
# -*- coding: ascii -*-
"""
Randomly swap one Li-Y pair in a VASP5 POSCAR and write N new files.
- Keeps coordinate mode (Direct/Cartesian), Selective Dynamics flags, and Velocities.
- Requires VASP5+ POSCAR (with element symbols line).
"""
import random
from pathlib import Path
def _is_ints(tokens):
try:
_ = [int(t) for t in tokens]
return True
except ValueError:
return False
def _find_species_index(species, target):
t = target.lower()
for i, s in enumerate(species):
if s.lower() == t:
return i
raise ValueError("Element '%s' not found in species line: %s" % (target, " ".join(species)))
def parse_poscar(lines):
if len(lines) < 8:
raise ValueError("POSCAR too short")
comment = lines[0].rstrip("\n")
scale = lines[1].rstrip("\n")
lv = [lines[2].rstrip("\n"), lines[3].rstrip("\n"), lines[4].rstrip("\n")]
i = 5
tokens = lines[i].split()
if _is_ints(tokens):
raise ValueError("VASP4 format (no element symbols line) is not supported.")
species = tokens
i += 1
counts_line = lines[i].rstrip("\n")
counts = [int(x) for x in counts_line.split()]
i += 1
selective = False
sel_line = None
if i < len(lines) and lines[i].strip().lower().startswith("s"):
selective = True
sel_line = lines[i].rstrip("\n")
i += 1
coord_line = lines[i].rstrip("\n")
i += 1
natoms = sum(counts)
pos_start = i
pos_end = i + natoms
if pos_end > len(lines):
raise ValueError("Atom count exceeds file length.")
pos_lines = [lines[j].rstrip("\n") for j in range(pos_start, pos_end)]
# Optional Velocities section
k = pos_end
while k < len(lines) and lines[k].strip() == "":
k += 1
vel_header = None
vel_lines = None
vel_end = k
if k < len(lines) and lines[k].strip().lower().startswith("veloc"):
vel_header = lines[k].rstrip("\n")
vel_start = k + 1
vel_end = vel_start + natoms
if vel_end > len(lines):
raise ValueError("Velocities section length inconsistent with atom count.")
vel_lines = [lines[j].rstrip("\n") for j in range(vel_start, vel_end)]
tail_lines = [lines[j].rstrip("\n") for j in range(vel_end, len(lines))] if vel_end < len(lines) else []
# Species index ranges (by order in species list)
starts = []
acc = 0
for c in counts:
starts.append(acc)
acc += c
species_ranges = []
for idx, sp in enumerate(species):
s, e = starts[idx], starts[idx] + counts[idx]
species_ranges.append((sp, s, e))
return {
"comment": comment,
"scale": scale,
"lv": lv,
"species": species,
"counts": counts,
"counts_line": counts_line,
"selective": selective,
"sel_line": sel_line,
"coord_line": coord_line,
"natoms": natoms,
"pos_lines": pos_lines,
"vel_header": vel_header,
"vel_lines": vel_lines,
"tail_lines": tail_lines,
"species_ranges": species_ranges,
}
def build_poscar(data, pos_lines, vel_lines=None):
out = []
out.append(data["comment"])
out.append(data["scale"])
out.extend(data["lv"])
out.append(" ".join(data["species"]))
out.append(data["counts_line"])
if data["selective"]:
out.append(data["sel_line"] if data["sel_line"] is not None else "Selective dynamics")
out.append(data["coord_line"])
out.extend(pos_lines)
if data["vel_header"] is not None and vel_lines is not None:
out.append(data["vel_header"])
out.extend(vel_lines)
if data["tail_lines"]:
out.extend(data["tail_lines"])
return "\n".join(out) + "\n"
def _swap_once(data, rng, li_label="Li", y_label="Y"):
si_li = _find_species_index(data["species"], li_label)
si_y = _find_species_index(data["species"], y_label)
_, li_start, li_end = data["species_ranges"][si_li]
_, y_start, y_end = data["species_ranges"][si_y]
li_pick = rng.randrange(li_start, li_end)
y_pick = rng.randrange(y_start, y_end)
new_pos = list(data["pos_lines"])
new_pos[li_pick], new_pos[y_pick] = new_pos[y_pick], new_pos[li_pick]
new_vel = None
if data["vel_lines"] is not None:
new_vel = list(data["vel_lines"])
new_vel[li_pick], new_vel[y_pick] = new_vel[y_pick], new_vel[li_pick]
return new_pos, new_vel, (li_pick, y_pick)
def swap(n, input_file, output_dir):
"""
Generate n POSCAR files, each with one random Li-Y swap.
Returns: list of Path to written files.
"""
input_path = Path(input_file)
out_dir = Path(output_dir)
out_dir.mkdir(parents=True, exist_ok=True)
lines = input_path.read_text().splitlines()
data = parse_poscar(lines)
rng = random.Random()
base = input_path.name
out_paths = []
for k in range(1, n + 1):
new_pos, new_vel, picked = _swap_once(data, rng)
txt = build_poscar(data, new_pos, new_vel)
out_path = out_dir / f"swap_{k}_{base}"
out_path.write_text(txt)
out_paths.append(out_path)
print(f"Wrote {out_path} (swapped Li idx {picked[0]} <-> Y idx {picked[1]})")
return out_paths
# --------- Editable defaults for direct run ---------
INPUT_FILE = "data_POSCAR/origin/p3m1.vasp" # path to input POSCAR
OUTPUT_DIR = "data_POSCAR/p3m1" # output directory
N = 5 # number of files to generate
# ----------------------------------------------------
if __name__ == "__main__":
# Direct-run entry: edit INPUT_FILE/OUTPUT_DIR/N above to change behavior.
swap(n=N, input_file=INPUT_FILE, output_dir=OUTPUT_DIR)

140
GPUMD/t-SNE/t-SNE.py Normal file
View File

@@ -0,0 +1,140 @@
from pathlib import Path
import numpy as np
import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
def tsne_dir_shared_coords(
dir_path: str,
*,
metric: str = "euclidean", # 可试 "cosine";想保留尺度差异用 "euclidean"
perplexity: float = 50.0, # 30k~50k 样本建议 30~50
n_iter: int = 1000,
early_exaggeration: float = 12.0,
learning_rate = "auto",
standardize: bool = False,
pca_dim: int | None = None, # 先用 PCA 降到 pca_dim(如 20) 再跑 t-SNE可提速
context: bool = True,
make_joint: bool = True,
init: str = "pca",
random_state: int = 42
) -> None:
p = Path(dir_path)
if not p.is_dir():
raise ValueError(f"{dir_path!r} 不是有效文件夹")
files = sorted(p.glob("*.npy"))
if not files:
print(f"目录 {p} 中未找到 .npy 文件")
return
X_list, paths, counts = [], [], []
for f in files:
try:
data = np.load(f)
if data.ndim != 2:
print(f"[跳过] {f.name}: 期望二维数组,实际 shape={data.shape}")
continue
# 统一到 (n_samples, 30)
if data.shape[1] == 30:
X = data
elif data.shape[0] == 30:
X = data.T
else:
print(f"[跳过] {f.name}: shape={data.shape}, 未检测到 30 维特征")
continue
mask = np.isfinite(X).all(axis=1)
if not np.all(mask):
X = X[mask]
print(f"[提示] {f.name}: 移除了含 NaN/Inf 的样本行")
if X.shape[0] < 3:
print(f"[跳过] {f.name}: 样本数过少(n={X.shape[0]})")
continue
X_list.append(X)
paths.append(f)
counts.append(X.shape[0])
except Exception as e:
print(f"[错误] 读取 {f.name} 失败: {e}")
if not X_list:
print("未找到可用的数据文件")
return
X_all = np.vstack(X_list)
if standardize:
mean = X_all.mean(axis=0)
std = X_all.std(axis=0); std[std == 0] = 1.0
X_all = (X_all - mean) / std
if pca_dim is not None and pca_dim > 2:
X_all = PCA(n_components=pca_dim, random_state=random_state).fit_transform(X_all)
tsne = TSNE(
n_components=2,
metric=metric,
perplexity=float(perplexity),
early_exaggeration=float(early_exaggeration),
learning_rate=learning_rate,
init=init,
random_state=random_state,
method="barnes_hut", # 适合大样本
angle=0.5,
verbose=0,
)
Z_all = tsne.fit_transform(X_all)
# 统一坐标轴范围
x_min, x_max = float(Z_all[:, 0].min()), float(Z_all[:, 0].max())
y_min, y_max = float(Z_all[:, 1].min()), float(Z_all[:, 1].max())
pad_x = 0.05 * (x_max - x_min) if x_max > x_min else 1.0
pad_y = 0.05 * (y_max - y_min) if y_max > y_min else 1.0
colors = [
"#1f77b4","#ff7f0e","#2ca02c","#d62728","#9467bd",
"#8c564b","#e377c2","#7f7f7f","#bcbd22","#17becf"
]
# 分文件出图
start = 0
for i, (f, n) in enumerate(zip(paths, counts)):
Zi = Z_all[start:start + n]; start += n
fig, ax = plt.subplots(figsize=(6, 5), dpi=150)
if context:
ax.scatter(Z_all[:, 0], Z_all[:, 1], s=5, c="#cccccc", alpha=0.35, edgecolors="none", label="All")
ax.scatter(Zi[:, 0], Zi[:, 1], s=8, c=colors[i % len(colors)], alpha=0.9, edgecolors="none", label=f.name)
ax.set_title(f"{f.name} • t-SNE(shared) (perp={perplexity}, metric={metric})", fontsize=9)
ax.set_xlabel("t-SNE-1"); ax.set_ylabel("t-SNE-2")
ax.set_xlim(x_min - pad_x, x_max + pad_x); ax.set_ylim(y_min - pad_y, y_max + pad_y)
ax.grid(True, linestyle="--", linewidth=0.3, alpha=0.5)
if context: ax.legend(loc="best", fontsize=8, frameon=False)
fig.tight_layout()
out_png = f.with_suffix("").as_posix() + "_tsne_shared.png"
fig.savefig(out_png); plt.close(fig)
print(f"[完成] {f.name} -> {out_png}")
# 总览图
if make_joint:
start = 0
fig, ax = plt.subplots(figsize=(7, 6), dpi=150)
for i, (f, n) in enumerate(zip(paths, counts)):
Zi = Z_all[start:start + n]; start += n
ax.scatter(Zi[:, 0], Zi[:, 1], s=8, c=colors[i % len(colors)], alpha=0.85, edgecolors="none", label=f.name)
ax.set_title(f"t-SNE(shared) overview (perp={perplexity}, metric={metric})", fontsize=10)
ax.set_xlabel("t-SNE-1"); ax.set_ylabel("t-SNE-2")
ax.set_xlim(x_min - pad_x, x_max + pad_x); ax.set_ylim(y_min - pad_y, y_max + pad_y)
ax.grid(True, linestyle="--", linewidth=0.3, alpha=0.5)
ax.legend(loc="best", fontsize=8, frameon=False)
fig.tight_layout()
out_png = Path(dir_path) / "tsne_shared_overview.png"
fig.savefig(out_png.as_posix()); plt.close(fig)
print(f"[完成] 总览 -> {out_png}")
if __name__ == "__main__":
tsne_dir_shared_coords("data")