diff --git a/GPUMD/Umap/umap_make.py b/GPUMD/Umap/umap_make.py new file mode 100644 index 0000000..6a2f127 --- /dev/null +++ b/GPUMD/Umap/umap_make.py @@ -0,0 +1,76 @@ +from pathlib import Path +import numpy as np +import matplotlib +matplotlib.use("Agg") # 仅保存图片,不弹窗 +import matplotlib.pyplot as plt +from umap import UMAP + +def umap_dir_to_pngs(dir_path: str) -> None: + """ + 对目录内每个 .npy 文件执行 UMAP(30D->2D) 并保存散点图。 + - 输入 .npy 期望形状为 (n_samples, 30) 或 (30, n_samples) + - 输出图片保存在同目录,命名为 <原文件名>_umap.png + """ + p = Path(dir_path) + if not p.is_dir(): + raise ValueError(f"{dir_path!r} 不是有效文件夹") + + files = sorted(p.glob("*.npy")) + if not files: + print(f"目录 {p} 中未找到 .npy 文件") + return + + for f in files: + try: + data = np.load(f) + if data.ndim == 2: + if data.shape[1] == 30: + X = data + elif data.shape[0] == 30: + X = data.T + else: + print(f"[跳过] {f.name}: shape={data.shape}, 未检测到 30 维特征") + continue + else: + print(f"[跳过] {f.name}: 期望二维数组,实际 shape={data.shape}") + continue + + # 清理非数值行 + mask = np.isfinite(X).all(axis=1) + if not np.all(mask): + X = X[mask] + print(f"[提示] {f.name}: 移除了含 NaN/Inf 的样本行") + + n_samples = X.shape[0] + if n_samples < 3: + print(f"[跳过] {f.name}: 样本数过少(n={n_samples}),无法稳定降维") + continue + + # 确保 n_neighbors 合法 + n_neighbors = min(15, max(2, n_samples - 1)) + reducer = UMAP( + n_components=2, + n_neighbors=n_neighbors, + min_dist=0.1, + metric="euclidean", + random_state=42, + ) + emb = reducer.fit_transform(X) + + fig, ax = plt.subplots(figsize=(6, 5), dpi=150) + ax.scatter(emb[:, 0], emb[:, 1], s=6, c="#1f77b4", alpha=0.8, edgecolors="none") + ax.set_title(f"{f.name} • UMAP (n={len(X)}, nn={n_neighbors})", fontsize=10) + ax.set_xlabel("UMAP-1") + ax.set_ylabel("UMAP-2") + ax.grid(True, linestyle="--", linewidth=0.3, alpha=0.5) + fig.tight_layout() + + out_png = f.with_suffix("").as_posix() + "_umap.png" + fig.savefig(out_png) + plt.close(fig) + print(f"[完成] {f.name} -> {out_png}") + except Exception as e: + print(f"[错误] 处理 {f.name} 失败: {e}") + +if __name__=="__main__": + umap_dir_to_pngs("data") \ No newline at end of file diff --git a/GPUMD/Umap/umap_make_2.py b/GPUMD/Umap/umap_make_2.py new file mode 100644 index 0000000..d86eaaa --- /dev/null +++ b/GPUMD/Umap/umap_make_2.py @@ -0,0 +1,161 @@ +from pathlib import Path +import numpy as np +import matplotlib +matplotlib.use("Agg") +import matplotlib.pyplot as plt + +try: + from umap import UMAP +except Exception: + from umap.umap_ import UMAP + + +def umap_dir_shared_coords( + dir_path: str, + *, + metric: str = "cosine", + n_neighbors: int = 15, + min_dist: float = 0.0, + spread: float = 1.2, + standardize: bool = False, + context: bool = True, + make_joint: bool = True, + init: str = "random", # 关键:禁用谱初始化,避免告警;也可用 "pca" + jitter: float = 0.0, # 可选:拟合前加微弱噪声,如 1e-6 + random_state: int = 42 +) -> None: + """ + 在同一 UMAP 坐标系中为目录内每个 .npy 文件生成 2D 图。 + - 每个 .npy 形状为 (n_samples, 30) 或 (30, n_samples) + - 统一坐标轴范围;各自输出 *_umap_shared.png,另可输出总览图 + """ + p = Path(dir_path) + if not p.is_dir(): + raise ValueError(f"{dir_path!r} 不是有效文件夹") + + files = sorted(p.glob("*.npy")) + if not files: + print(f"目录 {p} 中未找到 .npy 文件") + return + + X_list, paths, counts = [], [], [] + for f in files: + try: + data = np.load(f) + if data.ndim != 2: + print(f"[跳过] {f.name}: 期望二维数组,实际 shape={data.shape}") + continue + + if data.shape[1] == 30: + X = data + elif data.shape[0] == 30: + X = data.T + else: + print(f"[跳过] {f.name}: shape={data.shape}, 未检测到 30 维特征") + continue + + mask = np.isfinite(X).all(axis=1) + if not np.all(mask): + X = X[mask] + print(f"[提示] {f.name}: 移除了含 NaN/Inf 的样本行") + + if X.shape[0] < 3: + print(f"[跳过] {f.name}: 样本数过少(n={X.shape[0]})") + continue + + X_list.append(X) + paths.append(f) + counts.append(X.shape[0]) + except Exception as e: + print(f"[错误] 读取 {f.name} 失败: {e}") + + if not X_list: + print("未找到可用的数据文件") + return + + X_all = np.vstack(X_list) + + if standardize: + mean = X_all.mean(axis=0) + std = X_all.std(axis=0) + std[std == 0] = 1.0 + X_all = (X_all - mean) / std + + if jitter and jitter > 0: + rng = np.random.default_rng(random_state) + X_all = X_all + rng.normal(scale=jitter, size=X_all.shape) + + reducer = UMAP( + n_components=2, + n_neighbors=int(max(2, n_neighbors)), + min_dist=float(min_dist), + spread=float(spread), + metric=metric, + init=init, # 关键改动:避免谱初始化告警 + random_state=random_state, + ) + Z_all = reducer.fit_transform(X_all) + + x_min, x_max = float(Z_all[:, 0].min()), float(Z_all[:, 0].max()) + y_min, y_max = float(Z_all[:, 1].min()), float(Z_all[:, 1].max()) + pad_x = 0.05 * (x_max - x_min) if x_max > x_min else 1.0 + pad_y = 0.05 * (y_max - y_min) if y_max > y_min else 1.0 + + base_colors = [ + "#1f77b4","#ff7f0e","#2ca02c","#d62728","#9467bd", + "#8c564b","#e377c2","#7f7f7f","#bcbd22","#17becf" + ] + + start = 0 + for i, (f, n) in enumerate(zip(paths, counts)): + Zi = Z_all[start:start + n] + start += n + + fig, ax = plt.subplots(figsize=(6, 5), dpi=150) + if context: + ax.scatter(Z_all[:, 0], Z_all[:, 1], s=5, c="#cccccc", + alpha=0.35, edgecolors="none", label="All") + ax.scatter(Zi[:, 0], Zi[:, 1], s=10, + c=base_colors[i % len(base_colors)], + alpha=0.9, edgecolors="none", label=f.name) + + ax.set_title( + f"{f.name} • UMAP(shared) (nn={n_neighbors}, min={min_dist}, metric={metric}, init={init})", + fontsize=9 + ) + ax.set_xlabel("UMAP-1") + ax.set_ylabel("UMAP-2") + ax.set_xlim(x_min - pad_x, x_max + pad_x) + ax.set_ylim(y_min - pad_y, y_max + pad_y) + ax.grid(True, linestyle="--", linewidth=0.3, alpha=0.5) + if context: + ax.legend(loc="best", fontsize=8, frameon=False) + fig.tight_layout() + + out_png = f.with_suffix("").as_posix() + "_umap_shared.png" + fig.savefig(out_png) + plt.close(fig) + print(f"[完成] {f.name} -> {out_png}") + + if make_joint: + start = 0 + fig, ax = plt.subplots(figsize=(7, 6), dpi=150) + for i, (f, n) in enumerate(zip(paths, counts)): + Zi = Z_all[start:start + n]; start += n + ax.scatter(Zi[:, 0], Zi[:, 1], s=8, + c=base_colors[i % len(base_colors)], + alpha=0.85, edgecolors="none", label=f.name) + ax.set_title(f"UMAP(shared) overview (metric={metric}, nn={n_neighbors}, min={min_dist}, init={init})", + fontsize=10) + ax.set_xlabel("UMAP-1"); ax.set_ylabel("UMAP-2") + ax.set_xlim(x_min - pad_x, x_max + pad_x) + ax.set_ylim(y_min - pad_y, y_max + pad_y) + ax.grid(True, linestyle="--", linewidth=0.3, alpha=0.5) + ax.legend(loc="best", fontsize=8, frameon=False, ncol=1) + fig.tight_layout() + out_png = Path(dir_path) / "umap_shared_overview.png" + fig.savefig(out_png.as_posix()) + plt.close(fig) + print(f"[完成] 总览 -> {out_png}") +if __name__=="__main__": + umap_dir_shared_coords("data") \ No newline at end of file diff --git a/GPUMD/data_POSCAR/origin/pnma.vasp b/GPUMD/data_POSCAR/origin/pnma.vasp new file mode 100644 index 0000000..7c5ad26 --- /dev/null +++ b/GPUMD/data_POSCAR/origin/pnma.vasp @@ -0,0 +1,88 @@ +Li Y Cl + 1.0000000000000000 + 12.1082364219999992 -0.0000000000000000 0.0000000000000000 + 0.0000420925000000 12.6964871139000000 0.0000000000000000 + 0.0000111360000000 0.0000097283000000 11.1520040839999997 + Li Y Cl + 24 8 48 +Cartesian + 3.0170113299999999 11.0208475999999997 6.5429541999999996 + 9.0710813300000002 11.0208076100000003 6.5429413099999998 + 3.0372732299999998 1.6755553700000001 0.9669378900000000 + 9.0914532300000008 1.6755853700000001 0.9668849900000001 + 5.9960454600000004 8.0228419300000002 4.6273539599999998 + 12.0502254600000001 8.0228319300000006 4.6273410699999999 + 6.0439837100000000 8.0239104000000001 0.9669839400000000 + 12.0980237099999997 8.0238703999999998 0.9669410500000000 + 2.9687930800000002 11.0219791300000001 10.2032142199999996 + 9.0228930799999993 11.0219691300000004 10.2032413300000009 + 3.0851749800000001 1.6745967300000000 4.6273578600000000 + 9.1393549800000002 1.6745967399999999 4.6273049700000000 + 0.0581325900000000 4.6736939399999997 10.2033481199999994 + 6.1122525899999998 4.6736539400000003 10.2033652299999993 + 0.0102008400000000 4.6725925699999999 6.5430081500000004 + 6.0643308400000002 4.6725225699999999 6.5430152499999998 + 6.0582017800000001 11.3105425600000000 6.4882826299999996 + 0.0040517800000000 11.3105725600000007 6.4882655299999996 + 3.0311532099999998 7.7341853599999997 0.9123054900000001 + 9.0851632099999993 7.7341953600000002 0.9123126000000000 + 3.0230812999999999 4.9623175699999997 6.4882665900000003 + 9.0772212999999997 4.9622875700000000 6.4882637000000001 + 12.1043127199999994 1.3859403699999999 0.9123036700000000 + 6.0501727199999999 1.3859103699999999 0.9123265600000000 + 0.0501691200000000 4.7065010400000000 2.8276881199999999 + 6.1042791200000002 4.7064910400000004 2.8277152200000000 + 6.0040072100000001 7.9899634099999997 8.4036839699999994 + 12.0581272100000003 7.9900134100000004 8.4037010799999994 + 3.0772167300000000 1.6417582399999999 8.4037178800000003 + 9.1312967300000007 1.6417482400000001 8.4036949900000000 + 2.9769896100000000 11.0547561999999999 2.8276842100000001 + 9.0310896100000004 11.0547362000000007 2.8277013100000001 + 4.5156189400000004 10.0925444199999994 4.7485038499999996 + 10.5696489400000004 10.0924044199999994 4.7485109599999999 + 1.5387092400000000 2.6040715400000001 10.3245482299999995 + 7.5927392300000003 2.6040115400000001 10.3245253399999992 + 1.3626281400000000 9.1096203100000004 6.4718857500000002 + 7.4167081399999999 9.1095703100000005 6.4718328500000002 + 1.4883697199999999 8.9523616199999996 10.3245457500000004 + 7.5425797200000000 8.9522516299999992 10.3245128600000005 + 4.3896168400000004 9.9351631099999995 0.8958039700000000 + 10.4437368399999997 9.9351431100000003 0.8958210800000000 + 1.6644776500000000 2.7613398100000000 6.4718681100000000 + 7.7185976500000004 2.7613198099999998 6.4718352200000000 + 4.6916063499999998 3.5868826100000000 0.8958463400000000 + 10.7457363499999996 3.5868826100000000 0.8958134500000000 + 4.5657084499999998 3.7442043400000000 4.7485363400000002 + 10.6197684500000005 3.7442543399999999 4.7484934399999998 + 1.4271435299999999 5.7840809399999999 8.3327970300000000 + 7.4812435300000004 5.7840109399999999 8.3328141400000000 + 4.6270127299999997 6.9124334500000000 2.7567950600000000 + 10.6811427299999995 6.9124334500000000 2.7567721600000001 + 4.4542422500000001 0.5641837300000000 2.7567976500000002 + 10.5083422500000001 0.5641637400000000 2.7567847500000000 + 1.5999540200000000 12.1323306500000001 8.3327844399999993 + 7.6540440199999997 12.1323306500000001 8.3327815399999992 + 1.4488319300000001 5.8551694799999998 4.7388569900000004 + 7.5029319299999999 5.8551594800000002 4.7388941000000004 + 4.6052662299999998 6.8413464700000004 10.3148350900000008 + 10.6594162299999997 6.8413164799999997 10.3148222000000001 + 4.4984539600000000 0.5241951300000000 6.4733476400000001 + 10.5525539599999991 0.5241751300000000 6.4733547500000004 + 4.4759357399999997 0.4930566900000000 10.3148076700000004 + 10.5300357400000006 0.4930866900000000 10.3148047700000003 + 1.4714200500000001 5.8240679200000001 0.8973369900000000 + 7.5254900500000002 5.8240779299999996 0.8973441000000000 + 4.5827044399999997 6.8724549899999996 6.4733550900000001 + 10.6368744399999997 6.8724349900000004 6.4733521999999999 + 1.5557505300000001 12.1723877900000002 0.8973444400000000 + 7.6098205300000004 12.1722777900000008 0.8973515500000000 + 1.5782524200000001 12.2033792699999992 4.7388244200000003 + 7.6324124199999996 12.2033992700000002 4.7388615300000003 + 1.5507855200000000 2.5414585299999999 2.7575782499999999 + 7.6049855199999996 2.5414785300000000 2.7574953600000001 + 4.5034907500000001 10.1550858599999998 8.3335738300000006 + 10.5574507499999992 10.1550558599999992 8.3335109400000000 + 4.5777702700000003 3.8068257399999998 8.3335863099999994 + 10.6319002699999992 3.8067957400000001 8.3335634100000000 + 1.4764060000000001 8.8896886500000001 2.7575657800000002 + 7.5304859999999998 8.8896686500000008 2.7575728900000001 diff --git a/GPUMD/raw2xyz.py b/GPUMD/raw2xyz.py new file mode 100644 index 0000000..fccd548 --- /dev/null +++ b/GPUMD/raw2xyz.py @@ -0,0 +1,149 @@ +import os +import numpy as np + + +def convert_raw_to_gpumd_xyz(input_folder: str, output_filename: str = "gpumd_nep_training_data.xyz"): + """ + 将 DeePMD-kit 风格的 .raw 训练数据转换为 GPUMD NEP 训练所需的 extended XYZ 格式。 + 调整为 GPUMD 期望的格式,包括在注释行中添加 Properties 字段, + 并将每个原子的力数据附加到原子坐标行。 + Args: + input_folder (str): 包含 .raw 文件的文件夹路径 (例如 './set.000/'). + output_filename (str): 输出的 GPUMD extended XYZ 文件的名称。 + Raises: + FileNotFoundError: 如果必需的 .raw 文件不存在。 + ValueError: 如果数据格式不符合预期。 + """ + required_files = [ + 'box.raw', 'coord.raw', 'energy.raw', 'force.raw', + 'type.raw', 'type_map.raw', 'virial.raw' + ] + # 检查所有必需的文件是否存在 + for filename in required_files: + filepath = os.path.join(input_folder, filename) + if not os.path.exists(filepath): + raise FileNotFoundError( + f"Missing required file: {filepath}. Please ensure all .raw files are in the specified folder.") + print(f"Loading raw from folder: {input_folder}") + + # --- 1. 读取数据 --- + try: + # 读取 type_map.raw + with open(os.path.join(input_folder, 'type_map.raw'), 'r') as f: + type_map_list = [line.strip() for line in f if line.strip()] # 移除空行 + + # 首次加载 coord.raw 来确定 num_atoms + first_coord_line = np.loadtxt(os.path.join(input_folder, 'coord.raw'), max_rows=1) + if first_coord_line.ndim == 0: # 如果只有1个数字 + num_atoms = 1 + else: + num_atoms = first_coord_line.shape[0] // 3 + if num_atoms == 0: + raise ValueError(f"Could not determine num_atoms from coord.raw. It seems empty or malformed.") + + # 现在有了正确的 num_atoms,重新加载 type.raw 以获取原子类型列表 + with open(os.path.join(input_folder, 'type.raw'), 'r') as f: + all_types_lines = f.readlines() + if not all_types_lines: + raise ValueError(f"{os.path.join(input_folder, 'type.raw')} is empty or malformed.") + + # 假设所有构型的原子类型序列是相同的,我们只需要第一个构型的类型 + first_type_config = np.array([int(x) for x in all_types_lines[0].strip().split()]) + if len(first_type_config) != num_atoms: + # 尝试另一种 DeePMD 常见的 type.raw 格式:一个长序列,表示所有原子类型 + # 如果 type.raw 的行数等于原子数,我们假设每行一个原子类型 + if len(all_types_lines) == num_atoms: + atom_types_numeric = np.array([int(line.strip()) for line in all_types_lines]) + else: + raise ValueError( + f"Mismatch between num_atoms ({num_atoms}) derived from coord.raw and type.raw format. " + f"First line of type.raw has {len(first_type_config)} types, total lines {len(all_types_lines)}. " + f"Please check type.raw format and adjust script.") + else: + atom_types_numeric = first_type_config # 正常情况,第一行就是第一个构型的所有原子类型 + + atom_symbols = [type_map_list[t] for t in atom_types_numeric] + + # 读取其他数据 + boxes = np.loadtxt(os.path.join(input_folder, 'box.raw')).reshape(-1, 3, 3) + coords_flat = np.loadtxt(os.path.join(input_folder, 'coord.raw')) + energies = np.loadtxt(os.path.join(input_folder, 'energy.raw')) + forces_flat = np.loadtxt(os.path.join(input_folder, 'force.raw')) + virials_flat = np.loadtxt(os.path.join(input_folder, 'virial.raw')) # 可能是 9 个分量 + + except Exception as e: + raise ValueError(f"Error reading .raw files. Please check their format. Details: {e}") + + # 验证数据维度 + num_configs = len(energies) + expected_coord_cols = num_atoms * 3 + expected_virial_cols = 9 # DeepMD通常输出9个分量 + + if coords_flat.shape[1] != expected_coord_cols: + raise ValueError( + f"coord.raw has {coords_flat.shape[1]} columns, but expected {expected_coord_cols} (N_atoms * 3).") + if boxes.shape[0] != num_configs: + raise ValueError(f"box.raw has {boxes.shape[0]} configurations, but expected {num_configs}. Data mismatch.") + if forces_flat.shape[1] != expected_coord_cols: + raise ValueError( + f"force.raw has {forces_flat.shape[1]} columns, but expected {expected_coord_cols} (N_atoms * 3). Check file format.") + if virials_flat.shape[0] != num_configs or virials_flat.shape[1] != expected_virial_cols: + raise ValueError( + f"virial.raw has shape {virials_flat.shape}, but expected ({num_configs}, {expected_virial_cols}). Check file format.") + + coords = coords_flat.reshape(num_configs, num_atoms, 3) + forces = forces_flat.reshape(num_configs, num_atoms, 3) + virials_matrix = virials_flat.reshape(num_configs, 3, 3) + + print(f"Loaded {num_configs} configurations with {num_atoms} atoms each.") + + # --- 2. 写入到 GPUMD NEP 的 extended XYZ 格式 --- + # 确保输出路径的目录存在 + output_dir = os.path.dirname(output_filename) + if output_dir and not os.path.exists(output_dir): + os.makedirs(output_dir) + + output_filepath = output_filename # 直接使用传入的output_filename作为最终路径 + + with open(output_filepath, 'w') as f: + for i in range(num_configs): + # 第一行:原子数量 + f.write(f"{num_atoms}\n") + + # 第二行:元数据 + box_matrix_flat = boxes[i].flatten() + box_str = " ".join(f"{x:.10f}" for x in box_matrix_flat) + energy_str = f"{energies[i]:.10f}" + + virial_tensor = virials_matrix[i] + # --- 关键修改处:输出 Virial 的九个分量 --- + # 展平 3x3 矩阵以得到九个分量 + virial_gpumd_components = virial_tensor.flatten() + virial_str = " ".join(f"{x:.10f}" for x in virial_gpumd_components) + + # 构造 GPUMD 兼容的第二行 + config_type_str = f"Config_type=dpgen_iter{i:03d}" # 示例:迭代号,可以自定义 + weight_str = "Weight=1.0" + properties_str = "Properties=species:S:1:pos:R:3:forces:R:3" # 关键修改 + + f.write( + f'{config_type_str} {weight_str} Lattice="{box_str}" Energy={energy_str} Virial="{virial_str}" pbc="T T T" {properties_str}\n' + ) + + # 后续行:原子符号、坐标和力 + for j in range(num_atoms): + x, y, z = coords[i, j] + fx, fy, fz = forces[i, j] + f.write(f"{atom_symbols[j]} {x:.10f} {y:.10f} {z:.10f} {fx:.10f} {fy:.10f} {fz:.10f}\n") + + print(f"Successfully converted {num_configs} configurations to {output_filepath}") + print(f"Output file saved at: {output_filepath}") + + +# --- 如何使用这个函数 --- +if __name__ == "__main__": + # 示例用法: + input_folder_path = 'data/dpmd_data/lyc/training_data/p3m1_data/raw' + output_file_path = 'data/dpmd_data/lyc/training_data/p3m1_data/p3m1_train.xyz' + + convert_raw_to_gpumd_xyz(input_folder=input_folder_path, output_filename=output_file_path) diff --git a/GPUMD/swap_li.py b/GPUMD/swap_li.py new file mode 100644 index 0000000..eaf4b69 --- /dev/null +++ b/GPUMD/swap_li.py @@ -0,0 +1,180 @@ +#!/usr/bin/env python3 +# -*- coding: ascii -*- +""" +Randomly swap one Li-Y pair in a VASP5 POSCAR and write N new files. +- Keeps coordinate mode (Direct/Cartesian), Selective Dynamics flags, and Velocities. +- Requires VASP5+ POSCAR (with element symbols line). +""" + +import random +from pathlib import Path + + + +def _is_ints(tokens): + try: + _ = [int(t) for t in tokens] + return True + except ValueError: + return False + +def _find_species_index(species, target): + t = target.lower() + for i, s in enumerate(species): + if s.lower() == t: + return i + raise ValueError("Element '%s' not found in species line: %s" % (target, " ".join(species))) + +def parse_poscar(lines): + if len(lines) < 8: + raise ValueError("POSCAR too short") + + comment = lines[0].rstrip("\n") + scale = lines[1].rstrip("\n") + lv = [lines[2].rstrip("\n"), lines[3].rstrip("\n"), lines[4].rstrip("\n")] + + i = 5 + tokens = lines[i].split() + if _is_ints(tokens): + raise ValueError("VASP4 format (no element symbols line) is not supported.") + species = tokens + i += 1 + + counts_line = lines[i].rstrip("\n") + counts = [int(x) for x in counts_line.split()] + i += 1 + + selective = False + sel_line = None + if i < len(lines) and lines[i].strip().lower().startswith("s"): + selective = True + sel_line = lines[i].rstrip("\n") + i += 1 + + coord_line = lines[i].rstrip("\n") + i += 1 + + natoms = sum(counts) + pos_start = i + pos_end = i + natoms + if pos_end > len(lines): + raise ValueError("Atom count exceeds file length.") + pos_lines = [lines[j].rstrip("\n") for j in range(pos_start, pos_end)] + + # Optional Velocities section + k = pos_end + while k < len(lines) and lines[k].strip() == "": + k += 1 + + vel_header = None + vel_lines = None + vel_end = k + if k < len(lines) and lines[k].strip().lower().startswith("veloc"): + vel_header = lines[k].rstrip("\n") + vel_start = k + 1 + vel_end = vel_start + natoms + if vel_end > len(lines): + raise ValueError("Velocities section length inconsistent with atom count.") + vel_lines = [lines[j].rstrip("\n") for j in range(vel_start, vel_end)] + + tail_lines = [lines[j].rstrip("\n") for j in range(vel_end, len(lines))] if vel_end < len(lines) else [] + + # Species index ranges (by order in species list) + starts = [] + acc = 0 + for c in counts: + starts.append(acc) + acc += c + species_ranges = [] + for idx, sp in enumerate(species): + s, e = starts[idx], starts[idx] + counts[idx] + species_ranges.append((sp, s, e)) + + return { + "comment": comment, + "scale": scale, + "lv": lv, + "species": species, + "counts": counts, + "counts_line": counts_line, + "selective": selective, + "sel_line": sel_line, + "coord_line": coord_line, + "natoms": natoms, + "pos_lines": pos_lines, + "vel_header": vel_header, + "vel_lines": vel_lines, + "tail_lines": tail_lines, + "species_ranges": species_ranges, + } + +def build_poscar(data, pos_lines, vel_lines=None): + out = [] + out.append(data["comment"]) + out.append(data["scale"]) + out.extend(data["lv"]) + out.append(" ".join(data["species"])) + out.append(data["counts_line"]) + if data["selective"]: + out.append(data["sel_line"] if data["sel_line"] is not None else "Selective dynamics") + out.append(data["coord_line"]) + out.extend(pos_lines) + if data["vel_header"] is not None and vel_lines is not None: + out.append(data["vel_header"]) + out.extend(vel_lines) + if data["tail_lines"]: + out.extend(data["tail_lines"]) + return "\n".join(out) + "\n" + +def _swap_once(data, rng, li_label="Li", y_label="Y"): + si_li = _find_species_index(data["species"], li_label) + si_y = _find_species_index(data["species"], y_label) + _, li_start, li_end = data["species_ranges"][si_li] + _, y_start, y_end = data["species_ranges"][si_y] + + li_pick = rng.randrange(li_start, li_end) + y_pick = rng.randrange(y_start, y_end) + + new_pos = list(data["pos_lines"]) + new_pos[li_pick], new_pos[y_pick] = new_pos[y_pick], new_pos[li_pick] + + new_vel = None + if data["vel_lines"] is not None: + new_vel = list(data["vel_lines"]) + new_vel[li_pick], new_vel[y_pick] = new_vel[y_pick], new_vel[li_pick] + + return new_pos, new_vel, (li_pick, y_pick) + +def swap(n, input_file, output_dir): + """ + Generate n POSCAR files, each with one random Li-Y swap. + + Returns: list of Path to written files. + """ + input_path = Path(input_file) + out_dir = Path(output_dir) + out_dir.mkdir(parents=True, exist_ok=True) + + lines = input_path.read_text().splitlines() + data = parse_poscar(lines) + + rng = random.Random() + base = input_path.name + + out_paths = [] + for k in range(1, n + 1): + new_pos, new_vel, picked = _swap_once(data, rng) + txt = build_poscar(data, new_pos, new_vel) + out_path = out_dir / f"swap_{k}_{base}" + out_path.write_text(txt) + out_paths.append(out_path) + print(f"Wrote {out_path} (swapped Li idx {picked[0]} <-> Y idx {picked[1]})") + return out_paths +# --------- Editable defaults for direct run --------- +INPUT_FILE = "data_POSCAR/origin/p3m1.vasp" # path to input POSCAR +OUTPUT_DIR = "data_POSCAR/p3m1" # output directory +N = 5 # number of files to generate +# ---------------------------------------------------- +if __name__ == "__main__": + # Direct-run entry: edit INPUT_FILE/OUTPUT_DIR/N above to change behavior. + swap(n=N, input_file=INPUT_FILE, output_dir=OUTPUT_DIR) \ No newline at end of file diff --git a/GPUMD/t-SNE/t-SNE.py b/GPUMD/t-SNE/t-SNE.py new file mode 100644 index 0000000..ab782e2 --- /dev/null +++ b/GPUMD/t-SNE/t-SNE.py @@ -0,0 +1,140 @@ +from pathlib import Path +import numpy as np +import matplotlib +matplotlib.use("Agg") +import matplotlib.pyplot as plt +from sklearn.manifold import TSNE +from sklearn.decomposition import PCA + +def tsne_dir_shared_coords( + dir_path: str, + *, + metric: str = "euclidean", # 可试 "cosine";想保留尺度差异用 "euclidean" + perplexity: float = 50.0, # 30k~50k 样本建议 30~50 + n_iter: int = 1000, + early_exaggeration: float = 12.0, + learning_rate = "auto", + standardize: bool = False, + pca_dim: int | None = None, # 先用 PCA 降到 pca_dim(如 20) 再跑 t-SNE,可提速 + context: bool = True, + make_joint: bool = True, + init: str = "pca", + random_state: int = 42 +) -> None: + p = Path(dir_path) + if not p.is_dir(): + raise ValueError(f"{dir_path!r} 不是有效文件夹") + + files = sorted(p.glob("*.npy")) + if not files: + print(f"目录 {p} 中未找到 .npy 文件") + return + + X_list, paths, counts = [], [], [] + for f in files: + try: + data = np.load(f) + if data.ndim != 2: + print(f"[跳过] {f.name}: 期望二维数组,实际 shape={data.shape}") + continue + + # 统一到 (n_samples, 30) + if data.shape[1] == 30: + X = data + elif data.shape[0] == 30: + X = data.T + else: + print(f"[跳过] {f.name}: shape={data.shape}, 未检测到 30 维特征") + continue + + mask = np.isfinite(X).all(axis=1) + if not np.all(mask): + X = X[mask] + print(f"[提示] {f.name}: 移除了含 NaN/Inf 的样本行") + + if X.shape[0] < 3: + print(f"[跳过] {f.name}: 样本数过少(n={X.shape[0]})") + continue + + X_list.append(X) + paths.append(f) + counts.append(X.shape[0]) + except Exception as e: + print(f"[错误] 读取 {f.name} 失败: {e}") + + if not X_list: + print("未找到可用的数据文件") + return + + X_all = np.vstack(X_list) + + if standardize: + mean = X_all.mean(axis=0) + std = X_all.std(axis=0); std[std == 0] = 1.0 + X_all = (X_all - mean) / std + + if pca_dim is not None and pca_dim > 2: + X_all = PCA(n_components=pca_dim, random_state=random_state).fit_transform(X_all) + + tsne = TSNE( + n_components=2, + metric=metric, + perplexity=float(perplexity), + early_exaggeration=float(early_exaggeration), + learning_rate=learning_rate, + init=init, + random_state=random_state, + method="barnes_hut", # 适合大样本 + angle=0.5, + verbose=0, + ) + Z_all = tsne.fit_transform(X_all) + + # 统一坐标轴范围 + x_min, x_max = float(Z_all[:, 0].min()), float(Z_all[:, 0].max()) + y_min, y_max = float(Z_all[:, 1].min()), float(Z_all[:, 1].max()) + pad_x = 0.05 * (x_max - x_min) if x_max > x_min else 1.0 + pad_y = 0.05 * (y_max - y_min) if y_max > y_min else 1.0 + + colors = [ + "#1f77b4","#ff7f0e","#2ca02c","#d62728","#9467bd", + "#8c564b","#e377c2","#7f7f7f","#bcbd22","#17becf" + ] + + # 分文件出图 + start = 0 + for i, (f, n) in enumerate(zip(paths, counts)): + Zi = Z_all[start:start + n]; start += n + fig, ax = plt.subplots(figsize=(6, 5), dpi=150) + if context: + ax.scatter(Z_all[:, 0], Z_all[:, 1], s=5, c="#cccccc", alpha=0.35, edgecolors="none", label="All") + ax.scatter(Zi[:, 0], Zi[:, 1], s=8, c=colors[i % len(colors)], alpha=0.9, edgecolors="none", label=f.name) + ax.set_title(f"{f.name} • t-SNE(shared) (perp={perplexity}, metric={metric})", fontsize=9) + ax.set_xlabel("t-SNE-1"); ax.set_ylabel("t-SNE-2") + ax.set_xlim(x_min - pad_x, x_max + pad_x); ax.set_ylim(y_min - pad_y, y_max + pad_y) + ax.grid(True, linestyle="--", linewidth=0.3, alpha=0.5) + if context: ax.legend(loc="best", fontsize=8, frameon=False) + fig.tight_layout() + out_png = f.with_suffix("").as_posix() + "_tsne_shared.png" + fig.savefig(out_png); plt.close(fig) + print(f"[完成] {f.name} -> {out_png}") + + # 总览图 + if make_joint: + start = 0 + fig, ax = plt.subplots(figsize=(7, 6), dpi=150) + for i, (f, n) in enumerate(zip(paths, counts)): + Zi = Z_all[start:start + n]; start += n + ax.scatter(Zi[:, 0], Zi[:, 1], s=8, c=colors[i % len(colors)], alpha=0.85, edgecolors="none", label=f.name) + ax.set_title(f"t-SNE(shared) overview (perp={perplexity}, metric={metric})", fontsize=10) + ax.set_xlabel("t-SNE-1"); ax.set_ylabel("t-SNE-2") + ax.set_xlim(x_min - pad_x, x_max + pad_x); ax.set_ylim(y_min - pad_y, y_max + pad_y) + ax.grid(True, linestyle="--", linewidth=0.3, alpha=0.5) + ax.legend(loc="best", fontsize=8, frameon=False) + fig.tight_layout() + out_png = Path(dir_path) / "tsne_shared_overview.png" + fig.savefig(out_png.as_posix()); plt.close(fig) + print(f"[完成] 总览 -> {out_png}") + +if __name__ == "__main__": + tsne_dir_shared_coords("data") \ No newline at end of file diff --git a/Li_Conductivity/data/conductivity_results.csv b/Li_Conductivity/data/conductivity_results.csv new file mode 100644 index 0000000..0c64187 --- /dev/null +++ b/Li_Conductivity/data/conductivity_results.csv @@ -0,0 +1,3 @@ +Temperature(K),Conductivity(S/m) +300,0.0148 +425,6.3173 \ No newline at end of file diff --git a/MSD/main.py b/MSD/main.py index 730baad..217e055 100644 --- a/MSD/main.py +++ b/MSD/main.py @@ -1,7 +1,7 @@ from utils.MSD import * if __name__ == '__main__': - # file_path_li = 'data/msd_li.dat' + # file_path_li = 'raw/msd_li.dat' # final_msd = plot_and_get_final_msd(file_path_li, ion_name='Li⁺') num_li_ions = 144 # !! 请务必用您体系的真实值替换此示例值 !! diff --git a/MSD/utils/con2.py b/MSD/utils/con2.py index a101a24..a80570b 100644 --- a/MSD/utils/con2.py +++ b/MSD/utils/con2.py @@ -234,7 +234,7 @@ if __name__ == "__main__": # 检查文件是否存在 msd_path = os.path.join(folder_path, "msd_li.dat") - data_path = os.path.join(folder_path, "LYC.data") + data_path = os.path.join(folder_path, "LYC.raw") if not os.path.exists(msd_path): print(f" 跳过:缺少{msd_path}") continue diff --git a/contrast learning/copy.py b/contrast learning/copy.py index 506f47a..1a82a0a 100644 --- a/contrast learning/copy.py +++ b/contrast learning/copy.py @@ -133,14 +133,134 @@ def copy_cif_with_O_or_S_robust(source_dir: str, target_dir: str, dry_run: bool print(f"模拟运行结束:如果实际运行,将会有 {copied_count} 个文件被复制。") else: print(f"成功复制了 {copied_count} 个文件到目标文件夹。") + + +def copy_cif_without_Br_or_Cl(source_dir: str, target_dir: str, dry_run: bool = False): + """ + 从源文件夹中筛选出内容不含'Br'或'Cl'元素的CIF文件,并复制到目标文件夹。 + (鲁棒版:能正确解析CIF中的元素符号列) + + :param source_dir: 源文件夹路径,包含CIF文件。 + :param target_dir: 目标文件夹路径,用于存放筛选出的文件。 + :param dry_run: 如果为True,则只打印将要复制的文件,而不实际执行复制操作。 + """ + # 1. 路径处理和验证 (与原函数相同) + source_path = Path(source_dir) + target_path = Path(target_dir) + if not source_path.is_dir(): + print(f"错误:源文件夹 '{source_dir}' 不存在或不是一个文件夹。") + return + if not dry_run and not target_path.exists(): + target_path.mkdir(parents=True, exist_ok=True) + print(f"目标文件夹 '{target_dir}' 已创建。") + + print(f"源文件夹: {source_path}") + print(f"目标文件夹: {target_path}") + if dry_run: + print("\n--- *** 模拟运行模式 (Dry Run) *** ---") + print("--- 不会执行任何实际的文件复制操作 ---") + + # 2. 开始遍历和筛选 + print("\n开始扫描源文件夹,剔除含 Br 或 Cl 的CIF文件...") + copied_count = 0 + checked_files = 0 + error_files = 0 + excluded_files = 0 + + # 遍历所有 .cif 文件 + for file_path in source_path.glob('*.cif'): + if not file_path.is_file(): + continue + + checked_files += 1 + contains_br_or_cl = False # 标记文件是否包含 Br 或 Cl + + try: + with open(file_path, 'r', encoding='utf-8', errors='ignore') as f: + lines = f.readlines() + + # 步骤 A: 找到元素符号在哪一列 + element_col_idx = find_element_column_index(lines) + + if element_col_idx != -1: + # 优先使用结构数据进行精确判断 + for line in lines: + line_stripped = line.strip() + # 忽略空行、注释行和定义行 + if not line_stripped or line_stripped.startswith(('#', '_', 'loop_')): + continue + + parts = line_stripped.split() + # 确保行中有足够的列 + if len(parts) > element_col_idx: + atom_symbol = parts[element_col_idx].strip() + # 检查元素是否为 Br 或 Cl(也考虑类似 Br- 的情况) + if atom_symbol.upper().startswith('BR') or atom_symbol.upper().startswith('CL'): + contains_br_or_cl = True + break # 找到一个就足够,可以停止检查这个文件 + + # 步骤 B: 如果上述方法未找到,使用化学式作为备用检查 + if not contains_br_or_cl: + # 使用 any() 来高效检查,找到一个匹配即停止 + is_in_formula = any( + line.strip().startswith(('_chemical_formula_sum', '_chemical_formula_structural')) and + (' Br' in line or ' Cl' in line) + for line in lines + ) + if is_in_formula: + contains_br_or_cl = True + + # 步骤 C: 根据检查结果决定是否复制 + if contains_br_or_cl: + # 如果包含 Br 或 Cl,则打印信息并跳过 + print(f"排除文件: '{file_path.name}' (检测到 Br 或 Cl 元素)") + excluded_files += 1 + else: + # 如果不包含 Br 或 Cl,则执行复制 + target_file_path = target_path / file_path.name + print(f"找到匹配: '{file_path.name}' (不含 Br 或 Cl)") + if not dry_run: + shutil.copy2(file_path, target_file_path) + copied_count += 1 + + except Exception as e: + error_files += 1 + print(f"!! 处理文件 '{file_path.name}' 时发生错误: {e}") + + # 3. 打印最终报告 (与原函数类似,增加了排除计数) + print("\n--- 操作总结 ---") + print(f"共检查了 {checked_files} 个.cif文件。") + print(f"排除了 {excluded_files} 个含有 Br 或 Cl 的文件。") + if error_files > 0: + print(f"处理过程中有 {error_files} 个文件发生错误。") + if dry_run: + print(f"模拟运行结束:如果实际运行,将会有 {copied_count} 个文件被复制。") + else: + print(f"成功复制了 {copied_count} 个文件到目标文件夹。") + if __name__ == '__main__': # !! 重要:请将下面的路径修改为您自己电脑上的实际路径 - source_folder = "D:/download/2025-10/data_all/input/input" - target_folder = "D:/download/2025-10/data_all/output" + # source_folder = "D:/download/2025-10/data_all/input/input" + # target_folder = "D:/download/2025-10/data_all/output" + # + # # --- 第一次运行:使用模拟模式 (Dry Run) --- + # print("================ 第一次运行: 模拟模式 ================") + # copy_cif_with_O_or_S_robust(source_folder, target_folder, dry_run=True) + # + # print("\n\n=======================================================") + # input("检查上面的模拟运行结果。如果符合预期,按回车键继续执行实际复制操作...") + # print("=======================================================") + # + # # --- 第二次运行:实际执行复制 --- + # print("\n================ 第二次运行: 实际复制模式 ================") + # copy_cif_with_O_or_S_robust(source_folder, target_folder, dry_run=False) + + source_folder = "D:/download/2025-10/data_OS/input" + target_folder = "D:/download/2025-10/data_withoutBrCl/input" # --- 第一次运行:使用模拟模式 (Dry Run) --- print("================ 第一次运行: 模拟模式 ================") - copy_cif_with_O_or_S_robust(source_folder, target_folder, dry_run=True) + copy_cif_without_Br_or_Cl(source_folder, target_folder, dry_run=True) print("\n\n=======================================================") input("检查上面的模拟运行结果。如果符合预期,按回车键继续执行实际复制操作...") @@ -148,4 +268,4 @@ if __name__ == '__main__': # --- 第二次运行:实际执行复制 --- print("\n================ 第二次运行: 实际复制模式 ================") - copy_cif_with_O_or_S_robust(source_folder, target_folder, dry_run=False) + copy_cif_without_Br_or_Cl(source_folder, target_folder, dry_run=False) diff --git a/contrast learning/split.py b/contrast learning/split.py new file mode 100644 index 0000000..f5bdec3 --- /dev/null +++ b/contrast learning/split.py @@ -0,0 +1,111 @@ +import os +import shutil +import random +from pathlib import Path + + +def split_dataset(source_dir: str, output_dir: str, test_ratio: float = 0.2): + """ + 将源文件夹中的文件按比例划分到输出文件夹下的 train 和 test 子目录中。 + + Args: + source_dir (str): 包含所有数据文件的源文件夹路径。 + output_dir (str): 用于存放'train'和'test'文件夹的目标文件夹路径。 + test_ratio (float, optional): 测试集所占的比例。默认为 0.2。 + """ + print("--- 开始执行数据集划分 ---") + + # 1. 路径处理和验证 + source_path = Path(source_dir) + output_path = Path(output_dir) + + if not source_path.is_dir(): + print(f"错误:源文件夹 '{source_dir}' 不存在或不是一个目录。") + return + + # 2. 创建输出文件夹 (train 和 test) + train_dir = output_path / 'train' + test_dir = output_path / 'test' + + try: + os.makedirs(train_dir, exist_ok=True) + os.makedirs(test_dir, exist_ok=True) + print(f"输出目录已准备好: \n 训练集 -> {train_dir}\n 测试集 -> {test_dir}") + except OSError as e: + print(f"错误:创建输出目录时发生错误: {e}") + return + + # 3. 获取所有文件并随机打乱 + all_files = [f for f in source_path.iterdir() if f.is_file()] + + if not all_files: + print(f"警告:源文件夹 '{source_dir}' 中没有文件可供划分。") + return + + random.shuffle(all_files) + total_files = len(all_files) + print(f"在源文件夹中找到 {total_files} 个文件。") + + # 4. 计算分割数量 + num_test = int(total_files * test_ratio) + num_train = total_files - num_test + + print(f"划分计划 -> 训练集: {num_train} 个文件 | 测试集: {num_test} 个文件") + + # 5. 分割文件列表 + test_files = all_files[:num_test] + train_files = all_files[num_test:] + + # 6. 定义一个复制/移动文件的辅助函数 + def copy_files(files_to_copy, destination_dir): + copied_count = 0 + for file_path in files_to_copy: + try: + # 注意:这里使用的是复制(copy),更安全。 + # 如果你确认要移动(move)并且清空源文件夹,请将 shutil.copy 改为 shutil.move + shutil.copy(file_path, destination_dir) + copied_count += 1 + except Exception as e: + print(f"处理文件 '{file_path.name}' 时出错: {e}") + return copied_count + + # 7. 复制文件到对应的文件夹 + print(f"\n正在复制文件到 'train' 文件夹...") + copied_train = copy_files(train_files, train_dir) + print(f"成功复制 {copied_train} 个文件到训练集。") + + print(f"\n正在复制文件到 'test' 文件夹...") + copied_test = copy_files(test_files, test_dir) + print(f"成功复制 {copied_test} 个文件到测试集。") + + print("\n--- 数据集划分完成! ---") + + +# --- 如何使用这个函数 --- +if __name__ == '__main__': + # --- 请在这里配置你的文件夹路径 --- + + # 你的原始数据集所在的文件夹 + # 例如: 'C:/Users/YourUser/Desktop/my_dataset' (Windows) + # 或: '/home/user/project/raw/all_images' (Linux/macOS) + SOURCE_DATA_DIR = 'D:/download/2025-10/data_OS/input/S' + + # 你希望将'train'和'test'文件夹创建在哪里 + # 例如: 'C:/Users/YourUser/Desktop/split_output' (Windows) + # 或: '/home/user/project/raw/processed' (Linux/macOS) + # 如果使用 '.', 表示在当前脚本所在的目录下创建 + OUTPUT_DIR = 'D:/download/2025-10/data_OS/train/S' + + # --- 配置完成,下面是调用函数 --- + + # 检查示例路径是否存在,如果不存在则创建并填充一些假文件用于演示 + if not os.path.exists(SOURCE_DATA_DIR): + print(f"演示目录 '{SOURCE_DATA_DIR}' 不存在,正在创建并生成100个示例文件...") + os.makedirs(SOURCE_DATA_DIR) + for i in range(100): + with open(os.path.join(SOURCE_DATA_DIR, f'file_{i + 1:03d}.txt'), 'w') as f: + f.write(f'This is file {i + 1}.') + print("示例文件创建完毕。") + + # 调用函数执行划分 + split_dataset(SOURCE_DATA_DIR, OUTPUT_DIR, test_ratio=0.2) diff --git a/corner-sharing/0923_CS.py b/corner-sharing/0923_CS.py index 7ec61d3..b5eac2f 100644 --- a/corner-sharing/0923_CS.py +++ b/corner-sharing/0923_CS.py @@ -82,7 +82,7 @@ def process_cif_folder(cif_folder_path: str, output_csv_path: str): if __name__ == "__main__": # ----- 参数配置 ----- # 请将此路径修改为您存放CIF文件的文件夹的实际路径 - CIF_DIRECTORY = "data/0921" + CIF_DIRECTORY = "raw/0921" # 输出的CSV文件名 OUTPUT_CSV = "corner_sharing_results.csv" diff --git a/corner-sharing/utils/CS_analyse.py b/corner-sharing/utils/CS_analyse.py index 587edd1..e8818dc 100644 --- a/corner-sharing/utils/CS_analyse.py +++ b/corner-sharing/utils/CS_analyse.py @@ -349,7 +349,7 @@ def check_only_corner_sharing(sharing_results: Dict[str, Dict[str, int]]) -> int else: return 0 # 没有任何共享关系,也返回 0 -# structure = Structure.from_file("../data/0921/wjy_001.cif") +# structure = Structure.from_file("../raw/0921/wjy_001.cif") # a = CS_catulate(structure,notice=True) # b = CS_count(structure,a) # print(f"{a}\n{b}") diff --git a/corner-sharing/utils/analyze_env_st.py b/corner-sharing/utils/analyze_env_st.py index 6275216..48666bc 100644 --- a/corner-sharing/utils/analyze_env_st.py +++ b/corner-sharing/utils/analyze_env_st.py @@ -205,6 +205,6 @@ def export_envs(envlist, sp='Li', envtype='both', fname=None): f.write("Site index " + str(index) + ": " + str(i) + '\n') -# struct = Structure.from_file("../data/0921/wjy_475.cif") +# struct = Structure.from_file("../raw/0921/wjy_475.cif") # site_info = extract_sites(struct, envtype="both") # export_envs(site_info, sp="Li", envtype="both") \ No newline at end of file diff --git a/data_get/new_v1/data/input/最新数据库核查过gsj.xlsx b/data_get/new_v1/data/input/最新数据库核查过gsj.xlsx new file mode 100644 index 0000000..de59655 Binary files /dev/null and b/data_get/new_v1/data/input/最新数据库核查过gsj.xlsx differ diff --git a/data_get/new_v1/data_get.py b/data_get/new_v1/data_get.py new file mode 100644 index 0000000..b3078e1 --- /dev/null +++ b/data_get/new_v1/data_get.py @@ -0,0 +1,127 @@ +import pandas as pd +import os +import re + + +def extract_cif_from_xlsx( + xlsx_path: str, + output_dir: str, + naming_mode: str = 'formula', + name_col: int = 0, + cif_col: int = 1, + prefix: str = 'wjy' +): + """ + 从 XLSX 文件中提取 CIF 数据并保存为单独的 .cif 文件。 + + Args: + xlsx_path (str): 输入的 XLSX 文件的路径。 + output_dir (str): 输出 .cif 文件的文件夹路径。 + naming_mode (str, optional): CIF 文件的命名模式。 + 可选值为 'formula' (使用第一列的名字) 或 + 'auto' (使用前缀+自动递增编号)。 + 默认为 'formula'。 + name_col (int, optional): 包含文件名的列的索引(从0开始)。默认为 0。 + cif_col (int, optional): 包含 CIF 内容的列的索引(从0开始)。默认为 1。 + prefix (str, optional): 在 'auto' 命名模式下使用的文件名前缀。默认为 'wjy'。 + + Raises: + FileNotFoundError: 如果指定的 xlsx_path 文件不存在。 + ValueError: 如果指定的 naming_mode 无效。 + Exception: 处理过程中发生的其他错误。 + """ + # --- 1. 参数校验和准备 --- + if not os.path.exists(xlsx_path): + raise FileNotFoundError(f"错误: 输入文件未找到 -> {xlsx_path}") + + if naming_mode not in ['formula', 'auto']: + raise ValueError(f"错误: 'naming_mode' 参数必须是 'formula' 或 'auto',但收到了 '{naming_mode}'") + + # 创建输出目录(如果不存在) + os.makedirs(output_dir, exist_ok=True) + print(f"CIF 文件将保存到: {output_dir}") + + try: + # --- 2. 读取 XLSX 文件 --- + # header=None 表示第一行不是标题,将其作为数据读取 + df = pd.read_excel(xlsx_path, header=None) + + # 跳过原始文件的表头行('formula', 'cif') + if str(df.iloc[0, name_col]).strip().lower() == 'formula' and str(df.iloc[0, cif_col]).strip().lower() == 'cif': + df = df.iloc[1:] + print("检测到并跳过了表头行。") + + # --- 3. 遍历数据并生成文件 --- + success_count = 0 + for index, row in df.iterrows(): + # 获取文件名和 CIF 内容 + formula_name = str(row[name_col]) + cif_content = str(row[cif_col]) + + # 跳过内容为空的行 + if pd.isna(row[name_col]) or pd.isna(row[cif_col]) or not cif_content.strip(): + print(f"警告: 第 {index + 2} 行数据不完整,已跳过。") + continue + + # --- 4. 根据命名模式确定文件名 --- + if naming_mode == 'formula': + # 清理文件名,替换掉不适合做文件名的特殊字符 + # 例如:将 (PO4)3 替换为 _PO4_3,将 / 替换为 _ + safe_filename = re.sub(r'[\\/*?:"<>|()]', '_', formula_name) + filename = f"{safe_filename}.cif" + else: # naming_mode == 'auto' + # 使用 format 方法来确保编号格式统一,例如 001, 002 + filename = f"{prefix}_{success_count + 1:03d}.cif" + + # 构造完整的输出文件路径 + output_path = os.path.join(output_dir, filename) + + # --- 5. 写入 CIF 文件 --- + try: + with open(output_path, 'w', encoding='utf-8') as f: + f.write(cif_content) + success_count += 1 + except IOError as e: + print(f"错误: 无法写入文件 {output_path}。原因: {e}") + + print(f"\n处理完成!成功提取并生成了 {success_count} 个 CIF 文件。") + + except Exception as e: + print(f"处理 XLSX 文件时发生错误: {e}") + + +# --- 函数使用示例 --- +if __name__ == '__main__': + # 假设您的 XLSX 文件名为 'materials.xlsx',且与此脚本在同一目录下 + source_xlsx_file = 'input/cif_dataset.xlsx' + + # 检查示例文件是否存在,如果不存在则创建一个 + if not os.path.exists(source_xlsx_file): + print(f"未找到示例文件 '{source_xlsx_file}',正在创建一个...") + example_data = { + 'formula': ['Li3Al0.3Ti1.7(PO4)3', 'Li6.5La3Zr1.75W0.25O12', 'Invalid/Name*Test'], + 'cif': ['# CIF Data for Li3Al0.3...\n_atom_site_type_symbol\n Li\n Al\n Ti\n P\n O', + '# CIF Data for Li6.5La3...\n_symmetry_space_group_name_H-M \'I a -3 d\'', + '# CIF Data for Invalid Name Test'] + } + pd.DataFrame(example_data).to_excel(source_xlsx_file, index=False, header=True) + print("示例文件创建成功。") + + # --- 示例 1: 使用第一列的 'formula' 命名 --- + # print("\n--- 示例 1: 使用 'formula' 命名模式 ---") + # output_folder_1 = 'cif_by_formula' + # extract_cif_from_xlsx( + # xlsx_path=source_xlsx_file, + # output_dir=output_folder_1, + # naming_mode='formula' + # ) + + # --- 示例 2: 使用 'wjy+编号' 自动命名 --- + print("\n--- 示例 2: 使用 'auto' 命名模式 ---") + output_folder_2 = 'cif_by_auto' + extract_cif_from_xlsx( + xlsx_path=source_xlsx_file, + output_dir=output_folder_2, + naming_mode='auto', + prefix='wjy' + ) \ No newline at end of file diff --git a/dpgen/create_supercell_poscar.py b/dpgen/create_supercell_poscar.py new file mode 100644 index 0000000..da25097 --- /dev/null +++ b/dpgen/create_supercell_poscar.py @@ -0,0 +1,79 @@ +from pymatgen.core import Structure +from pymatgen.io.vasp import Poscar + +def create_supercell_poscar(cif_path, supercell_matrix, output_filename="POSCAR_supercell"): + """ + 从CIF文件读取晶体结构,根据指定的矩阵进行扩胞,并生成VASP POSCAR文件。 + + Args: + cif_path (str): 输入的CIF文件路径。 + supercell_matrix (list or tuple): 3x3的扩胞矩阵。 + - 对于简单的对角扩胞 (例如 2x2x4),使用: [[2, 0, 0], [0, 2, 0], [0, 0, 4]] + - 对于非对角扩胞 (例如 a_s=3a, b_s=2a+4b, c_s=6c),使用: [[3, 0, 0], [2, 4, 0], [0, 0, 6]] + output_filename (str): 输出的POSCAR文件名。默认为 "POSCAR_supercell"。 + + Returns: + bool: 如果成功生成文件则返回 True,否则返回 False。 + """ + try: + # 1. 从CIF文件加载结构 + # 使用 from_file 静态方法直接读取 + # primitive=False 确保我们使用CIF中定义的晶胞,而不是其原胞 + original_structure = Structure.from_file(cif_path, primitive=False) + + print("--- 原始晶胞信息 ---") + print(f" 原子数: {original_structure.num_sites}") + print(f" 化学式: {original_structure.composition.reduced_formula}") + print(f" 晶格参数 (a, b, c, α, β, γ):") + lat = original_structure.lattice + print(f" {lat.a:.4f}, {lat.b:.4f}, {lat.c:.4f}, {lat.alpha:.2f}, {lat.beta:.2f}, {lat.gamma:.2f}") + + # 2. 进行扩胞操作 + # 注意:pymatgen 会自动处理原子坐标的映射 + supercell_structure = original_structure * supercell_matrix + + print("\n--- 扩胞后信息 ---") + print(f" 扩胞矩阵: {supercell_matrix}") + print(f" 新原子数: {supercell_structure.num_sites}") + print(f" 新化学式: {supercell_structure.composition.reduced_formula}") + print(f" 新晶格参数 (a, b, c, α, β, γ):") + super_lat = supercell_structure.lattice + print(f" {super_lat.a:.4f}, {super_lat.b:.4f}, {super_lat.c:.4f}, {super_lat.alpha:.2f}, {super_lat.beta:.2f}, {super_lat.gamma:.2f}") + + # 3. 创建Poscar对象并写入文件 + # comment 参数可以设置POSCAR文件的第一行注释 + poscar = Poscar(supercell_structure, comment=f"Supercell from {cif_path}") + poscar.write_file(output_filename) + + print(f"\n成功!已将扩胞结构写入文件: {output_filename}") + return True + + except Exception as e: + print(f"发生错误: {e}") + return False + +# --- 使用示例 --- + +# 假设您的CIF文件名为 "origin.cif",并且与此脚本在同一目录下。 +# 如果您在复现 Wang Shuo 的工作,他们可能使用了不同的扩胞方案。 +# 例如,用于MD模拟的大超胞是 2x2x4 [source_id: 3]。 +# 而用于DP-GEN探索的小超胞是 1x1x2 [source_id: 3]。 + +# 示例1:生成用于DP-GEN探索的 1x1x2 小超胞 (60个原子) +print("="*40) +print("正在生成 1x1x2 超胞 (用于 DP-GEN 探索)...") +matrix_1x1x2 = [[1, 0, 0], [0, 1, 0], [0, 0, 2]] +create_supercell_poscar("data/P3ma/origin.cif", matrix_1x1x2, "data/P3ma/output/POSCAR_1x1x2_60atoms") + +# 示例2:生成用于LAMMPS MD模拟的 2x2x4 大超胞 (480个原子) +# print("\n" + "="*40) +# print("正在生成 2x2x4 超胞 (用于 LAMMPS MD 模拟)...") +# matrix_2x2x4 = [[2, 0, 0], [0, 2, 0], [0, 0, 4]] +# create_supercell_poscar("origin.cif", matrix_2x2x4, "POSCAR_2x2x4_480atoms") +# +# # 示例3:生成 Geng 等人研究中使用的大超胞 (2160个原子) [source_id: 1] +# # 这个扩胞矩阵 a_s = 3a_0, b_s = 2a_0 + 4b_0, c_s = 6c_0 [source_id: 1] +# print("\n" + "="*40) +# print("正在生成非对角扩胞超胞 (Geng et al.)...") +# matrix_geng = [[3, 0, 0], [2, 4, 0], [0, 0, 6]] +# create_supercell_poscar("origin.cif", matrix_geng, "POSCAR_Geng_2160atoms") \ No newline at end of file diff --git a/dpgen/plus.py b/dpgen/plus.py index 8d496af..4dabe50 100644 --- a/dpgen/plus.py +++ b/dpgen/plus.py @@ -147,5 +147,5 @@ def make_pnma_poscar_from_cif(cif_path: str, print(f"写出 {out_poscar};总原子数 = {len(s)};组成 = {comp}") if __name__=="__main__": - # make_model3_poscar_from_cif("data/P3ma/model3.cif","data/P3ma/supercell_model4.poscar") - make_pnma_poscar_from_cif("data/Pnma/origin.cif","data/Pnma/supercell_pnma.poscar",seed=42) \ No newline at end of file + # make_model3_poscar_from_cif("raw/P3ma/model3.cif","raw/P3ma/supercell_model4.poscar") + make_pnma_poscar_from_cif("data/Pnma/origin.cif","raw/Pnma/supercell_pnma.poscar",seed=42) \ No newline at end of file diff --git a/dpgen/supercell_make_p3ma.py b/dpgen/supercell_make_p3ma.py index e69de29..7a73566 100644 --- a/dpgen/supercell_make_p3ma.py +++ b/dpgen/supercell_make_p3ma.py @@ -0,0 +1,240 @@ +import pymatgen.core as mg +from pymatgen.io.cif import CifParser +from pymatgen.transformations.standard_transformations import SupercellTransformation +import random +import os + + +def create_ordered_structure_from_disordered(disordered_structure): + """ + 手动将包含部分占位的无序结构转换为有序结构,借鉴plus.py的思路。 + """ + s = disordered_structure.copy() + + # 识别需要处理的部分占位 + # 根据 model3.cif, Y2(z≈0.488, occ=0.75), Y3(z≈-0.065, occ=0.25), Li2(z≈0.5, occ=0.5) [model3.cif] + y2_indices, y3_indices, li2_indices = [], [], [] + + for i, site in enumerate(s.sites): + # 使用z坐标来识别特定的部分占位 + z = site.frac_coords[2] + if site.species_string == "Y": + if abs(z - 0.488) < 0.05: + y2_indices.append(i) + elif abs(z - (-0.065)) < 0.05 or abs(z - (1 - 0.065)) < 0.05: + y3_indices.append(i) + elif site.species_string == "Li": + if abs(z - 0.5) < 0.05: + li2_indices.append(i) + + # 根据占位率随机选择要保留的原子 + def choose_keep(indices, keep_fraction): + num_to_keep = int(round(len(indices) * keep_fraction)) + return set(random.sample(indices, num_to_keep)) + + keep_y2 = choose_keep(y2_indices, 0.75) + keep_y3 = choose_keep(y3_indices, 0.25) + keep_li2 = choose_keep(li2_indices, 0.50) + + # 找出所有需要删除的原子索引 + to_remove_indices = [i for i in y2_indices if i not in keep_y2] + to_remove_indices.extend([i for i in y3_indices if i not in keep_y3]) + to_remove_indices.extend([i for i in li2_indices if i not in keep_li2]) + + # 从后往前删除,避免索引错位 + s.remove_sites(sorted(to_remove_indices, reverse=True)) + + # --- 关键修复步骤 --- + # 最终清理,确保所有位点都是有序的 + for i, site in enumerate(s.sites): + if not site.is_ordered: + # 将Composition对象转换为字典,然后找到占位率最高的元素 [plus.py] + species_dict = site.species.as_dict() + main_specie = max(species_dict.items(), key=lambda item: item[1])[0] + s.replace(i, main_specie) + + return s + + +def create_supercells_from_file(cif_path, output_path="."): + """ + 根据给定的CIF文件路径,生成三种不同尺寸和缺陷的超胞,并保存为POSCAR文件。 + """ + if not os.path.exists(cif_path): + print(f"错误: 文件 '{cif_path}' 不存在。") + return + + print(f"正在从 {cif_path} 读取结构...") + parser = CifParser(cif_path) + disordered_structure = parser.parse_structures(primitive=False)[0] + + structure = create_ordered_structure_from_disordered(disordered_structure) + print(f"成功将无序结构转换为一个包含 {len(structure)} 个原子的有序单胞。") + + os.makedirs(output_path, exist_ok=True) + + # 任务一:生成60原子超胞 (无缺陷) + print("\n--- 正在生成 60原子无缺陷超胞 (1x1x2) ---") + tf_60 = SupercellTransformation([[1, 0, 0], [0, 1, 0], [0, 0, 2]]) + sc_60_no_defect = tf_60.apply_transformation(structure) + print(f"原子总数: {len(sc_60_no_defect)}, 化学式: {sc_60_no_defect.composition.reduced_formula}") + sc_60_no_defect.to(fmt="poscar", filename=os.path.join(output_path, "POSCAR_60_no_defect")) + print(f"已保存文件: {os.path.join(output_path, 'POSCAR_60_no_defect')}") + + # 任务二:生成60原子超胞 (含一对反位缺陷) + print("\n--- 正在生成 60原子含一对反位缺陷超胞 ---") + sc_60_defect = sc_60_no_defect.copy() + li_indices = [i for i, site in enumerate(sc_60_defect.sites) if site.species_string == 'Li'] + y_indices = [i for i, site in enumerate(sc_60_defect.sites) if site.species_string == 'Y'] + + if li_indices and y_indices: + li_swap_idx, y_swap_idx = random.choice(li_indices), random.choice(y_indices) + sc_60_defect.replace(li_swap_idx, "Y") + sc_60_defect.replace(y_swap_idx, "Li") + print(f"成功引入一对反位缺陷。浓度: {2 / (len(li_indices) + len(y_indices)) * 100:.2f}%") + sc_60_defect.to(fmt="poscar", filename=os.path.join(output_path, "POSCAR_60_antisite_defect")) + print(f"已保存文件: {os.path.join(output_path, 'POSCAR_60_antisite_defect')}") + + # 任务三:生成90原子超胞 (含一对反位缺陷) + print("\n--- 正在生成 90原子含一对反位缺陷超胞 ---") + tf_90 = SupercellTransformation([[1, 0, 0], [0, 1, 0], [0, 0, 3]]) + sc_90_no_defect = tf_90.apply_transformation(structure) + sc_90_defect = sc_90_no_defect.copy() + li_indices = [i for i, site in enumerate(sc_90_defect.sites) if site.species_string == 'Li'] + y_indices = [i for i, site in enumerate(sc_90_defect.sites) if site.species_string == 'Y'] + + if li_indices and y_indices: + li_swap_idx, y_swap_idx = random.choice(li_indices), random.choice(y_indices) + sc_90_defect.replace(li_swap_idx, "Y") + sc_90_defect.replace(y_swap_idx, "Li") + print(f"原子总数: {len(sc_90_defect)}, 浓度: {2 / (len(li_indices) + len(y_indices)) * 100:.2f}%") + sc_90_defect.to(fmt="poscar", filename=os.path.join(output_path, "POSCAR_90_antisite_defect")) + print(f"已保存文件: {os.path.join(output_path, 'POSCAR_90_antisite_defect')}") + + +def create_ordered_p3ma_structure(disordered_structure): + """ + 手动将P3ma相的无序结构(包含Y2, Y3, Li2的部分占位)转换为有序结构。 + """ + s = disordered_structure.copy() + + # 根据 model3.cif, 识别Y2(z≈0.488, occ=0.75), Y3(z≈-0.065, occ=0.25), Li2(z≈0.5, occ=0.5) [model3.cif] + y2_indices, y3_indices, li2_indices = [], [], [] + + for i, site in enumerate(s.sites): + z = site.frac_coords[2] + if site.species_string == "Y": + if abs(z - 0.488) < 0.05: + y2_indices.append(i) + elif abs(z - (-0.065)) < 0.05 or abs(z - (1 - 0.065)) < 0.05: + y3_indices.append(i) + elif site.species_string == "Li": + if abs(z - 0.5) < 0.05: + li2_indices.append(i) + + # 根据占位率随机选择要保留的原子 + def choose_keep(indices, keep_fraction): + num_to_keep = int(round(len(indices) * keep_fraction)) + return set(random.sample(indices, num_to_keep)) + + keep_y2 = choose_keep(y2_indices, 0.75) + keep_y3 = choose_keep(y3_indices, 0.25) + keep_li2 = choose_keep(li2_indices, 0.50) + + # 找出所有需要删除的原子索引 + to_remove_indices = [i for i in y2_indices if i not in keep_y2] + to_remove_indices.extend([i for i in y3_indices if i not in keep_y3]) + to_remove_indices.extend([i for i in li2_indices if i not in keep_li2]) + + s.remove_sites(sorted(to_remove_indices, reverse=True)) + + # 最终清理,确保所有位点都是有序的 + for i, site in enumerate(s.sites): + if not site.is_ordered: + species_dict = site.species.as_dict() + main_specie = max(species_dict.items(), key=lambda item: item[1])[0] + s.replace(i, main_specie) + + return s + + +def create_multiple_p3ma_supercells(cif_path, num_configs=5, output_path="."): + """ + 读取P3ma相CIF,为不同尺寸的超胞生成多个具有不同反位缺陷位置的构型。 + """ + if not os.path.exists(cif_path): + print(f"错误: 文件 '{cif_path}' 不存在。") + return + + print(f"正在从 {cif_path} 读取P3ma结构...") + parser = CifParser(cif_path) + disordered_structure = parser.parse_structures(primitive=False)[0] + + structure = create_ordered_p3ma_structure(disordered_structure) + print(f"成功将无序P3ma结构转换为一个包含 {len(structure)} 个原子的有序单胞。") + + os.makedirs(output_path, exist_ok=True) + + target_sizes = [60, 90] + for size in target_sizes: + print(f"\n--- 正在为约 {size} 原子的版本生成 {num_configs} 个不同构型 ---") + + # 1. 构建基准超胞 + if size == 60: + tf = SupercellTransformation([[1, 0, 0], [0, 1, 0], [0, 0, 2]]) + filename_suffix = "60_approx" + else: # size == 90 + tf = SupercellTransformation([[1, 0, 0], [0, 1, 0], [0, 0, 3]]) + filename_suffix = "90_approx" + + base_supercell = tf.apply_transformation(structure) + print(f"已生成基准超胞,实际原子数: {len(base_supercell)}") + + li_indices = [i for i, site in enumerate(base_supercell.sites) if site.species_string == 'Li'] + y_indices = [i for i, site in enumerate(base_supercell.sites) if site.species_string == 'Y'] + + if not li_indices or not y_indices: + print("错误:在超胞中未找到足够的Li或Y原子来引入缺陷。") + continue + + # 2. 循环生成多个独特的缺陷构型 + used_pairs = set() + for i in range(num_configs): + defect_supercell = base_supercell.copy() + + # 确保随机选择的交换对是全新的 + # 增加一个尝试次数上限,防止在原子数很少时陷入死循环 + max_tries = len(li_indices) * len(y_indices) + for _ in range(max_tries): + li_swap_idx = random.choice(li_indices) + y_swap_idx = random.choice(y_indices) + pair = tuple(sorted((li_swap_idx, y_swap_idx))) + if pair not in used_pairs: + used_pairs.add(pair) + break + else: + print(f" 警告: 未能找到更多独特的交换对,已停止在第 {i} 个构型。") + break + + # 引入缺陷 + defect_supercell.replace(li_swap_idx, "Y") + defect_supercell.replace(y_swap_idx, "Li") + + print(f" 配置 {i}: 成功引入一对反位缺陷 (Li at index {li_swap_idx} <-> Y at index {y_swap_idx})。") + + # 3. 保存为带编号的POSCAR文件 + poscar_filename = f"POSCAR_P3ma_{filename_suffix}_antisite_defect_{i}" + poscar_path = os.path.join(output_path, poscar_filename) + defect_supercell.to(fmt="poscar", filename=poscar_path) + print(f" 已保存文件: {poscar_path}") + +if __name__ == '__main__': + # --- 使用方法 --- + # 1. 将您的CIF文件保存,例如命名为 "Li3YCl6.cif" + # 2. 将文件名作为参数传递给函数 + cif_file_path = "data/P3ma/model3.cif" # 修改为您的CIF文件名 + output_directory = "raw/P3ma/output" # 可以指定一个输出目录 + + # create_supercells_from_file(cif_file_path, output_directory) + create_multiple_p3ma_supercells(cif_file_path,output_path=output_directory) + print("所有任务完成!") \ No newline at end of file diff --git a/dpgen/supercell_make_pnma.py b/dpgen/supercell_make_pnma.py new file mode 100644 index 0000000..f95e48b --- /dev/null +++ b/dpgen/supercell_make_pnma.py @@ -0,0 +1,115 @@ +import pymatgen.core as mg +from pymatgen.io.cif import CifParser +from pymatgen.transformations.standard_transformations import SupercellTransformation +import random +import os + + +def create_ordered_pnma_structure(disordered_structure): + """ + 手动将Pnma相的无序结构(主要为Li的部分占位)转换为有序结构。 + """ + s = disordered_structure.copy() + + # 根据origin.cif, Li位点的占位率为0.75 [5] + partial_li_indices = [i for i, site in enumerate(s.sites) if "Li" in site.species and not site.is_ordered] + + # 根据0.75的占位率随机选择要保留的Li原子 + num_to_keep = int(round(len(partial_li_indices) * 0.75)) + keep_indices = set(random.sample(partial_li_indices, num_to_keep)) + + # 找出需要删除的原子索引 + to_remove_indices = [i for i in partial_li_indices if i not in keep_indices] + + s.remove_sites(sorted(to_remove_indices, reverse=True)) + + # 重新创建一个新的、完全有序的结构,避免任何副作用 + ordered_species = [] + ordered_coords = [] + for site in s.sites: + # 只取每个位点的主要元素 + main_specie = site.species.elements[0] + ordered_species.append(main_specie) + ordered_coords.append(site.frac_coords) + + final_structure = mg.Structure(s.lattice, ordered_species, ordered_coords) + + return final_structure + + +def create_multiple_pnma_supercells(cif_path, num_configs=3, output_path="."): + """ + 读取Pnma相CIF,为不同尺寸的超胞生成多个具有不同反位缺陷位置的构型。 + """ + if not os.path.exists(cif_path): + print(f"错误: 文件 '{cif_path}' 不存在。") + return + + print(f"正在从 {cif_path} 读取Pnma结构...") + parser = CifParser(cif_path) + disordered_structure = parser.parse_structures(primitive=False)[0] + + structure = create_ordered_pnma_structure(disordered_structure) + print(f"成功将无序Pnma结构转换为一个包含 {len(structure)} 个原子的有序单胞。") + + os.makedirs(output_path, exist_ok=True) + + target_sizes = [60, 90] + for size in target_sizes: + print(f"\n--- 正在为约 {size} 原子的版本生成 {num_configs} 个不同构型 ---") + + # 1. 构建基准超胞 + if size == 60: + tf = SupercellTransformation([[1, 0, 0], [0, 1, 0], [0, 0, 2]]) + filename_suffix = "60_approx" + else: # size == 90 + tf = SupercellTransformation([[1, 0, 0], [0, 1, 0], [0, 0, 3]]) + filename_suffix = "90_approx" + + base_supercell = tf.apply_transformation(structure) + print(f"已生成基准超胞,实际原子数: {len(base_supercell)}") + + li_indices = [i for i, site in enumerate(base_supercell.sites) if site.species_string == 'Li'] + y_indices = [i for i, site in enumerate(base_supercell.sites) if site.species_string == 'Y'] + + if not li_indices or not y_indices: + print("错误:在超胞中未找到足够的Li或Y原子来引入缺陷。") + continue + + # 2. 循环生成多个独特的缺陷构型 + used_pairs = set() + for i in range(num_configs): + defect_supercell = base_supercell.copy() + + # 确保随机选择的交换对是全新的 + while True: + li_swap_idx = random.choice(li_indices) + y_swap_idx = random.choice(y_indices) + # 使用排序后的元组作为键,确保(a,b)和(b,a)被视为相同 + pair = tuple(sorted((li_swap_idx, y_swap_idx))) + if pair not in used_pairs: + used_pairs.add(pair) + break + + # 引入缺陷 + defect_supercell.replace(li_swap_idx, "Y") + defect_supercell.replace(y_swap_idx, "Li") + + print(f" 配置 {i}: 成功引入一对反位缺陷 (Li at index {li_swap_idx} <-> Y at index {y_swap_idx})。") + + # 3. 保存为带编号的POSCAR文件 + poscar_filename = f"POSCAR_Pnma_{filename_suffix}_antisite_defect_{i}" + poscar_path = os.path.join(output_path, poscar_filename) + defect_supercell.to(fmt="poscar", filename=poscar_path) + print(f" 已保存文件: {poscar_path}") + + +if __name__ == '__main__': + # 请将您的Pnma相CIF文件保存,并修改此路径 + # 这里我们使用您提供的参考文件名 'origin.cif' + cif_file_path = "data/Pnma/origin.cif" + output_directory = "raw/Pnma/output" + + create_multiple_pnma_supercells(cif_file_path, num_configs=3, output_path=output_directory) + + print("\nPnma相处理完成!") \ No newline at end of file diff --git a/dpgen/supercell_make_wangshuo.py b/dpgen/supercell_make_wangshuo.py new file mode 100644 index 0000000..18c6a5e --- /dev/null +++ b/dpgen/supercell_make_wangshuo.py @@ -0,0 +1,197 @@ +import random +from collections import defaultdict, Counter + +from pymatgen.core import Structure, Element +from pymatgen.io.lammps.data import LammpsData + +# ASE 兜底(可选) +try: + from ase.io import write as ase_write + from ase import Atoms as ASEAtoms + HAS_ASE = True +except Exception: + HAS_ASE = False + +# ===== 用户参数 ===== +cif_filename = "data/P3ma/origin.cif" # 你的输入 CIF(含部分占位)[5] +supercell_matrix = [[1, 0, 0], [0, 1, 0], [0, 0, 2]] # 2×2×4 超胞(总复制数=16)[3] +out_lammps = "lyc_P3m1_1x1x2_from_cif_ordered.vasp" # 输出 LAMMPS raw +seed = 2025 +strict_count = True # 严格配额:每个父位点在超胞内按占位概率分配整数个原子/空位 +# ==================== + +random.seed(seed) + +def species_to_probs(site): + """ + 将站点的物种占位转换为 [(species or None(vac), prob)] 列表,prob 归一化为和=1。 + 若总占位 < 1,补一个 vacancy(None)。 + 去掉氧化态,仅保留元素。 + """ + sp_items = site.species.items() + total = 0.0 + pairs = [] + for spc, occ in sp_items: + # 转成 Element(剔除氧化态) + try: + e = Element(spc.symbol) if hasattr(spc, "symbol") else Element(str(spc)) + except Exception: + e = Element(str(spc)) + pairs.append((e, float(occ))) + total += float(occ) + if total < 1.0 - 1e-10: + pairs.append((None, 1.0 - total)) # vacancy + total = 1.0 + # 归一化 + if abs(total - 1.0) > 1e-10: + pairs = [(e, p / total) for (e, p) in pairs] + return pairs + +def draw_counts_from_probs(n, probs): + """ + 给定复制数 n 和概率 probs,返回 {species/None: count},使计数和为 n。 + 先按四舍五入,再用残差修正到总和= n。 + """ + # 初分配 + counts = {sp: int(round(p * n)) for sp, p in probs} + s = sum(counts.values()) + if s == n: + return counts + + # 残差排序:需要增加则按概率大的优先加;需要减少则按概率小的优先减 + if n > s: + need = n - s + probs_sorted = sorted(probs, key=lambda x: x[1], reverse=True) + for i in range(need): + sp = probs_sorted[i % len(probs_sorted)][0] + counts[sp] = counts.get(sp, 0) + 1 + else: + need = s - n + probs_sorted = sorted(probs, key=lambda x: x[1]) # 先减概率小的 + idx = 0 + while need > 0 and idx < 50 * len(probs_sorted): + sp = probs_sorted[idx % len(probs_sorted)][0] + if counts.get(sp, 0) > 0: + counts[sp] -= 1 + need -= 1 + idx += 1 + return counts + +def collapse_disorder_to_ordered_supercell(struct, M, strict=True): + """ + 处理步骤: + 1) 给原胞每个位点打 parent_id + 2) 扩胞到 M + 3) 以父位点为组,在组内(复制数 n)按占位概率分配整数个 species/空位到每个复制位点 + - 有序位点:所有复制直接保留 + - 无序位点/部分占位:严格配额或独立抽样 + 4) 返回完全有序的超胞 Structure + """ + s0 = struct.copy() + s0.add_site_property("parent_id", list(range(s0.num_sites))) + + sc = s0.copy() + sc.make_supercell(M) + + # 按父位点分组 + groups = defaultdict(list) + for i, site in enumerate(sc.sites): + pid = sc.site_properties["parent_id"][i] + groups[pid].append(i) + + new_species = [] + new_fracs = [] + new_lat = sc.lattice + + for pid, idx_list in groups.items(): + # 用该组第一个复制的站点定义占位 + site0 = sc[idx_list[0]] + # 有序站点:直接全部保留(只有一种元素,且占位为1) + if site0.is_ordered: + species_elem = list(site0.species.keys())[0] + for i in idx_list: + new_species.append(species_elem) + new_fracs.append(sc[i].frac_coords) + continue + + # 无序/部分占位:概率分配 + probs = species_to_probs(site0) + n = len(idx_list) + + if strict: + counts = draw_counts_from_probs(n, probs) + # 构造分配池并打乱 + pool = [] + for sp, c in counts.items(): + pool += [sp] * c + random.shuffle(pool) + # 分配到每个复制位点 + for i, sp in zip(idx_list, pool): + if sp is None: + continue # vacancy -> 删除该位点 + new_species.append(sp) + new_fracs.append(sc[i].frac_coords) + else: + # 独立抽样 + import bisect + species_list = [sp for sp, p in probs] + cum = [] + ssum = 0.0 + for _, p in probs: + ssum += p + cum.append(ssum) + for i in idx_list: + r = random.random() + j = bisect.bisect_left(cum, r) + sp = species_list[j] + if sp is None: + continue + new_species.append(sp) + new_fracs.append(sc[i].frac_coords) + + ordered_sc = Structure(new_lat, new_species, new_fracs, to_unit_cell=True, coords_are_cartesian=False) + # 去除可能残留的氧化态(LAMMPS atomic 不需要) + try: + ordered_sc.remove_oxidation_states() + except Exception: + pass + return ordered_sc + +# 1) 读取 CIF(含部分占位) +s_in = Structure.from_file(cif_filename, primitive=False) +print(f"读入: {cif_filename}, 原胞位点: {s_in.num_sites}, 有序?: {s_in.is_ordered}") + +# 2) 在 2×2×4 超胞上固化部分占位 -> 完全有序超胞 +ordered_sc = collapse_disorder_to_ordered_supercell(s_in, supercell_matrix, strict=strict_count) +print(f"生成有序超胞: 位点数={ordered_sc.num_sites}, 有序?: {ordered_sc.is_ordered}") + +# 3) 打印元素计数,核对化学计量 +elem_count = Counter([sp.symbol for sp in ordered_sc.species]) +print("元素计数:", dict(elem_count)) + +# 4) 写 LAMMPS raw(pymatgen,失败则 ASE 兜底) +wrote = False +try: + ldata = LammpsData.from_structure(ordered_sc, atom_style="atomic") + ldata.write_file(out_lammps) + wrote = True + print(f"已写出 LAMMPS raw: {out_lammps} (pymatgen)") +except Exception as e: + print("pymatgen 写 LAMMPS raw 失败:", e) + +if not wrote and HAS_ASE: + try: + ase_atoms = ASEAtoms( + symbols=[sp.symbol for sp in ordered_sc.species], + positions=ordered_sc.cart_coords, + cell=ordered_sc.lattice.matrix, + pbc=True + ) + ase_write(out_lammps, ase_atoms, format="lammps-raw", atom_style="atomic") + wrote = True + print(f"已写出 LAMMPS raw: {out_lammps} (ASE)") + except Exception as e: + print("ASE 写 LAMMPS raw 也失败:", e) + +if not wrote: + print("写 LAMMPS raw 失败,请把错误信息发我。") \ No newline at end of file diff --git a/mcp/main.py b/mcp/main.py index b882f00..5dd7f32 100644 --- a/mcp/main.py +++ b/mcp/main.py @@ -11,15 +11,16 @@ from starlette.routing import Mount from system_tools import create_system_mcp from materialproject_mcp import create_materials_mcp from softBV_remake import create_softbv_mcp -from paper_search_mcp import create_paper_search_mcp +# from paper_search_mcp import create_paper_search_mcp from topological_analysis_models import create_topological_analysis_mcp - +from vasp_mcp import create_vasp_mcp # 创建 MCP 实例 system_mcp = create_system_mcp() materials_mcp = create_materials_mcp() softbv_mcp = create_softbv_mcp() -paper_search_mcp = create_paper_search_mcp() +# paper_search_mcp = create_paper_search_mcp() topological_analysis_mcp = create_topological_analysis_mcp() +vasp_mcp = create_vasp_mcp() # 在 Starlette 的 lifespan 中启动 MCP 的 session manager @contextlib.asynccontextmanager async def lifespan(app: Starlette): @@ -27,8 +28,9 @@ async def lifespan(app: Starlette): await stack.enter_async_context(system_mcp.session_manager.run()) await stack.enter_async_context(materials_mcp.session_manager.run()) await stack.enter_async_context(softbv_mcp.session_manager.run()) - await stack.enter_async_context(paper_search_mcp.session_manager.run()) + # await stack.enter_async_context(paper_search_mcp.session_manager.run()) await stack.enter_async_context(topological_analysis_mcp.session_manager.run()) + await stack.enter_async_context(vasp_mcp.session_manager.run()) yield # 服务器运行期间 # 退出时自动清理 @@ -39,8 +41,9 @@ app = Starlette( Mount("/system", app=system_mcp.streamable_http_app()), Mount("/materials", app=materials_mcp.streamable_http_app()), Mount("/softBV", app=softbv_mcp.streamable_http_app()), - Mount("/papersearch",app=paper_search_mcp.streamable_http_app()), + # Mount("/papersearch",app=paper_search_mcp.streamable_http_app()), Mount("/topologicalAnalysis",app=topological_analysis_mcp.streamable_http_app()), + Mount("/vasp",app=vasp_mcp.streamable_http_app()), ], ) @@ -52,4 +55,5 @@ app = Starlette( # http://localhost:8000/softBV # http://localhost:8000/papersearch # http://localhost:8000/topologicalAnalysis +# http://localhost:8000/vasp # 如果需要浏览器客户端访问(CORS 暴露 Mcp-Session-Id),请参考 README 中的 CORS 配置示例 [1] \ No newline at end of file diff --git a/mcp/softBV_remake.py b/mcp/softBV_remake.py index ba2533d..b5d7c0e 100644 --- a/mcp/softBV_remake.py +++ b/mcp/softBV_remake.py @@ -560,7 +560,7 @@ def _parse_print_cube_output(raw_text: str) -> PrintCubeResult: matrix.append(parts) return matrix - # Find key lines and parse data + # Find key lines and parse raw name = re.search(r"CELL: name: (.*)", raw_text).group(1).strip() total_atoms = int(re.search(r"CELL: total atom: (\d+)", raw_text).group(1)) diff --git a/mcp/vasp_mcp.py b/mcp/vasp_mcp.py new file mode 100644 index 0000000..1f6ab77 --- /dev/null +++ b/mcp/vasp_mcp.py @@ -0,0 +1,362 @@ +# vasp_mcp.py +import os +import posixpath +import warnings +from dataclasses import dataclass +from typing import Any, AsyncIterator +from contextlib import asynccontextmanager + +import asyncssh +from pymatgen.core import Structure +from io import StringIO + +from mcp.server.fastmcp import FastMCP, Context +from mcp.server.session import ServerSession +from pymatgen.io.cif import CifParser + +# --- VASP 特定配置 --- +REMOTE_HOST = os.getenv("REMOTE_HOST", '202.121.182.208') +REMOTE_USER = os.getenv("REMOTE_USER", 'koko125') +PRIVATE_KEY_PATH = os.getenv("PRIVATE_KEY_PATH", 'D:/tool/tool/id_rsa.txt') + +# VASP 赝势库和沙箱路径 +POTCAR_BASE_PATH = "/cluster/home/koko125/tool/potcar_mcp" +DEFAULT_SANDBOX = f"/cluster/home/{REMOTE_USER}/sandbox" +VASP_ENV_SCRIPT = "/cluster/home/koko125/intel/oneapi/setvars.sh" +VASP_MPI_RUN_CMD = "mpirun -np 4 /cluster/home/koko125/vasp/bin_cpu/vasp_std" +def shell_quote(arg: str) -> str: + """安全地引用 shell 参数""" + return "'" + str(arg).replace("'", "'\"'\"'") + "'" + + +# --- 定义共享上下文 --- +@dataclass +class VaspContext: + ssh_connection: asyncssh.SSHClientConnection + potcar_base: str + sandbox_path: str + env_script: str + mpi_run_cmd: str # <-- 添加这一行 + +# --- 定义生命周期管理器 --- +@asynccontextmanager +async def vasp_lifespan(_server: FastMCP) -> AsyncIterator[VaspContext]: + """建立 SSH 连接并注入 VASP 上下文。""" + conn: asyncssh.SSHClientConnection | None = None + try: + conn = await asyncssh.connect( + REMOTE_HOST, username=REMOTE_USER, client_keys=[PRIVATE_KEY_PATH], known_hosts=None + ) + yield VaspContext( + ssh_connection=conn, + potcar_base=POTCAR_BASE_PATH, + sandbox_path=DEFAULT_SANDBOX, + env_script=VASP_ENV_SCRIPT, + mpi_run_cmd=VASP_MPI_RUN_CMD + ) + finally: + if conn: + conn.close() + await conn.wait_closed() + + +# --- VASP MCP 工厂函数 --- +def create_vasp_mcp() -> FastMCP: + """创建包含 VASP 辅助工具的 MCP 实例。""" + mcp = FastMCP( + name="VASP POTCAR Tools", + instructions="用于查询和准备 VASP POTCAR 文件的专用工具集。", + lifespan=vasp_lifespan, + streamable_http_path="/", + ) + + # 沿用 system_tools.py 中的安全路径拼接函数,确保目标路径安全 + def _safe_join_sandbox(sandbox_root: str, relative_path: str) -> str: + rel = (relative_path or ".").strip().lstrip("/") + combined = posixpath.normpath(posixpath.join(sandbox_root, rel)) + root_norm = sandbox_root.rstrip("/") + if combined != root_norm and not combined.startswith(root_norm + "/"): + raise ValueError("路径越界:目标路径必须在沙箱目录内") + if ".." in combined.split("/"): + raise ValueError("非法路径:不允许使用 '..'") + return combined + + # --- 工具 1: 列出可用的赝势库类型 --- + @mcp.tool() + async def list_potcar_types(ctx: Context[ServerSession, VaspContext]) -> list[str]: + """ + 列出中央赝势库中所有可用的赝势类型 (例如 'PBE_potpaw', 'PAW_GGA_PBE')。 + """ + app_ctx = ctx.request_context.lifespan_context + conn = app_ctx.ssh_connection + try: + # 使用 ls -F, 目录会带上 '/' 后缀 + result = await conn.run(f"ls -F {shell_quote(app_ctx.potcar_base)}", check=True) + potcar_types = [ + name.strip('/') for name in result.stdout.strip().split() if name.endswith('/') + ] + await ctx.info(f"发现可用赝势库: {potcar_types}") + return potcar_types + except Exception as e: + await ctx.error(f"列出 POTCAR 类型失败: {e}") + return [] + + # --- 工具 2 (新): 查询指定赝势库中的可用元素 --- + @mcp.tool() + async def query_potcar_elements(ctx: Context[ServerSession, VaspContext], potcar_type: str) -> list[str]: + """ + 查询指定类型的赝势库中包含哪些元素的赝势。 + """ + app_ctx = ctx.request_context.lifespan_context + conn = app_ctx.ssh_connection + + # 安全地构建源目录路径 + source_dir = posixpath.join(app_ctx.potcar_base, potcar_type) + if ".." in potcar_type or "/" in potcar_type: + await ctx.error(f"非法的赝势库类型: {potcar_type}") + return [] + + await ctx.info(f"查询目录 '{source_dir}' 中的可用元素...") + try: + result = await conn.run(f"ls -F {shell_quote(source_dir)}", check=True) + elements = [ + name.strip('/') for name in result.stdout.strip().split() if name.endswith('/') + ] + return elements + except asyncssh.ProcessError as e: + msg = f"查询元素失败: 赝势库 '{potcar_type}' 可能不存在。Stderr: {e.stderr}" + await ctx.error(msg) + return [] + except Exception as e: + await ctx.error(f"查询 POTCAR 元素时发生未知错误: {e}") + return [] + + # --- 工具 3 (新): 从中央库安全地复制 POTCAR 文件到沙箱 --- + @mcp.tool() + async def copy_potcar_file( + ctx: Context[ServerSession, VaspContext], + potcar_type: str, + element: str, + destination_path: str + ) -> dict[str, str]: + """ + 从中央赝势库安全地复制一个指定元素的 POTCAR 文件到用户沙箱中的目标路径。 + 例如, 将 'PBE_potpaw' 库中的 'Si' 赝势复制到 'sio2_relax/POTCAR_Si'。 + """ + app_ctx = ctx.request_context.lifespan_context + conn = app_ctx.ssh_connection + + try: + # 1. 安全地构建源文件路径 (只允许访问 potcar_base 下的子目录) + if ".." in potcar_type or "/" in potcar_type or ".." in element or "/" in element: + raise ValueError("非法的赝势库类型或元素名称。") + source_file = posixpath.join(app_ctx.potcar_base, potcar_type, element, "POTCAR") + + # 2. 安全地构建目标文件路径 (必须在沙箱内) + dest_file_abs = _safe_join_sandbox(app_ctx.sandbox_path, destination_path) + + # 3. 执行 cp 命令 + cmd = f"cp {shell_quote(source_file)} {shell_quote(dest_file_abs)}" + await ctx.info(f"执行安全复制: cp {source_file} -> {dest_file_abs}") + await conn.run(cmd, check=True) + + return {"status": "success", "source": source_file, "destination": destination_path} + + except asyncssh.ProcessError as e: + msg = f"复制 POTCAR 失败。请检查 potcar_type 和 element 是否正确。Stderr: {e.stderr}" + await ctx.error(msg) + return {"status": "error", "message": msg} + except ValueError as e: + await ctx.error(f"路径验证失败: {e}") + return {"status": "error", "message": str(e)} + except Exception as e: + await ctx.error(f"复制 POTCAR 时发生未知错误: {e}") + return {"status": "error", "message": str(e)} + + # (可选) 我们仍然可以保留 cif_to_poscar 工具,因为它对于确定元素顺序非常有用 + @mcp.tool() + def cif_to_poscar(cif_content: str, sort_structure: bool = True) -> dict[str, str]: + """将 CIF 文件内容转换为 VASP 的 POSCAR 文件内容,并提供有序的元素列表。""" + # ... (此工具代码与之前版本相同) + try: + structure = Structure.from_str(cif_content, fmt="cif") + if sort_structure: + structure = structure.get_sorted_structure() + + elements = [site.specie.symbol for site in structure] + unique_elements = sorted(set(elements), key=elements.index) + + poscar_string_io = StringIO() + structure.to(fmt="poscar", file_obj=poscar_string_io) + poscar_content = poscar_string_io.getvalue() + + return { + "status": "success", + "poscar_content": poscar_content, + "elements": " ".join(unique_elements) + } + except Exception as e: + return {"status": "error", "message": f"CIF 转换失败: {e}"} + + @mcp.tool() + def cif_to_poscar(cif_content: str,ctx: Context[ServerSession, VaspContext], sort_structure: bool = True) -> dict[str, str]: + """ + 将 CIF 文件内容稳健地转换为 VASP 的 POSCAR 文件内容。 + 该工具会尝试多种解析策略来处理格式不规范的CIF文件。 + 如果成功,返回 POSCAR 内容和用于生成 POTCAR 的有序元素列表。 + 如果失败,返回详细的错误信息。 + + Args: + cif_content (str): 包含晶体结构的 CIF 文件完整内容。 + sort_structure (bool): 是否对原子进行排序以匹配 Pymatgen 的 POTCAR 约定。默认为 True。 + """ + + structure = None + last_exception = None + + # 忽略 Pymatgen 可能产生的警告,避免污染输出 + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + + # --- 策略 1: 标准解析 --- + try: + ctx.debug("尝试策略 1: 标准 CIF 解析...") + structure = Structure.from_str(cif_content, fmt="cif") + ctx.info("策略 1 成功: 标准解析完成。") + except Exception as e: + ctx.warning(f"策略 1 失败: {e}") + last_exception = e + + # --- 策略 2: 宽松解析 (忽略化合价检查) --- + if structure is None: + try: + ctx.debug("尝试策略 2: 宽松解析 (不检查化合价)...") + # 使用底层的 CifParser 并禁用化合价检查 + parser = CifParser.from_string(cif_content, check_valence=False) + structure = parser.get_structures(primitive=True)[0] + ctx.info("策略 2 成功: 宽松解析完成。") + except Exception as e: + ctx.warning(f"策略 2 失败: {e}") + last_exception = e + + # --- 策略 3: 使用原始坐标,不进行对称性处理 --- + if structure is None: + try: + ctx.debug("尝试策略 3: 使用原始坐标 (primitive=False)...") + parser = CifParser.from_string(cif_content) + # 获取文件中的原始结构,而不是计算出的原胞 + structure = parser.get_structures(primitive=False)[0] + ctx.info("策略 3 成功: 已使用原始坐标。") + except Exception as e: + ctx.warning(f"策略 3 失败: {e}") + last_exception = e + + # --- 如果所有策略都失败 --- + if structure is None: + error_message = ( + "无法从提供的 CIF 内容中解析出晶体结构。所有解析策略均已失败。\n" + f"最后遇到的错误是: {last_exception}\n" + "建议: 请检查 CIF 文件格式是否严重损坏。AI 可以尝试重新生成 CIF,或直接请求用户提供 POSCAR 文件内容。" + ) + ctx.error(error_message) + return {"status": "error", "message": error_message} + + # --- 成功后处理 --- + try: + # 排序结构以确保元素顺序与 pymatgen 的 POTCAR 生成逻辑一致 + if sort_structure: + structure = structure.get_sorted_structure() + + # 从结构中获取有序的元素列表 + elements = [site.specie.symbol for site in structure.composition.elements] + + # 生成 POSCAR 内容 + poscar_string_io = StringIO() + # 使用 vasp5 格式,确保元素行存在 + structure.to(fmt="poscar", file_obj=poscar_string_io) + poscar_content = poscar_string_io.getvalue() + + return { + "status": "success", + "poscar_content": poscar_content, + "elements": " ".join(elements) # 以空格分隔的字符串形式提供有序元素 + } + except Exception as e: + # 这种情况很少见,但可能在结构后处理时发生 + final_error = f"结构解析成功,但在生成POSCAR时出错: {e}" + ctx.error(final_error) + return {"status": "error", "message": final_error} + + @mcp.tool() + async def test_vasp_run( + ctx: Context[ServerSession, VaspContext], + job_directory: str + ) -> dict[str, Any]: + """ + 在指定目录中启动 VASP 的“精简模式”(--lite)以进行快速测试。 + 此模式会完整解析所有输入文件(INCAR, POSCAR等)并检查参数, + 但不会开始实际的离子或电子步计算,通常在数秒内完成。 + + Args: + job_directory (str): 包含所有 VASP 输入文件的远程沙箱子目录。 + """ + app_ctx = ctx.request_context.lifespan_context + conn = app_ctx.ssh_connection + + try: + # 安全地构建工作目录的绝对路径 + workdir_abs = _safe_join_sandbox(app_ctx.sandbox_path, job_directory) + + await ctx.info(f"在 '{workdir_abs}' 中开始 VASP 输入文件验证 (精简模式)...") + + # 关键:在 VASP 执行命令后附加 --lite 标志 + # 注意: mpirun [options] [args] + # 我们需要将 --lite 附加到 vasp_std 后面 + # 假设 app_ctx.mpi_run_cmd 是 "mpirun -np 4 .../vasp_std" + command_with_lite = f"{app_ctx.mpi_run_cmd} --lite" + + # 构建完整的 shell 命令,以激活环境并执行 + # 这里的 f-string 已被修正,以避免多行和嵌套问题 + inner_command = ( + f"cd {shell_quote(workdir_abs)}; " + f"source {shell_quote(app_ctx.env_script)}; " + f"{command_with_lite}" + ) + full_shell_command = f"bash -lc {shell_quote(inner_command)}" + + # 使用简单的 conn.run 等待命令完成。check=False 因为我们想自己处理非零退出 + proc = await conn.run(full_shell_command, check=False) + + # 分析结果 + stdout = proc.stdout or "" + stderr = proc.stderr or "" + + # VASP 成功完成初始化并正常退出的标志通常是这条信息 + success_indicator = "General timing and accounting informations for this job" + + if proc.exit_status == 0 and success_indicator in stdout: + test_passed = True + conclusion = "测试成功:VASP 输入文件有效,所有参数均被正确解析。" + await ctx.info(conclusion) + else: + test_passed = False + conclusion = "测试失败:VASP 报告了错误或未能正常完成初始化。请检查下面的 stderr 输出。" + await ctx.warning(conclusion) + + return { + "status": "completed", + "test_passed": test_passed, + "conclusion": conclusion, + "exit_status": proc.exit_status, + "stdout_preview": "\n".join(stdout.splitlines()[-20:]), # 只看最后20行,避免刷屏 + "stderr": stderr + } + + except Exception as e: + msg = f"执行 test_vasp_run 工具时发生意外错误: {e}" + await ctx.error(msg) + return {"status": "error", "message": str(e)} + + # 确保这个 _safe_join_sandbox 辅助函数存在于 create_vasp_mcp 函数内部或可被访问 + + return mcp \ No newline at end of file diff --git a/rss/nature_filter_rss.py b/rss/nature_filter_rss.py new file mode 100644 index 0000000..3956578 --- /dev/null +++ b/rss/nature_filter_rss.py @@ -0,0 +1,122 @@ +import Bfeedparser +import requests +from feedgen.feed import FeedGenerator +from datetime import datetime, timezone +import time + +# --- 1. 配置区 --- + +# 你的关键词列表,不区分大小写 +KEYWORDS = ['solid-state battery', 'lithium metal', 'anode-free', 'electrolyte'] + +# 你想监控的 Nature 系列期刊的 RSS 源 +SOURCE_FEEDS = { + 'Nature': 'https://www.nature.com/nature/rss/current', + 'Nat Commun': 'https://www.nature.com/ncomms/rss/current', + 'Nat Energy': 'https://www.nature.com/nenergy/rss/current', + 'Nat Mater': 'https://www.nature.com/nmat/rss/current', + 'Nat Nanotechnol': 'https://www.nature.com/nnano/rss/current', + 'Nat Sustain': 'https://www.nature.com/natsustain/rss/current', + 'Nat Chem': 'https://www.nature.com/nchem/rss/current', + 'Nat Synth': 'https://www.nature.com/natsynth/rss/current', + 'Nat Catal': 'https://www.nature.com/natcatal/rss/current', + 'Nat Rev Mater': 'https://www.nature.com/natrevmat/rss/current', + 'Nat Rev Chem': 'https://www.nature.com/natrevchem/rss/current', + 'Nat Rev Earth Environ': 'https://www.nature.com/natrevearthenviron/rss/current', +} + +# 输出的 RSS 文件路径,确保 ttrss 能通过 web 服务器访问到它 +OUTPUT_FILE = '/var/www/html/rss/nature_filtered_feed.xml' + + +# --- 2. 脚本核心逻辑 --- +N +def fetch_and_filter(): + """获取所有源,过滤文章,返回一个匹配文章的列表""" + + print(f"Starting feed fetch at {datetime.now()}") + + matched_articles = [] + # 使用集合来存储已添加文章的链接,防止重复 + seen_links = set() + + headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' + } + + for name, url in SOURCE_FEEDS.items(): + print(f" -> Fetching from {name}...") + try: + # 使用 requests 获取内容,可以更好地处理网络问题和伪装 User-Agent + response = requests.get(url, headers=headers, timeout=15) + response.raise_for_status() # 确保请求成功 + + # 使用 feedparser 解析获取到的内容 + feed = feedparser.parse(response.content) + + for entry in feed.entries: + # 检查文章链接是否已处理过 + if entry.link in seen_links: + continue + + # 将标题和摘要拼接在一起,方便搜索 + content_to_check = (entry.title + ' ' + entry.get('summary', '')).lower() + + # 检查是否有任何一个关键词出现在内容中 + if any(keyword.lower() in content_to_check for keyword in KEYWORDS): + print(f" [MATCH FOUND] in {name}: {entry.title}") + + # 为了在 RSS 阅读器中更好地展示,我们在标题前加上来源期刊 + entry.title = f"[{name}] {entry.title}" + matched_articles.append(entry) + seen_links.add(entry.link) + + # 友好请求,避免过于频繁 + time.sleep(1) + + except requests.RequestException as e: + print(f" [ERROR] Could not fetch {name}: {e}") + except Exception as e: + print(f" [ERROR] An unexpected error occurred for {name}: {e}") + + print(f"\nFound {len(matched_articles)} matching articles in total.") + return matched_articles + + +def generate_filtered_feed(articles): + """根据过滤后的文章列表生成新的 RSS 文件""" + + fg = FeedGenerator() + fg.title('My Filtered Nature Research Feed') + fg.link(href='https://www.nature.com', rel='alternate') + fg.description(f"Custom RSS feed for Nature journals, filtered by keywords: {', '.join(KEYWORDS)}") + + # 按发布日期对文章进行排序(从新到旧) + articles.sort(key=lambda x: x.get('published_parsed') or x.get('updated_parsed'), reverse=True) + + for entry in articles: + fe = fg.add_entry() + fe.id(entry.link) # 使用文章链接作为唯一ID + fe.title(entry.title) + fe.link(href=entry.link) + # feedparser 已经帮我们解析好了摘要 + fe.description(entry.get('summary', 'No summary available.')) + + # 处理发布日期 + pub_date = entry.get('published_parsed') + if pub_date: + # 转换为带时区的 datetime 对象 + fe.published(datetime.fromtimestamp(time.mktime(pub_date)).replace(tzinfo=timezone.utc)) + + # 写入文件 + fg.rss_file(OUTPUT_FILE, pretty=True) + print(f"Successfully generated new RSS feed at {OUTPUT_FILE}") + + +# --- 3. 主程序入口 --- +if __name__ == "__main__": + filtered_articles = fetch_and_filter() + if filtered_articles: + generate_filtered_feed(filtered_articles) + else: + print("No new matching articles found. RSS file not updated.")