对比学习法增改

2025-10-29 11:39:30 +08:00
parent 1f8667ae51
commit 95d719cc1e
5 changed files with 466 additions and 0 deletions
--- a/learning/copy.py
+++ b/learning/copy.py
@@ -0,0 +1,151 @@
+import shutil
+from pathlib import Path
+
+
+def find_element_column_index(cif_lines: list) -> int:
+    """
+    在CIF文件内容中查找 _atom_site_type_symbol 所在的列索引。
+
+    :param cif_lines: 从CIF文件读取的行列表。
+    :return: 元素符号列的索引（从0开始），如果未找到则返回-1。
+    """
+    in_loop_header = False
+    column_index = -1
+    current_column = 0
+
+    for line in cif_lines:
+        line_stripped = line.strip()
+        if not line_stripped:
+            continue
+
+        if line_stripped.startswith('loop_'):
+            in_loop_header = True
+            column_index = -1
+            current_column = 0
+            continue
+
+        if in_loop_header:
+            if line_stripped.startswith('_'):
+                if line_stripped.startswith('_atom_site_type_symbol'):
+                    column_index = current_column
+                current_column += 1
+            else:
+                # loop_ 头部定义结束，开始数据行
+                return column_index
+
+    return -1  # 如果文件中没有找到 loop_ 或 _atom_site_type_symbol
+
+
+def copy_cif_with_O_or_S_robust(source_dir: str, target_dir: str, dry_run: bool = False):
+    """
+    从源文件夹中筛选出内容包含'O'或'S'元素的CIF文件，并复制到目标文件夹。
+    (鲁棒版：能正确解析CIF中的元素符号列)
+
+    :param source_dir: 源文件夹路径，包含CIF文件。
+    :param target_dir: 目标文件夹路径，用于存放筛选出的文件。
+    :param dry_run: 如果为True，则只打印将要复制的文件，而不实际执行复制操作。
+    """
+    # 1. 路径处理和验证
+    source_path = Path(source_dir)
+    target_path = Path(target_dir)
+
+    if not source_path.is_dir():
+        print(f"错误：源文件夹 '{source_dir}' 不存在或不是一个文件夹。")
+        return
+
+    if not dry_run and not target_path.exists():
+        target_path.mkdir(parents=True, exist_ok=True)
+        print(f"目标文件夹 '{target_dir}' 已创建。")
+
+    print(f"源文件夹: {source_path}")
+    print(f"目标文件夹: {target_path}")
+    if dry_run:
+        print("\n--- *** 模拟运行模式 (Dry Run) *** ---")
+        print("--- 不会执行任何实际的文件复制操作 ---")
+
+    # 2. 开始遍历和筛选
+    print("\n开始扫描源文件夹中的CIF文件...")
+    copied_count = 0
+    checked_files = 0
+    error_files = 0
+
+    # 使用 rglob('*.cif') 可以遍历所有子文件夹，如果只想遍历当前文件夹用 glob
+    for file_path in source_path.glob('*.cif'):
+        if file_path.is_file():
+            checked_files += 1
+            try:
+                with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
+                    lines = f.readlines()
+
+                # 步骤 A: 找到元素符号在哪一列
+                element_col_idx = find_element_column_index(lines)
+
+                if element_col_idx == -1:
+                    # 在某些CIF文件中，可能没有loop块，而是简单的 key-value 格式
+                    # 为了兼容这种情况，我们保留一个简化的检查
+                    found_simple = any(
+                        line.strip().startswith(('_chemical_formula_sum', '_chemical_formula_structural')) and (
+                                    ' O' in line or ' S' in line) for line in lines)
+                    if not found_simple:
+                        continue  # 如果两种方法都找不到，跳过此文件
+
+                # 步骤 B: 检查该列是否有 'O' 或 'S'
+                found = False
+                for line in lines:
+                    line_stripped = line.strip()
+                    # 忽略空行、注释行和定义行
+                    if not line_stripped or line_stripped.startswith(('#', '_', 'loop_')):
+                        continue
+
+                    parts = line_stripped.split()
+                    # 确保行中有足够的列
+                    if len(parts) > element_col_idx:
+                        # 元素符号可能带有电荷，如 O2-，所以用 startswith
+                        atom_symbol = parts[element_col_idx].strip()
+                        if atom_symbol == 'O' or atom_symbol == 'S':
+                            found = True
+                            break
+
+                # 兼容性检查：如果通过了 found_simple 的检查，也标记为找到
+                if found_simple:
+                    found = True
+
+                if found:
+                    target_file_path = target_path / file_path.name
+                    print(f"找到匹配: '{file_path.name}' (含有 O 或 S 元素)")
+
+                    if not dry_run:
+                        shutil.copy2(file_path, target_file_path)
+                        # print(f"  -> 已复制到 {target_file_path}") # 可以取消注释以获得更详细的输出
+
+                    copied_count += 1
+
+            except Exception as e:
+                error_files += 1
+                print(f"!! 处理文件 '{file_path.name}' 时发生错误: {e}")
+
+    # 3. 打印最终报告
+    print("\n--- 操作总结 ---")
+    print(f"共检查了 {checked_files} 个.cif文件。")
+    if error_files > 0:
+        print(f"处理过程中有 {error_files} 个文件发生错误。")
+    if dry_run:
+        print(f"模拟运行结束：如果实际运行，将会有 {copied_count} 个文件被复制。")
+    else:
+        print(f"成功复制了 {copied_count} 个文件到目标文件夹。")
+if __name__ == '__main__':
+    # !! 重要：请将下面的路径修改为您自己电脑上的实际路径
+    source_folder = "D:/download/2025-10/data_all/input/input"
+    target_folder = "D:/download/2025-10/data_all/output"
+
+    # --- 第一次运行：使用模拟模式 (Dry Run) ---
+    print("================ 第一次运行: 模拟模式 ================")
+    copy_cif_with_O_or_S_robust(source_folder, target_folder, dry_run=True)
+
+    print("\n\n=======================================================")
+    input("检查上面的模拟运行结果。如果符合预期，按回车键继续执行实际复制操作...")
+    print("=======================================================")
+
+    # --- 第二次运行：实际执行复制 ---
+    print("\n================ 第二次运行: 实际复制模式 ================")
+    copy_cif_with_O_or_S_robust(source_folder, target_folder, dry_run=False)
--- a/learning/delete.py
+++ b/learning/delete.py
@@ -0,0 +1,111 @@
+import shutil
+from pathlib import Path
+
+
+def delete_duplicates_from_second_folder(source_dir: str, target_dir: str, dry_run: bool = False):
+    """
+    删除第二个文件夹中与第一个文件夹内项目同名的文件或文件夹。
+
+    :param source_dir: 第一个文件夹（源）的路径。
+    :param target_dir: 第二个文件夹（目标）的路径，将从此文件夹中删除内容。
+    :param dry_run: 如果为True，则只打印将要删除的内容，而不实际执行删除操作。
+    """
+    # 1. 将字符串路径转换为Path对象，方便操作
+    source_path = Path(source_dir)
+    target_path = Path(target_dir)
+
+    # 2. 验证路径是否存在且为文件夹
+    if not source_path.is_dir():
+        print(f"错误：源文件夹 '{source_dir}' 不存在或不是一个文件夹。")
+        return
+    if not target_path.is_dir():
+        print(f"错误：目标文件夹 '{target_dir}' 不存在或不是一个文件夹。")
+        return
+
+    print(f"源文件夹: {source_path}")
+    print(f"目标文件夹: {target_path}")
+    if dry_run:
+        print("\n--- *** 模拟运行模式 (Dry Run) *** ---")
+        print("--- 不会执行任何实际的删除操作 ---")
+
+    # 3. 获取源文件夹中所有项目（文件和子文件夹）的名称
+    # p.name 会返回路径的最后一部分，即文件名或文件夹名
+    source_item_names = {p.name for p in source_path.iterdir()}
+
+    if not source_item_names:
+        print("\n源文件夹为空，无需执行任何操作。")
+        return
+
+    print(f"\n在源文件夹中找到 {len(source_item_names)} 个项目。")
+    print("开始检查并删除目标文件夹中的同名项目...")
+
+    deleted_count = 0
+    # 4. 遍历源文件夹中的项目名称
+    for item_name in source_item_names:
+        # 构建目标文件夹中可能存在的同名项目的完整路径
+        item_to_delete = target_path / item_name
+
+        # 5. 检查该项目是否存在于目标文件夹中
+        if item_to_delete.exists():
+            try:
+                if item_to_delete.is_file():
+                    # 如果是文件，直接删除
+                    print(f"准备删除文件: {item_to_delete}")
+                    if not dry_run:
+                        item_to_delete.unlink()
+                        print("  -> 已删除。")
+                    deleted_count += 1
+
+                elif item_to_delete.is_dir():
+                    # 如果是文件夹，使用 shutil.rmtree 删除整个文件夹及其内容
+                    print(f"准备删除文件夹及其所有内容: {item_to_delete}")
+                    if not dry_run:
+                        shutil.rmtree(item_to_delete)
+                        print("  -> 已删除。")
+                    deleted_count += 1
+
+            except Exception as e:
+                print(f"!! 删除 '{item_to_delete}' 时发生错误: {e}")
+
+    if deleted_count == 0:
+        print("\n操作完成：在目标文件夹中没有找到需要删除的同名项目。")
+    else:
+        if dry_run:
+            print(f"\n模拟运行结束：如果实际运行，将会有 {deleted_count} 个项目被删除。")
+        else:
+            print(f"\n操作完成：总共删除了 {deleted_count} 个项目。")
+
+
+# --- 使用示例 ---
+
+# 在运行前，请创建以下文件夹和文件结构进行测试：
+# /your/path/folder1/
+#   ├── file_a.txt
+#   ├── file_b.log
+#   └── subfolder_x/
+#       └── test.txt
+
+# /your/path/folder2/
+#   ├── file_a.txt      (将被删除)
+#   ├── file_c.md
+#   └── subfolder_x/    (将被删除)
+#       └── another.txt
+
+if __name__ == '__main__':
+    # !! 重要：请将下面的路径修改为您自己电脑上的实际路径
+    folder1_path = "D:/download/2025-10/after_step5/after_step5/S"  # 源文件夹
+    folder2_path = "D:/download/2025-10/input/input"  # 目标文件夹
+
+    # --- 第一次运行：使用模拟模式 (Dry Run)，非常推荐！---
+    # 这会告诉你脚本将要做什么，但不会真的删除任何东西。
+    print("================ 第一次运行: 模拟模式 ================")
+    delete_duplicates_from_second_folder(folder1_path, folder2_path, dry_run=True)
+
+    print("\n\n=======================================================")
+    input("检查上面的模拟运行结果。如果符合预期，按回车键继续执行实际删除操作...")
+    print("=======================================================")
+
+    # --- 第二次运行：实际执行删除 ---
+    # 确认模拟运行结果无误后，再将 dry_run 设置为 False 或移除该参数。
+    print("\n================ 第二次运行: 实际删除模式 ================")
+    delete_duplicates_from_second_folder(folder1_path, folder2_path, dry_run=False)
--- a/dpgen/data/Pnma/origin_backup.cif
+++ b/dpgen/data/Pnma/origin_backup.cif
@@ -0,0 +1,53 @@
+#------------------------------------------------------------------------------
+# CIF (Crystallographic Information File) for Li3YCl6
+# Data source: Table S1 from the provided image.
+# Rietveld refinement result of the neutron diffraction pattern for the 450 °C-annealed sample.
+#------------------------------------------------------------------------------
+
+data_Li3YCl6
+
+_chemical_name_systematic      'Lithium Yttrium Chloride'
+_chemical_formula_sum          'Li3 Y1 Cl6'
+_chemical_formula_structural   'Li3YCl6'
+
+_symmetry_space_group_name_H-M   'P n m a'
+_symmetry_Int_Tables_number      62
+_symmetry_cell_setting           orthorhombic
+
+loop_
+_symmetry_equiv_pos_as_xyz
+'x, y, z'
+'-x+1/2, y+1/2, -z+1/2'
+'-x, -y, -z'
+'x+1/2, -y+1/2, z+1/2'
+'-x, y+1/2, -z'
+'x-1/2, -y-1/2, z-1/2'
+'x, -y, z'
+'-x-1/2, y-1/2, -z-1/2'
+
+_cell_length_a                   12.92765(13)
+_cell_length_b                   11.19444(10)
+_cell_length_c                   6.04000(12)
+_cell_angle_alpha                90.0
+_cell_angle_beta                 90.0
+_cell_angle_gamma                90.0
+_cell_volume                     874.15
+_cell_formula_units_Z            4
+
+loop_
+_atom_site_label
+_atom_site_type_symbol
+_atom_site_fract_x
+_atom_site_fract_y
+_atom_site_fract_z
+_atom_site_occupancy
+_atom_site_Wyckoff_symbol
+_atom_site_U_iso_or_equiv
+Li1   Li   0.11730(7)   0.09640(7)   0.04860(10)   0.750(13)   8d   4.579(2)
+Li2   Li   0.13270(9)   0.07900(10)  0.48600(2)    0.750(19)   8d   9.554(4)
+Cl1   Cl   0.21726(7)   0.58920(7)   0.26362(11)   1.0         8d   0.797(17)
+Cl2   Cl   0.45948(8)   0.08259(8)   0.23831(13)   1.0         8d   1.548(2)
+Cl3   Cl   0.04505(10)  0.25000      0.74110(2)    1.0         4c   1.848(3)
+Cl4   Cl   0.20205(9)   0.25000      0.24970(2)    1.0         4c   0.561(2)
+Y1    Y    0.37529(10)  0.25000      0.01870(3)    1.0         4c   1.121(17)
+#------------------------------------------------------------------------------
--- a/dpgen/plus.py
+++ b/dpgen/plus.py
@@ -0,0 +1,151 @@
+import random
+from typing import List
+from pymatgen.core import Structure
+from pymatgen.io.vasp import Poscar
+
+def _is_close_frac(z, target, tol=2e-2):
+    t = target % 1.0
+    return min(abs(z - t), abs(z - (t + 1)), abs(z - (t - 1))) < tol
+
+def make_model3_poscar_from_cif(cif_path: str,
+                                out_poscar: str = "POSCAR_model3_supercell",
+                                seed: int = 42,
+                                tol: float = 2e-2):
+    """
+    将 model3.cif 扩胞为 [[3,0,0],[2,4,0],[0,0,6]] 的2160原子超胞，并把部分占据位点(Y2=0.75, Y3=0.25, Li2=0.5)
+    显式有序化后写出 POSCAR。
+    """
+    random.seed(seed)
+
+    # 1) 读取 CIF
+    s = Structure.from_file(cif_path)
+
+    # 2) 扩胞（a_s=3a0, b_s=2a0+4b0, c_s=6c0）[1]
+    T = [[3, 0, 0],
+         [2, 4, 0],
+         [0, 0, 6]]
+    s.make_supercell(T)
+
+    # 3) 识别三类需取整的位点：Y2、Y3、Li2
+    y2_idx: List[int] = []
+    y3_idx: List[int] = []
+    li2_idx: List[int] = []
+
+    for i, site in enumerate(s.sites):
+        # 兼容不同版本pymatgen
+        try:
+            el = site.species.elements[0].symbol
+        except Exception:
+            ss = site.species_string
+            el = "Li" if ss.startswith("Li") else ("Y" if ss.startswith("Y") else ("Cl" if ss.startswith("Cl") else ss))
+        z = site.frac_coords[2]
+        if el == "Y":
+            if _is_close_frac(z, 0.488, tol):
+                y2_idx.append(i)
+            elif _is_close_frac(z, -0.065, tol) or _is_close_frac(z, 0.935, tol):
+                y3_idx.append(i)
+        elif el == "Li":
+            if _is_close_frac(z, 0.5, tol):
+                li2_idx.append(i)
+
+    def choose_keep(idxs, frac_keep):
+        n = len(idxs)
+        k = int(round(n * frac_keep))
+        if k < 0: k = 0
+        if k > n: k = n
+        keep = set(random.sample(idxs, k)) if 0 < k < n else set(idxs if k == n else [])
+        drop = [i for i in idxs if i not in keep]
+        return keep, drop
+
+    keep_y2, drop_y2 = choose_keep(y2_idx, 0.75)
+    keep_y3, drop_y3 = choose_keep(y3_idx, 0.25)
+    keep_li2, drop_li2 = choose_keep(li2_idx, 0.50)
+
+    # 4) 保留者占据设为1，其余删除
+    for i in keep_y2 | keep_y3:
+        s.replace(i, "Y")
+    for i in keep_li2:
+        s.replace(i, "Li")
+    to_remove = sorted(drop_y2 + drop_y3 + drop_li2, reverse=True)
+    for i in to_remove:
+        s.remove_sites([i])
+
+    # 5) 最终清理：消除任何残留的部分占据（防止 POSCAR 写出报错）
+    # 若有 site.is_ordered==False，则取该站位的“主要元素”替换为占据=1
+    for i, site in enumerate(s.sites):
+        if not site.is_ordered:
+            d = site.species.as_dict()  # {'Li': 0.5} 或 {'Li':0.5,'Y':0.5}
+            elem = max(d.items(), key=lambda kv: kv[1])[0]
+            s.replace(i, elem)
+
+    # 6) 排序并写出 POSCAR
+    order = {"Li": 0, "Y": 1, "Cl": 2}
+    s = s.get_sorted_structure(key=lambda site: order.get(site.species.elements[0].symbol, 99))
+    Poscar(s).write_file(out_poscar)
+
+    # 报告
+    comp = {k: int(v) for k, v in s.composition.as_dict().items()}
+    print(f"写出 {out_poscar}；总原子数 = {len(s)}")
+    print(f"Y2识别={len(y2_idx)}，Y3识别={len(y3_idx)}，Li2识别={len(li2_idx)}；组成={comp}")
+
+import random
+from typing import List
+from pymatgen.core import Structure
+from pymatgen.io.vasp import Poscar
+
+def make_pnma_poscar_from_cif(cif_path: str,
+                               out_poscar: str = "POSCAR_pnma_supercell",
+                               seed: int = 42,
+                               supercell=(3,3,6),
+                               tol: float = 1e-6):
+    """
+    读取 Pnma 的 CIF（如 origin.cif），扩胞到 2160 原子，并把部分占据的 Li 位点(0.75)显式取整后写出 POSCAR。
+    默认超胞尺度为(3,3,6)，体积放大因子=54，40原子/原胞×54=2160 [1][3]。
+    """
+    random.seed(seed)
+
+    s = Structure.from_file(cif_path)
+
+    # 扩胞；Pnma原胞已是正交，直接用对角放缩
+    s.make_supercell(supercell)
+
+    # 找出所有“部分占据的 Li”位点
+    partial_li_idx: List[int] = []
+    for i, site in enumerate(s.sites):
+        if not site.is_ordered:
+            d = site.species.as_dict()  # 例如 {'Li': 0.75}
+            # 只处理主要元素是Li且占据<1的位点
+            m_elem, m_occ = max(d.items(), key=lambda kv: kv[1])
+            if m_elem == "Li" and m_occ < 1 - tol:
+                partial_li_idx.append(i)
+
+    # 以占据0.75进行随机取整：保留75%，其余删除为“空位”
+    n = len(partial_li_idx)
+    k = int(round(n * 0.75))
+    keep = set(random.sample(partial_li_idx, k)) if 0 < k < n else set(partial_li_idx if k == n else [])
+    drop = sorted([i for i in partial_li_idx if i not in keep], reverse=True)
+
+    # 保留者设为占据=1；删除其余
+    for i in keep:
+        s.replace(i, "Li")
+    for i in drop:
+        s.remove_sites([i])
+
+    # 兜底：若仍有部分占据，强制取主要元素
+    for i, site in enumerate(s.sites):
+        if not site.is_ordered:
+            d = site.species.as_dict()
+            elem = max(d.items(), key=lambda kv: kv[1])[0]
+            s.replace(i, elem)
+
+    # 排序并写POSCAR
+    order = {"Li": 0, "Y": 1, "Cl": 2}
+    s = s.get_sorted_structure(key=lambda site: order.get(site.species.elements[0].symbol, 99))
+    Poscar(s).write_file(out_poscar)
+
+    comp = {k: int(v) for k, v in s.composition.as_dict().items()}
+    print(f"写出 {out_poscar}；总原子数 = {len(s)}；组成 = {comp}")
+
+if __name__=="__main__":
+    # make_model3_poscar_from_cif("data/P3ma/model3.cif","data/P3ma/supercell_model4.poscar")
+    make_pnma_poscar_from_cif("data/Pnma/origin.cif","data/Pnma/supercell_pnma.poscar",seed=42)
--- a/dpgen/supercell_make_p3ma.py
+++ b/dpgen/supercell_make_p3ma.py