对比学习法增改

2025-10-29 11:39:30 +08:00
parent 1f8667ae51
commit 95d719cc1e
5 changed files with 466 additions and 0 deletions
--- a/learning/copy.py
+++ b/learning/copy.py
@@ -0,0 +1,151 @@
 import shutil
 from pathlib import Path
 def find_element_column_index(cif_lines: list) -> int:
    """
    在CIF文件内容中查找 _atom_site_type_symbol 所在的列索引。
    :param cif_lines: 从CIF文件读取的行列表。
    :return: 元素符号列的索引（从0开始），如果未找到则返回-1。
    """
    in_loop_header = False
    column_index = -1
    current_column = 0
    for line in cif_lines:
        line_stripped = line.strip()
        if not line_stripped:
            continue
        if line_stripped.startswith('loop_'):
            in_loop_header = True
            column_index = -1
            current_column = 0
            continue
        if in_loop_header:
            if line_stripped.startswith('_'):
                if line_stripped.startswith('_atom_site_type_symbol'):
                    column_index = current_column
                current_column += 1
            else:
                # loop_ 头部定义结束，开始数据行
                return column_index
    return -1  # 如果文件中没有找到 loop_ 或 _atom_site_type_symbol
 def copy_cif_with_O_or_S_robust(source_dir: str, target_dir: str, dry_run: bool = False):
    """
    从源文件夹中筛选出内容包含'O'或'S'元素的CIF文件，并复制到目标文件夹。
    (鲁棒版：能正确解析CIF中的元素符号列)
    :param source_dir: 源文件夹路径，包含CIF文件。
    :param target_dir: 目标文件夹路径，用于存放筛选出的文件。
    :param dry_run: 如果为True，则只打印将要复制的文件，而不实际执行复制操作。
    """
    # 1. 路径处理和验证
    source_path = Path(source_dir)
    target_path = Path(target_dir)
    if not source_path.is_dir():
        print(f"错误：源文件夹 '{source_dir}' 不存在或不是一个文件夹。")
        return
    if not dry_run and not target_path.exists():
        target_path.mkdir(parents=True, exist_ok=True)
        print(f"目标文件夹 '{target_dir}' 已创建。")
    print(f"源文件夹: {source_path}")
    print(f"目标文件夹: {target_path}")
    if dry_run:
        print("\n--- *** 模拟运行模式 (Dry Run) *** ---")
        print("--- 不会执行任何实际的文件复制操作 ---")
    # 2. 开始遍历和筛选
    print("\n开始扫描源文件夹中的CIF文件...")
    copied_count = 0
    checked_files = 0
    error_files = 0
    # 使用 rglob('*.cif') 可以遍历所有子文件夹，如果只想遍历当前文件夹用 glob
    for file_path in source_path.glob('*.cif'):
        if file_path.is_file():
            checked_files += 1
            try:
                with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
                    lines = f.readlines()
                # 步骤 A: 找到元素符号在哪一列
                element_col_idx = find_element_column_index(lines)
                if element_col_idx == -1:
                    # 在某些CIF文件中，可能没有loop块，而是简单的 key-value 格式
                    # 为了兼容这种情况，我们保留一个简化的检查
                    found_simple = any(
                        line.strip().startswith(('_chemical_formula_sum', '_chemical_formula_structural')) and (
                                    ' O' in line or ' S' in line) for line in lines)
                    if not found_simple:
                        continue  # 如果两种方法都找不到，跳过此文件
                # 步骤 B: 检查该列是否有 'O' 或 'S'
                found = False
                for line in lines:
                    line_stripped = line.strip()
                    # 忽略空行、注释行和定义行
                    if not line_stripped or line_stripped.startswith(('#', '_', 'loop_')):
                        continue
                    parts = line_stripped.split()
                    # 确保行中有足够的列
                    if len(parts) > element_col_idx:
                        # 元素符号可能带有电荷，如 O2-，所以用 startswith
                        atom_symbol = parts[element_col_idx].strip()
                        if atom_symbol == 'O' or atom_symbol == 'S':
                            found = True
                            break
                # 兼容性检查：如果通过了 found_simple 的检查，也标记为找到
                if found_simple:
                    found = True
                if found:
                    target_file_path = target_path / file_path.name
                    print(f"找到匹配: '{file_path.name}' (含有 O 或 S 元素)")
                    if not dry_run:
                        shutil.copy2(file_path, target_file_path)
                        # print(f"  -> 已复制到 {target_file_path}") # 可以取消注释以获得更详细的输出
                    copied_count += 1
            except Exception as e:
                error_files += 1
                print(f"!! 处理文件 '{file_path.name}' 时发生错误: {e}")
    # 3. 打印最终报告
    print("\n--- 操作总结 ---")
    print(f"共检查了 {checked_files} 个.cif文件。")
    if error_files > 0:
        print(f"处理过程中有 {error_files} 个文件发生错误。")
    if dry_run:
        print(f"模拟运行结束：如果实际运行，将会有 {copied_count} 个文件被复制。")
    else:
        print(f"成功复制了 {copied_count} 个文件到目标文件夹。")
 if __name__ == '__main__':
    # !! 重要：请将下面的路径修改为您自己电脑上的实际路径
    source_folder = "D:/download/2025-10/data_all/input/input"
    target_folder = "D:/download/2025-10/data_all/output"
    # --- 第一次运行：使用模拟模式 (Dry Run) ---
    print("================ 第一次运行: 模拟模式 ================")
    copy_cif_with_O_or_S_robust(source_folder, target_folder, dry_run=True)
    print("\n\n=======================================================")
    input("检查上面的模拟运行结果。如果符合预期，按回车键继续执行实际复制操作...")
    print("=======================================================")
    # --- 第二次运行：实际执行复制 ---
    print("\n================ 第二次运行: 实际复制模式 ================")
    copy_cif_with_O_or_S_robust(source_folder, target_folder, dry_run=False)
--- a/learning/delete.py
+++ b/learning/delete.py
@@ -0,0 +1,111 @@
 import shutil
 from pathlib import Path
 def delete_duplicates_from_second_folder(source_dir: str, target_dir: str, dry_run: bool = False):
    """
    删除第二个文件夹中与第一个文件夹内项目同名的文件或文件夹。
    :param source_dir: 第一个文件夹（源）的路径。
    :param target_dir: 第二个文件夹（目标）的路径，将从此文件夹中删除内容。
    :param dry_run: 如果为True，则只打印将要删除的内容，而不实际执行删除操作。
    """
    # 1. 将字符串路径转换为Path对象，方便操作
    source_path = Path(source_dir)
    target_path = Path(target_dir)
    # 2. 验证路径是否存在且为文件夹
    if not source_path.is_dir():
        print(f"错误：源文件夹 '{source_dir}' 不存在或不是一个文件夹。")
        return
    if not target_path.is_dir():
        print(f"错误：目标文件夹 '{target_dir}' 不存在或不是一个文件夹。")
        return
    print(f"源文件夹: {source_path}")
    print(f"目标文件夹: {target_path}")
    if dry_run:
        print("\n--- *** 模拟运行模式 (Dry Run) *** ---")
        print("--- 不会执行任何实际的删除操作 ---")
    # 3. 获取源文件夹中所有项目（文件和子文件夹）的名称
    # p.name 会返回路径的最后一部分，即文件名或文件夹名
    source_item_names = {p.name for p in source_path.iterdir()}
    if not source_item_names:
        print("\n源文件夹为空，无需执行任何操作。")
        return
    print(f"\n在源文件夹中找到 {len(source_item_names)} 个项目。")
    print("开始检查并删除目标文件夹中的同名项目...")
    deleted_count = 0
    # 4. 遍历源文件夹中的项目名称
    for item_name in source_item_names:
        # 构建目标文件夹中可能存在的同名项目的完整路径
        item_to_delete = target_path / item_name
        # 5. 检查该项目是否存在于目标文件夹中
        if item_to_delete.exists():
            try:
                if item_to_delete.is_file():
                    # 如果是文件，直接删除
                    print(f"准备删除文件: {item_to_delete}")
                    if not dry_run:
                        item_to_delete.unlink()
                        print("  -> 已删除。")
                    deleted_count += 1
                elif item_to_delete.is_dir():
                    # 如果是文件夹，使用 shutil.rmtree 删除整个文件夹及其内容
                    print(f"准备删除文件夹及其所有内容: {item_to_delete}")
                    if not dry_run:
                        shutil.rmtree(item_to_delete)
                        print("  -> 已删除。")
                    deleted_count += 1
            except Exception as e:
                print(f"!! 删除 '{item_to_delete}' 时发生错误: {e}")
    if deleted_count == 0:
        print("\n操作完成：在目标文件夹中没有找到需要删除的同名项目。")
    else:
        if dry_run:
            print(f"\n模拟运行结束：如果实际运行，将会有 {deleted_count} 个项目被删除。")
        else:
            print(f"\n操作完成：总共删除了 {deleted_count} 个项目。")
 # --- 使用示例 ---
 # 在运行前，请创建以下文件夹和文件结构进行测试：
 # /your/path/folder1/
 #   ├── file_a.txt
 #   ├── file_b.log
 #   └── subfolder_x/
 #       └── test.txt
 # /your/path/folder2/
 #   ├── file_a.txt      (将被删除)
 #   ├── file_c.md
 #   └── subfolder_x/    (将被删除)
 #       └── another.txt
 if __name__ == '__main__':
    # !! 重要：请将下面的路径修改为您自己电脑上的实际路径
    folder1_path = "D:/download/2025-10/after_step5/after_step5/S"  # 源文件夹
    folder2_path = "D:/download/2025-10/input/input"  # 目标文件夹
    # --- 第一次运行：使用模拟模式 (Dry Run)，非常推荐！---
    # 这会告诉你脚本将要做什么，但不会真的删除任何东西。
    print("================ 第一次运行: 模拟模式 ================")
    delete_duplicates_from_second_folder(folder1_path, folder2_path, dry_run=True)
    print("\n\n=======================================================")
    input("检查上面的模拟运行结果。如果符合预期，按回车键继续执行实际删除操作...")
    print("=======================================================")
    # --- 第二次运行：实际执行删除 ---
    # 确认模拟运行结果无误后，再将 dry_run 设置为 False 或移除该参数。
    print("\n================ 第二次运行: 实际删除模式 ================")
    delete_duplicates_from_second_folder(folder1_path, folder2_path, dry_run=False)
--- a/dpgen/data/Pnma/origin_backup.cif
+++ b/dpgen/data/Pnma/origin_backup.cif
@@ -0,0 +1,53 @@
 #------------------------------------------------------------------------------
 # CIF (Crystallographic Information File) for Li3YCl6
 # Data source: Table S1 from the provided image.
 # Rietveld refinement result of the neutron diffraction pattern for the 450 °C-annealed sample.
 #------------------------------------------------------------------------------
 data_Li3YCl6
 _chemical_name_systematic      'Lithium Yttrium Chloride'
 _chemical_formula_sum          'Li3 Y1 Cl6'
 _chemical_formula_structural   'Li3YCl6'
 _symmetry_space_group_name_H-M   'P n m a'
 _symmetry_Int_Tables_number      62
 _symmetry_cell_setting           orthorhombic
 loop_
 _symmetry_equiv_pos_as_xyz
 'x, y, z'
 '-x+1/2, y+1/2, -z+1/2'
 '-x, -y, -z'
 'x+1/2, -y+1/2, z+1/2'
 '-x, y+1/2, -z'
 'x-1/2, -y-1/2, z-1/2'
 'x, -y, z'
 '-x-1/2, y-1/2, -z-1/2'
 _cell_length_a                   12.92765(13)
 _cell_length_b                   11.19444(10)
 _cell_length_c                   6.04000(12)
 _cell_angle_alpha                90.0
 _cell_angle_beta                 90.0
 _cell_angle_gamma                90.0
 _cell_volume                     874.15
 _cell_formula_units_Z            4
 loop_
 _atom_site_label
 _atom_site_type_symbol
 _atom_site_fract_x
 _atom_site_fract_y
 _atom_site_fract_z
 _atom_site_occupancy
 _atom_site_Wyckoff_symbol
 _atom_site_U_iso_or_equiv
 Li1   Li   0.11730(7)   0.09640(7)   0.04860(10)   0.750(13)   8d   4.579(2)
 Li2   Li   0.13270(9)   0.07900(10)  0.48600(2)    0.750(19)   8d   9.554(4)
 Cl1   Cl   0.21726(7)   0.58920(7)   0.26362(11)   1.0         8d   0.797(17)
 Cl2   Cl   0.45948(8)   0.08259(8)   0.23831(13)   1.0         8d   1.548(2)
 Cl3   Cl   0.04505(10)  0.25000      0.74110(2)    1.0         4c   1.848(3)
 Cl4   Cl   0.20205(9)   0.25000      0.24970(2)    1.0         4c   0.561(2)
 Y1    Y    0.37529(10)  0.25000      0.01870(3)    1.0         4c   1.121(17)
 #------------------------------------------------------------------------------
--- a/dpgen/plus.py
+++ b/dpgen/plus.py
@@ -0,0 +1,151 @@
 import random
 from typing import List
 from pymatgen.core import Structure
 from pymatgen.io.vasp import Poscar
 def _is_close_frac(z, target, tol=2e-2):
    t = target % 1.0
    return min(abs(z - t), abs(z - (t + 1)), abs(z - (t - 1))) < tol
 def make_model3_poscar_from_cif(cif_path: str,
                                out_poscar: str = "POSCAR_model3_supercell",
                                seed: int = 42,
                                tol: float = 2e-2):
    """
    将 model3.cif 扩胞为 [[3,0,0],[2,4,0],[0,0,6]] 的2160原子超胞，并把部分占据位点(Y2=0.75, Y3=0.25, Li2=0.5)
    显式有序化后写出 POSCAR。
    """
    random.seed(seed)
    # 1) 读取 CIF
    s = Structure.from_file(cif_path)
    # 2) 扩胞（a_s=3a0, b_s=2a0+4b0, c_s=6c0）[1]
    T = [[3, 0, 0],
         [2, 4, 0],
         [0, 0, 6]]
    s.make_supercell(T)
    # 3) 识别三类需取整的位点：Y2、Y3、Li2
    y2_idx: List[int] = []
    y3_idx: List[int] = []
    li2_idx: List[int] = []
    for i, site in enumerate(s.sites):
        # 兼容不同版本pymatgen
        try:
            el = site.species.elements[0].symbol
        except Exception:
            ss = site.species_string
            el = "Li" if ss.startswith("Li") else ("Y" if ss.startswith("Y") else ("Cl" if ss.startswith("Cl") else ss))
        z = site.frac_coords[2]
        if el == "Y":
            if _is_close_frac(z, 0.488, tol):
                y2_idx.append(i)
            elif _is_close_frac(z, -0.065, tol) or _is_close_frac(z, 0.935, tol):
                y3_idx.append(i)
        elif el == "Li":
            if _is_close_frac(z, 0.5, tol):
                li2_idx.append(i)
    def choose_keep(idxs, frac_keep):
        n = len(idxs)
        k = int(round(n * frac_keep))
        if k < 0: k = 0
        if k > n: k = n
        keep = set(random.sample(idxs, k)) if 0 < k < n else set(idxs if k == n else [])
        drop = [i for i in idxs if i not in keep]
        return keep, drop
    keep_y2, drop_y2 = choose_keep(y2_idx, 0.75)
    keep_y3, drop_y3 = choose_keep(y3_idx, 0.25)
    keep_li2, drop_li2 = choose_keep(li2_idx, 0.50)
    # 4) 保留者占据设为1，其余删除
    for i in keep_y2 | keep_y3:
        s.replace(i, "Y")
    for i in keep_li2:
        s.replace(i, "Li")
    to_remove = sorted(drop_y2 + drop_y3 + drop_li2, reverse=True)
    for i in to_remove:
        s.remove_sites([i])
    # 5) 最终清理：消除任何残留的部分占据（防止 POSCAR 写出报错）
    # 若有 site.is_ordered==False，则取该站位的“主要元素”替换为占据=1
    for i, site in enumerate(s.sites):
        if not site.is_ordered:
            d = site.species.as_dict()  # {'Li': 0.5} 或 {'Li':0.5,'Y':0.5}
            elem = max(d.items(), key=lambda kv: kv[1])[0]
            s.replace(i, elem)
    # 6) 排序并写出 POSCAR
    order = {"Li": 0, "Y": 1, "Cl": 2}
    s = s.get_sorted_structure(key=lambda site: order.get(site.species.elements[0].symbol, 99))
    Poscar(s).write_file(out_poscar)
    # 报告
    comp = {k: int(v) for k, v in s.composition.as_dict().items()}
    print(f"写出 {out_poscar}；总原子数 = {len(s)}")
    print(f"Y2识别={len(y2_idx)}，Y3识别={len(y3_idx)}，Li2识别={len(li2_idx)}；组成={comp}")
 import random
 from typing import List
 from pymatgen.core import Structure
 from pymatgen.io.vasp import Poscar
 def make_pnma_poscar_from_cif(cif_path: str,
                               out_poscar: str = "POSCAR_pnma_supercell",
                               seed: int = 42,
                               supercell=(3,3,6),
                               tol: float = 1e-6):
    """
    读取 Pnma 的 CIF（如 origin.cif），扩胞到 2160 原子，并把部分占据的 Li 位点(0.75)显式取整后写出 POSCAR。
    默认超胞尺度为(3,3,6)，体积放大因子=54，40原子/原胞×54=2160 [1][3]。
    """
    random.seed(seed)
    s = Structure.from_file(cif_path)
    # 扩胞；Pnma原胞已是正交，直接用对角放缩
    s.make_supercell(supercell)
    # 找出所有“部分占据的 Li”位点
    partial_li_idx: List[int] = []
    for i, site in enumerate(s.sites):
        if not site.is_ordered:
            d = site.species.as_dict()  # 例如 {'Li': 0.75}
            # 只处理主要元素是Li且占据<1的位点
            m_elem, m_occ = max(d.items(), key=lambda kv: kv[1])
            if m_elem == "Li" and m_occ < 1 - tol:
                partial_li_idx.append(i)
    # 以占据0.75进行随机取整：保留75%，其余删除为“空位”
    n = len(partial_li_idx)
    k = int(round(n * 0.75))
    keep = set(random.sample(partial_li_idx, k)) if 0 < k < n else set(partial_li_idx if k == n else [])
    drop = sorted([i for i in partial_li_idx if i not in keep], reverse=True)
    # 保留者设为占据=1；删除其余
    for i in keep:
        s.replace(i, "Li")
    for i in drop:
        s.remove_sites([i])
    # 兜底：若仍有部分占据，强制取主要元素
    for i, site in enumerate(s.sites):
        if not site.is_ordered:
            d = site.species.as_dict()
            elem = max(d.items(), key=lambda kv: kv[1])[0]
            s.replace(i, elem)
    # 排序并写POSCAR
    order = {"Li": 0, "Y": 1, "Cl": 2}
    s = s.get_sorted_structure(key=lambda site: order.get(site.species.elements[0].symbol, 99))
    Poscar(s).write_file(out_poscar)
    comp = {k: int(v) for k, v in s.composition.as_dict().items()}
    print(f"写出 {out_poscar}；总原子数 = {len(s)}；组成 = {comp}")
 if __name__=="__main__":
    # make_model3_poscar_from_cif("data/P3ma/model3.cif","data/P3ma/supercell_model4.poscar")
    make_pnma_poscar_from_cif("data/Pnma/origin.cif","data/Pnma/supercell_pnma.poscar",seed=42)
--- a/dpgen/supercell_make_p3ma.py
+++ b/dpgen/supercell_make_p3ma.py