一些小修改

2025-11-19 12:23:17 +08:00
parent 95d719cc1e
commit 80ae03c8c1
25 changed files with 2291 additions and 17 deletions
--- a/learning/copy.py
+++ b/learning/copy.py
@@ -133,14 +133,134 @@ def copy_cif_with_O_or_S_robust(source_dir: str, target_dir: str, dry_run: bool
        print(f"模拟运行结束：如果实际运行，将会有 {copied_count} 个文件被复制。")
    else:
        print(f"成功复制了 {copied_count} 个文件到目标文件夹。")
+
+
+def copy_cif_without_Br_or_Cl(source_dir: str, target_dir: str, dry_run: bool = False):
+    """
+    从源文件夹中筛选出内容不含'Br'或'Cl'元素的CIF文件，并复制到目标文件夹。
+    (鲁棒版：能正确解析CIF中的元素符号列)
+
+    :param source_dir: 源文件夹路径，包含CIF文件。
+    :param target_dir: 目标文件夹路径，用于存放筛选出的文件。
+    :param dry_run: 如果为True，则只打印将要复制的文件，而不实际执行复制操作。
+    """
+    # 1. 路径处理和验证 (与原函数相同)
+    source_path = Path(source_dir)
+    target_path = Path(target_dir)
+    if not source_path.is_dir():
+        print(f"错误：源文件夹 '{source_dir}' 不存在或不是一个文件夹。")
+        return
+    if not dry_run and not target_path.exists():
+        target_path.mkdir(parents=True, exist_ok=True)
+        print(f"目标文件夹 '{target_dir}' 已创建。")
+
+    print(f"源文件夹: {source_path}")
+    print(f"目标文件夹: {target_path}")
+    if dry_run:
+        print("\n--- *** 模拟运行模式 (Dry Run) *** ---")
+        print("--- 不会执行任何实际的文件复制操作 ---")
+
+    # 2. 开始遍历和筛选
+    print("\n开始扫描源文件夹，剔除含 Br 或 Cl 的CIF文件...")
+    copied_count = 0
+    checked_files = 0
+    error_files = 0
+    excluded_files = 0
+
+    # 遍历所有 .cif 文件
+    for file_path in source_path.glob('*.cif'):
+        if not file_path.is_file():
+            continue
+
+        checked_files += 1
+        contains_br_or_cl = False  # 标记文件是否包含 Br 或 Cl
+
+        try:
+            with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
+                lines = f.readlines()
+
+            # 步骤 A: 找到元素符号在哪一列
+            element_col_idx = find_element_column_index(lines)
+
+            if element_col_idx != -1:
+                # 优先使用结构数据进行精确判断
+                for line in lines:
+                    line_stripped = line.strip()
+                    # 忽略空行、注释行和定义行
+                    if not line_stripped or line_stripped.startswith(('#', '_', 'loop_')):
+                        continue
+
+                    parts = line_stripped.split()
+                    # 确保行中有足够的列
+                    if len(parts) > element_col_idx:
+                        atom_symbol = parts[element_col_idx].strip()
+                        # 检查元素是否为 Br 或 Cl（也考虑类似 Br- 的情况）
+                        if atom_symbol.upper().startswith('BR') or atom_symbol.upper().startswith('CL'):
+                            contains_br_or_cl = True
+                            break  # 找到一个就足够，可以停止检查这个文件
+
+            # 步骤 B: 如果上述方法未找到，使用化学式作为备用检查
+            if not contains_br_or_cl:
+                # 使用 any() 来高效检查，找到一个匹配即停止
+                is_in_formula = any(
+                    line.strip().startswith(('_chemical_formula_sum', '_chemical_formula_structural')) and
+                    (' Br' in line or ' Cl' in line)
+                    for line in lines
+                )
+                if is_in_formula:
+                    contains_br_or_cl = True
+
+            # 步骤 C: 根据检查结果决定是否复制
+            if contains_br_or_cl:
+                # 如果包含 Br 或 Cl，则打印信息并跳过
+                print(f"排除文件: '{file_path.name}' (检测到 Br 或 Cl 元素)")
+                excluded_files += 1
+            else:
+                # 如果不包含 Br 或 Cl，则执行复制
+                target_file_path = target_path / file_path.name
+                print(f"找到匹配: '{file_path.name}' (不含 Br 或 Cl)")
+                if not dry_run:
+                    shutil.copy2(file_path, target_file_path)
+                copied_count += 1
+
+        except Exception as e:
+            error_files += 1
+            print(f"!! 处理文件 '{file_path.name}' 时发生错误: {e}")
+
+    # 3. 打印最终报告 (与原函数类似，增加了排除计数)
+    print("\n--- 操作总结 ---")
+    print(f"共检查了 {checked_files} 个.cif文件。")
+    print(f"排除了 {excluded_files} 个含有 Br 或 Cl 的文件。")
+    if error_files > 0:
+        print(f"处理过程中有 {error_files} 个文件发生错误。")
+    if dry_run:
+        print(f"模拟运行结束：如果实际运行，将会有 {copied_count} 个文件被复制。")
+    else:
+        print(f"成功复制了 {copied_count} 个文件到目标文件夹。")
+
 if __name__ == '__main__':
    # !! 重要：请将下面的路径修改为您自己电脑上的实际路径
-    source_folder = "D:/download/2025-10/data_all/input/input"
-    target_folder = "D:/download/2025-10/data_all/output"
+    # source_folder = "D:/download/2025-10/data_all/input/input"
+    # target_folder = "D:/download/2025-10/data_all/output"
+    #
+    # # --- 第一次运行：使用模拟模式 (Dry Run) ---
+    # print("================ 第一次运行: 模拟模式 ================")
+    # copy_cif_with_O_or_S_robust(source_folder, target_folder, dry_run=True)
+    #
+    # print("\n\n=======================================================")
+    # input("检查上面的模拟运行结果。如果符合预期，按回车键继续执行实际复制操作...")
+    # print("=======================================================")
+    #
+    # # --- 第二次运行：实际执行复制 ---
+    # print("\n================ 第二次运行: 实际复制模式 ================")
+    # copy_cif_with_O_or_S_robust(source_folder, target_folder, dry_run=False)
+
+    source_folder = "D:/download/2025-10/data_OS/input"
+    target_folder = "D:/download/2025-10/data_withoutBrCl/input"

    # --- 第一次运行：使用模拟模式 (Dry Run) ---
    print("================ 第一次运行: 模拟模式 ================")
-    copy_cif_with_O_or_S_robust(source_folder, target_folder, dry_run=True)
+    copy_cif_without_Br_or_Cl(source_folder, target_folder, dry_run=True)

    print("\n\n=======================================================")
    input("检查上面的模拟运行结果。如果符合预期，按回车键继续执行实际复制操作...")
@@ -148,4 +268,4 @@ if __name__ == '__main__':

    # --- 第二次运行：实际执行复制 ---
    print("\n================ 第二次运行: 实际复制模式 ================")
-    copy_cif_with_O_or_S_robust(source_folder, target_folder, dry_run=False)
+    copy_cif_without_Br_or_Cl(source_folder, target_folder, dry_run=False)
--- a/learning/split.py
+++ b/learning/split.py
@@ -0,0 +1,111 @@
+import os
+import shutil
+import random
+from pathlib import Path
+
+
+def split_dataset(source_dir: str, output_dir: str, test_ratio: float = 0.2):
+    """
+    将源文件夹中的文件按比例划分到输出文件夹下的 train 和 test 子目录中。
+
+    Args:
+        source_dir (str): 包含所有数据文件的源文件夹路径。
+        output_dir (str): 用于存放'train'和'test'文件夹的目标文件夹路径。
+        test_ratio (float, optional): 测试集所占的比例。默认为 0.2。
+    """
+    print("--- 开始执行数据集划分 ---")
+
+    # 1. 路径处理和验证
+    source_path = Path(source_dir)
+    output_path = Path(output_dir)
+
+    if not source_path.is_dir():
+        print(f"错误：源文件夹 '{source_dir}' 不存在或不是一个目录。")
+        return
+
+    # 2. 创建输出文件夹 (train 和 test)
+    train_dir = output_path / 'train'
+    test_dir = output_path / 'test'
+
+    try:
+        os.makedirs(train_dir, exist_ok=True)
+        os.makedirs(test_dir, exist_ok=True)
+        print(f"输出目录已准备好: \n  训练集 -> {train_dir}\n  测试集 -> {test_dir}")
+    except OSError as e:
+        print(f"错误：创建输出目录时发生错误: {e}")
+        return
+
+    # 3. 获取所有文件并随机打乱
+    all_files = [f for f in source_path.iterdir() if f.is_file()]
+
+    if not all_files:
+        print(f"警告：源文件夹 '{source_dir}' 中没有文件可供划分。")
+        return
+
+    random.shuffle(all_files)
+    total_files = len(all_files)
+    print(f"在源文件夹中找到 {total_files} 个文件。")
+
+    # 4. 计算分割数量
+    num_test = int(total_files * test_ratio)
+    num_train = total_files - num_test
+
+    print(f"划分计划 -> 训练集: {num_train} 个文件 | 测试集: {num_test} 个文件")
+
+    # 5. 分割文件列表
+    test_files = all_files[:num_test]
+    train_files = all_files[num_test:]
+
+    # 6. 定义一个复制/移动文件的辅助函数
+    def copy_files(files_to_copy, destination_dir):
+        copied_count = 0
+        for file_path in files_to_copy:
+            try:
+                # 注意：这里使用的是复制(copy)，更安全。
+                # 如果你确认要移动(move)并且清空源文件夹，请将 shutil.copy 改为 shutil.move
+                shutil.copy(file_path, destination_dir)
+                copied_count += 1
+            except Exception as e:
+                print(f"处理文件 '{file_path.name}' 时出错: {e}")
+        return copied_count
+
+    # 7. 复制文件到对应的文件夹
+    print(f"\n正在复制文件到 'train' 文件夹...")
+    copied_train = copy_files(train_files, train_dir)
+    print(f"成功复制 {copied_train} 个文件到训练集。")
+
+    print(f"\n正在复制文件到 'test' 文件夹...")
+    copied_test = copy_files(test_files, test_dir)
+    print(f"成功复制 {copied_test} 个文件到测试集。")
+
+    print("\n--- 数据集划分完成！ ---")
+
+
+# --- 如何使用这个函数 ---
+if __name__ == '__main__':
+    # --- 请在这里配置你的文件夹路径 ---
+
+    # 你的原始数据集所在的文件夹
+    # 例如: 'C:/Users/YourUser/Desktop/my_dataset' (Windows)
+    # 或: '/home/user/project/raw/all_images' (Linux/macOS)
+    SOURCE_DATA_DIR = 'D:/download/2025-10/data_OS/input/S'
+
+    # 你希望将'train'和'test'文件夹创建在哪里
+    # 例如: 'C:/Users/YourUser/Desktop/split_output' (Windows)
+    # 或: '/home/user/project/raw/processed' (Linux/macOS)
+    # 如果使用 '.', 表示在当前脚本所在的目录下创建
+    OUTPUT_DIR = 'D:/download/2025-10/data_OS/train/S'
+
+    # --- 配置完成，下面是调用函数 ---
+
+    # 检查示例路径是否存在，如果不存在则创建并填充一些假文件用于演示
+    if not os.path.exists(SOURCE_DATA_DIR):
+        print(f"演示目录 '{SOURCE_DATA_DIR}' 不存在，正在创建并生成100个示例文件...")
+        os.makedirs(SOURCE_DATA_DIR)
+        for i in range(100):
+            with open(os.path.join(SOURCE_DATA_DIR, f'file_{i + 1:03d}.txt'), 'w') as f:
+                f.write(f'This is file {i + 1}.')
+        print("示例文件创建完毕。")
+
+    # 调用函数执行划分
+    split_dataset(SOURCE_DATA_DIR, OUTPUT_DIR, test_ratio=0.2)