import shutil
from pathlib import Path


def find_element_column_index(cif_lines: list) -> int:
    """
    在CIF文件内容中查找 _atom_site_type_symbol 所在的列索引。

    :param cif_lines: 从CIF文件读取的行列表。
    :return: 元素符号列的索引（从0开始），如果未找到则返回-1。
    """
    in_loop_header = False
    column_index = -1
    current_column = 0

    for line in cif_lines:
        line_stripped = line.strip()
        if not line_stripped:
            continue

        if line_stripped.startswith('loop_'):
            in_loop_header = True
            column_index = -1
            current_column = 0
            continue

        if in_loop_header:
            if line_stripped.startswith('_'):
                if line_stripped.startswith('_atom_site_type_symbol'):
                    column_index = current_column
                current_column += 1
            else:
                # loop_ 头部定义结束，开始数据行
                return column_index

    return -1  # 如果文件中没有找到 loop_ 或 _atom_site_type_symbol


def copy_cif_with_O_or_S_robust(source_dir: str, target_dir: str, dry_run: bool = False):
    """
    从源文件夹中筛选出内容包含'O'或'S'元素的CIF文件，并复制到目标文件夹。
    (鲁棒版：能正确解析CIF中的元素符号列)

    :param source_dir: 源文件夹路径，包含CIF文件。
    :param target_dir: 目标文件夹路径，用于存放筛选出的文件。
    :param dry_run: 如果为True，则只打印将要复制的文件，而不实际执行复制操作。
    """
    # 1. 路径处理和验证
    source_path = Path(source_dir)
    target_path = Path(target_dir)

    if not source_path.is_dir():
        print(f"错误：源文件夹 '{source_dir}' 不存在或不是一个文件夹。")
        return

    if not dry_run and not target_path.exists():
        target_path.mkdir(parents=True, exist_ok=True)
        print(f"目标文件夹 '{target_dir}' 已创建。")

    print(f"源文件夹: {source_path}")
    print(f"目标文件夹: {target_path}")
    if dry_run:
        print("\n--- *** 模拟运行模式 (Dry Run) *** ---")
        print("--- 不会执行任何实际的文件复制操作 ---")

    # 2. 开始遍历和筛选
    print("\n开始扫描源文件夹中的CIF文件...")
    copied_count = 0
    checked_files = 0
    error_files = 0

    # 使用 rglob('*.cif') 可以遍历所有子文件夹，如果只想遍历当前文件夹用 glob
    for file_path in source_path.glob('*.cif'):
        if file_path.is_file():
            checked_files += 1
            try:
                with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
                    lines = f.readlines()

                # 步骤 A: 找到元素符号在哪一列
                element_col_idx = find_element_column_index(lines)

                if element_col_idx == -1:
                    # 在某些CIF文件中，可能没有loop块，而是简单的 key-value 格式
                    # 为了兼容这种情况，我们保留一个简化的检查
                    found_simple = any(
                        line.strip().startswith(('_chemical_formula_sum', '_chemical_formula_structural')) and (
                                    ' O' in line or ' S' in line) for line in lines)
                    if not found_simple:
                        continue  # 如果两种方法都找不到，跳过此文件

                # 步骤 B: 检查该列是否有 'O' 或 'S'
                found = False
                for line in lines:
                    line_stripped = line.strip()
                    # 忽略空行、注释行和定义行
                    if not line_stripped or line_stripped.startswith(('#', '_', 'loop_')):
                        continue

                    parts = line_stripped.split()
                    # 确保行中有足够的列
                    if len(parts) > element_col_idx:
                        # 元素符号可能带有电荷，如 O2-，所以用 startswith
                        atom_symbol = parts[element_col_idx].strip()
                        if atom_symbol == 'O' or atom_symbol == 'S':
                            found = True
                            break

                # 兼容性检查：如果通过了 found_simple 的检查，也标记为找到
                if found_simple:
                    found = True

                if found:
                    target_file_path = target_path / file_path.name
                    print(f"找到匹配: '{file_path.name}' (含有 O 或 S 元素)")

                    if not dry_run:
                        shutil.copy2(file_path, target_file_path)
                        # print(f"  -> 已复制到 {target_file_path}") # 可以取消注释以获得更详细的输出

                    copied_count += 1

            except Exception as e:
                error_files += 1
                print(f"!! 处理文件 '{file_path.name}' 时发生错误: {e}")

    # 3. 打印最终报告
    print("\n--- 操作总结 ---")
    print(f"共检查了 {checked_files} 个.cif文件。")
    if error_files > 0:
        print(f"处理过程中有 {error_files} 个文件发生错误。")
    if dry_run:
        print(f"模拟运行结束：如果实际运行，将会有 {copied_count} 个文件被复制。")
    else:
        print(f"成功复制了 {copied_count} 个文件到目标文件夹。")
if __name__ == '__main__':
    # !! 重要：请将下面的路径修改为您自己电脑上的实际路径
    source_folder = "D:/download/2025-10/data_all/input/input"
    target_folder = "D:/download/2025-10/data_all/output"

    # --- 第一次运行：使用模拟模式 (Dry Run) ---
    print("================ 第一次运行: 模拟模式 ================")
    copy_cif_with_O_or_S_robust(source_folder, target_folder, dry_run=True)

    print("\n\n=======================================================")
    input("检查上面的模拟运行结果。如果符合预期，按回车键继续执行实际复制操作...")
    print("=======================================================")

    # --- 第二次运行：实际执行复制 ---
    print("\n================ 第二次运行: 实际复制模式 ================")
    copy_cif_with_O_or_S_robust(source_folder, target_folder, dry_run=False)