screen/src/analysis/structure_inspector.py

"""
结构检查器：对单个CIF文件进行深度分析（含扩胞需求判断）
"""
from dataclasses import dataclass, field
from typing import Set, Dict, List, Optional, Tuple
from pymatgen.core import Structure
from pymatgen.core.periodic_table import Element, Specie
from collections import defaultdict
from fractions import Fraction
from functools import reduce
import math
import re
import os


@dataclass
class OccupancyInfo:
    """共占位信息"""
    occupation: float  # 占据率
    atom_serials: List[int] = field(default_factory=list)  # 原子序号
    elements: List[str] = field(default_factory=list)  # 涉及的元素
    numerator: int = 0  # 分子
    denominator: int = 1  # 分母
    involves_target_cation: bool = False  # 是否涉及目标阳离子
    involves_anion: bool = False  # 是否涉及阴离子


@dataclass
class ExpansionInfo:
    """扩胞信息"""
    needs_expansion: bool = False  # 是否需要扩胞
    expansion_factor: int = 1  # 扩胞因子（最小公倍数）
    occupancy_details: List[OccupancyInfo] = field(default_factory=list)  # 共占位详情
    problematic_sites: int = 0  # 问题位点数
    can_expand: bool = True  # 是否可以扩胞处理
    skip_reason: str = ""  # 无法扩胞的原因


@dataclass
class StructureInfo:
    """单个结构的分析结果"""
    file_path: str
    file_name: str

    # 基础信息
    is_valid: bool = False
    error_message: str = ""

    # 元素组成
    elements: Set[str] = field(default_factory=set)
    num_sites: int = 0
    formula: str = ""

    # 阳离子/阴离子信息
    contains_target_cation: bool = False
    anion_types: Set[str] = field(default_factory=set)
    anion_mode: str = ""  # "single", "mixed", "none"

    # 数据质量标记
    has_oxidation_states: bool = False
    has_partial_occupancy: bool = False  # 是否有共占位
    has_water_molecule: bool = False
    has_radioactive_elements: bool = False
    is_binary_compound: bool = False

    # 共占位详细分析（新增）
    cation_with_vacancy: bool = False           # Li与空位共占位（不需处理）
    cation_with_other_cation: bool = False      # Li与其他阳离子共占位（需扩胞）
    anion_has_partial_occupancy: bool = False   # 阴离子共占位
    other_has_partial_occupancy: bool = False   # 其他元素共占位（需扩胞）
    expansion_info: ExpansionInfo = field(default_factory=ExpansionInfo)

    # 可处理性
    needs_expansion: bool = False
    can_process: bool = False
    skip_reason: str = ""


class StructureInspector:
    """结构检查器（含扩胞分析）"""

    # 预定义的阴离子集合
    VALID_ANIONS = {'O', 'S', 'Cl', 'Br'}

    # 放射性元素
    RADIOACTIVE_ELEMENTS = {
        'U', 'Th', 'Pu', 'Ra', 'Rn', 'Po', 'Np', 'Am',
        'Cm', 'Bk', 'Cf', 'Es', 'Fm', 'Md', 'No', 'Lr'
    }

    # 扩胞精度模式
    PRECISION_LIMITS = {
        'high': None,      # 精确分数
        'normal': 100,     # 分母≤100
        'low': 10,         # 分母≤10
        'very_low': 5      # 分母≤5
    }

    def __init__(
        self,
        target_cation: str = "Li",
        target_anions: Set[str] = None,
        expansion_precision: str = "low"
    ):
        """
        初始化检查器

        Args:
            target_cation: 目标阳离子 (如 "Li", "Na")
            target_anions: 目标阴离子集合 (如 {"O", "S"})
            expansion_precision: 扩胞计算精度 ('high', 'normal', 'low', 'very_low')
        """
        self.target_cation = target_cation
        self.target_anions = target_anions or self.VALID_ANIONS
        self.expansion_precision = expansion_precision

        # 目标阳离子的各种可能表示形式
        self.target_cation_variants = {
            target_cation,
            f"{target_cation}+",
            f"{target_cation}1+",
        }

    def inspect(self, file_path: str) -> StructureInfo:
        """
        分析单个CIF文件

        Args:
            file_path: CIF文件路径

        Returns:
            StructureInfo: 分析结果
        """
        import os
        info = StructureInfo(
            file_path=file_path,
            file_name=os.path.basename(file_path)
        )

        # 尝试读取结构
        try:
            structure = Structure.from_file(file_path)
        except Exception as e:
            info.is_valid = False
            info.error_message = f"读取CIF失败: {str(e)}"
            return info

        # 结构读取成功，标记为有效
        info.is_valid = True

        # 后续分析用 try-except 包裹，确保即使分析出错也能返回基本信息
        try:
            # 基础信息
            info.elements = {str(el) for el in structure.composition.elements}
            info.num_sites = structure.num_sites
            info.formula = structure.composition.reduced_formula

            # 检查是否为二元化合物
            info.is_binary_compound = len(structure.composition.elements) == 2

            # 检查是否含有目标阳离子
            info.contains_target_cation = self.target_cation in info.elements

            # 检查阴离子类型
            info.anion_types = info.elements.intersection(self.target_anions)
            if len(info.anion_types) == 0:
                info.anion_mode = "none"
            elif len(info.anion_types) == 1:
                info.anion_mode = "single"
            else:
                info.anion_mode = "mixed"

            # 检查氧化态
            info.has_oxidation_states = self._check_oxidation_states(structure)

            # 检查共占位（核心分析）
            try:
                self._analyze_partial_occupancy(structure, info)
            except Exception as e:
                # 共占位分析失败，记录但继续
                pass

            # 检查水分子
            try:
                info.has_water_molecule = self._check_water_molecule(structure)
            except:
                info.has_water_molecule = False

            # 检查放射性元素
            info.has_radioactive_elements = bool(
                info.elements.intersection(self.RADIOACTIVE_ELEMENTS)
            )

            # 判断可处理性
            self._evaluate_processability(info)

        except Exception as e:
            # 分析过程出错，但文件本身是有效的
            # 保留 is_valid = True，但记录错误
            info.error_message = f"分析过程出错: {str(e)}"

        return info

    def _check_oxidation_states(self, structure: Structure) -> bool:
        """检查结构是否包含氧化态信息"""
        try:
            for site in structure.sites:
                for specie in site.species.keys():
                    if isinstance(specie, Specie):
                        return True
            return False
        except:
            return False

    def _get_element_from_species_string(self, species_str: str) -> str:
        """从物种字符串提取纯元素符号"""
        match = re.match(r'([A-Z][a-z]?)', species_str)
        return match.group(1) if match else ""

    def _get_occupancy_from_species_string(self, species_str: str, exclude_elements: Set[str]) -> Optional[float]:
        """
        从物种字符串获取非目标元素的占据率
        格式如: "Li+:0.689, Sc3+:0.311"
        """
        if ':' not in species_str:
            return None

        parts = [p.strip() for p in species_str.split(',')]
        for part in parts:
            if ':' in part:
                element_part, occu_part = part.split(':')
                element = self._get_element_from_species_string(element_part.strip())
                if element and element not in exclude_elements:
                    try:
                        return float(occu_part.strip())
                    except ValueError:
                        continue
        return None

    # 在 StructureInspector 类中，替换 _analyze_partial_occupancy 方法

    def _analyze_partial_occupancy(self, structure: Structure, info: StructureInfo):
        """
        分析共占位情况（修正版）

        关键规则：
        - Li与空位共占位 → 不需要处理（cation_with_vacancy）
        - Li与其他阳离子共占位 → 需要扩胞（cation_with_other_cation）
        - 阴离子共占位 → 通常不处理
        - 其他阳离子共占位 → 需要扩胞
        """
        occupancy_dict = defaultdict(list)  # {occupation: [site_indices]}
        occupancy_elements = {}  # {occupation: [elements]}

        for i, site in enumerate(structure.sites):
            site_species = site.species
            species_string = str(site.species)

            # 提取各元素及其占据率
            species_occu = {}  # {element: occupancy}
            for sp, occu in site_species.items():
                elem = sp.symbol if hasattr(sp, 'symbol') else str(sp)
                elem = self._get_element_from_species_string(elem)
                if elem:
                    species_occu[elem] = occu

            total_occupancy = sum(species_occu.values())
            elements_at_site = list(species_occu.keys())

            # 检查是否有部分占据
            has_partial = any(occu < 1.0 for occu in species_occu.values()) or len(species_occu) > 1

            if not has_partial:
                continue

            info.has_partial_occupancy = True

            # 判断Li的共占位情况
            if self.target_cation in elements_at_site:
                li_occu = species_occu.get(self.target_cation, 0)
                other_elements = [e for e in elements_at_site if e != self.target_cation]

                if not other_elements and li_occu < 1.0:
                    # Li与空位共占位（Li占据率<1，但没有其他元素）
                    info.cation_with_vacancy = True
                elif other_elements:
                    # Li与其他元素共占位
                    other_are_anions = all(e in self.target_anions for e in other_elements)
                    if other_are_anions:
                        # Li与阴离子共占位（罕见，标记为阴离子共占位）
                        info.anion_has_partial_occupancy = True
                    else:
                        # Li与其他阳离子共占位 → 需要扩胞
                        info.cation_with_other_cation = True

                        # 记录需要扩胞的占据率（取非Li元素的占据率）
                        for elem in other_elements:
                            if elem not in self.target_anions:
                                occu = species_occu.get(elem, 0)
                                if occu > 0 and occu < 1.0:
                                    occupancy_dict[occu].append(i)
                                    occupancy_elements[occu] = elements_at_site
            else:
                # 不涉及Li的位点
                # 判断是否涉及阴离子
                if any(elem in self.target_anions for elem in elements_at_site):
                    info.anion_has_partial_occupancy = True
                else:
                    # 其他阳离子的共占位 → 需要扩胞
                    info.other_has_partial_occupancy = True

                    # 获取占据率
                    for elem, occu in species_occu.items():
                        if occu > 0 and occu < 1.0:
                            occupancy_dict[occu].append(i)
                            occupancy_elements[occu] = elements_at_site
                            break  # 只记录一次

        # 计算扩胞信息
        self._calculate_expansion_info(info, occupancy_dict, occupancy_elements)

    def _evaluate_processability(self, info: StructureInfo):
        """评估可处理性（修正版）"""
        skip_reasons = []

        if not info.is_valid:
            skip_reasons.append("无法解析CIF文件")

        if not info.contains_target_cation:
            skip_reasons.append(f"不含{self.target_cation}")

        if info.anion_mode == "none":
            skip_reasons.append("不含目标阴离子")

        if info.is_binary_compound:
            skip_reasons.append("二元化合物")

        if info.has_radioactive_elements:
            skip_reasons.append("含放射性元素")

        # Li与空位共占位 → 不需要处理（不加入skip_reasons）
        # info.cation_with_vacancy 不影响可处理性

        # Li与其他阳离子共占位 → 需要扩胞（如果扩胞因子合理则可处理）
        if info.cation_with_other_cation:
            if info.expansion_info.can_expand:
                info.needs_expansion = True
            else:
                skip_reasons.append(f"{self.target_cation}与其他阳离子共占位且{info.expansion_info.skip_reason}")

        # 阴离子共占位 → 不处理
        if info.anion_has_partial_occupancy:
            skip_reasons.append("阴离子存在共占位")

        if info.has_water_molecule:
            skip_reasons.append("含水分子")

        # 其他阳离子共占位（不涉及Li）→ 需要扩胞
        if info.other_has_partial_occupancy:
            if info.expansion_info.can_expand:
                info.needs_expansion = True
            else:
                skip_reasons.append(info.expansion_info.skip_reason)

        if skip_reasons:
            info.can_process = False
            info.skip_reason = "; ".join(skip_reasons)
        else:
            info.can_process = True

    def _calculate_expansion_info(
        self,
        info: StructureInfo,
        occupancy_dict: Dict[float, List[int]],
        occupancy_elements: Dict[float, List[str]]
    ):
        """计算扩胞相关信息"""
        expansion_info = ExpansionInfo()

        if not occupancy_dict:
            info.expansion_info = expansion_info
            return

        # 需要扩胞（有非目标阳离子的共占位）
        expansion_info.needs_expansion = True
        expansion_info.problematic_sites = sum(len(v) for v in occupancy_dict.values())

        # 转换为OccupancyInfo列表
        occupancy_list = []
        for occu, serials in occupancy_dict.items():
            elements = occupancy_elements.get(occu, [])

            # 根据精度计算分数
            limit = self.PRECISION_LIMITS.get(self.expansion_precision)
            if limit:
                fraction = Fraction(occu).limit_denominator(limit)
            else:
                fraction = Fraction(occu).limit_denominator()

            occ_info = OccupancyInfo(
                occupation=occu,
                atom_serials=[s + 1 for s in serials],  # 转为1-based
                elements=elements,
                numerator=fraction.numerator,
                denominator=fraction.denominator,
                involves_target_cation=self.target_cation in elements,
                involves_anion=any(e in self.target_anions for e in elements)
            )
            occupancy_list.append(occ_info)

        expansion_info.occupancy_details = occupancy_list

        # 计算最小公倍数（扩胞因子）
        denominators = [occ.denominator for occ in occupancy_list]
        if denominators:
            lcm = reduce(lambda a, b: a * b // math.gcd(a, b), denominators, 1)
            expansion_info.expansion_factor = lcm

            # 判断是否可以扩胞（因子过大则不可处理）
            if lcm > 64:  # 扩胞超过64倍通常不可行
                expansion_info.can_expand = False
                expansion_info.skip_reason = f"扩胞因子过大({lcm})"

        info.expansion_info = expansion_info
        info.needs_expansion = expansion_info.needs_expansion and expansion_info.can_expand

    def _check_water_molecule(self, structure: Structure) -> bool:
        """检查是否含有水分子"""
        try:
            oxygen_sites = []
            hydrogen_sites = []

            for site in structure.sites:
                species_str = str(site.species)
                if 'O' in species_str:
                    oxygen_sites.append(site)
                if 'H' in species_str:
                    hydrogen_sites.append(site)

            for o_site in oxygen_sites:
                nearby_h = [h for h in hydrogen_sites if o_site.distance(h) < 1.2]
                if len(nearby_h) >= 2:
                    return True
            return False
        except:
            return False

    def _evaluate_processability(self, info: StructureInfo):
        """评估可处理性"""
        skip_reasons = []

        if not info.is_valid:
            skip_reasons.append("无法解析CIF文件")

        if not info.contains_target_cation:
            skip_reasons.append(f"不含{self.target_cation}")

        if info.anion_mode == "none":
            skip_reasons.append("不含目标阴离子")

        if info.is_binary_compound:
            skip_reasons.append("二元化合物")

        if info.has_radioactive_elements:
            skip_reasons.append("含放射性元素")

        # 关键：目标阳离子共占位是不可处理的
        if info.cation_has_partial_occupancy:
            skip_reasons.append(f"{self.target_cation}存在共占位")

        # 阴离子共占位通常也不处理
        if info.anion_has_partial_occupancy:
            skip_reasons.append("阴离子存在共占位")

        if info.has_water_molecule:
            skip_reasons.append("含水分子")

        # 扩胞因子过大
        if info.expansion_info.needs_expansion and not info.expansion_info.can_expand:
            skip_reasons.append(info.expansion_info.skip_reason)

        if skip_reasons:
            info.can_process = False
            info.skip_reason = "; ".join(skip_reasons)
        else:
            info.can_process = True