""" 结构检查器:对单个CIF文件进行深度分析(含扩胞需求判断) """ from dataclasses import dataclass, field from typing import Set, Dict, List, Optional, Tuple from pymatgen.core import Structure from pymatgen.core.periodic_table import Element, Specie from collections import defaultdict from fractions import Fraction from functools import reduce import math import re import os @dataclass class OccupancyInfo: """共占位信息""" occupation: float # 占据率 atom_serials: List[int] = field(default_factory=list) # 原子序号 elements: List[str] = field(default_factory=list) # 涉及的元素 numerator: int = 0 # 分子 denominator: int = 1 # 分母 involves_target_cation: bool = False # 是否涉及目标阳离子 involves_anion: bool = False # 是否涉及阴离子 @dataclass class ExpansionInfo: """扩胞信息""" needs_expansion: bool = False # 是否需要扩胞 expansion_factor: int = 1 # 扩胞因子(最小公倍数) occupancy_details: List[OccupancyInfo] = field(default_factory=list) # 共占位详情 problematic_sites: int = 0 # 问题位点数 can_expand: bool = True # 是否可以扩胞处理 skip_reason: str = "" # 无法扩胞的原因 @dataclass class StructureInfo: """单个结构的分析结果""" file_path: str file_name: str # 基础信息 is_valid: bool = False error_message: str = "" # 元素组成 elements: Set[str] = field(default_factory=set) num_sites: int = 0 formula: str = "" # 阳离子/阴离子信息 contains_target_cation: bool = False anion_types: Set[str] = field(default_factory=set) anion_mode: str = "" # "single", "mixed", "none" # 数据质量标记 has_oxidation_states: bool = False has_partial_occupancy: bool = False # 是否有共占位 has_water_molecule: bool = False has_radioactive_elements: bool = False is_binary_compound: bool = False # 共占位详细分析(新增) cation_with_vacancy: bool = False # Li与空位共占位(不需处理) cation_with_other_cation: bool = False # Li与其他阳离子共占位(需扩胞) anion_has_partial_occupancy: bool = False # 阴离子共占位 other_has_partial_occupancy: bool = False # 其他元素共占位(需扩胞) expansion_info: ExpansionInfo = field(default_factory=ExpansionInfo) # 可处理性 needs_expansion: bool = False can_process: bool = False skip_reason: str = "" class StructureInspector: """结构检查器(含扩胞分析)""" # 预定义的阴离子集合 VALID_ANIONS = {'O', 'S', 'Cl', 'Br'} # 放射性元素 RADIOACTIVE_ELEMENTS = { 'U', 'Th', 'Pu', 'Ra', 'Rn', 'Po', 'Np', 'Am', 'Cm', 'Bk', 'Cf', 'Es', 'Fm', 'Md', 'No', 'Lr' } # 扩胞精度模式 PRECISION_LIMITS = { 'high': None, # 精确分数 'normal': 100, # 分母≤100 'low': 10, # 分母≤10 'very_low': 5 # 分母≤5 } def __init__( self, target_cation: str = "Li", target_anions: Set[str] = None, expansion_precision: str = "low" ): """ 初始化检查器 Args: target_cation: 目标阳离子 (如 "Li", "Na") target_anions: 目标阴离子集合 (如 {"O", "S"}) expansion_precision: 扩胞计算精度 ('high', 'normal', 'low', 'very_low') """ self.target_cation = target_cation self.target_anions = target_anions or self.VALID_ANIONS self.expansion_precision = expansion_precision # 目标阳离子的各种可能表示形式 self.target_cation_variants = { target_cation, f"{target_cation}+", f"{target_cation}1+", } def inspect(self, file_path: str) -> StructureInfo: """ 分析单个CIF文件 Args: file_path: CIF文件路径 Returns: StructureInfo: 分析结果 """ import os info = StructureInfo( file_path=file_path, file_name=os.path.basename(file_path) ) # 尝试读取结构 try: structure = Structure.from_file(file_path) except Exception as e: info.is_valid = False info.error_message = f"读取CIF失败: {str(e)}" return info # 结构读取成功,标记为有效 info.is_valid = True # 后续分析用 try-except 包裹,确保即使分析出错也能返回基本信息 try: # 基础信息 info.elements = {str(el) for el in structure.composition.elements} info.num_sites = structure.num_sites info.formula = structure.composition.reduced_formula # 检查是否为二元化合物 info.is_binary_compound = len(structure.composition.elements) == 2 # 检查是否含有目标阳离子 info.contains_target_cation = self.target_cation in info.elements # 检查阴离子类型 info.anion_types = info.elements.intersection(self.target_anions) if len(info.anion_types) == 0: info.anion_mode = "none" elif len(info.anion_types) == 1: info.anion_mode = "single" else: info.anion_mode = "mixed" # 检查氧化态 info.has_oxidation_states = self._check_oxidation_states(structure) # 检查共占位(核心分析) try: self._analyze_partial_occupancy(structure, info) except Exception as e: # 共占位分析失败,记录但继续 pass # 检查水分子 try: info.has_water_molecule = self._check_water_molecule(structure) except: info.has_water_molecule = False # 检查放射性元素 info.has_radioactive_elements = bool( info.elements.intersection(self.RADIOACTIVE_ELEMENTS) ) # 判断可处理性 self._evaluate_processability(info) except Exception as e: # 分析过程出错,但文件本身是有效的 # 保留 is_valid = True,但记录错误 info.error_message = f"分析过程出错: {str(e)}" return info def _check_oxidation_states(self, structure: Structure) -> bool: """检查结构是否包含氧化态信息""" try: for site in structure.sites: for specie in site.species.keys(): if isinstance(specie, Specie): return True return False except: return False def _get_element_from_species_string(self, species_str: str) -> str: """从物种字符串提取纯元素符号""" match = re.match(r'([A-Z][a-z]?)', species_str) return match.group(1) if match else "" def _get_occupancy_from_species_string(self, species_str: str, exclude_elements: Set[str]) -> Optional[float]: """ 从物种字符串获取非目标元素的占据率 格式如: "Li+:0.689, Sc3+:0.311" """ if ':' not in species_str: return None parts = [p.strip() for p in species_str.split(',')] for part in parts: if ':' in part: element_part, occu_part = part.split(':') element = self._get_element_from_species_string(element_part.strip()) if element and element not in exclude_elements: try: return float(occu_part.strip()) except ValueError: continue return None # 在 StructureInspector 类中,替换 _analyze_partial_occupancy 方法 def _analyze_partial_occupancy(self, structure: Structure, info: StructureInfo): """ 分析共占位情况(修正版) 关键规则: - Li与空位共占位 → 不需要处理(cation_with_vacancy) - Li与其他阳离子共占位 → 需要扩胞(cation_with_other_cation) - 阴离子共占位 → 通常不处理 - 其他阳离子共占位 → 需要扩胞 """ occupancy_dict = defaultdict(list) # {occupation: [site_indices]} occupancy_elements = {} # {occupation: [elements]} for i, site in enumerate(structure.sites): site_species = site.species species_string = str(site.species) # 提取各元素及其占据率 species_occu = {} # {element: occupancy} for sp, occu in site_species.items(): elem = sp.symbol if hasattr(sp, 'symbol') else str(sp) elem = self._get_element_from_species_string(elem) if elem: species_occu[elem] = occu total_occupancy = sum(species_occu.values()) elements_at_site = list(species_occu.keys()) # 检查是否有部分占据 has_partial = any(occu < 1.0 for occu in species_occu.values()) or len(species_occu) > 1 if not has_partial: continue info.has_partial_occupancy = True # 判断Li的共占位情况 if self.target_cation in elements_at_site: li_occu = species_occu.get(self.target_cation, 0) other_elements = [e for e in elements_at_site if e != self.target_cation] if not other_elements and li_occu < 1.0: # Li与空位共占位(Li占据率<1,但没有其他元素) info.cation_with_vacancy = True elif other_elements: # Li与其他元素共占位 other_are_anions = all(e in self.target_anions for e in other_elements) if other_are_anions: # Li与阴离子共占位(罕见,标记为阴离子共占位) info.anion_has_partial_occupancy = True else: # Li与其他阳离子共占位 → 需要扩胞 info.cation_with_other_cation = True # 记录需要扩胞的占据率(取非Li元素的占据率) for elem in other_elements: if elem not in self.target_anions: occu = species_occu.get(elem, 0) if occu > 0 and occu < 1.0: occupancy_dict[occu].append(i) occupancy_elements[occu] = elements_at_site else: # 不涉及Li的位点 # 判断是否涉及阴离子 if any(elem in self.target_anions for elem in elements_at_site): info.anion_has_partial_occupancy = True else: # 其他阳离子的共占位 → 需要扩胞 info.other_has_partial_occupancy = True # 获取占据率 for elem, occu in species_occu.items(): if occu > 0 and occu < 1.0: occupancy_dict[occu].append(i) occupancy_elements[occu] = elements_at_site break # 只记录一次 # 计算扩胞信息 self._calculate_expansion_info(info, occupancy_dict, occupancy_elements) def _evaluate_processability(self, info: StructureInfo): """评估可处理性(修正版)""" skip_reasons = [] if not info.is_valid: skip_reasons.append("无法解析CIF文件") if not info.contains_target_cation: skip_reasons.append(f"不含{self.target_cation}") if info.anion_mode == "none": skip_reasons.append("不含目标阴离子") if info.is_binary_compound: skip_reasons.append("二元化合物") if info.has_radioactive_elements: skip_reasons.append("含放射性元素") # Li与空位共占位 → 不需要处理(不加入skip_reasons) # info.cation_with_vacancy 不影响可处理性 # Li与其他阳离子共占位 → 需要扩胞(如果扩胞因子合理则可处理) if info.cation_with_other_cation: if info.expansion_info.can_expand: info.needs_expansion = True else: skip_reasons.append(f"{self.target_cation}与其他阳离子共占位且{info.expansion_info.skip_reason}") # 阴离子共占位 → 不处理 if info.anion_has_partial_occupancy: skip_reasons.append("阴离子存在共占位") if info.has_water_molecule: skip_reasons.append("含水分子") # 其他阳离子共占位(不涉及Li)→ 需要扩胞 if info.other_has_partial_occupancy: if info.expansion_info.can_expand: info.needs_expansion = True else: skip_reasons.append(info.expansion_info.skip_reason) if skip_reasons: info.can_process = False info.skip_reason = "; ".join(skip_reasons) else: info.can_process = True def _calculate_expansion_info( self, info: StructureInfo, occupancy_dict: Dict[float, List[int]], occupancy_elements: Dict[float, List[str]] ): """计算扩胞相关信息""" expansion_info = ExpansionInfo() if not occupancy_dict: info.expansion_info = expansion_info return # 需要扩胞(有非目标阳离子的共占位) expansion_info.needs_expansion = True expansion_info.problematic_sites = sum(len(v) for v in occupancy_dict.values()) # 转换为OccupancyInfo列表 occupancy_list = [] for occu, serials in occupancy_dict.items(): elements = occupancy_elements.get(occu, []) # 根据精度计算分数 limit = self.PRECISION_LIMITS.get(self.expansion_precision) if limit: fraction = Fraction(occu).limit_denominator(limit) else: fraction = Fraction(occu).limit_denominator() occ_info = OccupancyInfo( occupation=occu, atom_serials=[s + 1 for s in serials], # 转为1-based elements=elements, numerator=fraction.numerator, denominator=fraction.denominator, involves_target_cation=self.target_cation in elements, involves_anion=any(e in self.target_anions for e in elements) ) occupancy_list.append(occ_info) expansion_info.occupancy_details = occupancy_list # 计算最小公倍数(扩胞因子) denominators = [occ.denominator for occ in occupancy_list] if denominators: lcm = reduce(lambda a, b: a * b // math.gcd(a, b), denominators, 1) expansion_info.expansion_factor = lcm # 判断是否可以扩胞(因子过大则不可处理) if lcm > 64: # 扩胞超过64倍通常不可行 expansion_info.can_expand = False expansion_info.skip_reason = f"扩胞因子过大({lcm})" info.expansion_info = expansion_info info.needs_expansion = expansion_info.needs_expansion and expansion_info.can_expand def _check_water_molecule(self, structure: Structure) -> bool: """检查是否含有水分子""" try: oxygen_sites = [] hydrogen_sites = [] for site in structure.sites: species_str = str(site.species) if 'O' in species_str: oxygen_sites.append(site) if 'H' in species_str: hydrogen_sites.append(site) for o_site in oxygen_sites: nearby_h = [h for h in hydrogen_sites if o_site.distance(h) < 1.2] if len(nearby_h) >= 2: return True return False except: return False def _evaluate_processability(self, info: StructureInfo): """评估可处理性""" skip_reasons = [] if not info.is_valid: skip_reasons.append("无法解析CIF文件") if not info.contains_target_cation: skip_reasons.append(f"不含{self.target_cation}") if info.anion_mode == "none": skip_reasons.append("不含目标阴离子") if info.is_binary_compound: skip_reasons.append("二元化合物") if info.has_radioactive_elements: skip_reasons.append("含放射性元素") # 关键:目标阳离子共占位是不可处理的 if info.cation_has_partial_occupancy: skip_reasons.append(f"{self.target_cation}存在共占位") # 阴离子共占位通常也不处理 if info.anion_has_partial_occupancy: skip_reasons.append("阴离子存在共占位") if info.has_water_molecule: skip_reasons.append("含水分子") # 扩胞因子过大 if info.expansion_info.needs_expansion and not info.expansion_info.can_expand: skip_reasons.append(info.expansion_info.skip_reason) if skip_reasons: info.can_process = False info.skip_reason = "; ".join(skip_reasons) else: info.can_process = True