Files
screen/src/analysis/structure_inspector.py
2025-12-14 18:11:00 +08:00

486 lines
18 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
结构检查器对单个CIF文件进行深度分析含扩胞需求判断
"""
from dataclasses import dataclass, field
from typing import Set, Dict, List, Optional, Tuple
from pymatgen.core import Structure
from pymatgen.core.periodic_table import Element, Specie
from collections import defaultdict
from fractions import Fraction
from functools import reduce
import math
import re
import os
@dataclass
class OccupancyInfo:
"""共占位信息"""
occupation: float # 占据率
atom_serials: List[int] = field(default_factory=list) # 原子序号
elements: List[str] = field(default_factory=list) # 涉及的元素
numerator: int = 0 # 分子
denominator: int = 1 # 分母
involves_target_cation: bool = False # 是否涉及目标阳离子
involves_anion: bool = False # 是否涉及阴离子
@dataclass
class ExpansionInfo:
"""扩胞信息"""
needs_expansion: bool = False # 是否需要扩胞
expansion_factor: int = 1 # 扩胞因子(最小公倍数)
occupancy_details: List[OccupancyInfo] = field(default_factory=list) # 共占位详情
problematic_sites: int = 0 # 问题位点数
can_expand: bool = True # 是否可以扩胞处理
skip_reason: str = "" # 无法扩胞的原因
@dataclass
class StructureInfo:
"""单个结构的分析结果"""
file_path: str
file_name: str
# 基础信息
is_valid: bool = False
error_message: str = ""
# 元素组成
elements: Set[str] = field(default_factory=set)
num_sites: int = 0
formula: str = ""
# 阳离子/阴离子信息
contains_target_cation: bool = False
anion_types: Set[str] = field(default_factory=set)
anion_mode: str = "" # "single", "mixed", "none"
# 数据质量标记
has_oxidation_states: bool = False
has_partial_occupancy: bool = False # 是否有共占位
has_water_molecule: bool = False
has_radioactive_elements: bool = False
is_binary_compound: bool = False
# 共占位详细分析(新增)
cation_with_vacancy: bool = False # Li与空位共占位不需处理
cation_with_other_cation: bool = False # Li与其他阳离子共占位需扩胞
anion_has_partial_occupancy: bool = False # 阴离子共占位
other_has_partial_occupancy: bool = False # 其他元素共占位(需扩胞)
expansion_info: ExpansionInfo = field(default_factory=ExpansionInfo)
# 可处理性
needs_expansion: bool = False
can_process: bool = False
skip_reason: str = ""
class StructureInspector:
"""结构检查器(含扩胞分析)"""
# 预定义的阴离子集合
VALID_ANIONS = {'O', 'S', 'Cl', 'Br'}
# 放射性元素
RADIOACTIVE_ELEMENTS = {
'U', 'Th', 'Pu', 'Ra', 'Rn', 'Po', 'Np', 'Am',
'Cm', 'Bk', 'Cf', 'Es', 'Fm', 'Md', 'No', 'Lr'
}
# 扩胞精度模式
PRECISION_LIMITS = {
'high': None, # 精确分数
'normal': 100, # 分母≤100
'low': 10, # 分母≤10
'very_low': 5 # 分母≤5
}
def __init__(
self,
target_cation: str = "Li",
target_anions: Set[str] = None,
expansion_precision: str = "low"
):
"""
初始化检查器
Args:
target_cation: 目标阳离子 (如 "Li", "Na")
target_anions: 目标阴离子集合 (如 {"O", "S"})
expansion_precision: 扩胞计算精度 ('high', 'normal', 'low', 'very_low')
"""
self.target_cation = target_cation
self.target_anions = target_anions or self.VALID_ANIONS
self.expansion_precision = expansion_precision
# 目标阳离子的各种可能表示形式
self.target_cation_variants = {
target_cation,
f"{target_cation}+",
f"{target_cation}1+",
}
def inspect(self, file_path: str) -> StructureInfo:
"""
分析单个CIF文件
Args:
file_path: CIF文件路径
Returns:
StructureInfo: 分析结果
"""
import os
info = StructureInfo(
file_path=file_path,
file_name=os.path.basename(file_path)
)
# 尝试读取结构
try:
structure = Structure.from_file(file_path)
except Exception as e:
info.is_valid = False
info.error_message = f"读取CIF失败: {str(e)}"
return info
# 结构读取成功,标记为有效
info.is_valid = True
# 后续分析用 try-except 包裹,确保即使分析出错也能返回基本信息
try:
# 基础信息
info.elements = {str(el) for el in structure.composition.elements}
info.num_sites = structure.num_sites
info.formula = structure.composition.reduced_formula
# 检查是否为二元化合物
info.is_binary_compound = len(structure.composition.elements) == 2
# 检查是否含有目标阳离子
info.contains_target_cation = self.target_cation in info.elements
# 检查阴离子类型
info.anion_types = info.elements.intersection(self.target_anions)
if len(info.anion_types) == 0:
info.anion_mode = "none"
elif len(info.anion_types) == 1:
info.anion_mode = "single"
else:
info.anion_mode = "mixed"
# 检查氧化态
info.has_oxidation_states = self._check_oxidation_states(structure)
# 检查共占位(核心分析)
try:
self._analyze_partial_occupancy(structure, info)
except Exception as e:
# 共占位分析失败,记录但继续
pass
# 检查水分子
try:
info.has_water_molecule = self._check_water_molecule(structure)
except:
info.has_water_molecule = False
# 检查放射性元素
info.has_radioactive_elements = bool(
info.elements.intersection(self.RADIOACTIVE_ELEMENTS)
)
# 判断可处理性
self._evaluate_processability(info)
except Exception as e:
# 分析过程出错,但文件本身是有效的
# 保留 is_valid = True但记录错误
info.error_message = f"分析过程出错: {str(e)}"
return info
def _check_oxidation_states(self, structure: Structure) -> bool:
"""检查结构是否包含氧化态信息"""
try:
for site in structure.sites:
for specie in site.species.keys():
if isinstance(specie, Specie):
return True
return False
except:
return False
def _get_element_from_species_string(self, species_str: str) -> str:
"""从物种字符串提取纯元素符号"""
match = re.match(r'([A-Z][a-z]?)', species_str)
return match.group(1) if match else ""
def _get_occupancy_from_species_string(self, species_str: str, exclude_elements: Set[str]) -> Optional[float]:
"""
从物种字符串获取非目标元素的占据率
格式如: "Li+:0.689, Sc3+:0.311"
"""
if ':' not in species_str:
return None
parts = [p.strip() for p in species_str.split(',')]
for part in parts:
if ':' in part:
element_part, occu_part = part.split(':')
element = self._get_element_from_species_string(element_part.strip())
if element and element not in exclude_elements:
try:
return float(occu_part.strip())
except ValueError:
continue
return None
# 在 StructureInspector 类中,替换 _analyze_partial_occupancy 方法
def _analyze_partial_occupancy(self, structure: Structure, info: StructureInfo):
"""
分析共占位情况(修正版)
关键规则:
- Li与空位共占位 → 不需要处理cation_with_vacancy
- Li与其他阳离子共占位 → 需要扩胞cation_with_other_cation
- 阴离子共占位 → 通常不处理
- 其他阳离子共占位 → 需要扩胞
"""
occupancy_dict = defaultdict(list) # {occupation: [site_indices]}
occupancy_elements = {} # {occupation: [elements]}
for i, site in enumerate(structure.sites):
site_species = site.species
species_string = str(site.species)
# 提取各元素及其占据率
species_occu = {} # {element: occupancy}
for sp, occu in site_species.items():
elem = sp.symbol if hasattr(sp, 'symbol') else str(sp)
elem = self._get_element_from_species_string(elem)
if elem:
species_occu[elem] = occu
total_occupancy = sum(species_occu.values())
elements_at_site = list(species_occu.keys())
# 检查是否有部分占据
has_partial = any(occu < 1.0 for occu in species_occu.values()) or len(species_occu) > 1
if not has_partial:
continue
info.has_partial_occupancy = True
# 判断Li的共占位情况
if self.target_cation in elements_at_site:
li_occu = species_occu.get(self.target_cation, 0)
other_elements = [e for e in elements_at_site if e != self.target_cation]
if not other_elements and li_occu < 1.0:
# Li与空位共占位Li占据率<1但没有其他元素
info.cation_with_vacancy = True
elif other_elements:
# Li与其他元素共占位
other_are_anions = all(e in self.target_anions for e in other_elements)
if other_are_anions:
# Li与阴离子共占位罕见标记为阴离子共占位
info.anion_has_partial_occupancy = True
else:
# Li与其他阳离子共占位 → 需要扩胞
info.cation_with_other_cation = True
# 记录需要扩胞的占据率取非Li元素的占据率
for elem in other_elements:
if elem not in self.target_anions:
occu = species_occu.get(elem, 0)
if occu > 0 and occu < 1.0:
occupancy_dict[occu].append(i)
occupancy_elements[occu] = elements_at_site
else:
# 不涉及Li的位点
# 判断是否涉及阴离子
if any(elem in self.target_anions for elem in elements_at_site):
info.anion_has_partial_occupancy = True
else:
# 其他阳离子的共占位 → 需要扩胞
info.other_has_partial_occupancy = True
# 获取占据率
for elem, occu in species_occu.items():
if occu > 0 and occu < 1.0:
occupancy_dict[occu].append(i)
occupancy_elements[occu] = elements_at_site
break # 只记录一次
# 计算扩胞信息
self._calculate_expansion_info(info, occupancy_dict, occupancy_elements)
def _evaluate_processability(self, info: StructureInfo):
"""评估可处理性(修正版)"""
skip_reasons = []
if not info.is_valid:
skip_reasons.append("无法解析CIF文件")
if not info.contains_target_cation:
skip_reasons.append(f"不含{self.target_cation}")
if info.anion_mode == "none":
skip_reasons.append("不含目标阴离子")
if info.is_binary_compound:
skip_reasons.append("二元化合物")
if info.has_radioactive_elements:
skip_reasons.append("含放射性元素")
# Li与空位共占位 → 不需要处理不加入skip_reasons
# info.cation_with_vacancy 不影响可处理性
# Li与其他阳离子共占位 → 需要扩胞(如果扩胞因子合理则可处理)
if info.cation_with_other_cation:
if info.expansion_info.can_expand:
info.needs_expansion = True
else:
skip_reasons.append(f"{self.target_cation}与其他阳离子共占位且{info.expansion_info.skip_reason}")
# 阴离子共占位 → 不处理
if info.anion_has_partial_occupancy:
skip_reasons.append("阴离子存在共占位")
if info.has_water_molecule:
skip_reasons.append("含水分子")
# 其他阳离子共占位不涉及Li→ 需要扩胞
if info.other_has_partial_occupancy:
if info.expansion_info.can_expand:
info.needs_expansion = True
else:
skip_reasons.append(info.expansion_info.skip_reason)
if skip_reasons:
info.can_process = False
info.skip_reason = "; ".join(skip_reasons)
else:
info.can_process = True
def _calculate_expansion_info(
self,
info: StructureInfo,
occupancy_dict: Dict[float, List[int]],
occupancy_elements: Dict[float, List[str]]
):
"""计算扩胞相关信息"""
expansion_info = ExpansionInfo()
if not occupancy_dict:
info.expansion_info = expansion_info
return
# 需要扩胞(有非目标阳离子的共占位)
expansion_info.needs_expansion = True
expansion_info.problematic_sites = sum(len(v) for v in occupancy_dict.values())
# 转换为OccupancyInfo列表
occupancy_list = []
for occu, serials in occupancy_dict.items():
elements = occupancy_elements.get(occu, [])
# 根据精度计算分数
limit = self.PRECISION_LIMITS.get(self.expansion_precision)
if limit:
fraction = Fraction(occu).limit_denominator(limit)
else:
fraction = Fraction(occu).limit_denominator()
occ_info = OccupancyInfo(
occupation=occu,
atom_serials=[s + 1 for s in serials], # 转为1-based
elements=elements,
numerator=fraction.numerator,
denominator=fraction.denominator,
involves_target_cation=self.target_cation in elements,
involves_anion=any(e in self.target_anions for e in elements)
)
occupancy_list.append(occ_info)
expansion_info.occupancy_details = occupancy_list
# 计算最小公倍数(扩胞因子)
denominators = [occ.denominator for occ in occupancy_list]
if denominators:
lcm = reduce(lambda a, b: a * b // math.gcd(a, b), denominators, 1)
expansion_info.expansion_factor = lcm
# 判断是否可以扩胞(因子过大则不可处理)
if lcm > 64: # 扩胞超过64倍通常不可行
expansion_info.can_expand = False
expansion_info.skip_reason = f"扩胞因子过大({lcm})"
info.expansion_info = expansion_info
info.needs_expansion = expansion_info.needs_expansion and expansion_info.can_expand
def _check_water_molecule(self, structure: Structure) -> bool:
"""检查是否含有水分子"""
try:
oxygen_sites = []
hydrogen_sites = []
for site in structure.sites:
species_str = str(site.species)
if 'O' in species_str:
oxygen_sites.append(site)
if 'H' in species_str:
hydrogen_sites.append(site)
for o_site in oxygen_sites:
nearby_h = [h for h in hydrogen_sites if o_site.distance(h) < 1.2]
if len(nearby_h) >= 2:
return True
return False
except:
return False
def _evaluate_processability(self, info: StructureInfo):
"""评估可处理性"""
skip_reasons = []
if not info.is_valid:
skip_reasons.append("无法解析CIF文件")
if not info.contains_target_cation:
skip_reasons.append(f"不含{self.target_cation}")
if info.anion_mode == "none":
skip_reasons.append("不含目标阴离子")
if info.is_binary_compound:
skip_reasons.append("二元化合物")
if info.has_radioactive_elements:
skip_reasons.append("含放射性元素")
# 关键:目标阳离子共占位是不可处理的
if info.cation_has_partial_occupancy:
skip_reasons.append(f"{self.target_cation}存在共占位")
# 阴离子共占位通常也不处理
if info.anion_has_partial_occupancy:
skip_reasons.append("阴离子存在共占位")
if info.has_water_molecule:
skip_reasons.append("含水分子")
# 扩胞因子过大
if info.expansion_info.needs_expansion and not info.expansion_info.can_expand:
skip_reasons.append(info.expansion_info.skip_reason)
if skip_reasons:
info.can_process = False
info.skip_reason = "; ".join(skip_reasons)
else:
info.can_process = True