486 lines
18 KiB
Python
486 lines
18 KiB
Python
"""
|
||
结构检查器:对单个CIF文件进行深度分析(含扩胞需求判断)
|
||
"""
|
||
from dataclasses import dataclass, field
|
||
from typing import Set, Dict, List, Optional, Tuple
|
||
from pymatgen.core import Structure
|
||
from pymatgen.core.periodic_table import Element, Specie
|
||
from collections import defaultdict
|
||
from fractions import Fraction
|
||
from functools import reduce
|
||
import math
|
||
import re
|
||
import os
|
||
|
||
|
||
@dataclass
|
||
class OccupancyInfo:
|
||
"""共占位信息"""
|
||
occupation: float # 占据率
|
||
atom_serials: List[int] = field(default_factory=list) # 原子序号
|
||
elements: List[str] = field(default_factory=list) # 涉及的元素
|
||
numerator: int = 0 # 分子
|
||
denominator: int = 1 # 分母
|
||
involves_target_cation: bool = False # 是否涉及目标阳离子
|
||
involves_anion: bool = False # 是否涉及阴离子
|
||
|
||
|
||
@dataclass
|
||
class ExpansionInfo:
|
||
"""扩胞信息"""
|
||
needs_expansion: bool = False # 是否需要扩胞
|
||
expansion_factor: int = 1 # 扩胞因子(最小公倍数)
|
||
occupancy_details: List[OccupancyInfo] = field(default_factory=list) # 共占位详情
|
||
problematic_sites: int = 0 # 问题位点数
|
||
can_expand: bool = True # 是否可以扩胞处理
|
||
skip_reason: str = "" # 无法扩胞的原因
|
||
|
||
|
||
@dataclass
|
||
class StructureInfo:
|
||
"""单个结构的分析结果"""
|
||
file_path: str
|
||
file_name: str
|
||
|
||
# 基础信息
|
||
is_valid: bool = False
|
||
error_message: str = ""
|
||
|
||
# 元素组成
|
||
elements: Set[str] = field(default_factory=set)
|
||
num_sites: int = 0
|
||
formula: str = ""
|
||
|
||
# 阳离子/阴离子信息
|
||
contains_target_cation: bool = False
|
||
anion_types: Set[str] = field(default_factory=set)
|
||
anion_mode: str = "" # "single", "mixed", "none"
|
||
|
||
# 数据质量标记
|
||
has_oxidation_states: bool = False
|
||
has_partial_occupancy: bool = False # 是否有共占位
|
||
has_water_molecule: bool = False
|
||
has_radioactive_elements: bool = False
|
||
is_binary_compound: bool = False
|
||
|
||
# 共占位详细分析(新增)
|
||
cation_with_vacancy: bool = False # Li与空位共占位(不需处理)
|
||
cation_with_other_cation: bool = False # Li与其他阳离子共占位(需扩胞)
|
||
anion_has_partial_occupancy: bool = False # 阴离子共占位
|
||
other_has_partial_occupancy: bool = False # 其他元素共占位(需扩胞)
|
||
expansion_info: ExpansionInfo = field(default_factory=ExpansionInfo)
|
||
|
||
# 可处理性
|
||
needs_expansion: bool = False
|
||
can_process: bool = False
|
||
skip_reason: str = ""
|
||
|
||
|
||
class StructureInspector:
|
||
"""结构检查器(含扩胞分析)"""
|
||
|
||
# 预定义的阴离子集合
|
||
VALID_ANIONS = {'O', 'S', 'Cl', 'Br'}
|
||
|
||
# 放射性元素
|
||
RADIOACTIVE_ELEMENTS = {
|
||
'U', 'Th', 'Pu', 'Ra', 'Rn', 'Po', 'Np', 'Am',
|
||
'Cm', 'Bk', 'Cf', 'Es', 'Fm', 'Md', 'No', 'Lr'
|
||
}
|
||
|
||
# 扩胞精度模式
|
||
PRECISION_LIMITS = {
|
||
'high': None, # 精确分数
|
||
'normal': 100, # 分母≤100
|
||
'low': 10, # 分母≤10
|
||
'very_low': 5 # 分母≤5
|
||
}
|
||
|
||
def __init__(
|
||
self,
|
||
target_cation: str = "Li",
|
||
target_anions: Set[str] = None,
|
||
expansion_precision: str = "low"
|
||
):
|
||
"""
|
||
初始化检查器
|
||
|
||
Args:
|
||
target_cation: 目标阳离子 (如 "Li", "Na")
|
||
target_anions: 目标阴离子集合 (如 {"O", "S"})
|
||
expansion_precision: 扩胞计算精度 ('high', 'normal', 'low', 'very_low')
|
||
"""
|
||
self.target_cation = target_cation
|
||
self.target_anions = target_anions or self.VALID_ANIONS
|
||
self.expansion_precision = expansion_precision
|
||
|
||
# 目标阳离子的各种可能表示形式
|
||
self.target_cation_variants = {
|
||
target_cation,
|
||
f"{target_cation}+",
|
||
f"{target_cation}1+",
|
||
}
|
||
|
||
def inspect(self, file_path: str) -> StructureInfo:
|
||
"""
|
||
分析单个CIF文件
|
||
|
||
Args:
|
||
file_path: CIF文件路径
|
||
|
||
Returns:
|
||
StructureInfo: 分析结果
|
||
"""
|
||
import os
|
||
info = StructureInfo(
|
||
file_path=file_path,
|
||
file_name=os.path.basename(file_path)
|
||
)
|
||
|
||
# 尝试读取结构
|
||
try:
|
||
structure = Structure.from_file(file_path)
|
||
except Exception as e:
|
||
info.is_valid = False
|
||
info.error_message = f"读取CIF失败: {str(e)}"
|
||
return info
|
||
|
||
# 结构读取成功,标记为有效
|
||
info.is_valid = True
|
||
|
||
# 后续分析用 try-except 包裹,确保即使分析出错也能返回基本信息
|
||
try:
|
||
# 基础信息
|
||
info.elements = {str(el) for el in structure.composition.elements}
|
||
info.num_sites = structure.num_sites
|
||
info.formula = structure.composition.reduced_formula
|
||
|
||
# 检查是否为二元化合物
|
||
info.is_binary_compound = len(structure.composition.elements) == 2
|
||
|
||
# 检查是否含有目标阳离子
|
||
info.contains_target_cation = self.target_cation in info.elements
|
||
|
||
# 检查阴离子类型
|
||
info.anion_types = info.elements.intersection(self.target_anions)
|
||
if len(info.anion_types) == 0:
|
||
info.anion_mode = "none"
|
||
elif len(info.anion_types) == 1:
|
||
info.anion_mode = "single"
|
||
else:
|
||
info.anion_mode = "mixed"
|
||
|
||
# 检查氧化态
|
||
info.has_oxidation_states = self._check_oxidation_states(structure)
|
||
|
||
# 检查共占位(核心分析)
|
||
try:
|
||
self._analyze_partial_occupancy(structure, info)
|
||
except Exception as e:
|
||
# 共占位分析失败,记录但继续
|
||
pass
|
||
|
||
# 检查水分子
|
||
try:
|
||
info.has_water_molecule = self._check_water_molecule(structure)
|
||
except:
|
||
info.has_water_molecule = False
|
||
|
||
# 检查放射性元素
|
||
info.has_radioactive_elements = bool(
|
||
info.elements.intersection(self.RADIOACTIVE_ELEMENTS)
|
||
)
|
||
|
||
# 判断可处理性
|
||
self._evaluate_processability(info)
|
||
|
||
except Exception as e:
|
||
# 分析过程出错,但文件本身是有效的
|
||
# 保留 is_valid = True,但记录错误
|
||
info.error_message = f"分析过程出错: {str(e)}"
|
||
|
||
return info
|
||
|
||
def _check_oxidation_states(self, structure: Structure) -> bool:
|
||
"""检查结构是否包含氧化态信息"""
|
||
try:
|
||
for site in structure.sites:
|
||
for specie in site.species.keys():
|
||
if isinstance(specie, Specie):
|
||
return True
|
||
return False
|
||
except:
|
||
return False
|
||
|
||
def _get_element_from_species_string(self, species_str: str) -> str:
|
||
"""从物种字符串提取纯元素符号"""
|
||
match = re.match(r'([A-Z][a-z]?)', species_str)
|
||
return match.group(1) if match else ""
|
||
|
||
def _get_occupancy_from_species_string(self, species_str: str, exclude_elements: Set[str]) -> Optional[float]:
|
||
"""
|
||
从物种字符串获取非目标元素的占据率
|
||
格式如: "Li+:0.689, Sc3+:0.311"
|
||
"""
|
||
if ':' not in species_str:
|
||
return None
|
||
|
||
parts = [p.strip() for p in species_str.split(',')]
|
||
for part in parts:
|
||
if ':' in part:
|
||
element_part, occu_part = part.split(':')
|
||
element = self._get_element_from_species_string(element_part.strip())
|
||
if element and element not in exclude_elements:
|
||
try:
|
||
return float(occu_part.strip())
|
||
except ValueError:
|
||
continue
|
||
return None
|
||
|
||
# 在 StructureInspector 类中,替换 _analyze_partial_occupancy 方法
|
||
|
||
def _analyze_partial_occupancy(self, structure: Structure, info: StructureInfo):
|
||
"""
|
||
分析共占位情况(修正版)
|
||
|
||
关键规则:
|
||
- Li与空位共占位 → 不需要处理(cation_with_vacancy)
|
||
- Li与其他阳离子共占位 → 需要扩胞(cation_with_other_cation)
|
||
- 阴离子共占位 → 通常不处理
|
||
- 其他阳离子共占位 → 需要扩胞
|
||
"""
|
||
occupancy_dict = defaultdict(list) # {occupation: [site_indices]}
|
||
occupancy_elements = {} # {occupation: [elements]}
|
||
|
||
for i, site in enumerate(structure.sites):
|
||
site_species = site.species
|
||
species_string = str(site.species)
|
||
|
||
# 提取各元素及其占据率
|
||
species_occu = {} # {element: occupancy}
|
||
for sp, occu in site_species.items():
|
||
elem = sp.symbol if hasattr(sp, 'symbol') else str(sp)
|
||
elem = self._get_element_from_species_string(elem)
|
||
if elem:
|
||
species_occu[elem] = occu
|
||
|
||
total_occupancy = sum(species_occu.values())
|
||
elements_at_site = list(species_occu.keys())
|
||
|
||
# 检查是否有部分占据
|
||
has_partial = any(occu < 1.0 for occu in species_occu.values()) or len(species_occu) > 1
|
||
|
||
if not has_partial:
|
||
continue
|
||
|
||
info.has_partial_occupancy = True
|
||
|
||
# 判断Li的共占位情况
|
||
if self.target_cation in elements_at_site:
|
||
li_occu = species_occu.get(self.target_cation, 0)
|
||
other_elements = [e for e in elements_at_site if e != self.target_cation]
|
||
|
||
if not other_elements and li_occu < 1.0:
|
||
# Li与空位共占位(Li占据率<1,但没有其他元素)
|
||
info.cation_with_vacancy = True
|
||
elif other_elements:
|
||
# Li与其他元素共占位
|
||
other_are_anions = all(e in self.target_anions for e in other_elements)
|
||
if other_are_anions:
|
||
# Li与阴离子共占位(罕见,标记为阴离子共占位)
|
||
info.anion_has_partial_occupancy = True
|
||
else:
|
||
# Li与其他阳离子共占位 → 需要扩胞
|
||
info.cation_with_other_cation = True
|
||
|
||
# 记录需要扩胞的占据率(取非Li元素的占据率)
|
||
for elem in other_elements:
|
||
if elem not in self.target_anions:
|
||
occu = species_occu.get(elem, 0)
|
||
if occu > 0 and occu < 1.0:
|
||
occupancy_dict[occu].append(i)
|
||
occupancy_elements[occu] = elements_at_site
|
||
else:
|
||
# 不涉及Li的位点
|
||
# 判断是否涉及阴离子
|
||
if any(elem in self.target_anions for elem in elements_at_site):
|
||
info.anion_has_partial_occupancy = True
|
||
else:
|
||
# 其他阳离子的共占位 → 需要扩胞
|
||
info.other_has_partial_occupancy = True
|
||
|
||
# 获取占据率
|
||
for elem, occu in species_occu.items():
|
||
if occu > 0 and occu < 1.0:
|
||
occupancy_dict[occu].append(i)
|
||
occupancy_elements[occu] = elements_at_site
|
||
break # 只记录一次
|
||
|
||
# 计算扩胞信息
|
||
self._calculate_expansion_info(info, occupancy_dict, occupancy_elements)
|
||
|
||
def _evaluate_processability(self, info: StructureInfo):
|
||
"""评估可处理性(修正版)"""
|
||
skip_reasons = []
|
||
|
||
if not info.is_valid:
|
||
skip_reasons.append("无法解析CIF文件")
|
||
|
||
if not info.contains_target_cation:
|
||
skip_reasons.append(f"不含{self.target_cation}")
|
||
|
||
if info.anion_mode == "none":
|
||
skip_reasons.append("不含目标阴离子")
|
||
|
||
if info.is_binary_compound:
|
||
skip_reasons.append("二元化合物")
|
||
|
||
if info.has_radioactive_elements:
|
||
skip_reasons.append("含放射性元素")
|
||
|
||
# Li与空位共占位 → 不需要处理(不加入skip_reasons)
|
||
# info.cation_with_vacancy 不影响可处理性
|
||
|
||
# Li与其他阳离子共占位 → 需要扩胞(如果扩胞因子合理则可处理)
|
||
if info.cation_with_other_cation:
|
||
if info.expansion_info.can_expand:
|
||
info.needs_expansion = True
|
||
else:
|
||
skip_reasons.append(f"{self.target_cation}与其他阳离子共占位且{info.expansion_info.skip_reason}")
|
||
|
||
# 阴离子共占位 → 不处理
|
||
if info.anion_has_partial_occupancy:
|
||
skip_reasons.append("阴离子存在共占位")
|
||
|
||
if info.has_water_molecule:
|
||
skip_reasons.append("含水分子")
|
||
|
||
# 其他阳离子共占位(不涉及Li)→ 需要扩胞
|
||
if info.other_has_partial_occupancy:
|
||
if info.expansion_info.can_expand:
|
||
info.needs_expansion = True
|
||
else:
|
||
skip_reasons.append(info.expansion_info.skip_reason)
|
||
|
||
if skip_reasons:
|
||
info.can_process = False
|
||
info.skip_reason = "; ".join(skip_reasons)
|
||
else:
|
||
info.can_process = True
|
||
|
||
def _calculate_expansion_info(
|
||
self,
|
||
info: StructureInfo,
|
||
occupancy_dict: Dict[float, List[int]],
|
||
occupancy_elements: Dict[float, List[str]]
|
||
):
|
||
"""计算扩胞相关信息"""
|
||
expansion_info = ExpansionInfo()
|
||
|
||
if not occupancy_dict:
|
||
info.expansion_info = expansion_info
|
||
return
|
||
|
||
# 需要扩胞(有非目标阳离子的共占位)
|
||
expansion_info.needs_expansion = True
|
||
expansion_info.problematic_sites = sum(len(v) for v in occupancy_dict.values())
|
||
|
||
# 转换为OccupancyInfo列表
|
||
occupancy_list = []
|
||
for occu, serials in occupancy_dict.items():
|
||
elements = occupancy_elements.get(occu, [])
|
||
|
||
# 根据精度计算分数
|
||
limit = self.PRECISION_LIMITS.get(self.expansion_precision)
|
||
if limit:
|
||
fraction = Fraction(occu).limit_denominator(limit)
|
||
else:
|
||
fraction = Fraction(occu).limit_denominator()
|
||
|
||
occ_info = OccupancyInfo(
|
||
occupation=occu,
|
||
atom_serials=[s + 1 for s in serials], # 转为1-based
|
||
elements=elements,
|
||
numerator=fraction.numerator,
|
||
denominator=fraction.denominator,
|
||
involves_target_cation=self.target_cation in elements,
|
||
involves_anion=any(e in self.target_anions for e in elements)
|
||
)
|
||
occupancy_list.append(occ_info)
|
||
|
||
expansion_info.occupancy_details = occupancy_list
|
||
|
||
# 计算最小公倍数(扩胞因子)
|
||
denominators = [occ.denominator for occ in occupancy_list]
|
||
if denominators:
|
||
lcm = reduce(lambda a, b: a * b // math.gcd(a, b), denominators, 1)
|
||
expansion_info.expansion_factor = lcm
|
||
|
||
# 判断是否可以扩胞(因子过大则不可处理)
|
||
if lcm > 64: # 扩胞超过64倍通常不可行
|
||
expansion_info.can_expand = False
|
||
expansion_info.skip_reason = f"扩胞因子过大({lcm})"
|
||
|
||
info.expansion_info = expansion_info
|
||
info.needs_expansion = expansion_info.needs_expansion and expansion_info.can_expand
|
||
|
||
def _check_water_molecule(self, structure: Structure) -> bool:
|
||
"""检查是否含有水分子"""
|
||
try:
|
||
oxygen_sites = []
|
||
hydrogen_sites = []
|
||
|
||
for site in structure.sites:
|
||
species_str = str(site.species)
|
||
if 'O' in species_str:
|
||
oxygen_sites.append(site)
|
||
if 'H' in species_str:
|
||
hydrogen_sites.append(site)
|
||
|
||
for o_site in oxygen_sites:
|
||
nearby_h = [h for h in hydrogen_sites if o_site.distance(h) < 1.2]
|
||
if len(nearby_h) >= 2:
|
||
return True
|
||
return False
|
||
except:
|
||
return False
|
||
|
||
def _evaluate_processability(self, info: StructureInfo):
|
||
"""评估可处理性"""
|
||
skip_reasons = []
|
||
|
||
if not info.is_valid:
|
||
skip_reasons.append("无法解析CIF文件")
|
||
|
||
if not info.contains_target_cation:
|
||
skip_reasons.append(f"不含{self.target_cation}")
|
||
|
||
if info.anion_mode == "none":
|
||
skip_reasons.append("不含目标阴离子")
|
||
|
||
if info.is_binary_compound:
|
||
skip_reasons.append("二元化合物")
|
||
|
||
if info.has_radioactive_elements:
|
||
skip_reasons.append("含放射性元素")
|
||
|
||
# 关键:目标阳离子共占位是不可处理的
|
||
if info.cation_has_partial_occupancy:
|
||
skip_reasons.append(f"{self.target_cation}存在共占位")
|
||
|
||
# 阴离子共占位通常也不处理
|
||
if info.anion_has_partial_occupancy:
|
||
skip_reasons.append("阴离子存在共占位")
|
||
|
||
if info.has_water_molecule:
|
||
skip_reasons.append("含水分子")
|
||
|
||
# 扩胞因子过大
|
||
if info.expansion_info.needs_expansion and not info.expansion_info.can_expand:
|
||
skip_reasons.append(info.expansion_info.skip_reason)
|
||
|
||
if skip_reasons:
|
||
info.can_process = False
|
||
info.skip_reason = "; ".join(skip_reasons)
|
||
else:
|
||
info.can_process = True |