增加扩胞逻辑

This commit is contained in:
2025-12-14 16:52:14 +08:00
parent f27fd3e3ce
commit 72cf0a79e1
2 changed files with 253 additions and 46 deletions

View File

@@ -1,10 +1,39 @@
"""
结构检查器对单个CIF文件进行深度分析
结构检查器对单个CIF文件进行深度分析(含扩胞需求判断)
"""
from dataclasses import dataclass, field
from typing import Set, Dict, List, Optional, Tuple
from pymatgen.core import Structure
from pymatgen.core.periodic_table import Element, Specie
from collections import defaultdict
from fractions import Fraction
from functools import reduce
import math
import re
import os
@dataclass
class OccupancyInfo:
"""共占位信息"""
occupation: float # 占据率
atom_serials: List[int] = field(default_factory=list) # 原子序号
elements: List[str] = field(default_factory=list) # 涉及的元素
numerator: int = 0 # 分子
denominator: int = 1 # 分母
involves_target_cation: bool = False # 是否涉及目标阳离子
involves_anion: bool = False # 是否涉及阴离子
@dataclass
class ExpansionInfo:
"""扩胞信息"""
needs_expansion: bool = False # 是否需要扩胞
expansion_factor: int = 1 # 扩胞因子(最小公倍数)
occupancy_details: List[OccupancyInfo] = field(default_factory=list) # 共占位详情
problematic_sites: int = 0 # 问题位点数
can_expand: bool = True # 是否可以扩胞处理
skip_reason: str = "" # 无法扩胞的原因
@dataclass
@@ -20,29 +49,34 @@ class StructureInfo:
# 元素组成
elements: Set[str] = field(default_factory=set)
num_sites: int = 0
formula: str = ""
# 阳离子/阴离子信息
contains_target_cation: bool = False
anion_types: Set[str] = field(default_factory=set) # 找到的目标阴离子
anion_types: Set[str] = field(default_factory=set)
anion_mode: str = "" # "single", "mixed", "none"
# 数据质量标记
has_oxidation_states: bool = False
has_partial_occupancy: bool = False # 是否有共占位
cation_has_partial_occupancy: bool = False # 目标阳离子是否共占位
anion_has_partial_occupancy: bool = False # 阴离子是否共占位
has_water_molecule: bool = False
has_radioactive_elements: bool = False
is_binary_compound: bool = False
# 共占位详细分析(新增)
cation_has_partial_occupancy: bool = False # 目标阳离子共占位
anion_has_partial_occupancy: bool = False # 阴离子共占位
other_has_partial_occupancy: bool = False # 其他元素共占位(需扩胞)
expansion_info: ExpansionInfo = field(default_factory=ExpansionInfo)
# 可处理性
needs_expansion: bool = False # 需要扩胞
can_process: bool = False # 可以直接处理
skip_reason: str = "" # 跳过原因
needs_expansion: bool = False
can_process: bool = False
skip_reason: str = ""
class StructureInspector:
"""结构检查器"""
"""结构检查器(含扩胞分析)"""
# 预定义的阴离子集合
VALID_ANIONS = {'O', 'S', 'Cl', 'Br'}
@@ -53,16 +87,38 @@ class StructureInspector:
'Cm', 'Bk', 'Cf', 'Es', 'Fm', 'Md', 'No', 'Lr'
}
def __init__(self, target_cation: str = "Li", target_anions: Set[str] = None):
# 扩胞精度模式
PRECISION_LIMITS = {
'high': None, # 精确分数
'normal': 100, # 分母≤100
'low': 10, # 分母≤10
'very_low': 5 # 分母≤5
}
def __init__(
self,
target_cation: str = "Li",
target_anions: Set[str] = None,
expansion_precision: str = "low"
):
"""
初始化检查器
Args:
target_cation: 目标阳离子 (如 "Li", "Na")
target_anions: 目标阴离子集合 (如 {"O", "S"})
expansion_precision: 扩胞计算精度 ('high', 'normal', 'low', 'very_low')
"""
self.target_cation = target_cation
self.target_anions = target_anions or self.VALID_ANIONS
self.expansion_precision = expansion_precision
# 目标阳离子的各种可能表示形式
self.target_cation_variants = {
target_cation,
f"{target_cation}+",
f"{target_cation}1+",
}
def inspect(self, file_path: str) -> StructureInfo:
"""
@@ -74,7 +130,6 @@ class StructureInspector:
Returns:
StructureInfo: 分析结果
"""
import os
info = StructureInfo(
file_path=file_path,
file_name=os.path.basename(file_path)
@@ -91,9 +146,10 @@ class StructureInspector:
# 基础信息
info.elements = {str(el) for el in structure.composition.elements}
info.num_sites = structure.num_sites
info.formula = structure.composition.reduced_formula
# 检查是否为二元化合物
info.is_binary_compound = len(structure.types_of_specie) == 2
info.is_binary_compound = len(structure.composition.elements) == 2
# 检查是否含有目标阳离子
info.contains_target_cation = self.target_cation in info.elements
@@ -110,8 +166,8 @@ class StructureInspector:
# 检查氧化态
info.has_oxidation_states = self._check_oxidation_states(structure)
# 检查共占位
self._check_partial_occupancy(structure, info)
# 检查共占位(核心分析)
self._analyze_partial_occupancy(structure, info)
# 检查水分子
info.has_water_molecule = self._check_water_molecule(structure)
@@ -121,9 +177,6 @@ class StructureInspector:
info.elements.intersection(self.RADIOACTIVE_ELEMENTS)
)
# 判断是否需要扩胞
info.needs_expansion = info.has_partial_occupancy and not info.cation_has_partial_occupancy
# 判断可处理性
self._evaluate_processability(info)
@@ -140,48 +193,172 @@ class StructureInspector:
except:
return False
def _check_partial_occupancy(self, structure: Structure, info: StructureInfo):
"""检查共占位情况"""
try:
for site in structure.sites:
if len(site.species) > 1:
def _get_element_from_species_string(self, species_str: str) -> str:
"""从物种字符串提取纯元素符号"""
match = re.match(r'([A-Z][a-z]?)', species_str)
return match.group(1) if match else ""
def _get_occupancy_from_species_string(self, species_str: str, exclude_elements: Set[str]) -> Optional[float]:
"""
从物种字符串获取非目标元素的占据率
格式如: "Li+:0.689, Sc3+:0.311"
"""
if ':' not in species_str:
return None
parts = [p.strip() for p in species_str.split(',')]
for part in parts:
if ':' in part:
element_part, occu_part = part.split(':')
element = self._get_element_from_species_string(element_part.strip())
if element and element not in exclude_elements:
try:
return float(occu_part.strip())
except ValueError:
continue
return None
def _analyze_partial_occupancy(self, structure: Structure, info: StructureInfo):
"""
分析共占位情况(核心逻辑)
关键规则:
- 目标阳离子(Li)的共占位 → 不可处理
- 阴离子的共占位 → 需要扩胞,但通常不处理
- 其他阳离子的共占位 → 需要扩胞处理
"""
occupancy_dict = defaultdict(list) # {occupation: [site_indices]}
occupancy_elements = {} # {occupation: [elements]}
for i, site in enumerate(structure.sites):
site_species = site.species
species_string = str(site.species)
# 检查是否有多个物种占据同一位点
if len(site_species) > 1:
info.has_partial_occupancy = True
# 提取各元素符号
elements_at_site = []
for sp in site_species.keys():
elem = sp.symbol if hasattr(sp, 'symbol') else str(sp)
elem = self._get_element_from_species_string(elem)
if elem:
elements_at_site.append(elem)
# 判断是否涉及目标阳离子
if self.target_cation in elements_at_site:
info.cation_has_partial_occupancy = True
# 判断是否涉及阴离子
if any(elem in self.target_anions for elem in elements_at_site):
info.anion_has_partial_occupancy = True
# 判断是否涉及其他元素(需要扩胞处理的情况)
other_elements = [e for e in elements_at_site
if e != self.target_cation and e not in self.target_anions]
if other_elements:
info.other_has_partial_occupancy = True
# 获取占据率(取非目标阳离子的占据率)
occu = self._get_occupancy_from_species_string(
species_string,
self.target_cation_variants
)
if occu is not None and occu != 1.0:
occupancy_dict[occu].append(i)
occupancy_elements[occu] = elements_at_site
# 检查单一物种的部分占据
for specie, occupancy in site_species.items():
if occupancy < 1.0:
info.has_partial_occupancy = True
elem = specie.symbol if hasattr(specie, 'symbol') else str(specie)
elem = self._get_element_from_species_string(elem)
# 检查是否涉及目标阳离子
species_symbols = [str(sp.symbol) if hasattr(sp, 'symbol') else str(sp)
for sp in site.species.keys()]
if self.target_cation in species_symbols:
if elem == self.target_cation:
info.cation_has_partial_occupancy = True
# 检查是否涉及阴离子
if any(sym in self.target_anions for sym in species_symbols):
elif elem in self.target_anions:
info.anion_has_partial_occupancy = True
else:
info.other_has_partial_occupancy = True
occupancy_dict[occupancy].append(i)
occupancy_elements[occupancy] = [elem]
# 检查单一物种的部分占据
for specie, occupancy in site.species.items():
if occupancy < 1.0:
info.has_partial_occupancy = True
symbol = str(specie.symbol) if hasattr(specie, 'symbol') else str(specie)
# 计算扩胞信息
self._calculate_expansion_info(info, occupancy_dict, occupancy_elements)
if symbol == self.target_cation:
info.cation_has_partial_occupancy = True
if symbol in self.target_anions:
info.anion_has_partial_occupancy = True
except Exception as e:
pass
def _calculate_expansion_info(
self,
info: StructureInfo,
occupancy_dict: Dict[float, List[int]],
occupancy_elements: Dict[float, List[str]]
):
"""计算扩胞相关信息"""
expansion_info = ExpansionInfo()
if not occupancy_dict:
info.expansion_info = expansion_info
return
# 需要扩胞(有非目标阳离子的共占位)
expansion_info.needs_expansion = True
expansion_info.problematic_sites = sum(len(v) for v in occupancy_dict.values())
# 转换为OccupancyInfo列表
occupancy_list = []
for occu, serials in occupancy_dict.items():
elements = occupancy_elements.get(occu, [])
# 根据精度计算分数
limit = self.PRECISION_LIMITS.get(self.expansion_precision)
if limit:
fraction = Fraction(occu).limit_denominator(limit)
else:
fraction = Fraction(occu).limit_denominator()
occ_info = OccupancyInfo(
occupation=occu,
atom_serials=[s + 1 for s in serials], # 转为1-based
elements=elements,
numerator=fraction.numerator,
denominator=fraction.denominator,
involves_target_cation=self.target_cation in elements,
involves_anion=any(e in self.target_anions for e in elements)
)
occupancy_list.append(occ_info)
expansion_info.occupancy_details = occupancy_list
# 计算最小公倍数(扩胞因子)
denominators = [occ.denominator for occ in occupancy_list]
if denominators:
lcm = reduce(lambda a, b: a * b // math.gcd(a, b), denominators, 1)
expansion_info.expansion_factor = lcm
# 判断是否可以扩胞(因子过大则不可处理)
if lcm > 64: # 扩胞超过64倍通常不可行
expansion_info.can_expand = False
expansion_info.skip_reason = f"扩胞因子过大({lcm})"
info.expansion_info = expansion_info
info.needs_expansion = expansion_info.needs_expansion and expansion_info.can_expand
def _check_water_molecule(self, structure: Structure) -> bool:
"""检查是否含有水分子"""
try:
oxygen_sites = [site for site in structure.sites
if 'O' in str(site.species)]
hydrogen_sites = [site for site in structure.sites
if 'H' in str(site.species)]
oxygen_sites = []
hydrogen_sites = []
for site in structure.sites:
species_str = str(site.species)
if 'O' in species_str:
oxygen_sites.append(site)
if 'H' in species_str:
hydrogen_sites.append(site)
for o_site in oxygen_sites:
nearby_h = [h for h in hydrogen_sites
if o_site.distance(h) < 1.2]
nearby_h = [h for h in hydrogen_sites if o_site.distance(h) < 1.2]
if len(nearby_h) >= 2:
return True
return False
@@ -207,15 +384,21 @@ class StructureInspector:
if info.has_radioactive_elements:
skip_reasons.append("含放射性元素")
# 关键:目标阳离子共占位是不可处理的
if info.cation_has_partial_occupancy:
skip_reasons.append(f"{self.target_cation}存在共占位")
# 阴离子共占位通常也不处理
if info.anion_has_partial_occupancy:
skip_reasons.append("阴离子存在共占位")
if info.has_water_molecule:
skip_reasons.append("含水分子")
# 扩胞因子过大
if info.expansion_info.needs_expansion and not info.expansion_info.can_expand:
skip_reasons.append(info.expansion_info.skip_reason)
if skip_reasons:
info.can_process = False
info.skip_reason = "; ".join(skip_reasons)