Files
screen/py/step2_4_combined.py
2025-12-07 15:22:36 +08:00

147 lines
5.1 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import os
import pandas as pd
import math
# ================= 配置区域 =================
# 定义各阴离子的筛选阈值
# perc: Percolation diameter (对应 Step 2, 大于此值)
# min_d: Minimum of d (对应 Step 3, 小于此值)
# node: Maximum node length (对应 Step 4, 大于此值)
THRESHOLDS = {
"O": {"perc": 0.50, "min_d": 3.0, "node": 2.2},
"S": {"perc": 0.55, "min_d": 3.0, "node": 2.2},
"Cl": {"perc": 0.45, "min_d": 3.0, "node": 2.0},
"Br": {"perc": 0.45, "min_d": 3.0, "node": 2.0}
}
# 路径配置
CSV_ROOT_DIR = "../output" # CSV 所在的根目录
DATA_SOURCE_DIR = "../data/after_step1" # 原始 CIF 文件所在的根目录 (用于创建链接源)
TARGET_DIR = "../data/after_screening" # 筛选后放置软链接的目标目录
# ===========================================
def check_requirements(row, anion_type):
"""
检查单行数据是否符合要求
"""
# 获取该阴离子类型的阈值配置
config = THRESHOLDS.get(anion_type)
if not config:
print(f"Warning: 未知的阴离子类型 {anion_type},跳过筛选。")
return False
try:
# 获取数值 (处理可能的空值或非数字情况)
perc = float(row["Percolation Diameter (A)"])
min_d = float(row["Minimum of d"])
node = float(row["Maximum Node Length (A)"])
# 检查是否为 NaN
if math.isnan(perc) or math.isnan(min_d) or math.isnan(node):
return False
# --- 筛选逻辑 ---
# Step 2: 连通孔径 > 阈值
c1 = perc > config["perc"]
# Step 3: 最短距离 < 3.0 (所有元素目前都是3.0)
c2 = min_d < config["min_d"]
# Step 4: 扩大锂离子节点 > 阈值
c3 = node > config["node"]
return c1 and c2 and c3
except (ValueError, TypeError):
return False
def create_symlink(group_name, anion_name, material_id):
"""
创建软链接
源: ../data/after_step1/Group/Anion/ID/ID.cif
目: ../data/after_screening/Group/Anion/ID.cif
"""
# 1. 构建源文件路径 (必须使用绝对路径以确保软链接在任何地方都有效)
# 注意:根据你修改后的 step1文件在 ID 文件夹内,如 141/141.cif
rel_source_path = os.path.join(DATA_SOURCE_DIR, group_name, anion_name, material_id, f"{material_id}.cif")
abs_source_path = os.path.abspath(rel_source_path)
if not os.path.exists(abs_source_path):
print(f"源文件不存在: {abs_source_path}")
return
# 2. 构建目标文件夹路径
target_subdir = os.path.join(TARGET_DIR, group_name, anion_name)
if not os.path.exists(target_subdir):
os.makedirs(target_subdir)
# 3. 构建目标链接路径
target_link_path = os.path.join(target_subdir, f"{material_id}.cif")
# 4. 创建链接
try:
# 如果目标已经存在(可能是旧的链接),先删除
if os.path.exists(target_link_path) or os.path.islink(target_link_path):
os.remove(target_link_path)
os.symlink(abs_source_path, target_link_path)
# print(f"Link: {material_id} -> Passed")
except OSError as e:
print(f"创建软链接失败 {material_id}: {e}")
def process_all_csvs():
"""
遍历 output 文件夹下的所有 CSV 并处理
"""
if not os.path.exists(CSV_ROOT_DIR):
print(f"CSV 目录不存在: {CSV_ROOT_DIR}")
return
print("开始执行 Step 2-4 联合筛选...")
# 遍历 output 目录
# 结构预期: ../output/Group/Anion/Anion.csv (例如 ../output/O+S/O/O.csv 或 ../output/O/O.csv)
for root, dirs, files in os.walk(CSV_ROOT_DIR):
for file in files:
if file.endswith(".csv"):
csv_path = os.path.join(root, file)
# 推断 Group 和 Anion
# root 的末尾应该是 .../Group/Anion
# 例如 root = ../output/O+S/O
path_parts = os.path.normpath(root).split(os.sep)
# 倒数第一级是 Anion (O), 倒数第二级是 Group (O+S)
if len(path_parts) >= 2:
anion_name = path_parts[-1]
group_name = path_parts[-2]
else:
print(f"跳过路径结构异常的 CSV: {csv_path}")
continue
# 确保这是一个有效的阴离子类型
if anion_name not in THRESHOLDS:
continue
print(f"正在处理: Group={group_name}, Anion={anion_name} ({file})")
# 读取 CSV
df = pd.read_csv(csv_path)
pass_count = 0
total_count = len(df)
for index, row in df.iterrows():
material_id = str(row['Filename'])
if check_requirements(row, anion_name):
create_symlink(group_name, anion_name, material_id)
pass_count += 1
print(f" - 完成: {pass_count}/{total_count} 个材料通过筛选并建立链接。")
if __name__ == "__main__":
process_all_csvs()