Files
screen/py/extract_data.py
2025-12-07 15:22:36 +08:00

154 lines
5.9 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import os
import re
import pandas as pd
def extract_parameters_from_log(log_path):
"""
从 log.txt 中提取三个关键参数。
如果未找到,返回 None。
"""
if not os.path.exists(log_path):
return None, None, None
with open(log_path, 'r', encoding='utf-8') as f:
content = f.read()
# 正则表达式定义
# 1. Percolation diameter (原来的 Step 2)
# 匹配模式: # Percolation diameter (A): 1.06
re_percolation = r"Percolation diameter \(A\):\s*([\d\.]+)"
# 2. Minimum of d (原来的 Step 3)
# 匹配模式: the minium of d \n 3.862140561244235
re_min_d = r"the minium of d\s*\n\s*([\d\.]+)"
# 3. Maximum node length (原来的 Step 4)
# 匹配模式: # Maximum node length detected: 1.332 A
re_max_node = r"Maximum node length detected:\s*([\d\.]+)\s*A"
# 提取数据
match_perc = re.search(re_percolation, content)
match_d = re.search(re_min_d, content)
match_node = re.search(re_max_node, content)
# 获取值如果没匹配到则为空字符串或None
val_perc = match_perc.group(1) if match_perc else None
val_d = match_d.group(1) if match_d else None
val_node = match_node.group(1) if match_node else None
return val_perc, val_d, val_node
def process_folder_recursively(base_input_folder, base_output_folder):
"""
递归遍历文件夹,提取数据并生成 CSV。
逻辑:
1. 遍历 base_input_folder 下的第一层子文件夹(通常是阴离子类别,如 O, S, O+S 等)。
2. 如果是单阴离子(如 O直接处理其下的材料文件夹。
3. 如果是混合阴离子(如 O+S需要进入下一层如 O+S/O 和 O+S/S分别处理。
4. 结果保存在 base_output_folder 下保持相同的目录结构。
"""
# 获取 after_step1 下的所有顶层目录 (例如 O, S, Cl, S+O ...)
if not os.path.exists(base_input_folder):
print(f"输入目录 {base_input_folder} 不存在")
return
top_dirs = [d for d in os.listdir(base_input_folder) if os.path.isdir(os.path.join(base_input_folder, d))]
for top_dir in top_dirs:
top_path = os.path.join(base_input_folder, top_dir)
# 判断是否是混合阴离子目录(名字包含 +
if "+" in top_dir:
# 混合阴离子情况:例如 S+O
# 需要遍历其子目录S+O/S 和 S+O/O
sub_anions = [d for d in os.listdir(top_path) if os.path.isdir(os.path.join(top_path, d))]
for sub_anion in sub_anions:
# 构建路径:../data/after_step1/S+O/S
current_process_path = os.path.join(top_path, sub_anion)
# 构建输出 CSV 路径:../output/S+O/S/S.csv (或者 S+O_S.csv这里按你要求的 O+S/O/O.csv 格式)
# 输出目录: ../output/S+O/S
output_dir = os.path.join(base_output_folder, top_dir, sub_anion)
csv_filename = f"{sub_anion}.csv"
extract_and_save(current_process_path, output_dir, csv_filename)
else:
# 单一阴离子情况:例如 O
# 路径:../data/after_step1/O
current_process_path = top_path
# 输出目录: ../output/O
output_dir = os.path.join(base_output_folder, top_dir)
csv_filename = f"{top_dir}.csv"
extract_and_save(current_process_path, output_dir, csv_filename)
def extract_and_save(input_dir, output_dir, csv_name):
"""
实际执行提取和保存的函数。
input_dir: 包含各个材料文件夹的目录 (例如 .../O/)
output_dir: CSV 保存目录
csv_name: CSV 文件名
"""
data_list = []
# input_dir 下面应该是各个材料的文件夹,例如 141, 142 ...
if not os.path.exists(input_dir):
return
# 遍历下面的所有材料文件夹
material_folders = [f for f in os.listdir(input_dir) if os.path.isdir(os.path.join(input_dir, f))]
print(f"正在处理目录: {input_dir}, 发现 {len(material_folders)} 个材料文件夹")
for material_id in material_folders:
material_path = os.path.join(input_dir, material_id)
# 根据新的 step1 逻辑log 文件名为 log.txt
log_path = os.path.join(material_path, "log.txt")
# 提取数据
perc, min_d, max_node = extract_parameters_from_log(log_path)
# 只要有一个数据存在,就记录(或者你可以改为必须全部存在)
# 这里设置为只要有记录就加入,方便排查错误
if perc or min_d or max_node:
data_list.append({
"Filename": material_id,
"Percolation Diameter (A)": perc,
"Minimum of d": min_d,
"Maximum Node Length (A)": max_node
})
else:
# 如果 log.txt 不存在或者提取不到数据,可以选择记录空值
data_list.append({
"Filename": material_id,
"Percolation Diameter (A)": None,
"Minimum of d": None,
"Maximum Node Length (A)": None
})
# 如果有数据,保存为 CSV
if data_list:
if not os.path.exists(output_dir):
os.makedirs(output_dir)
csv_path = os.path.join(output_dir, csv_name)
df = pd.DataFrame(data_list)
# 调整列顺序
df = df[["Filename", "Percolation Diameter (A)", "Minimum of d", "Maximum Node Length (A)"]]
df.to_csv(csv_path, index=False)
print(f"数据已保存至: {csv_path}")
else:
print(f"目录 {input_dir} 未提取到有效数据")
if __name__ == "__main__":
# 输入基础路径 (假设数据在 step1 处理后)
input_base = "../data/after_step1"
# 输出基础路径 (你提到的 output 文件夹)
output_base = "../output"
process_folder_recursively(input_base, output_base)