calc—_v1
This commit is contained in:
154
py/extract_data.py
Normal file
154
py/extract_data.py
Normal file
@@ -0,0 +1,154 @@
|
||||
import os
|
||||
import re
|
||||
import pandas as pd
|
||||
|
||||
|
||||
def extract_parameters_from_log(log_path):
|
||||
"""
|
||||
从 log.txt 中提取三个关键参数。
|
||||
如果未找到,返回 None。
|
||||
"""
|
||||
if not os.path.exists(log_path):
|
||||
return None, None, None
|
||||
|
||||
with open(log_path, 'r', encoding='utf-8') as f:
|
||||
content = f.read()
|
||||
|
||||
# 正则表达式定义
|
||||
# 1. Percolation diameter (原来的 Step 2)
|
||||
# 匹配模式: # Percolation diameter (A): 1.06
|
||||
re_percolation = r"Percolation diameter \(A\):\s*([\d\.]+)"
|
||||
|
||||
# 2. Minimum of d (原来的 Step 3)
|
||||
# 匹配模式: the minium of d \n 3.862140561244235
|
||||
re_min_d = r"the minium of d\s*\n\s*([\d\.]+)"
|
||||
|
||||
# 3. Maximum node length (原来的 Step 4)
|
||||
# 匹配模式: # Maximum node length detected: 1.332 A
|
||||
re_max_node = r"Maximum node length detected:\s*([\d\.]+)\s*A"
|
||||
|
||||
# 提取数据
|
||||
match_perc = re.search(re_percolation, content)
|
||||
match_d = re.search(re_min_d, content)
|
||||
match_node = re.search(re_max_node, content)
|
||||
|
||||
# 获取值,如果没匹配到则为空字符串或None
|
||||
val_perc = match_perc.group(1) if match_perc else None
|
||||
val_d = match_d.group(1) if match_d else None
|
||||
val_node = match_node.group(1) if match_node else None
|
||||
|
||||
return val_perc, val_d, val_node
|
||||
|
||||
|
||||
def process_folder_recursively(base_input_folder, base_output_folder):
|
||||
"""
|
||||
递归遍历文件夹,提取数据并生成 CSV。
|
||||
逻辑:
|
||||
1. 遍历 base_input_folder 下的第一层子文件夹(通常是阴离子类别,如 O, S, O+S 等)。
|
||||
2. 如果是单阴离子(如 O),直接处理其下的材料文件夹。
|
||||
3. 如果是混合阴离子(如 O+S),需要进入下一层(如 O+S/O 和 O+S/S),分别处理。
|
||||
4. 结果保存在 base_output_folder 下保持相同的目录结构。
|
||||
"""
|
||||
|
||||
# 获取 after_step1 下的所有顶层目录 (例如 O, S, Cl, S+O ...)
|
||||
if not os.path.exists(base_input_folder):
|
||||
print(f"输入目录 {base_input_folder} 不存在")
|
||||
return
|
||||
|
||||
top_dirs = [d for d in os.listdir(base_input_folder) if os.path.isdir(os.path.join(base_input_folder, d))]
|
||||
|
||||
for top_dir in top_dirs:
|
||||
top_path = os.path.join(base_input_folder, top_dir)
|
||||
|
||||
# 判断是否是混合阴离子目录(名字包含 +)
|
||||
if "+" in top_dir:
|
||||
# 混合阴离子情况:例如 S+O
|
||||
# 需要遍历其子目录:S+O/S 和 S+O/O
|
||||
sub_anions = [d for d in os.listdir(top_path) if os.path.isdir(os.path.join(top_path, d))]
|
||||
for sub_anion in sub_anions:
|
||||
# 构建路径:../data/after_step1/S+O/S
|
||||
current_process_path = os.path.join(top_path, sub_anion)
|
||||
# 构建输出 CSV 路径:../output/S+O/S/S.csv (或者 S+O_S.csv,这里按你要求的 O+S/O/O.csv 格式)
|
||||
# 输出目录: ../output/S+O/S
|
||||
output_dir = os.path.join(base_output_folder, top_dir, sub_anion)
|
||||
csv_filename = f"{sub_anion}.csv"
|
||||
|
||||
extract_and_save(current_process_path, output_dir, csv_filename)
|
||||
else:
|
||||
# 单一阴离子情况:例如 O
|
||||
# 路径:../data/after_step1/O
|
||||
current_process_path = top_path
|
||||
# 输出目录: ../output/O
|
||||
output_dir = os.path.join(base_output_folder, top_dir)
|
||||
csv_filename = f"{top_dir}.csv"
|
||||
|
||||
extract_and_save(current_process_path, output_dir, csv_filename)
|
||||
|
||||
|
||||
def extract_and_save(input_dir, output_dir, csv_name):
|
||||
"""
|
||||
实际执行提取和保存的函数。
|
||||
input_dir: 包含各个材料文件夹的目录 (例如 .../O/)
|
||||
output_dir: CSV 保存目录
|
||||
csv_name: CSV 文件名
|
||||
"""
|
||||
data_list = []
|
||||
|
||||
# input_dir 下面应该是各个材料的文件夹,例如 141, 142 ...
|
||||
if not os.path.exists(input_dir):
|
||||
return
|
||||
|
||||
# 遍历下面的所有材料文件夹
|
||||
material_folders = [f for f in os.listdir(input_dir) if os.path.isdir(os.path.join(input_dir, f))]
|
||||
|
||||
print(f"正在处理目录: {input_dir}, 发现 {len(material_folders)} 个材料文件夹")
|
||||
|
||||
for material_id in material_folders:
|
||||
material_path = os.path.join(input_dir, material_id)
|
||||
# 根据新的 step1 逻辑,log 文件名为 log.txt
|
||||
log_path = os.path.join(material_path, "log.txt")
|
||||
|
||||
# 提取数据
|
||||
perc, min_d, max_node = extract_parameters_from_log(log_path)
|
||||
|
||||
# 只要有一个数据存在,就记录(或者你可以改为必须全部存在)
|
||||
# 这里设置为只要有记录就加入,方便排查错误
|
||||
if perc or min_d or max_node:
|
||||
data_list.append({
|
||||
"Filename": material_id,
|
||||
"Percolation Diameter (A)": perc,
|
||||
"Minimum of d": min_d,
|
||||
"Maximum Node Length (A)": max_node
|
||||
})
|
||||
else:
|
||||
# 如果 log.txt 不存在或者提取不到数据,可以选择记录空值
|
||||
data_list.append({
|
||||
"Filename": material_id,
|
||||
"Percolation Diameter (A)": None,
|
||||
"Minimum of d": None,
|
||||
"Maximum Node Length (A)": None
|
||||
})
|
||||
|
||||
# 如果有数据,保存为 CSV
|
||||
if data_list:
|
||||
if not os.path.exists(output_dir):
|
||||
os.makedirs(output_dir)
|
||||
|
||||
csv_path = os.path.join(output_dir, csv_name)
|
||||
df = pd.DataFrame(data_list)
|
||||
# 调整列顺序
|
||||
df = df[["Filename", "Percolation Diameter (A)", "Minimum of d", "Maximum Node Length (A)"]]
|
||||
|
||||
df.to_csv(csv_path, index=False)
|
||||
print(f"数据已保存至: {csv_path}")
|
||||
else:
|
||||
print(f"目录 {input_dir} 未提取到有效数据")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# 输入基础路径 (假设数据在 step1 处理后)
|
||||
input_base = "../data/after_step1"
|
||||
# 输出基础路径 (你提到的 output 文件夹)
|
||||
output_base = "../output"
|
||||
|
||||
process_folder_recursively(input_base, output_base)
|
||||
Reference in New Issue
Block a user