CSM计算

2025-09-22 11:18:39 +08:00
parent 71f6ae8928
commit 28c2323ce8
4 changed files with 307 additions and 34 deletions
--- a/data_get/data_get.py
+++ b/data_get/data_get.py
@@ -0,0 +1,127 @@
+import pandas as pd
+import os
+import re
+
+
+def extract_cif_from_xlsx(
+        xlsx_path: str,
+        output_dir: str,
+        naming_mode: str = 'formula',
+        name_col: int = 0,
+        cif_col: int = 1,
+        prefix: str = 'wjy'
+):
+    """
+    从 XLSX 文件中提取 CIF 数据并保存为单独的 .cif 文件。
+
+    Args:
+        xlsx_path (str): 输入的 XLSX 文件的路径。
+        output_dir (str): 输出 .cif 文件的文件夹路径。
+        naming_mode (str, optional): CIF 文件的命名模式。
+                                     可选值为 'formula' (使用第一列的名字) 或
+                                     'auto' (使用前缀+自动递增编号)。
+                                     默认为 'formula'。
+        name_col (int, optional): 包含文件名的列的索引（从0开始）。默认为 0。
+        cif_col (int, optional): 包含 CIF 内容的列的索引（从0开始）。默认为 1。
+        prefix (str, optional): 在 'auto' 命名模式下使用的文件名前缀。默认为 'wjy'。
+
+    Raises:
+        FileNotFoundError: 如果指定的 xlsx_path 文件不存在。
+        ValueError: 如果指定的 naming_mode 无效。
+        Exception: 处理过程中发生的其他错误。
+    """
+    # --- 1. 参数校验和准备 ---
+    if not os.path.exists(xlsx_path):
+        raise FileNotFoundError(f"错误: 输入文件未找到 -> {xlsx_path}")
+
+    if naming_mode not in ['formula', 'auto']:
+        raise ValueError(f"错误: 'naming_mode' 参数必须是 'formula' 或 'auto'，但收到了 '{naming_mode}'")
+
+    # 创建输出目录（如果不存在）
+    os.makedirs(output_dir, exist_ok=True)
+    print(f"CIF 文件将保存到: {output_dir}")
+
+    try:
+        # --- 2. 读取 XLSX 文件 ---
+        # header=None 表示第一行不是标题，将其作为数据读取
+        df = pd.read_excel(xlsx_path, header=None)
+
+        # 跳过原始文件的表头行（'formula', 'cif'）
+        if str(df.iloc[0, name_col]).strip().lower() == 'formula' and str(df.iloc[0, cif_col]).strip().lower() == 'cif':
+            df = df.iloc[1:]
+            print("检测到并跳过了表头行。")
+
+        # --- 3. 遍历数据并生成文件 ---
+        success_count = 0
+        for index, row in df.iterrows():
+            # 获取文件名和 CIF 内容
+            formula_name = str(row[name_col])
+            cif_content = str(row[cif_col])
+
+            # 跳过内容为空的行
+            if pd.isna(row[name_col]) or pd.isna(row[cif_col]) or not cif_content.strip():
+                print(f"警告: 第 {index + 2} 行数据不完整，已跳过。")
+                continue
+
+            # --- 4. 根据命名模式确定文件名 ---
+            if naming_mode == 'formula':
+                # 清理文件名，替换掉不适合做文件名的特殊字符
+                # 例如：将 (PO4)3 替换为 _PO4_3，将 / 替换为 _
+                safe_filename = re.sub(r'[\\/*?:"<>|()]', '_', formula_name)
+                filename = f"{safe_filename}.cif"
+            else:  # naming_mode == 'auto'
+                # 使用 format 方法来确保编号格式统一，例如 001, 002
+                filename = f"{prefix}_{success_count + 1:03d}.cif"
+
+            # 构造完整的输出文件路径
+            output_path = os.path.join(output_dir, filename)
+
+            # --- 5. 写入 CIF 文件 ---
+            try:
+                with open(output_path, 'w', encoding='utf-8') as f:
+                    f.write(cif_content)
+                success_count += 1
+            except IOError as e:
+                print(f"错误: 无法写入文件 {output_path}。原因: {e}")
+
+        print(f"\n处理完成！成功提取并生成了 {success_count} 个 CIF 文件。")
+
+    except Exception as e:
+        print(f"处理 XLSX 文件时发生错误: {e}")
+
+
+# --- 函数使用示例 ---
+if __name__ == '__main__':
+    # 假设您的 XLSX 文件名为 'materials.xlsx'，且与此脚本在同一目录下
+    source_xlsx_file = 'input/cif_dataset.xlsx'
+
+    # 检查示例文件是否存在，如果不存在则创建一个
+    if not os.path.exists(source_xlsx_file):
+        print(f"未找到示例文件 '{source_xlsx_file}'，正在创建一个...")
+        example_data = {
+            'formula': ['Li3Al0.3Ti1.7(PO4)3', 'Li6.5La3Zr1.75W0.25O12', 'Invalid/Name*Test'],
+            'cif': ['# CIF Data for Li3Al0.3...\n_atom_site_type_symbol\n Li\n Al\n Ti\n P\n O',
+                    '# CIF Data for Li6.5La3...\n_symmetry_space_group_name_H-M \'I a -3 d\'',
+                    '# CIF Data for Invalid Name Test']
+        }
+        pd.DataFrame(example_data).to_excel(source_xlsx_file, index=False, header=True)
+        print("示例文件创建成功。")
+
+    # --- 示例 1: 使用第一列的 'formula' 命名 ---
+    # print("\n--- 示例 1: 使用 'formula' 命名模式 ---")
+    # output_folder_1 = 'cif_by_formula'
+    # extract_cif_from_xlsx(
+    #     xlsx_path=source_xlsx_file,
+    #     output_dir=output_folder_1,
+    #     naming_mode='formula'
+    # )
+
+    # --- 示例 2: 使用 'wjy+编号' 自动命名 ---
+    print("\n--- 示例 2: 使用 'auto' 命名模式 ---")
+    output_folder_2 = 'cif_by_auto'
+    extract_cif_from_xlsx(
+        xlsx_path=source_xlsx_file,
+        output_dir=output_folder_2,
+        naming_mode='auto',
+        prefix='wjy'
+    )
--- a/data_get/input/cif_dataset.xlsx
+++ b/data_get/input/cif_dataset.xlsx