import pandas as pd import json from datetime import datetime import numpy as np def convert_value(obj): """处理各种不可JSON序列化的类型""" # 处理空值 if pd.isna(obj) or obj is None: return None # 处理时间类型 if isinstance(obj, (pd.Timestamp, datetime)): return obj.strftime('%Y-%m-%d %H:%M:%S') # 处理NaT类型 if isinstance(obj, pd._libs.tslibs.nattype.NaTType): return None # 处理numpy数值类型 if isinstance(obj, (np.integer, np.floating)): return int(obj) if isinstance(obj, np.integer) else float(obj) # 处理numpy数组和pandas Series if isinstance(obj, np.ndarray): return obj.tolist() if isinstance(obj, pd.Series): return obj.to_dict() # 其他类型直接返回 return obj def xlsx_to_jsonl(input_file, output_file): """ 将XLSX文件转换为JSONL格式 参数: input_file (str): 输入的XLSX文件路径 output_file (str): 输出的JSONL文件路径 """ try: # 读取Excel文件 df = pd.read_excel(input_file) # 将数据写入JSONL文件 with open(output_file, 'w', encoding='utf-8') as f: for _, row in df.iterrows(): # 将行转换为字典并处理所有值 record = {k: convert_value(v) for k, v in row.items()} # 写入JSON行 json.dump(record, f, ensure_ascii=False) f.write('\n') print(f"转换成功,结果已保存到 {output_file}") except Exception as e: print(f"转换过程中发生错误: {str(e)}") # 使用示例 if __name__ == "__main__": input_xlsx = "/data/zhaochsh01/buquan/12345/1w_fillter.xlsx" # 替换为你的输入文件路径 output_jsonl = "output.jsonl" # 替换为你想要的输出文件路径 xlsx_to_jsonl(input_xlsx, output_jsonl)