offline_data_model_pipline/data_generate/zcs/fenbu/tool/xslx2jsonl.py

67 lines
2.0 KiB
Python
Raw Normal View History

2025-05-13 14:57:22 +08:00
import pandas as pd
import json
from datetime import datetime
import numpy as np
def convert_value(obj):
"""处理各种不可JSON序列化的类型"""
# 处理空值
if pd.isna(obj) or obj is None:
return None
# 处理时间类型
if isinstance(obj, (pd.Timestamp, datetime)):
return obj.strftime('%Y-%m-%d %H:%M:%S')
# 处理NaT类型
if isinstance(obj, pd._libs.tslibs.nattype.NaTType):
return None
# 处理numpy数值类型
if isinstance(obj, (np.integer, np.floating)):
return int(obj) if isinstance(obj, np.integer) else float(obj)
# 处理numpy数组和pandas Series
if isinstance(obj, np.ndarray):
return obj.tolist()
if isinstance(obj, pd.Series):
return obj.to_dict()
# 其他类型直接返回
return obj
def xlsx_to_jsonl(input_file, output_file):
"""
将XLSX文件转换为JSONL格式
参数:
input_file (str): 输入的XLSX文件路径
output_file (str): 输出的JSONL文件路径
"""
try:
# 读取Excel文件
df = pd.read_excel(input_file)
# 将数据写入JSONL文件
with open(output_file, 'w', encoding='utf-8') as f:
for _, row in df.iterrows():
# 将行转换为字典并处理所有值
record = {k: convert_value(v) for k, v in row.items()}
# 写入JSON行
json.dump(record, f, ensure_ascii=False)
f.write('\n')
print(f"转换成功,结果已保存到 {output_file}")
except Exception as e:
print(f"转换过程中发生错误: {str(e)}")
# 使用示例
if __name__ == "__main__":
input_xlsx = "/data/zhaochsh01/buquan/12345/1w_fillter.xlsx" # 替换为你的输入文件路径
output_jsonl = "output.jsonl" # 替换为你想要的输出文件路径
xlsx_to_jsonl(input_xlsx, output_jsonl)