67 lines
2.0 KiB
Python
67 lines
2.0 KiB
Python
|
import pandas as pd
|
||
|
import json
|
||
|
from datetime import datetime
|
||
|
import numpy as np
|
||
|
|
||
|
def convert_value(obj):
|
||
|
"""处理各种不可JSON序列化的类型"""
|
||
|
# 处理空值
|
||
|
if pd.isna(obj) or obj is None:
|
||
|
return None
|
||
|
|
||
|
# 处理时间类型
|
||
|
if isinstance(obj, (pd.Timestamp, datetime)):
|
||
|
return obj.strftime('%Y-%m-%d %H:%M:%S')
|
||
|
|
||
|
# 处理NaT类型
|
||
|
if isinstance(obj, pd._libs.tslibs.nattype.NaTType):
|
||
|
return None
|
||
|
|
||
|
# 处理numpy数值类型
|
||
|
if isinstance(obj, (np.integer, np.floating)):
|
||
|
return int(obj) if isinstance(obj, np.integer) else float(obj)
|
||
|
|
||
|
# 处理numpy数组和pandas Series
|
||
|
if isinstance(obj, np.ndarray):
|
||
|
return obj.tolist()
|
||
|
if isinstance(obj, pd.Series):
|
||
|
return obj.to_dict()
|
||
|
|
||
|
# 其他类型直接返回
|
||
|
return obj
|
||
|
|
||
|
def xlsx_to_jsonl(input_file, output_file):
|
||
|
"""
|
||
|
将XLSX文件转换为JSONL格式
|
||
|
|
||
|
参数:
|
||
|
input_file (str): 输入的XLSX文件路径
|
||
|
output_file (str): 输出的JSONL文件路径
|
||
|
"""
|
||
|
try:
|
||
|
# 读取Excel文件
|
||
|
df = pd.read_excel(input_file)
|
||
|
|
||
|
# 将数据写入JSONL文件
|
||
|
with open(output_file, 'w', encoding='utf-8') as f:
|
||
|
for _, row in df.iterrows():
|
||
|
# 将行转换为字典并处理所有值
|
||
|
record = {k: convert_value(v) for k, v in row.items()}
|
||
|
|
||
|
# 写入JSON行
|
||
|
json.dump(record, f, ensure_ascii=False)
|
||
|
f.write('\n')
|
||
|
|
||
|
print(f"转换成功,结果已保存到 {output_file}")
|
||
|
|
||
|
except Exception as e:
|
||
|
print(f"转换过程中发生错误: {str(e)}")
|
||
|
|
||
|
|
||
|
|
||
|
# 使用示例
|
||
|
if __name__ == "__main__":
|
||
|
input_xlsx = "/data/zhaochsh01/buquan/12345/1w_fillter.xlsx" # 替换为你的输入文件路径
|
||
|
output_jsonl = "output.jsonl" # 替换为你想要的输出文件路径
|
||
|
|
||
|
xlsx_to_jsonl(input_xlsx, output_jsonl)
|