offline_data_model_pipline/data_generate/zcs/fenbu/tool/merge.py

40 lines
1.7 KiB
Python
Raw Normal View History

2025-05-13 14:57:22 +08:00
import json
# 1. 读取 K-Means 聚类结果文件JSON 格式),构建 {data_idx: cluster_center} 的字典
kmeans_data = {}
with open('/data/zhaochsh01/buquan/12345/kmeans/123451wfilter_cluster_kmeans_result.jsonl', 'r', encoding='utf-8') as f:
for line in f:
line = line.strip()
if not line:
continue
try:
entry = json.loads(line)
data_idx = entry.get("data_idx") # 获取 data_idx
cluster_center = entry.get("cluster_center") # 获取 cluster_center整数
if data_idx and cluster_center is not None:
kmeans_data[data_idx] = cluster_center
except Exception as e:
print(f"Error parsing K-Means line: {line}, error: {e}")
# 2. 读取 JSONL 文件,匹配并合并 cluster_center
output_lines = []
with open('/data/zhaochsh01/buquan/12345/instag/123451wfilter_instag.jsonl', 'r', encoding='utf-8') as f:
for line in f:
line = line.strip()
if not line:
continue
try:
data = json.loads(line)
key_id = str(data.get("工单编号")) # 转为字符串确保匹配
if key_id in kmeans_data:
data["cluster_center"] = kmeans_data[key_id] # 添加 cluster_center
output_lines.append(json.dumps(data, ensure_ascii=False)) # 重新转为 JSON 字符串
except Exception as e:
print(f"Error processing JSONL line: {line}, error: {e}")
# 3. 将结果写入新文件
with open('merged_result.jsonl', 'w', encoding='utf-8') as f:
for line in output_lines:
f.write(line + '\n')
print("数据处理完成,结果已保存到 merged_result.jsonl")