offline_data_model_pipline/data_generate/zcs/fenbu/tool/merge.py

40 lines
1.7 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import json
# 1. 读取 K-Means 聚类结果文件JSON 格式),构建 {data_idx: cluster_center} 的字典
kmeans_data = {}
with open('/data/zhaochsh01/buquan/12345/kmeans/123451wfilter_cluster_kmeans_result.jsonl', 'r', encoding='utf-8') as f:
for line in f:
line = line.strip()
if not line:
continue
try:
entry = json.loads(line)
data_idx = entry.get("data_idx") # 获取 data_idx
cluster_center = entry.get("cluster_center") # 获取 cluster_center整数
if data_idx and cluster_center is not None:
kmeans_data[data_idx] = cluster_center
except Exception as e:
print(f"Error parsing K-Means line: {line}, error: {e}")
# 2. 读取 JSONL 文件,匹配并合并 cluster_center
output_lines = []
with open('/data/zhaochsh01/buquan/12345/instag/123451wfilter_instag.jsonl', 'r', encoding='utf-8') as f:
for line in f:
line = line.strip()
if not line:
continue
try:
data = json.loads(line)
key_id = str(data.get("工单编号")) # 转为字符串确保匹配
if key_id in kmeans_data:
data["cluster_center"] = kmeans_data[key_id] # 添加 cluster_center
output_lines.append(json.dumps(data, ensure_ascii=False)) # 重新转为 JSON 字符串
except Exception as e:
print(f"Error processing JSONL line: {line}, error: {e}")
# 3. 将结果写入新文件
with open('merged_result.jsonl', 'w', encoding='utf-8') as f:
for line in output_lines:
f.write(line + '\n')
print("数据处理完成,结果已保存到 merged_result.jsonl")