40 lines
1.7 KiB
Python
40 lines
1.7 KiB
Python
import json
|
||
|
||
# 1. 读取 K-Means 聚类结果文件(JSON 格式),构建 {data_idx: cluster_center} 的字典
|
||
kmeans_data = {}
|
||
with open('/data/zhaochsh01/buquan/12345/kmeans/123451wfilter_cluster_kmeans_result.jsonl', 'r', encoding='utf-8') as f:
|
||
for line in f:
|
||
line = line.strip()
|
||
if not line:
|
||
continue
|
||
try:
|
||
entry = json.loads(line)
|
||
data_idx = entry.get("data_idx") # 获取 data_idx
|
||
cluster_center = entry.get("cluster_center") # 获取 cluster_center(整数)
|
||
if data_idx and cluster_center is not None:
|
||
kmeans_data[data_idx] = cluster_center
|
||
except Exception as e:
|
||
print(f"Error parsing K-Means line: {line}, error: {e}")
|
||
|
||
# 2. 读取 JSONL 文件,匹配并合并 cluster_center
|
||
output_lines = []
|
||
with open('/data/zhaochsh01/buquan/12345/instag/123451wfilter_instag.jsonl', 'r', encoding='utf-8') as f:
|
||
for line in f:
|
||
line = line.strip()
|
||
if not line:
|
||
continue
|
||
try:
|
||
data = json.loads(line)
|
||
key_id = str(data.get("工单编号")) # 转为字符串确保匹配
|
||
if key_id in kmeans_data:
|
||
data["cluster_center"] = kmeans_data[key_id] # 添加 cluster_center
|
||
output_lines.append(json.dumps(data, ensure_ascii=False)) # 重新转为 JSON 字符串
|
||
except Exception as e:
|
||
print(f"Error processing JSONL line: {line}, error: {e}")
|
||
|
||
# 3. 将结果写入新文件
|
||
with open('merged_result.jsonl', 'w', encoding='utf-8') as f:
|
||
for line in output_lines:
|
||
f.write(line + '\n')
|
||
|
||
print("数据处理完成,结果已保存到 merged_result.jsonl") |