offline_data_model_pipline/data_generate/zcs/fenbu/tool/merge.py

import json

# 1. 读取 K-Means 聚类结果文件（JSON 格式），构建 {data_idx: cluster_center} 的字典
kmeans_data = {}
with open('/data/zhaochsh01/buquan/12345/kmeans/123451wfilter_cluster_kmeans_result.jsonl', 'r', encoding='utf-8') as f:
    for line in f:
        line = line.strip()
        if not line:
            continue
        try:
            entry = json.loads(line)
            data_idx = entry.get("data_idx")  # 获取 data_idx
            cluster_center = entry.get("cluster_center")  # 获取 cluster_center（整数）
            if data_idx and cluster_center is not None:
                kmeans_data[data_idx] = cluster_center
        except Exception as e:
            print(f"Error parsing K-Means line: {line}, error: {e}")

# 2. 读取 JSONL 文件，匹配并合并 cluster_center
output_lines = []
with open('/data/zhaochsh01/buquan/12345/instag/123451wfilter_instag.jsonl', 'r', encoding='utf-8') as f:
    for line in f:
        line = line.strip()
        if not line:
            continue
        try:
            data = json.loads(line)
            key_id = str(data.get("工单编号"))  # 转为字符串确保匹配
            if key_id in kmeans_data:
                data["cluster_center"] = kmeans_data[key_id]  # 添加 cluster_center
            output_lines.append(json.dumps(data, ensure_ascii=False))  # 重新转为 JSON 字符串
        except Exception as e:
            print(f"Error processing JSONL line: {line}, error: {e}")

# 3. 将结果写入新文件
with open('merged_result.jsonl', 'w', encoding='utf-8') as f:
    for line in output_lines:
        f.write(line + '\n')

print("数据处理完成，结果已保存到 merged_result.jsonl")