diff --git a/opencompass/summarizers/subjective/common_summarizer.py b/opencompass/summarizers/subjective/common_summarizer.py index 7ba7341c..ccb8d139 100644 --- a/opencompass/summarizers/subjective/common_summarizer.py +++ b/opencompass/summarizers/subjective/common_summarizer.py @@ -73,6 +73,7 @@ def get_capability_results( with open(fout, 'a+', newline='') as csvfile: writer = csv.writer(csvfile) writer.writerow([model_abbr] + [judge_model_abbr] + [dataset_abbr] + [capability_avg_ratings[column] for column in columns]) + return {column:capability_avg_ratings[column] for column in columns if column != ''} class CommonSummarizer(CompassArenaSummarizer): @@ -113,6 +114,7 @@ class CommonSummarizer(CompassArenaSummarizer): fout_flag = 0 output_tmp_file = osp.join(output_dir, 'result.csv') output_file = osp.join(output_dir, 'total_result.csv') + json_result={} for eval_model_cfg in self.eval_model_cfgs: for judge_model_cfg in self.judge_model_cfgs: eval_model_abbr = model_abbr_from_cfg(eval_model_cfg) @@ -125,7 +127,10 @@ class CommonSummarizer(CompassArenaSummarizer): judged_answers, references = get_judgeanswer_and_reference(dataset, subdir_path, self.judge_function) show_dataset_abbr = dataset_abbr_from_cfg(dataset) - get_capability_results(judged_answers, references, output_tmp_file, fout_flag, show_model_abbr, show_judge_model_abbr, show_dataset_abbr) + tmp_result = get_capability_results(judged_answers, references, output_tmp_file, fout_flag, show_model_abbr, show_judge_model_abbr, show_dataset_abbr) + if show_judge_model_abbr not in json_result: + json_result[show_judge_model_abbr] = {} + json_result[show_judge_model_abbr][show_model_abbr] = tmp_result fout_flag += 1 else: print(subdir_path + ' is not exist! please check!') @@ -144,3 +149,4 @@ class CommonSummarizer(CompassArenaSummarizer): f.write(','.join(map(str, line)) + '\n') print(t) print(output_file) + return {'qa_bench_' + show_dataset_abbr:json_result}