From 54c0fb7a937961b477a33f799f58a59934146d22 Mon Sep 17 00:00:00 2001 From: bittersweet1999 <148421775+bittersweet1999@users.noreply.github.com> Date: Tue, 10 Dec 2024 14:45:32 +0800 Subject: [PATCH] [Change] Change Compassarena metric (#1749) * fix pip version * fix pip version * fix summarizer bug * fix compassarena * fix compassarena * fix compassarena --- .../datasets/subjective/compassarena/compassarena_compare.py | 2 +- .../subjective/compassarena/compassarena_compare_new.py | 2 +- .../datasets/subjective/compassarena/compassarena_compare.py | 2 +- .../subjective/compassarena/compassarena_compare_new.py | 2 +- opencompass/datasets/subjective/compass_arena.py | 3 ++- 5 files changed, 6 insertions(+), 5 deletions(-) diff --git a/configs/datasets/subjective/compassarena/compassarena_compare.py b/configs/datasets/subjective/compassarena/compassarena_compare.py index 2c9b3e9b..90141e66 100644 --- a/configs/datasets/subjective/compassarena/compassarena_compare.py +++ b/configs/datasets/subjective/compassarena/compassarena_compare.py @@ -149,6 +149,6 @@ for _name, _prompt in sub_map.items(): mode='m2n', infer_order='double', base_models=gpt4, - summarizer = dict(type=CompassArenaSummarizer, summary_type='half_add'), + summarizer = dict(type=CompassArenaSummarizer, summary_type='single'), given_pred = [{'abbr':'gpt4-turbo', 'path':'./data/subjective/compass_arena/gpt4-turbo'}] )) diff --git a/configs/datasets/subjective/compassarena/compassarena_compare_new.py b/configs/datasets/subjective/compassarena/compassarena_compare_new.py index e7b14614..96d7ac65 100644 --- a/configs/datasets/subjective/compassarena/compassarena_compare_new.py +++ b/configs/datasets/subjective/compassarena/compassarena_compare_new.py @@ -120,7 +120,7 @@ for _name, _prompt in sub_map.items(): ), ]), ), - dict_postprocessor=dict(type=compassarena_postprocess, summary_type='half_add', check_pos_bias=True), + dict_postprocessor=dict(type=compassarena_postprocess, summary_type='single', check_pos_bias=True), ), pred_role='BOT', ) diff --git a/opencompass/configs/datasets/subjective/compassarena/compassarena_compare.py b/opencompass/configs/datasets/subjective/compassarena/compassarena_compare.py index 2c9b3e9b..90141e66 100644 --- a/opencompass/configs/datasets/subjective/compassarena/compassarena_compare.py +++ b/opencompass/configs/datasets/subjective/compassarena/compassarena_compare.py @@ -149,6 +149,6 @@ for _name, _prompt in sub_map.items(): mode='m2n', infer_order='double', base_models=gpt4, - summarizer = dict(type=CompassArenaSummarizer, summary_type='half_add'), + summarizer = dict(type=CompassArenaSummarizer, summary_type='single'), given_pred = [{'abbr':'gpt4-turbo', 'path':'./data/subjective/compass_arena/gpt4-turbo'}] )) diff --git a/opencompass/configs/datasets/subjective/compassarena/compassarena_compare_new.py b/opencompass/configs/datasets/subjective/compassarena/compassarena_compare_new.py index e7b14614..96d7ac65 100644 --- a/opencompass/configs/datasets/subjective/compassarena/compassarena_compare_new.py +++ b/opencompass/configs/datasets/subjective/compassarena/compassarena_compare_new.py @@ -120,7 +120,7 @@ for _name, _prompt in sub_map.items(): ), ]), ), - dict_postprocessor=dict(type=compassarena_postprocess, summary_type='half_add', check_pos_bias=True), + dict_postprocessor=dict(type=compassarena_postprocess, summary_type='single', check_pos_bias=True), ), pred_role='BOT', ) diff --git a/opencompass/datasets/subjective/compass_arena.py b/opencompass/datasets/subjective/compass_arena.py index f6ffa4f8..5afc68d0 100644 --- a/opencompass/datasets/subjective/compass_arena.py +++ b/opencompass/datasets/subjective/compass_arena.py @@ -65,7 +65,7 @@ def post_process_compassarena(item): @DICT_POSTPROCESSORS.register_module('compassarena') def compassarena_postprocess(output: dict, output_path: str, - summary_type='half_add', + summary_type='single', check_pos_bias=True) -> dict: judged_answers, references = get_judgeanswer_and_reference( output, output_path, post_process_compassarena) @@ -81,6 +81,7 @@ def compassarena_postprocess(output: dict, model1 = references[0]['answer1'] for prediction, reference in zip(judged_answers, references): + categories[reference['capability']] += 1 if prediction == 'A':