From 5c6dc908cd50851bf7810eb90e47d8ef1434aa88 Mon Sep 17 00:00:00 2001 From: bittersweet1999 <148421775+bittersweet1999@users.noreply.github.com> Date: Tue, 30 Jan 2024 16:34:38 +0800 Subject: [PATCH] fix compass arena (#854) --- .../compassarena/compassarena_compare.py | 19 +--- configs/eval_subjective_compassarena.py | 13 ++- .../summarizers/subjective/compass_arena.py | 106 +++++++++++------- opencompass/summarizers/subjective/utils.py | 7 +- 4 files changed, 81 insertions(+), 64 deletions(-) diff --git a/configs/datasets/subjective/compassarena/compassarena_compare.py b/configs/datasets/subjective/compassarena/compassarena_compare.py index 74ee38ae..89d866e0 100644 --- a/configs/datasets/subjective/compassarena/compassarena_compare.py +++ b/configs/datasets/subjective/compassarena/compassarena_compare.py @@ -88,19 +88,6 @@ math_prompt = """ reason_prompt = math_prompt -qa_prompt = """ -请根据提供的 评分要求,用户问题 以及 相应的两个回答(回答1,回答2),判断两个回答中哪一个更好。 -评分要求(重要性依次递减): -1. 好的回答必须首先具有事实正确性,即除了想象的内容外,所引用或阐述的各种信息都是真实正确的 -2. 好的回答必须具有逻辑连贯性,围绕一个中心进行回答,且前后连贯,逻辑没有问题 -3. 在都准确答对问题的前提下,更好的回答能进行额外补充,且补充的内容准确无误 - -[用户问题] -{question} -""" + base_prompt - - - creation_prompt = """ 请根据提供的 评分要求,用户问题 以及 相应的两个回答(回答1,回答2),判断两个回答中哪一个更好。 评分要求(重要性依次递减): @@ -112,11 +99,9 @@ creation_prompt = """ {question} """ + base_prompt +sub_map = {"knowledge": knowledge_prompt, "language": language_prompt, "math_v2": math_prompt, "reason_v2": reason_prompt, "creationv2_zh": creation_prompt} -subjective_all_sets = ["knowledge", "language", "math", "reason", "qa", "creationv2_zh"] -prompt_all_sets = [knowledge_prompt, language_prompt, math_prompt, reason_prompt, qa_prompt, creation_prompt] - -for _name,_prompt in zip(subjective_all_sets, prompt_all_sets): +for _name, _prompt in sub_map.items(): subjective_infer_cfg = dict( prompt_template=dict( type=PromptTemplate, diff --git a/configs/eval_subjective_compassarena.py b/configs/eval_subjective_compassarena.py index 58cf9bf4..3ac0b86c 100644 --- a/configs/eval_subjective_compassarena.py +++ b/configs/eval_subjective_compassarena.py @@ -3,7 +3,6 @@ from opencompass.models import HuggingFaceCausalLM from mmengine.config import read_base with read_base(): from .models.chatglm.hf_chatglm3_6b_32k import models as chatglm3_6b_32k_model - from .models.yi.hf_yi_6b_chat import models as yi_6b_chat_model from .datasets.subjective.compassarena.compassarena_compare import subjective_datasets from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3, OpenAI @@ -19,7 +18,7 @@ from opencompass.summarizers import CompassArenaSummarizer infer = dict( #partitioner=dict(type=NaivePartitioner), - partitioner=dict(type=SizePartitioner, max_task_size=10000), + partitioner=dict(type=SizePartitioner, strategy='split', max_task_size=10000), runner=dict( type=SlurmSequentialRunner, partition='llm_dev2', @@ -47,12 +46,12 @@ gpt4 = dict( retry=20, temperature = 1 ) -models = [*chatglm3_6b_32k_model, *yi_6b_chat_model] +models = [*chatglm3_6b_32k_model] datasets = [*subjective_datasets] -work_dir = 'outputs/compass_arena/' +work_dir = 'outputs/compass_arena_debug/' # -------------Inferen Stage ---------------------------------------- @@ -68,6 +67,7 @@ judge_model = dict( retry=20, temperature = 0 ) + ## ------------- Evaluation Configuration eval = dict( partitioner=dict( @@ -76,7 +76,7 @@ eval = dict( max_task_size=10000, mode='m2n', base_models = [gpt4], - compare_models = [*chatglm3_6b_32k_model, *yi_6b_chat_model, ] + compare_models = [*chatglm3_6b_32k_model] ), runner=dict( type=SlurmSequentialRunner, @@ -91,5 +91,6 @@ eval = dict( summarizer = dict( - type=CompassArenaSummarizer + type=CompassArenaSummarizer, + summary_type='half_add' ) \ No newline at end of file diff --git a/opencompass/summarizers/subjective/compass_arena.py b/opencompass/summarizers/subjective/compass_arena.py index 5619fa16..d23c9804 100644 --- a/opencompass/summarizers/subjective/compass_arena.py +++ b/opencompass/summarizers/subjective/compass_arena.py @@ -35,7 +35,7 @@ def check_position_bias(judged_answers, references, banned_choice=['C']): position_bias_flag = 0 position_bias_dict = {} for judge, ref in zip(judged_answers, references): - question = ref['others']['question'] + question = ref['question'] question_hash = hash(question) if question_hash not in position_bias_dict: position_bias_dict[question_hash] = { @@ -58,7 +58,11 @@ class CompassArenaSummarizer: It's expected to be filled out at runtime. """ - def __init__(self, config: ConfigDict, judge_type='general') -> None: + def __init__(self, + config: ConfigDict, + judge_type='general', + check_pos_bias=True, + summary_type='single') -> None: self.tasks = [] self.cfg = config self.base_models = self.cfg['eval']['partitioner']['base_models'] @@ -70,10 +74,13 @@ class CompassArenaSummarizer: 'general': post_process_compass_arena, } self.judge_function = self.judge_map[self.judge_type] + self.check_pos_bias = check_pos_bias + self.summary_type = summary_type - def summarize(self, - time_str: str = datetime.now().strftime('%Y%m%d_%H%M%S'), - check_pos_bias=True): + def summarize( + self, + time_str: str = datetime.now().strftime('%Y%m%d_%H%M%S'), + ): """Summarize the subjectivity analysis based on evaluation results. Args: @@ -88,25 +95,25 @@ class CompassArenaSummarizer: product(self.base_models, self.compare_models)) unique_combinations = remove_duplicate_pairs( [combo for combo in model_combinations if combo[0] != combo[1]]) + judge_model = self.judge_abbr fout_list = [] - for model_pair in unique_combinations: - model1, model2, judge_model = model_pair[0]['abbr'], model_pair[1][ - 'abbr'], self.judge_abbr - subdir = model1 + '_' + model2 + '_judged-by--' + self.judge_abbr - subdir_path = os.path.join(results_folder, subdir) - if os.path.isdir(subdir_path): - for dataset in dataset_cfgs: - dataset_abbr = dataset_abbr_from_cfg(dataset) - fout = osp.join( - output_dir, 'judged-by--' + judge_model + '-' + - dataset_abbr + '-report.csv') - fout_list.append(fout) + for dataset in dataset_cfgs: + dataset_abbr = dataset_abbr_from_cfg(dataset) + fout = osp.join( + output_dir, 'judged-by--' + judge_model + '-' + dataset_abbr + + '-report.csv') + fout_list.append(fout) + for model_pair in unique_combinations: + model1, model2, = model_pair[0]['abbr'], model_pair[1]['abbr'], + subdir = model1 + '_' + model2 + '_judged-by--' + judge_model + subdir_path = os.path.join(results_folder, subdir) + if os.path.isdir(subdir_path): judged_answers, references = get_judgeanswer_and_reference( dataset, subdir_path, self.judge_function, ) - if check_pos_bias: + if self.check_pos_bias: bias_num = check_position_bias(judged_answers, references) else: @@ -117,24 +124,47 @@ class CompassArenaSummarizer: 'answer2'] for prediction, reference in zip(judged_answers, references): - if dataset_abbr == 'qa': - reference['capability'] = 'QA' - categories['total'] += 1 - categories[reference['capability']] += 1 - if prediction == 'A': - if reference['answer1'] == model1: - win_model1[reference['capability']] += 1 - win_model1['total'] += 1 - else: - win_model2[reference['capability']] += 1 - win_model2['total'] += 1 - elif prediction == 'B': - if reference['answer1'] == model1: - win_model2[reference['capability']] += 1 - win_model2['total'] += 1 - else: - win_model1[reference['capability']] += 1 - win_model1['total'] += 1 + if self.summary_type == 'single': + if prediction == 'A': + categories['total'] += 1 + categories[reference['capability']] += 1 + if reference['answer1'] == model1: + win_model1[reference['capability']] += 1 + win_model1['total'] += 1 + else: + win_model2[reference['capability']] += 1 + win_model2['total'] += 1 + elif prediction == 'B': + categories['total'] += 1 + categories[reference['capability']] += 1 + if reference['answer1'] == model1: + win_model2[reference['capability']] += 1 + win_model2['total'] += 1 + else: + win_model1[reference['capability']] += 1 + win_model1['total'] += 1 + elif self.summary_type == 'half_add': + categories['total'] += 1 + categories[reference['capability']] += 1 + if prediction == 'A': + if reference['answer1'] == model1: + win_model1[reference['capability']] += 1 + win_model1['total'] += 1 + else: + win_model2[reference['capability']] += 1 + win_model2['total'] += 1 + elif prediction == 'B': + if reference['answer1'] == model1: + win_model2[reference['capability']] += 1 + win_model2['total'] += 1 + else: + win_model1[reference['capability']] += 1 + win_model1['total'] += 1 + elif prediction == 'C': + win_model1[reference['capability']] += 0.5 + win_model1['total'] += 0.5 + win_model2[reference['capability']] += 0.5 + win_model2['total'] += 0.5 for capability in categories: if capability not in win_model1: win_model1[capability] = 0.0 @@ -166,8 +196,8 @@ class CompassArenaSummarizer: writer.writerow( [row] + [scores[row][column] for column in columns]) - else: - print(subdir_path + ' is not exist! please check!') + else: + print(subdir_path + ' is not exist! please check!') for fout in fout_list: with open(fout, 'r') as f: x = from_csv(f) diff --git a/opencompass/summarizers/subjective/utils.py b/opencompass/summarizers/subjective/utils.py index 542c7600..1a2cf994 100644 --- a/opencompass/summarizers/subjective/utils.py +++ b/opencompass/summarizers/subjective/utils.py @@ -64,9 +64,10 @@ def get_judgeanswer_and_reference(dataset, subdir_path, post_process): if processed_judge is not None: judged_answers.append(processed_judge) references.append(v['gold']) - print( - f'Among {len(result)} judgements, successfully extracted {len(judged_answers)} judgements.' - ) + if len(judged_answers) != len(result): + print( + f'Among {len(result)} judgements, successfully extracted {len(judged_answers)} judgements, please check!' + ) if len(judged_answers) == 0: print('*' * 100) print(