From f407930475e4d7cf3338eb9a5b1ac4f03916d7f6 Mon Sep 17 00:00:00 2001 From: bittersweet1999 <148421775+bittersweet1999@users.noreply.github.com> Date: Thu, 20 Feb 2025 12:19:46 +0800 Subject: [PATCH] [Feature] Support subjective evaluation for reasoning model (#1868) * fix pip version * fix pip version * add subeval for reasoning model * add subeval for reasoning model * update configs * update config * update config * update config * update files --- .../alignbench_judgeby_critiquellm.py | 2 +- .../alignbench_judgeby_critiquellm_new.py | 2 +- .../alignbench_v1_1_judgeby_critiquellm.py | 2 +- .../alignbench_v1_1_judgeby_critiquellm_new.py | 2 +- .../alpaca_eval/alpacav2_judgeby_gpt4.py | 3 ++- .../alpacav2_judgeby_gpt4_bradleyterry.py | 2 +- .../alpaca_eval/alpacav2_judgeby_gpt4_new.py | 2 +- .../arena_hard/arena_hard_compare.py | 2 +- .../arena_hard_compare_bradleyterry.py | 2 +- .../arena_hard/arena_hard_compare_new.py | 2 +- .../compassarena/compassarena_compare.py | 2 +- .../compassarena_compare_bradleyterry.py | 4 ++-- .../compassarena/compassarena_compare_new.py | 2 +- .../subjective/fofo/fofo_bilingual_judge.py | 2 +- .../fofo/fofo_bilingual_judge_new.py | 2 +- .../datasets/subjective/fofo/fofo_judge.py | 2 +- .../datasets/subjective/fofo/fofo_judge_new.py | 2 +- .../followbench/followbench_llmeval.py | 2 +- .../followbench/followbench_llmeval_new.py | 2 +- .../subjective/multiround/mtbench101_judge.py | 2 +- .../multiround/mtbench101_judge_new.py | 2 +- .../openicl/icl_evaluator/lm_evaluator.py | 2 +- opencompass/tasks/subjective_eval.py | 18 +++++++++++++++--- opencompass/utils/text_postprocessors.py | 12 ++++++++++++ 24 files changed, 51 insertions(+), 26 deletions(-) diff --git a/opencompass/configs/datasets/subjective/alignbench/alignbench_judgeby_critiquellm.py b/opencompass/configs/datasets/subjective/alignbench/alignbench_judgeby_critiquellm.py index 86c2a80b..0bc7df77 100644 --- a/opencompass/configs/datasets/subjective/alignbench/alignbench_judgeby_critiquellm.py +++ b/opencompass/configs/datasets/subjective/alignbench/alignbench_judgeby_critiquellm.py @@ -32,7 +32,7 @@ for _name in subjective_all_sets: ]), ), retriever=dict(type=ZeroRetriever), - inferencer=dict(type=GenInferencer, max_out_len=2048), + inferencer=dict(type=GenInferencer), ) subjective_eval_cfg = dict( diff --git a/opencompass/configs/datasets/subjective/alignbench/alignbench_judgeby_critiquellm_new.py b/opencompass/configs/datasets/subjective/alignbench/alignbench_judgeby_critiquellm_new.py index 20797b0f..d3f59b9f 100644 --- a/opencompass/configs/datasets/subjective/alignbench/alignbench_judgeby_critiquellm_new.py +++ b/opencompass/configs/datasets/subjective/alignbench/alignbench_judgeby_critiquellm_new.py @@ -31,7 +31,7 @@ for _name in subjective_all_sets: ]), ), retriever=dict(type=ZeroRetriever), - inferencer=dict(type=GenInferencer, max_out_len=2048), + inferencer=dict(type=GenInferencer), ) subjective_eval_cfg = dict( diff --git a/opencompass/configs/datasets/subjective/alignbench/alignbench_v1_1_judgeby_critiquellm.py b/opencompass/configs/datasets/subjective/alignbench/alignbench_v1_1_judgeby_critiquellm.py index 024f66a1..44f63f4f 100644 --- a/opencompass/configs/datasets/subjective/alignbench/alignbench_v1_1_judgeby_critiquellm.py +++ b/opencompass/configs/datasets/subjective/alignbench/alignbench_v1_1_judgeby_critiquellm.py @@ -32,7 +32,7 @@ for _name in subjective_all_sets: ]), ), retriever=dict(type=ZeroRetriever), - inferencer=dict(type=GenInferencer, max_out_len=2048), + inferencer=dict(type=GenInferencer), ) subjective_eval_cfg = dict( diff --git a/opencompass/configs/datasets/subjective/alignbench/alignbench_v1_1_judgeby_critiquellm_new.py b/opencompass/configs/datasets/subjective/alignbench/alignbench_v1_1_judgeby_critiquellm_new.py index 2ff09a3e..216e6ffa 100644 --- a/opencompass/configs/datasets/subjective/alignbench/alignbench_v1_1_judgeby_critiquellm_new.py +++ b/opencompass/configs/datasets/subjective/alignbench/alignbench_v1_1_judgeby_critiquellm_new.py @@ -31,7 +31,7 @@ for _name in subjective_all_sets: ]), ), retriever=dict(type=ZeroRetriever), - inferencer=dict(type=GenInferencer, max_out_len=2048), + inferencer=dict(type=GenInferencer), ) subjective_eval_cfg = dict( diff --git a/opencompass/configs/datasets/subjective/alpaca_eval/alpacav2_judgeby_gpt4.py b/opencompass/configs/datasets/subjective/alpaca_eval/alpacav2_judgeby_gpt4.py index 137e5ca0..ad0d4ef4 100644 --- a/opencompass/configs/datasets/subjective/alpaca_eval/alpacav2_judgeby_gpt4.py +++ b/opencompass/configs/datasets/subjective/alpaca_eval/alpacav2_judgeby_gpt4.py @@ -73,12 +73,13 @@ for _name in subjective_all_sets: ]), ), retriever=dict(type=ZeroRetriever), - inferencer=dict(type=GenInferencer, max_out_len=4096), + inferencer=dict(type=GenInferencer), ) subjective_eval_cfg = dict( evaluator=dict( type=LMEvaluator, + prompt_template=dict( type=PromptTemplate, template=dict( diff --git a/opencompass/configs/datasets/subjective/alpaca_eval/alpacav2_judgeby_gpt4_bradleyterry.py b/opencompass/configs/datasets/subjective/alpaca_eval/alpacav2_judgeby_gpt4_bradleyterry.py index 99f2e2be..19fe1559 100644 --- a/opencompass/configs/datasets/subjective/alpaca_eval/alpacav2_judgeby_gpt4_bradleyterry.py +++ b/opencompass/configs/datasets/subjective/alpaca_eval/alpacav2_judgeby_gpt4_bradleyterry.py @@ -74,7 +74,7 @@ for _name in subjective_all_sets: ), ), retriever=dict(type=ZeroRetriever), - inferencer=dict(type=GenInferencer, max_out_len=4096), + inferencer=dict(type=GenInferencer), ) subjective_eval_cfg = dict( diff --git a/opencompass/configs/datasets/subjective/alpaca_eval/alpacav2_judgeby_gpt4_new.py b/opencompass/configs/datasets/subjective/alpaca_eval/alpacav2_judgeby_gpt4_new.py index 06a82efe..a0510f5c 100644 --- a/opencompass/configs/datasets/subjective/alpaca_eval/alpacav2_judgeby_gpt4_new.py +++ b/opencompass/configs/datasets/subjective/alpaca_eval/alpacav2_judgeby_gpt4_new.py @@ -72,7 +72,7 @@ for _name in subjective_all_sets: ]), ), retriever=dict(type=ZeroRetriever), - inferencer=dict(type=GenInferencer, max_out_len=4096), + inferencer=dict(type=GenInferencer), ) subjective_eval_cfg = dict( diff --git a/opencompass/configs/datasets/subjective/arena_hard/arena_hard_compare.py b/opencompass/configs/datasets/subjective/arena_hard/arena_hard_compare.py index 90837c7b..7446fdd7 100644 --- a/opencompass/configs/datasets/subjective/arena_hard/arena_hard_compare.py +++ b/opencompass/configs/datasets/subjective/arena_hard/arena_hard_compare.py @@ -38,7 +38,7 @@ for _name in subjective_all_sets: ]), ), retriever=dict(type=ZeroRetriever), - inferencer=dict(type=GenInferencer, max_out_len=4096), + inferencer=dict(type=GenInferencer), ) subjective_eval_cfg = dict( diff --git a/opencompass/configs/datasets/subjective/arena_hard/arena_hard_compare_bradleyterry.py b/opencompass/configs/datasets/subjective/arena_hard/arena_hard_compare_bradleyterry.py index 7a0e9ae8..dc4b250e 100644 --- a/opencompass/configs/datasets/subjective/arena_hard/arena_hard_compare_bradleyterry.py +++ b/opencompass/configs/datasets/subjective/arena_hard/arena_hard_compare_bradleyterry.py @@ -39,7 +39,7 @@ for _name in subjective_all_sets: ), ), retriever=dict(type=ZeroRetriever), - inferencer=dict(type=GenInferencer, max_out_len=4096), + inferencer=dict(type=GenInferencer), ) subjective_eval_cfg = dict( diff --git a/opencompass/configs/datasets/subjective/arena_hard/arena_hard_compare_new.py b/opencompass/configs/datasets/subjective/arena_hard/arena_hard_compare_new.py index 08b27ca7..dbad40ef 100644 --- a/opencompass/configs/datasets/subjective/arena_hard/arena_hard_compare_new.py +++ b/opencompass/configs/datasets/subjective/arena_hard/arena_hard_compare_new.py @@ -37,7 +37,7 @@ for _name in subjective_all_sets: ]), ), retriever=dict(type=ZeroRetriever), - inferencer=dict(type=GenInferencer, max_out_len=4096), + inferencer=dict(type=GenInferencer), ) subjective_eval_cfg = dict( diff --git a/opencompass/configs/datasets/subjective/compassarena/compassarena_compare.py b/opencompass/configs/datasets/subjective/compassarena/compassarena_compare.py index 90141e66..47cc7b31 100644 --- a/opencompass/configs/datasets/subjective/compassarena/compassarena_compare.py +++ b/opencompass/configs/datasets/subjective/compassarena/compassarena_compare.py @@ -118,7 +118,7 @@ for _name, _prompt in sub_map.items(): ]), ), retriever=dict(type=ZeroRetriever), - inferencer=dict(type=GenInferencer, max_seq_len=4096, max_out_len=4096), + inferencer=dict(type=GenInferencer), ) subjective_eval_cfg = dict( diff --git a/opencompass/configs/datasets/subjective/compassarena/compassarena_compare_bradleyterry.py b/opencompass/configs/datasets/subjective/compassarena/compassarena_compare_bradleyterry.py index 8a687889..38d7927a 100644 --- a/opencompass/configs/datasets/subjective/compassarena/compassarena_compare_bradleyterry.py +++ b/opencompass/configs/datasets/subjective/compassarena/compassarena_compare_bradleyterry.py @@ -1,6 +1,6 @@ from opencompass.datasets import ( CompassArenaDataset, - compassarena_bradleyterry_postprocess, + compassarena_bradleyterry_postprocess ) from opencompass.openicl.icl_evaluator import LMEvaluator from opencompass.openicl.icl_inferencer import GenInferencer @@ -127,7 +127,7 @@ for _name, _prompt in sub_map.items(): ), ), retriever=dict(type=ZeroRetriever), - inferencer=dict(type=GenInferencer, max_seq_len=4096, max_out_len=4096), + inferencer=dict(type=GenInferencer), ) subjective_eval_cfg = dict( diff --git a/opencompass/configs/datasets/subjective/compassarena/compassarena_compare_new.py b/opencompass/configs/datasets/subjective/compassarena/compassarena_compare_new.py index a32691ad..83266765 100644 --- a/opencompass/configs/datasets/subjective/compassarena/compassarena_compare_new.py +++ b/opencompass/configs/datasets/subjective/compassarena/compassarena_compare_new.py @@ -105,7 +105,7 @@ for _name, _prompt in sub_map.items(): ]), ), retriever=dict(type=ZeroRetriever), - inferencer=dict(type=GenInferencer, max_seq_len=4096, max_out_len=4096), + inferencer=dict(type=GenInferencer), ) subjective_eval_cfg = dict( diff --git a/opencompass/configs/datasets/subjective/fofo/fofo_bilingual_judge.py b/opencompass/configs/datasets/subjective/fofo/fofo_bilingual_judge.py index 089fd101..9516e074 100644 --- a/opencompass/configs/datasets/subjective/fofo/fofo_bilingual_judge.py +++ b/opencompass/configs/datasets/subjective/fofo/fofo_bilingual_judge.py @@ -91,7 +91,7 @@ for _name in subjective_all_sets: ]), ), retriever=dict(type=ZeroRetriever), - inferencer=dict(type=GenInferencer, max_out_len=4096), + inferencer=dict(type=GenInferencer), ) subjective_eval_cfg = dict( diff --git a/opencompass/configs/datasets/subjective/fofo/fofo_bilingual_judge_new.py b/opencompass/configs/datasets/subjective/fofo/fofo_bilingual_judge_new.py index 81e160b5..f732dba0 100644 --- a/opencompass/configs/datasets/subjective/fofo/fofo_bilingual_judge_new.py +++ b/opencompass/configs/datasets/subjective/fofo/fofo_bilingual_judge_new.py @@ -90,7 +90,7 @@ for _name in subjective_all_sets: ]), ), retriever=dict(type=ZeroRetriever), - inferencer=dict(type=GenInferencer, max_out_len=4096), + inferencer=dict(type=GenInferencer), ) subjective_eval_cfg = dict( diff --git a/opencompass/configs/datasets/subjective/fofo/fofo_judge.py b/opencompass/configs/datasets/subjective/fofo/fofo_judge.py index 89400892..8944be01 100644 --- a/opencompass/configs/datasets/subjective/fofo/fofo_judge.py +++ b/opencompass/configs/datasets/subjective/fofo/fofo_judge.py @@ -59,7 +59,7 @@ for _name in subjective_all_sets: ]), ), retriever=dict(type=ZeroRetriever), - inferencer=dict(type=GenInferencer, max_out_len=4096), + inferencer=dict(type=GenInferencer), ) subjective_eval_cfg = dict( diff --git a/opencompass/configs/datasets/subjective/fofo/fofo_judge_new.py b/opencompass/configs/datasets/subjective/fofo/fofo_judge_new.py index 691aff2b..03dcf190 100644 --- a/opencompass/configs/datasets/subjective/fofo/fofo_judge_new.py +++ b/opencompass/configs/datasets/subjective/fofo/fofo_judge_new.py @@ -58,7 +58,7 @@ for _name in subjective_all_sets: ]), ), retriever=dict(type=ZeroRetriever), - inferencer=dict(type=GenInferencer, max_out_len=4096), + inferencer=dict(type=GenInferencer,), ) subjective_eval_cfg = dict( diff --git a/opencompass/configs/datasets/subjective/followbench/followbench_llmeval.py b/opencompass/configs/datasets/subjective/followbench/followbench_llmeval.py index e601bda3..1c4203fd 100644 --- a/opencompass/configs/datasets/subjective/followbench/followbench_llmeval.py +++ b/opencompass/configs/datasets/subjective/followbench/followbench_llmeval.py @@ -29,7 +29,7 @@ for _name in subjective_all_sets: ]), ), retriever=dict(type=ZeroRetriever), - inferencer=dict(type=GenInferencer, max_out_len=2048), + inferencer=dict(type=GenInferencer), ) subjective_eval_cfg = dict( diff --git a/opencompass/configs/datasets/subjective/followbench/followbench_llmeval_new.py b/opencompass/configs/datasets/subjective/followbench/followbench_llmeval_new.py index b0aacd86..970605b6 100644 --- a/opencompass/configs/datasets/subjective/followbench/followbench_llmeval_new.py +++ b/opencompass/configs/datasets/subjective/followbench/followbench_llmeval_new.py @@ -28,7 +28,7 @@ for _name in subjective_all_sets: ]), ), retriever=dict(type=ZeroRetriever), - inferencer=dict(type=GenInferencer, max_out_len=2048), + inferencer=dict(type=GenInferencer), ) subjective_eval_cfg = dict( diff --git a/opencompass/configs/datasets/subjective/multiround/mtbench101_judge.py b/opencompass/configs/datasets/subjective/multiround/mtbench101_judge.py index 00924ecb..53ab1631 100644 --- a/opencompass/configs/datasets/subjective/multiround/mtbench101_judge.py +++ b/opencompass/configs/datasets/subjective/multiround/mtbench101_judge.py @@ -24,7 +24,7 @@ for _name in subjective_all_sets: template="""{dialogue}""", ), retriever=dict(type=ZeroRetriever), - inferencer=dict(type=ChatInferencer, max_seq_len=4096, max_out_len=4096, infer_mode='last'), + inferencer=dict(type=ChatInferencer, infer_mode='last'), ) subjective_eval_cfg = dict( diff --git a/opencompass/configs/datasets/subjective/multiround/mtbench101_judge_new.py b/opencompass/configs/datasets/subjective/multiround/mtbench101_judge_new.py index 938f2f5e..01b9c12f 100644 --- a/opencompass/configs/datasets/subjective/multiround/mtbench101_judge_new.py +++ b/opencompass/configs/datasets/subjective/multiround/mtbench101_judge_new.py @@ -23,7 +23,7 @@ for _name in subjective_all_sets: template="""{dialogue}""", ), retriever=dict(type=ZeroRetriever), - inferencer=dict(type=ChatInferencer, max_seq_len=4096, max_out_len=4096, infer_mode='last'), + inferencer=dict(type=ChatInferencer, infer_mode='last'), ) subjective_eval_cfg = dict( diff --git a/opencompass/openicl/icl_evaluator/lm_evaluator.py b/opencompass/openicl/icl_evaluator/lm_evaluator.py index 53814070..94f2cf94 100644 --- a/opencompass/openicl/icl_evaluator/lm_evaluator.py +++ b/opencompass/openicl/icl_evaluator/lm_evaluator.py @@ -329,4 +329,4 @@ class LMEvaluator: else: kwargs = self.dict_postprocessor proc = DICT_POSTPROCESSORS.get(kwargs.pop('type')) - return proc(output, self.output_path, **kwargs) + return proc(output, self.output_path, **kwargs) \ No newline at end of file diff --git a/opencompass/tasks/subjective_eval.py b/opencompass/tasks/subjective_eval.py index 417c5cdb..0ddd7b0c 100644 --- a/opencompass/tasks/subjective_eval.py +++ b/opencompass/tasks/subjective_eval.py @@ -198,14 +198,24 @@ class SubjectiveEvalTask(BaseTask): if fnmatch.fnmatch(ds_abbr, pattern): pred_postprocessor = model_postprocessors[pattern] break + if 'pred_postprocessor' in eval_cfg['evaluator'] or pred_postprocessor: kwargs = pred_postprocessor or eval_cfg['evaluator'][ 'pred_postprocessor'] - proc = TEXT_POSTPROCESSORS.get(kwargs.pop('type')) + temp_kwargs = copy.deepcopy(kwargs) + proc = TEXT_POSTPROCESSORS.get(temp_kwargs.pop('type')) self.logger.info('Get postprocessor {postprocessor}.') - pred_strs = [proc(s, **kwargs) for s in pred_strs] + pred_strs = [proc(s, **temp_kwargs) for s in pred_strs] else: - self.logger.info('No postprocessor found.') + self.logger.info('No dataset postprocessor found.') + + if 'pred_postprocessor' in model_cfg or pred_postprocessor: + kwargs = pred_postprocessor or model_cfg['pred_postprocessor'] + temp_kwargs = copy.deepcopy(kwargs) + proc = TEXT_POSTPROCESSORS.get(temp_kwargs.pop('type')) + pred_strs = [proc(s, **temp_kwargs) for s in pred_strs] + else: + self.logger.info('No model postprocessor found.') return { 'model_name': model_abbr_from_cfg(model_cfg), @@ -329,7 +339,9 @@ class SubjectiveEvalTask(BaseTask): if fnmatch.fnmatch(ds_abbr, pattern): pred_postprocessor = model_postprocessors[pattern] break + if 'pred_postprocessor' in eval_cfg or pred_postprocessor: + kwargs = pred_postprocessor or eval_cfg['pred_postprocessor'] proc = TEXT_POSTPROCESSORS.get(kwargs.pop('type')) pred_strs = [proc(s, **kwargs) for s in pred_strs] diff --git a/opencompass/utils/text_postprocessors.py b/opencompass/utils/text_postprocessors.py index eb7469ab..7110e752 100644 --- a/opencompass/utils/text_postprocessors.py +++ b/opencompass/utils/text_postprocessors.py @@ -57,6 +57,18 @@ def last_capital_postprocess(text: str) -> str: return '' +@TEXT_POSTPROCESSORS.register_module('think_pred') +def think_pred_postprocess( + prediction: str, + re_pattern: str, +) -> str: + match = re.search(re_pattern, prediction) + if match: + return match.group(1).strip() + else: + return prediction + + def first_option_postprocess(text: str, options: str, cushion=True) -> str: """Find first valid option for text."""