From 1c8e193de8c20dfeb8decccb7f079d5ad2f68748 Mon Sep 17 00:00:00 2001 From: bittersweet1999 <148421775+bittersweet1999@users.noreply.github.com> Date: Tue, 6 Feb 2024 21:26:47 +0800 Subject: [PATCH] [Fix] hotfix for mtbench (#877) * hotfix for mtbench * hotfix --- configs/eval_subjective_mtbench.py | 12 +++++------ opencompass/summarizers/subjective/mtbench.py | 21 +++++++++++++++---- 2 files changed, 23 insertions(+), 10 deletions(-) diff --git a/configs/eval_subjective_mtbench.py b/configs/eval_subjective_mtbench.py index 6ccb5e74..e7d86ea6 100644 --- a/configs/eval_subjective_mtbench.py +++ b/configs/eval_subjective_mtbench.py @@ -2,7 +2,6 @@ from mmengine.config import read_base with read_base(): from .datasets.subjective.multiround.mtbench_single_judge import subjective_datasets - # from .datasets.subjective.multiround.mtbench_pair_judge import subjective_datasets from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3 @@ -18,6 +17,7 @@ from opencompass.summarizers import MTBenchSummarizer api_meta_template = dict( round=[ + dict(role='SYSTEM', api_role='SYSTEM'), dict(role='HUMAN', api_role='HUMAN'), dict(role='BOT', api_role='BOT', generate=True), ] @@ -54,10 +54,10 @@ models = [ datasets = [*subjective_datasets] infer = dict( - partitioner=dict(type=SizePartitioner, max_task_size=100), + partitioner=dict(type=SizePartitioner, max_task_size=10000), runner=dict( type=SlurmSequentialRunner, - partition='llmeval', + partition='llm_dev2', quotatype='auto', max_num_workers=256, task=dict(type=OpenICLInferTask), @@ -70,12 +70,12 @@ infer = dict( judge_model = dict( abbr='GPT4-Turbo', type=OpenAIAllesAPIN, - path='gpt-4-0613', + path='gpt-4-0613', # To compare with the official leaderboard, please use gpt4-0613 key='xxxx', # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well url='xxxx', meta_template=api_meta_template, query_per_second=16, - max_out_len=1024, + max_out_len=2048, max_seq_len=2048, batch_size=8, temperature=0, @@ -95,7 +95,7 @@ judge_model = dict( ## single evaluation eval = dict( - partitioner=dict(type=SubjectiveSizePartitioner, max_task_size=100, mode='singlescore', models=models), + partitioner=dict(type=SubjectiveSizePartitioner, max_task_size=10000, mode='singlescore', models=models), runner=dict(type=LocalRunner, max_num_workers=32, task=dict(type=SubjectiveEvalTask, judge_cfg=judge_model)), ) diff --git a/opencompass/summarizers/subjective/mtbench.py b/opencompass/summarizers/subjective/mtbench.py index 771bf39b..4b71d7be 100644 --- a/opencompass/summarizers/subjective/mtbench.py +++ b/opencompass/summarizers/subjective/mtbench.py @@ -17,11 +17,10 @@ except ImportError: from opencompass.utils import model_abbr_from_cfg from .compass_arena import CompassArenaSummarizer -from .subjective_post_process import post_process_autoj from .utils import get_judgeanswer_and_reference, get_outdir -def post_process_mtbench(judgement: str): +def post_process_mtbench_pair(judgement: str): """Input a string like below: xxx[[A]]xxx, and extract the judge @@ -34,6 +33,20 @@ def post_process_mtbench(judgement: str): return None +def post_process_mtbench_single(judgement: str): + """Input a string like below: + + xxx[[5]]xxx, and extract the score + """ + pattern = r'Rating:\s*\[\[([\d.]+)\]\]' + matched_result = re.findall(pattern, judgement) + if matched_result: + score = float(matched_result[0]) + else: + return None + return {'score': score} + + def get_capability_results( judged_answers, references, @@ -87,8 +100,8 @@ class MTBenchSummarizer(CompassArenaSummarizer): 'compare_models'] self.judge_abbr = model_abbr_from_cfg(self.cfg['judge_model']) self.judge_map = { - 'single': post_process_autoj, - 'pair': post_process_mtbench + 'single': post_process_mtbench_single, + 'pair': post_process_mtbench_pair } self.judge_function = self.judge_map[self.judge_type]