[Fix] hotfix for mtbench (#877)

* hotfix for mtbench

* hotfix
This commit is contained in:
bittersweet1999 2024-02-06 21:26:47 +08:00 committed by GitHub
parent d34ba11106
commit 1c8e193de8
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 23 additions and 10 deletions

View File

@ -2,7 +2,6 @@ from mmengine.config import read_base
with read_base():
from .datasets.subjective.multiround.mtbench_single_judge import subjective_datasets
# from .datasets.subjective.multiround.mtbench_pair_judge import subjective_datasets
from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3
@ -18,6 +17,7 @@ from opencompass.summarizers import MTBenchSummarizer
api_meta_template = dict(
round=[
dict(role='SYSTEM', api_role='SYSTEM'),
dict(role='HUMAN', api_role='HUMAN'),
dict(role='BOT', api_role='BOT', generate=True),
]
@ -54,10 +54,10 @@ models = [
datasets = [*subjective_datasets]
infer = dict(
partitioner=dict(type=SizePartitioner, max_task_size=100),
partitioner=dict(type=SizePartitioner, max_task_size=10000),
runner=dict(
type=SlurmSequentialRunner,
partition='llmeval',
partition='llm_dev2',
quotatype='auto',
max_num_workers=256,
task=dict(type=OpenICLInferTask),
@ -70,12 +70,12 @@ infer = dict(
judge_model = dict(
abbr='GPT4-Turbo',
type=OpenAIAllesAPIN,
path='gpt-4-0613',
path='gpt-4-0613', # To compare with the official leaderboard, please use gpt4-0613
key='xxxx', # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well
url='xxxx',
meta_template=api_meta_template,
query_per_second=16,
max_out_len=1024,
max_out_len=2048,
max_seq_len=2048,
batch_size=8,
temperature=0,
@ -95,7 +95,7 @@ judge_model = dict(
## single evaluation
eval = dict(
partitioner=dict(type=SubjectiveSizePartitioner, max_task_size=100, mode='singlescore', models=models),
partitioner=dict(type=SubjectiveSizePartitioner, max_task_size=10000, mode='singlescore', models=models),
runner=dict(type=LocalRunner, max_num_workers=32, task=dict(type=SubjectiveEvalTask, judge_cfg=judge_model)),
)

View File

@ -17,11 +17,10 @@ except ImportError:
from opencompass.utils import model_abbr_from_cfg
from .compass_arena import CompassArenaSummarizer
from .subjective_post_process import post_process_autoj
from .utils import get_judgeanswer_and_reference, get_outdir
def post_process_mtbench(judgement: str):
def post_process_mtbench_pair(judgement: str):
"""Input a string like below:
xxx[[A]]xxx, and extract the judge
@ -34,6 +33,20 @@ def post_process_mtbench(judgement: str):
return None
def post_process_mtbench_single(judgement: str):
"""Input a string like below:
xxx[[5]]xxx, and extract the score
"""
pattern = r'Rating:\s*\[\[([\d.]+)\]\]'
matched_result = re.findall(pattern, judgement)
if matched_result:
score = float(matched_result[0])
else:
return None
return {'score': score}
def get_capability_results(
judged_answers,
references,
@ -87,8 +100,8 @@ class MTBenchSummarizer(CompassArenaSummarizer):
'compare_models']
self.judge_abbr = model_abbr_from_cfg(self.cfg['judge_model'])
self.judge_map = {
'single': post_process_autoj,
'pair': post_process_mtbench
'single': post_process_mtbench_single,
'pair': post_process_mtbench_pair
}
self.judge_function = self.judge_map[self.judge_type]