From 1c8e193de8c20dfeb8decccb7f079d5ad2f68748 Mon Sep 17 00:00:00 2001
From: bittersweet1999 <148421775+bittersweet1999@users.noreply.github.com>
Date: Tue, 6 Feb 2024 21:26:47 +0800
Subject: [PATCH] [Fix] hotfix for mtbench (#877)

* hotfix for mtbench

* hotfix
---
 configs/eval_subjective_mtbench.py            | 12 +++++------
 opencompass/summarizers/subjective/mtbench.py | 21 +++++++++++++++----
 2 files changed, 23 insertions(+), 10 deletions(-)

diff --git a/configs/eval_subjective_mtbench.py b/configs/eval_subjective_mtbench.py
index 6ccb5e74..e7d86ea6 100644
--- a/configs/eval_subjective_mtbench.py
+++ b/configs/eval_subjective_mtbench.py
@@ -2,7 +2,6 @@ from mmengine.config import read_base
 
 with read_base():
     from .datasets.subjective.multiround.mtbench_single_judge import subjective_datasets
-
     # from .datasets.subjective.multiround.mtbench_pair_judge import subjective_datasets
 
 from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3
@@ -18,6 +17,7 @@ from opencompass.summarizers import MTBenchSummarizer
 
 api_meta_template = dict(
     round=[
+        dict(role='SYSTEM', api_role='SYSTEM'),
         dict(role='HUMAN', api_role='HUMAN'),
         dict(role='BOT', api_role='BOT', generate=True),
     ]
@@ -54,10 +54,10 @@ models = [
 datasets = [*subjective_datasets]
 
 infer = dict(
-    partitioner=dict(type=SizePartitioner, max_task_size=100),
+    partitioner=dict(type=SizePartitioner, max_task_size=10000),
     runner=dict(
         type=SlurmSequentialRunner,
-        partition='llmeval',
+        partition='llm_dev2',
         quotatype='auto',
         max_num_workers=256,
         task=dict(type=OpenICLInferTask),
@@ -70,12 +70,12 @@ infer = dict(
 judge_model = dict(
     abbr='GPT4-Turbo',
     type=OpenAIAllesAPIN,
-    path='gpt-4-0613',
+    path='gpt-4-0613', # To compare with the official leaderboard, please use gpt4-0613
     key='xxxx',  # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well
     url='xxxx',
     meta_template=api_meta_template,
     query_per_second=16,
-    max_out_len=1024,
+    max_out_len=2048,
     max_seq_len=2048,
     batch_size=8,
     temperature=0,
@@ -95,7 +95,7 @@ judge_model = dict(
 
 ## single evaluation
 eval = dict(
-    partitioner=dict(type=SubjectiveSizePartitioner, max_task_size=100, mode='singlescore', models=models),
+    partitioner=dict(type=SubjectiveSizePartitioner, max_task_size=10000, mode='singlescore', models=models),
     runner=dict(type=LocalRunner, max_num_workers=32, task=dict(type=SubjectiveEvalTask, judge_cfg=judge_model)),
 )
 
diff --git a/opencompass/summarizers/subjective/mtbench.py b/opencompass/summarizers/subjective/mtbench.py
index 771bf39b..4b71d7be 100644
--- a/opencompass/summarizers/subjective/mtbench.py
+++ b/opencompass/summarizers/subjective/mtbench.py
@@ -17,11 +17,10 @@ except ImportError:
 from opencompass.utils import model_abbr_from_cfg
 
 from .compass_arena import CompassArenaSummarizer
-from .subjective_post_process import post_process_autoj
 from .utils import get_judgeanswer_and_reference, get_outdir
 
 
-def post_process_mtbench(judgement: str):
+def post_process_mtbench_pair(judgement: str):
     """Input a string like below:
 
     xxx[[A]]xxx, and extract the judge
@@ -34,6 +33,20 @@ def post_process_mtbench(judgement: str):
         return None
 
 
+def post_process_mtbench_single(judgement: str):
+    """Input a string like below:
+
+    xxx[[5]]xxx, and extract the score
+    """
+    pattern = r'Rating:\s*\[\[([\d.]+)\]\]'
+    matched_result = re.findall(pattern, judgement)
+    if matched_result:
+        score = float(matched_result[0])
+    else:
+        return None
+    return {'score': score}
+
+
 def get_capability_results(
     judged_answers,
     references,
@@ -87,8 +100,8 @@ class MTBenchSummarizer(CompassArenaSummarizer):
                 'compare_models']
         self.judge_abbr = model_abbr_from_cfg(self.cfg['judge_model'])
         self.judge_map = {
-            'single': post_process_autoj,
-            'pair': post_process_mtbench
+            'single': post_process_mtbench_single,
+            'pair': post_process_mtbench_pair
         }
         self.judge_function = self.judge_map[self.judge_type]