From 848e7c8a7668b57fe9cbb548034528a0c4345bf3 Mon Sep 17 00:00:00 2001
From: bittersweet1999 <148421775+bittersweet1999@users.noreply.github.com>
Date: Mon, 11 Mar 2024 17:24:39 +0800
Subject: [PATCH] [fix] add different temp for different question in mtbench
 (#954)

* add temp for mtbench

* add document for mtbench

* add document for mtbench
---
 .../mtbench_single_judge_diff_temp.py         | 64 +++++++++++++++++++
 configs/eval_subjective_mtbench.py            | 37 ++++++-----
 .../advanced_guides/subjective_evaluation.md  | 48 ++++++++++++++
 .../advanced_guides/subjective_evaluation.md  | 48 ++++++++++++++
 .../icl_inferencer/icl_chat_inferencer.py     | 16 ++++-
 opencompass/summarizers/subjective/mtbench.py | 10 ++-
 6 files changed, 202 insertions(+), 21 deletions(-)
 create mode 100644 configs/datasets/subjective/multiround/mtbench_single_judge_diff_temp.py

diff --git a/configs/datasets/subjective/multiround/mtbench_single_judge_diff_temp.py b/configs/datasets/subjective/multiround/mtbench_single_judge_diff_temp.py
new file mode 100644
index 00000000..dc7c406e
--- /dev/null
+++ b/configs/datasets/subjective/multiround/mtbench_single_judge_diff_temp.py
@@ -0,0 +1,64 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import ChatInferencer, GenInferencer
+from opencompass.openicl.icl_evaluator import LMEvaluator
+from opencompass.datasets import MTBenchDataset
+
+
+subjective_reader_cfg = dict(
+    input_columns=['dialogue', 'capability', 'system_prompt', 'prompt_template'],
+    output_column='judge',
+    )
+
+subjective_all_sets = [
+    "mtbench_0.0","mtbench_0.1","mtbench_0.7"
+]
+data_path ="data/subjective/mtbench"
+
+subjective_datasets = []
+
+for _name in subjective_all_sets:
+    temperature = float(_name.split('_')[1])
+    do_sample = False if temperature == 0.0 else True
+    subjective_infer_cfg = dict(
+            prompt_template=dict(
+                type=PromptTemplate,
+                template="""{dialogue}""",
+            ),
+            retriever=dict(type=ZeroRetriever),
+            inferencer=dict(type=ChatInferencer, max_seq_len=4096, max_out_len=512, temperature=temperature, do_sample=do_sample,infer_mode='every'),
+        )
+
+    subjective_eval_cfg = dict(
+        evaluator=dict(
+            type=LMEvaluator,
+            prompt_template=dict(
+                type=PromptTemplate,
+                template=dict(
+                begin=[
+                    dict(
+                        role='SYSTEM',
+                        fallback_role='HUMAN',
+                        prompt="{system_prompt}")
+                ],
+                    round=[
+                    dict(
+                        role='HUMAN',
+                        prompt = "{prompt_template}"
+                    ),
+                ]),
+            ),
+        ),
+        pred_role="BOT",
+    )
+
+    subjective_datasets.append(
+        dict(
+            abbr=f"{_name}",
+            type=MTBenchDataset,
+            path=data_path,
+            name=_name,
+            reader_cfg=subjective_reader_cfg,
+            infer_cfg=subjective_infer_cfg,
+            eval_cfg=subjective_eval_cfg
+        ))
diff --git a/configs/eval_subjective_mtbench.py b/configs/eval_subjective_mtbench.py
index e7d86ea6..940edabb 100644
--- a/configs/eval_subjective_mtbench.py
+++ b/configs/eval_subjective_mtbench.py
@@ -1,7 +1,7 @@
 from mmengine.config import read_base
 
 with read_base():
-    from .datasets.subjective.multiround.mtbench_single_judge import subjective_datasets
+    from .datasets.subjective.multiround.mtbench_single_judge_diff_temp import subjective_datasets
     # from .datasets.subjective.multiround.mtbench_pair_judge import subjective_datasets
 
 from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3
@@ -23,38 +23,44 @@ api_meta_template = dict(
     ]
 )
 
+_meta_template = dict(
+    round=[
+        dict(role="HUMAN", begin='\n<|im_start|>user\n', end='<|im_end|>'),
+        dict(role="BOT", begin="\n<|im_start|>assistant\n", end='<|im_end|>', generate=True),
+    ],
+)
 # -------------Inference Stage ----------------------------------------
 # For subjective evaluation, we often set do sample for models
 models = [
     dict(
-        type=HuggingFaceChatGLM3,
-        abbr='chatglm3-6b-hf',
-        path='THUDM/chatglm3-6b',
-        tokenizer_path='THUDM/chatglm3-6b',
+        type=HuggingFaceCausalLM,
+        abbr='qwen-7b-chat-hf',
+        path="Qwen/Qwen-7B-Chat",
+        tokenizer_path='Qwen/Qwen-7B-Chat',
         model_kwargs=dict(
             device_map='auto',
-            trust_remote_code=True,
+            trust_remote_code=True
         ),
         tokenizer_kwargs=dict(
             padding_side='left',
             truncation_side='left',
             trust_remote_code=True,
+            use_fast=False,
         ),
-        generation_kwargs=dict(
-            do_sample=True,
-        ),
-        meta_template=api_meta_template,
-        max_out_len=2048,
-        max_seq_len=4096,
-        batch_size=1,
+        pad_token_id=151643,
+        max_out_len=100,
+        max_seq_len=2048,
+        batch_size=8,
+        meta_template=_meta_template,
         run_cfg=dict(num_gpus=1, num_procs=1),
+        end_str='<|im_end|>',
     )
 ]
 
 datasets = [*subjective_datasets]
 
 infer = dict(
-    partitioner=dict(type=SizePartitioner, max_task_size=10000),
+    partitioner=dict(type=SizePartitioner, strategy='split', max_task_size=10000),
     runner=dict(
         type=SlurmSequentialRunner,
         partition='llm_dev2',
@@ -80,7 +86,6 @@ judge_model = dict(
     batch_size=8,
     temperature=0,
 )
-
 ## ------------- Evaluation Configuration
 # ## pair evaluation
 # eval = dict(
@@ -95,7 +100,7 @@ judge_model = dict(
 
 ## single evaluation
 eval = dict(
-    partitioner=dict(type=SubjectiveSizePartitioner, max_task_size=10000, mode='singlescore', models=models),
+    partitioner=dict(type=SubjectiveSizePartitioner, strategy='split', max_task_size=10000, mode='singlescore', models=models),
     runner=dict(type=LocalRunner, max_num_workers=32, task=dict(type=SubjectiveEvalTask, judge_cfg=judge_model)),
 )
 
diff --git a/docs/en/advanced_guides/subjective_evaluation.md b/docs/en/advanced_guides/subjective_evaluation.md
index 9746a0fd..b4304bce 100644
--- a/docs/en/advanced_guides/subjective_evaluation.md
+++ b/docs/en/advanced_guides/subjective_evaluation.md
@@ -202,6 +202,54 @@ Consider cite the following paper if you find it helpful:
 }
 ```
 
+## Multi-round Subjective Evaluation in OpenCompass
+
+In OpenCompass, we also support subjective multi-turn dialogue evaluation. For instance, the evaluation of MT-Bench can be referred to in `configs/eval_subjective_mtbench.py`.
+
+In the multi-turn dialogue evaluation, you need to organize the data format into the following dialogue structure:
+
+```
+"dialogue": [
+    {
+        "role": "user",
+        "content": "Imagine you are participating in a race with a group of people. If you have just overtaken the second person, what's your current position? Where is the person you just overtook?"
+    },
+    {
+        "role": "assistant",
+        "content": ""
+    },
+    {
+        "role": "user",
+        "content": "If the \"second person\" is changed to \"last person\" in the above question, what would the answer be?"
+    },
+    {
+        "role": "assistant",
+        "content": ""
+    }
+],
+```
+
+It's important to note that due to the different question types in MTBench having different temperature settings, we need to divide the original data files into three different subsets according to the temperature for separate inference. For different subsets, we can set different temperatures. For specific settings, please refer to `configs\datasets\subjective\multiround\mtbench_single_judge_diff_temp.py`.
+
+Consider cite the following paper if you find it helpful:
+
+```bibtex
+@misc{zheng2023judging,
+      title={Judging LLM-as-a-judge with MT-Bench and Chatbot Arena},
+      author={Lianmin Zheng and Wei-Lin Chiang and Ying Sheng and Siyuan Zhuang and Zhanghao Wu and Yonghao Zhuang and Zi Lin and Zhuohan Li and Dacheng Li and Eric. P Xing and Hao Zhang and Joseph E. Gonzalez and Ion Stoica},
+      year={2023},
+      eprint={2306.05685},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL}
+}
+@misc{2023opencompass,
+    title={OpenCompass: A Universal Evaluation Platform for Foundation Models},
+    author={OpenCompass Contributors},
+    howpublished = {\url{https://github.com/open-compass/opencompass}},
+    year={2023}
+}
+```
+
 ## Practice: AlignBench Evaluation
 
 ### Dataset
diff --git a/docs/zh_cn/advanced_guides/subjective_evaluation.md b/docs/zh_cn/advanced_guides/subjective_evaluation.md
index 555c3f8e..f59d8135 100644
--- a/docs/zh_cn/advanced_guides/subjective_evaluation.md
+++ b/docs/zh_cn/advanced_guides/subjective_evaluation.md
@@ -202,6 +202,54 @@ Opencompass 已经支持了很多的JudgeLLM，实际上，你可以将Opencompa
 }
 ```
 
+## 主观多轮对话评测
+
+在OpenCompass中我们同样支持了主观的多轮对话评测，以MT-Bench为例，对MTBench的评测可以参见`configs/eval_subjective_mtbench.py`
+
+在多轮对话评测中，你需要将数据格式整理为如下的dialogue格式
+
+```
+"dialogue": [
+            {
+                "role": "user",
+                "content": "Imagine you are participating in a race with a group of people. If you have just overtaken the second person, what's your current position? Where is the person you just overtook?"
+            },
+            {
+                "role": "assistant",
+                "content": ""
+            },
+            {
+                "role": "user",
+                "content": "If the \"second person\" is changed to \"last person\" in the above question, what would the answer be?"
+            },
+            {
+                "role": "assistant",
+                "content": ""
+            }
+        ],
+```
+
+值得注意的是，由于MTBench各不同的题目类型设置了不同的温度，因此我们需要将原始数据文件按照温度分成三个不同的子集以分别推理，针对不同的子集我们可以设置不同的温度，具体设置参加`configs\datasets\subjective\multiround\mtbench_single_judge_diff_temp.py`
+
+如果使用了该方法，请添加引用:
+
+```bibtex
+@misc{zheng2023judging,
+      title={Judging LLM-as-a-judge with MT-Bench and Chatbot Arena},
+      author={Lianmin Zheng and Wei-Lin Chiang and Ying Sheng and Siyuan Zhuang and Zhanghao Wu and Yonghao Zhuang and Zi Lin and Zhuohan Li and Dacheng Li and Eric. P Xing and Hao Zhang and Joseph E. Gonzalez and Ion Stoica},
+      year={2023},
+      eprint={2306.05685},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL}
+}
+@misc{2023opencompass,
+    title={OpenCompass: A Universal Evaluation Platform for Foundation Models},
+    author={OpenCompass Contributors},
+    howpublished = {\url{https://github.com/open-compass/opencompass}},
+    year={2023}
+}
+```
+
 ## 实战：AlignBench 主观评测
 
 ### 数据集准备
diff --git a/opencompass/openicl/icl_inferencer/icl_chat_inferencer.py b/opencompass/openicl/icl_inferencer/icl_chat_inferencer.py
index 3ad13a13..ee28b140 100644
--- a/opencompass/openicl/icl_inferencer/icl_chat_inferencer.py
+++ b/opencompass/openicl/icl_inferencer/icl_chat_inferencer.py
@@ -172,6 +172,8 @@ class ChatInferencer(BaseInferencer):
             output_json_filepath: Optional[str] = './icl_inference_output',
             output_json_filename: Optional[str] = 'predictions',
             save_every: Optional[int] = 1,
+            temperature: Optional[float] = 0.0,
+            do_sample: Optional[bool] = False,
             infer_mode: str = 'last',
             **kwargs) -> None:
         super().__init__(
@@ -182,6 +184,8 @@ class ChatInferencer(BaseInferencer):
         )
         assert infer_mode in ['last', 'every', 'every_with_gt']
         self.infer_mode = infer_mode
+        self.temperature = temperature
+        self.do_sample = do_sample
         self.model: BaseModel
         self._set_meta_template(self.model)
 
@@ -347,8 +351,16 @@ class ChatInferencer(BaseInferencer):
 
         for i in assistant_indices:
             history = chat[:i]
-            output = self.model.generate_from_template([history],
-                                                       max_out_len=512)[0]
+            if self.do_sample:
+                output = self.model.generate_from_template(
+                    [history],
+                    do_sample=self.do_sample,
+                    temperature=self.temperature,
+                    max_out_len=512)[0]
+            else:
+                output = self.model.generate_from_template([history],
+                                                           do_sample=False,
+                                                           max_out_len=512)[0]
             chat[i]['content'] = output
             if not self.dialogue_mode:
                 output_handler.save_multiround_results(
diff --git a/opencompass/summarizers/subjective/mtbench.py b/opencompass/summarizers/subjective/mtbench.py
index 4b71d7be..8d80544f 100644
--- a/opencompass/summarizers/subjective/mtbench.py
+++ b/opencompass/summarizers/subjective/mtbench.py
@@ -127,12 +127,16 @@ class MTBenchSummarizer(CompassArenaSummarizer):
                     fout = osp.join(
                         output_dir,
                         'judged-by--' + judge_model + '-capability.csv')
+                    overall_judged_answers, overall_references = [], []
                     for dataset in dataset_cfgs:
                         judged_answers, references = get_judgeanswer_and_reference(
                             dataset, subdir_path, self.judge_function)
-                        get_capability_results(judged_answers, references,
-                                               fout, fout_flag, model)
-                        fout_flag += 1
+                        overall_judged_answers += judged_answers
+                        overall_references += references
+                    get_capability_results(overall_judged_answers,
+                                           overall_references, fout, fout_flag,
+                                           model)
+                    fout_flag += 1
                 else:
                     print(subdir_path + ' is not exist! please check!')
             with open(fout, 'r') as f: