[fix] add different temp for different question in mtbench (#954)

* add temp for mtbench * add document for mtbench * add document for mtbench
2025-05-30 16:03:24 +08:00 · 2024-03-11 17:24:39 +08:00 · 2024-03-11 17:24:39 +08:00 · 848e7c8a76
commit 848e7c8a76
parent 7c1a819bb4
6 changed files with 202 additions and 21 deletions
--- a/configs/datasets/subjective/multiround/mtbench_single_judge_diff_temp.py
+++ b/configs/datasets/subjective/multiround/mtbench_single_judge_diff_temp.py
@ -0,0 +1,64 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import ChatInferencer, GenInferencer
+from opencompass.openicl.icl_evaluator import LMEvaluator
+from opencompass.datasets import MTBenchDataset
+
+
+subjective_reader_cfg = dict(
+    input_columns=['dialogue', 'capability', 'system_prompt', 'prompt_template'],
+    output_column='judge',
+    )
+
+subjective_all_sets = [
+    "mtbench_0.0","mtbench_0.1","mtbench_0.7"
+]
+data_path ="data/subjective/mtbench"
+
+subjective_datasets = []
+
+for _name in subjective_all_sets:
+    temperature = float(_name.split('_')[1])
+    do_sample = False if temperature == 0.0 else True
+    subjective_infer_cfg = dict(
+            prompt_template=dict(
+                type=PromptTemplate,
+                template="""{dialogue}""",
+            ),
+            retriever=dict(type=ZeroRetriever),
+            inferencer=dict(type=ChatInferencer, max_seq_len=4096, max_out_len=512, temperature=temperature, do_sample=do_sample,infer_mode='every'),
+        )
+
+    subjective_eval_cfg = dict(
+        evaluator=dict(
+            type=LMEvaluator,
+            prompt_template=dict(
+                type=PromptTemplate,
+                template=dict(
+                begin=[
+                    dict(
+                        role='SYSTEM',
+                        fallback_role='HUMAN',
+                        prompt="{system_prompt}")
+                ],
+                    round=[
+                    dict(
+                        role='HUMAN',
+                        prompt = "{prompt_template}"
+                    ),
+                ]),
+            ),
+        ),
+        pred_role="BOT",
+    )
+
+    subjective_datasets.append(
+        dict(
+            abbr=f"{_name}",
+            type=MTBenchDataset,
+            path=data_path,
+            name=_name,
+            reader_cfg=subjective_reader_cfg,
+            infer_cfg=subjective_infer_cfg,
+            eval_cfg=subjective_eval_cfg
+        ))
--- a/configs/eval_subjective_mtbench.py
+++ b/configs/eval_subjective_mtbench.py
@ -1,7 +1,7 @@
 from mmengine.config import read_base

 with read_base():
-    from .datasets.subjective.multiround.mtbench_single_judge import subjective_datasets
+    from .datasets.subjective.multiround.mtbench_single_judge_diff_temp import subjective_datasets
    # from .datasets.subjective.multiround.mtbench_pair_judge import subjective_datasets

 from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3
@ -23,38 +23,44 @@ api_meta_template = dict(
    ]
 )

+_meta_template = dict(
+    round=[
+        dict(role="HUMAN", begin='\n<|im_start|>user\n', end='<|im_end|>'),
+        dict(role="BOT", begin="\n<|im_start|>assistant\n", end='<|im_end|>', generate=True),
+    ],
+)
 # -------------Inference Stage ----------------------------------------
 # For subjective evaluation, we often set do sample for models
 models = [
    dict(
-        type=HuggingFaceChatGLM3,
-        abbr='chatglm3-6b-hf',
-        path='THUDM/chatglm3-6b',
-        tokenizer_path='THUDM/chatglm3-6b',
+        type=HuggingFaceCausalLM,
+        abbr='qwen-7b-chat-hf',
+        path="Qwen/Qwen-7B-Chat",
+        tokenizer_path='Qwen/Qwen-7B-Chat',
        model_kwargs=dict(
            device_map='auto',
-            trust_remote_code=True,
+            trust_remote_code=True
        ),
        tokenizer_kwargs=dict(
            padding_side='left',
            truncation_side='left',
            trust_remote_code=True,
+            use_fast=False,
        ),
-        generation_kwargs=dict(
-            do_sample=True,
-        ),
-        meta_template=api_meta_template,
-        max_out_len=2048,
-        max_seq_len=4096,
-        batch_size=1,
+        pad_token_id=151643,
+        max_out_len=100,
+        max_seq_len=2048,
+        batch_size=8,
+        meta_template=_meta_template,
        run_cfg=dict(num_gpus=1, num_procs=1),
+        end_str='<|im_end|>',
    )
 ]

 datasets = [*subjective_datasets]

 infer = dict(
-    partitioner=dict(type=SizePartitioner, max_task_size=10000),
+    partitioner=dict(type=SizePartitioner, strategy='split', max_task_size=10000),
    runner=dict(
        type=SlurmSequentialRunner,
        partition='llm_dev2',
@ -80,7 +86,6 @@ judge_model = dict(
    batch_size=8,
    temperature=0,
 )
-
 ## ------------- Evaluation Configuration
 # ## pair evaluation
 # eval = dict(
@ -95,7 +100,7 @@ judge_model = dict(

 ## single evaluation
 eval = dict(
-    partitioner=dict(type=SubjectiveSizePartitioner, max_task_size=10000, mode='singlescore', models=models),
+    partitioner=dict(type=SubjectiveSizePartitioner, strategy='split', max_task_size=10000, mode='singlescore', models=models),
    runner=dict(type=LocalRunner, max_num_workers=32, task=dict(type=SubjectiveEvalTask, judge_cfg=judge_model)),
 )

--- a/docs/en/advanced_guides/subjective_evaluation.md
+++ b/docs/en/advanced_guides/subjective_evaluation.md
@ -202,6 +202,54 @@ Consider cite the following paper if you find it helpful:
 }
 ```

+## Multi-round Subjective Evaluation in OpenCompass
+
+In OpenCompass, we also support subjective multi-turn dialogue evaluation. For instance, the evaluation of MT-Bench can be referred to in `configs/eval_subjective_mtbench.py`.
+
+In the multi-turn dialogue evaluation, you need to organize the data format into the following dialogue structure:
+
+```
+"dialogue": [
+    {
+        "role": "user",
+        "content": "Imagine you are participating in a race with a group of people. If you have just overtaken the second person, what's your current position? Where is the person you just overtook?"
+    },
+    {
+        "role": "assistant",
+        "content": ""
+    },
+    {
+        "role": "user",
+        "content": "If the \"second person\" is changed to \"last person\" in the above question, what would the answer be?"
+    },
+    {
+        "role": "assistant",
+        "content": ""
+    }
+],
+```
+
+It's important to note that due to the different question types in MTBench having different temperature settings, we need to divide the original data files into three different subsets according to the temperature for separate inference. For different subsets, we can set different temperatures. For specific settings, please refer to `configs\datasets\subjective\multiround\mtbench_single_judge_diff_temp.py`.
+
+Consider cite the following paper if you find it helpful:
+
+```bibtex
+@misc{zheng2023judging,
+      title={Judging LLM-as-a-judge with MT-Bench and Chatbot Arena},
+      author={Lianmin Zheng and Wei-Lin Chiang and Ying Sheng and Siyuan Zhuang and Zhanghao Wu and Yonghao Zhuang and Zi Lin and Zhuohan Li and Dacheng Li and Eric. P Xing and Hao Zhang and Joseph E. Gonzalez and Ion Stoica},
+      year={2023},
+      eprint={2306.05685},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL}
+}
+@misc{2023opencompass,
+    title={OpenCompass: A Universal Evaluation Platform for Foundation Models},
+    author={OpenCompass Contributors},
+    howpublished = {\url{https://github.com/open-compass/opencompass}},
+    year={2023}
+}
+```
+
 ## Practice: AlignBench Evaluation

 ### Dataset
--- a/docs/zh_cn/advanced_guides/subjective_evaluation.md
+++ b/docs/zh_cn/advanced_guides/subjective_evaluation.md
@ -202,6 +202,54 @@ Opencompass 已经支持了很多的JudgeLLM，实际上，你可以将Opencompa
 }
 ```

+## 主观多轮对话评测
+
+在OpenCompass中我们同样支持了主观的多轮对话评测，以MT-Bench为例，对MTBench的评测可以参见`configs/eval_subjective_mtbench.py`
+
+在多轮对话评测中，你需要将数据格式整理为如下的dialogue格式
+
+```
+"dialogue": [
+            {
+                "role": "user",
+                "content": "Imagine you are participating in a race with a group of people. If you have just overtaken the second person, what's your current position? Where is the person you just overtook?"
+            },
+            {
+                "role": "assistant",
+                "content": ""
+            },
+            {
+                "role": "user",
+                "content": "If the \"second person\" is changed to \"last person\" in the above question, what would the answer be?"
+            },
+            {
+                "role": "assistant",
+                "content": ""
+            }
+        ],
+```
+
+值得注意的是，由于MTBench各不同的题目类型设置了不同的温度，因此我们需要将原始数据文件按照温度分成三个不同的子集以分别推理，针对不同的子集我们可以设置不同的温度，具体设置参加`configs\datasets\subjective\multiround\mtbench_single_judge_diff_temp.py`
+
+如果使用了该方法，请添加引用:
+
+```bibtex
+@misc{zheng2023judging,
+      title={Judging LLM-as-a-judge with MT-Bench and Chatbot Arena},
+      author={Lianmin Zheng and Wei-Lin Chiang and Ying Sheng and Siyuan Zhuang and Zhanghao Wu and Yonghao Zhuang and Zi Lin and Zhuohan Li and Dacheng Li and Eric. P Xing and Hao Zhang and Joseph E. Gonzalez and Ion Stoica},
+      year={2023},
+      eprint={2306.05685},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL}
+}
+@misc{2023opencompass,
+    title={OpenCompass: A Universal Evaluation Platform for Foundation Models},
+    author={OpenCompass Contributors},
+    howpublished = {\url{https://github.com/open-compass/opencompass}},
+    year={2023}
+}
+```
+
 ## 实战：AlignBench 主观评测

 ### 数据集准备
--- a/opencompass/openicl/icl_inferencer/icl_chat_inferencer.py
+++ b/opencompass/openicl/icl_inferencer/icl_chat_inferencer.py
@ -172,6 +172,8 @@ class ChatInferencer(BaseInferencer):
            output_json_filepath: Optional[str] = './icl_inference_output',
            output_json_filename: Optional[str] = 'predictions',
            save_every: Optional[int] = 1,
+            temperature: Optional[float] = 0.0,
+            do_sample: Optional[bool] = False,
            infer_mode: str = 'last',
            **kwargs) -> None:
        super().__init__(
@ -182,6 +184,8 @@ class ChatInferencer(BaseInferencer):
        )
        assert infer_mode in ['last', 'every', 'every_with_gt']
        self.infer_mode = infer_mode
+        self.temperature = temperature
+        self.do_sample = do_sample
        self.model: BaseModel
        self._set_meta_template(self.model)

@ -347,7 +351,15 @@ class ChatInferencer(BaseInferencer):

        for i in assistant_indices:
            history = chat[:i]
+            if self.do_sample:
+                output = self.model.generate_from_template(
+                    [history],
+                    do_sample=self.do_sample,
+                    temperature=self.temperature,
+                    max_out_len=512)[0]
+            else:
                output = self.model.generate_from_template([history],
+                                                           do_sample=False,
                                                           max_out_len=512)[0]
            chat[i]['content'] = output
            if not self.dialogue_mode:
--- a/opencompass/summarizers/subjective/mtbench.py
+++ b/opencompass/summarizers/subjective/mtbench.py
@ -127,11 +127,15 @@ class MTBenchSummarizer(CompassArenaSummarizer):
                    fout = osp.join(
                        output_dir,
                        'judged-by--' + judge_model + '-capability.csv')
+                    overall_judged_answers, overall_references = [], []
                    for dataset in dataset_cfgs:
                        judged_answers, references = get_judgeanswer_and_reference(
                            dataset, subdir_path, self.judge_function)
-                        get_capability_results(judged_answers, references,
-                                               fout, fout_flag, model)
+                        overall_judged_answers += judged_answers
+                        overall_references += references
+                    get_capability_results(overall_judged_answers,
+                                           overall_references, fout, fout_flag,
+                                           model)
                    fout_flag += 1
                else:
                    print(subdir_path + ' is not exist! please check!')