[Fix] minor update wildbench (#1335)

* update crb * update crbbench * update crbbench * update crbbench * minor update wildbench * [Fix] Update doc of wildbench, and merge wildbench into subjective * [Fix] Update doc of wildbench, and merge wildbench into subjective, fix crbbench * Update crb.md * Update crb_pair_judge.py * Update crb_single_judge.py * Update subjective_evaluation.md * Update openai_api.py * [Update] update wildbench readme * [Update] update wildbench readme * [Update] update wildbench readme, remove crb * Delete configs/eval_subjective_wildbench_pair.py * Delete configs/eval_subjective_wildbench_single.py * Update __init__.py --------- Co-authored-by: bittersweet1999 <148421775+bittersweet1999@users.noreply.github.com>
2025-05-30 16:03:24 +08:00 · 2024-07-26 11:19:04 +08:00 · 2024-07-26 11:19:04 +08:00 · 65fad8e2ac
commit 65fad8e2ac
parent 51a94aee01
9 changed files with 43 additions and 330 deletions
--- a/configs/datasets/subjective/wildbench/wildbench.md
+++ b/configs/datasets/subjective/wildbench/wildbench.md
@ -19,12 +19,16 @@ wildbench

 The wildbench.jsonl is the preprocessed dataset, and the other three are the reference, used for score.

-Once you download the dataset, you have to modify the path defined in `configs/datasets/subjective/wildbench/wildbench_pair_judge.py` and `configs/datasets/subjective/wildbench/wildbench_single_judge.py`
+Once you download the dataset, you have to modify the path defined in `configs/datasets/subjective/wildbench/wildbench_pair_judge.py` and `configs/datasets/subjective/wildbench/wildbench_single_judge.py`.
+
+Note that you have to modify the given_preds in line 57-61 of `configs/datasets/subjective/wildbench/wildbench_pair_judge.py` too,

 ## Run

 We have provide the script for wildbench in `configs/eval_subjective_wildbench_pair.py` and `configs/eval_subjective_wildbench_single.py`.

-Please modify the path for `give_pred` (line 171) in `configs/eval_subjective_wildbench_pair.py` to your path.
-
 Note that if you test the wildbench with other models, please set the max_out_lens to 4096.
+
+## Acknowledgement
+
+We greatly appreciate the authors of [wildbench dataset](https://github.com/allenai/WildBench). If you find it is useful in your research, please consider cite them.
--- a/configs/datasets/subjective/wildbench/wildbench_pair_judge.py
+++ b/configs/datasets/subjective/wildbench/wildbench_pair_judge.py
@ -13,7 +13,7 @@ subjective_reader_cfg = dict(

 data_path ='./data/WildBench/wildbench.jsonl'

-subjective_datasets = []
+wildbench_datasets = []
 subjective_infer_cfg = dict(
        prompt_template=dict(
            type=PromptTemplate,
@ -33,14 +33,33 @@ subjective_eval_cfg = dict(
    ),
    pred_role='BOT',
 )
+gpt4 = dict(
+    abbr='gpt4-turbo',
+)

-subjective_datasets.append(
+claude = dict(
+    abbr='HaiKu',
+)
+
+llama_2_70b = dict(
+    abbr='llama-2-70b-chat-hf',
+)
+
+wildbench_datasets.append(
    dict(
        abbr='wildbench',
        type=WildBenchDataset,
        path=data_path,
-        mode='pair',
+        eval_mode='pair',
        reader_cfg=subjective_reader_cfg,
        infer_cfg=subjective_infer_cfg,
-        eval_cfg=subjective_eval_cfg
+        eval_cfg=subjective_eval_cfg,
+        given_pred = [{'abbr': 'gpt4-turbo', 'path':'./data/WildBench/gpt4'},
+                {'abbr': 'llama-2-70b-chat-hf', 'path':'./data/WildBench/llama2-70b'},
+                {'abbr': 'HaiKu', 'path':'./data/WildBench/claude'},
+                {'abbr': 'llama-2-70b-chat-turbomind', 'path':'./data/WildBench/llama2-70b'},
+                {'abbr': 'llama-2-70b-chat-vllm', 'path':'./data/WildBench/llama2-70b'}],
+        mode='m2n', # m个模型 与 n个模型进行对战
+        infer_order='random',
+        base_models = [llama_2_70b, gpt4, claude]
    ))
--- a/configs/datasets/subjective/wildbench/wildbench_single_judge.py
+++ b/configs/datasets/subjective/wildbench/wildbench_single_judge.py
@ -12,7 +12,7 @@ subjective_reader_cfg = dict(

 data_path ='./data/WildBench/wildbench.jsonl'

-subjective_datasets = []
+wildbench_single_datasets = []

 # the question is a list, how to process it
 subjective_infer_cfg = dict(
@ -35,12 +35,12 @@ subjective_eval_cfg = dict(
    pred_role='BOT',
 )

-subjective_datasets.append(
+wildbench_single_datasets.append(
    dict(
        abbr='wildbench',
        type=WildBenchDataset,
        path=data_path,
-        mode='single',
+        eval_mode='single',
        reader_cfg=subjective_reader_cfg,
        infer_cfg=subjective_infer_cfg,
        eval_cfg=subjective_eval_cfg
--- a/configs/eval_subjective.py
+++ b/configs/eval_subjective.py
@ -7,6 +7,7 @@ with read_base():
    from .datasets.subjective.arena_hard.arena_hard_compare import arenahard_datasets
    from .datasets.subjective.compassbench.compassbench_compare import compassbench_datasets
    from .datasets.subjective.fofo.fofo_judge import fofo_datasets
+    from .datasets.subjective.wildbench.wildbench_pair_judge import wildbench_datasets
    from .datasets.subjective.multiround.mtbench_single_judge_diff_temp import mtbench_datasets
    from .datasets.subjective.multiround.mtbench101_judge import mtbench101_datasets
 from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3, OpenAI
@ -55,7 +56,9 @@ models = [
    )
 ]

-datasets = [*alignbench_datasets, *alpacav2_datasets, *arenahard_datasets, *compassarena_datasets, *compassbench_datasets, *fofo_datasets, *mtbench_datasets, *mtbench101_datasets] # add datasets you want
+
+datasets = [*alignbench_datasets, *alpacav2_datasets, *arenahard_datasets, *compassarena_datasets, *compassbench_datasets, *fofo_datasets, *mtbench_datasets, *mtbench101_datasets, *wildbench_datasets] # add datasets you want
+

 infer = dict(
    partitioner=dict(type=NaivePartitioner),
--- a/configs/eval_subjective_wildbench_pair.py
+++ b/configs/eval_subjective_wildbench_pair.py
@ -1,180 +0,0 @@
-from mmengine.config import read_base
-
-with read_base():
-    # from .datasets.subjective.multiround.mtbench_single_judge_diff_temp import subjective_datasets
-    from .datasets.subjective.wildbench.wildbench_pair_judge import subjective_datasets
-    from .models.openai.gpt_4 import models as gpt4_models
-    from .models.hf_llama.hf_llama2_70b_chat import models as llama2_models
-    # from .models.gemma.hf_gemma_2b_it import models
-    # from .models.hf_llama.hf_llama3_70b_instruct import models as llama3_model
-    # # from .models.hf_internlm.hf_internlm2_chat_7b import models
-    # from .models.yi.hf_yi_1_5_34b_chat import models as yi_model
-    # from .models.qwen.hf_qwen1_5_72b_chat import models as qwen_model
-
-from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3, OpenAI
-from opencompass.partitioners import NaivePartitioner, SizePartitioner
-from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
-from opencompass.partitioners.sub_size import SubjectiveSizePartitioner
-from opencompass.runners import LocalRunner
-from opencompass.runners import SlurmSequentialRunner
-from opencompass.tasks import OpenICLInferTask
-from opencompass.tasks.subjective_eval import SubjectiveEvalTask
-from opencompass.summarizers import WildBenchPairSummarizer
-from opencompass.models.claude_api.claude_api import Claude
-from opencompass.models import HuggingFacewithChatTemplate
-
-
-models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
-
-api_meta_template = dict(
-    round=[
-        dict(role='SYSTEM', api_role='SYSTEM'),
-        dict(role='HUMAN', api_role='HUMAN'),
-        dict(role='BOT', api_role='BOT', generate=True),
-    ]
-)
-
-# _meta_template = dict(
-#     round=[
-#         dict(role='HUMAN', begin='\n<|im_start|>user\n', end='<|im_end|>'),
-#         dict(role='BOT', begin='\n<|im_start|>assistant\n', end='<|im_end|>', generate=True),
-#     ],
-# )
-# -------------Inference Stage ----------------------------------------
-# For subjective evaluation, we often set do sample for models
-
-models = [
-    dict(
-        type=HuggingFacewithChatTemplate,
-        abbr='llama-3-8b-instruct-hf',
-        path='meta-llama/Meta-Llama-3-8B-Instruct',
-        max_out_len=4096,
-        batch_size=8,
-        run_cfg=dict(num_gpus=1),
-        stop_words=['<|end_of_text|>', '<|eot_id|>'],
-    ),
-    dict(
-        type=HuggingFacewithChatTemplate,
-        abbr='yi-1.5-6b-chat-hf',
-        path='01-ai/Yi-1.5-6B-Chat',
-        max_out_len=4096,
-        batch_size=8,
-        run_cfg=dict(num_gpus=1),
-    ),
-    dict(
-        type=HuggingFacewithChatTemplate,
-        abbr='qwen1.5-7b-chat-hf',
-        path='Qwen/Qwen1.5-7B-Chat',
-        max_out_len=4096,
-        batch_size=8,
-        run_cfg=dict(num_gpus=1),
-    ),
-    # dict(
-    #     type=HuggingFacewithChatTemplate,
-    #     abbr='llama-3-70b-instruct-hf',
-    #     path='meta-llama/Meta-Llama-3-70B-Instruct',
-    #     max_out_len=4096,
-    #     batch_size=8,
-    #     run_cfg=dict(num_gpus=4),
-    #     stop_words=['<|end_of_text|>', '<|eot_id|>'],
-    # ),
-    #     dict(
-    #     type=HuggingFacewithChatTemplate,
-    #     abbr='yi-1.5-34b-chat-hf',
-    #     path='01-ai/Yi-1.5-34B-Chat',
-    #     max_out_len=4096,
-    #     batch_size=8,
-    #     run_cfg=dict(num_gpus=2),
-    # ),
-    # dict(
-    #     type=HuggingFacewithChatTemplate,
-    #     abbr='qwen1.5-72b-chat-hf',
-    #     path='Qwen/Qwen1.5-72B-Chat',
-    #     max_out_len=4096,
-    #     batch_size=8,
-    #     run_cfg=dict(num_gpus=8),
-    # )
-]
-
-datasets = [*subjective_datasets]
-
-# -------------Evalation Stage ----------------------------------------
-
-## ------------- JudgeLLM Configuration
-judge_models = [dict(
-    abbr='GPT4-Turbo',
-    type=OpenAI,
-    path='gpt-4-0613', # To compare with the official leaderboard, please use gpt4-0613
-    key='xxxx',  # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well
-    meta_template=api_meta_template,
-    query_per_second=16,
-    max_out_len=2048,
-    max_seq_len=2048,
-    batch_size=8,
-    temperature=0,
-)]
-
-gpt4 = dict(
-    abbr='gpt4-turbo',
-    type=OpenAI,
-    path='gpt-4-0409-preview',
-    key='',  # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well
-    meta_template=api_meta_template,
-    query_per_second=1,
-    max_out_len=2048,
-    max_seq_len=4096,
-    batch_size=4,
-    retry=20,
-    temperature=1,
-)  # Re-inference gpt4's predictions or you can choose to use the pre-commited gpt4's predictions
-
-claude = dict(abbr='HaiKu',
-        type=Claude,
-        path='claude-2',
-        key='YOUR_CLAUDE_KEY',
-        query_per_second=1,
-        max_out_len=2048, max_seq_len=2048, batch_size=2,
-    )
-## single evaluation
-# eval = dict(
-#     partitioner=dict(type=SubjectiveSizePartitioner, strategy='split', max_task_size=10000, mode='singlescore', models=models, judge_models=judge_models),
-#     runner=dict(type=LocalRunner, max_num_workers=32, task=dict(type=SubjectiveEvalTask)),
-# )
-infer = dict(
-    partitioner=dict(type=SizePartitioner, max_task_size=1000, strategy='split'),
-    runner=dict(
-        type=SlurmSequentialRunner,
-        max_num_workers=64,
-        quotatype='reserved',
-        partition='llmeval',
-        task=dict(type=OpenICLInferTask)),
-)
-
-eval = dict(
-    partitioner=dict(
-        type=SubjectiveNaivePartitioner,
-        mode='m2n', # m个模型 与 n个模型进行对战
-        infer_order='random',
-        #  在m2n模式下，需要指定base_models和compare_models，将会对base_models和compare_models生成对应的两两pair（去重且不会与自身进行比较）
-        base_models = [*llama2_models, gpt4, claude], # 用于对比的基线模型
-        compare_models = models, # 待评测模型
-        judge_models=judge_models
-    ),
-    runner=dict(
-        type=LocalRunner,
-        # partition='llmeval',
-        # quotatype='auto',
-        max_num_workers=3,
-        task=dict(
-            type=SubjectiveEvalTask
-        )),
-    given_pred = [{'abbr':'gpt4-turbo', 'path':'./data/WildBench/gpt4'},
-                  {'abbr': 'llama-2-70b-chat-hf', 'path':'./data/WildBench/llama2-70b'},
-                  {'abbr': 'HaiKu', 'path':'./data/WildBench/claude'},
-                  {'abbr': 'llama-2-70b-chat-turbomind', 'path':'./data/WildBench/llama2-70b'},
-                  {'abbr': 'llama-2-70b-chat-vllm', 'path':'./data/WildBench/llama2-70b'}]
-)
-
-summarizer = dict(type=WildBenchPairSummarizer)
-
-work_dir = 'outputs/wildbench/'
--- a/configs/eval_subjective_wildbench_single.py
+++ b/configs/eval_subjective_wildbench_single.py
@ -1,135 +0,0 @@
-from mmengine.config import read_base
-
-with read_base():
-    # from .datasets.subjective.multiround.mtbench_single_judge_diff_temp import subjective_datasets
-    from .datasets.subjective.wildbench.wildbench_single_judge import subjective_datasets
-    # from .models.gemma.hf_gemma_2b_it import models as gemma_2b_models
-    # from .models.hf_llama.hf_llama3_70b_instruct import models as llama3_model
-    # # from .models.hf_internlm.hf_internlm2_chat_7b import models
-    # from .models.yi.hf_yi_1_5_34b_chat import models as yi_model
-    # from .models.qwen.hf_qwen1_5_72b_chat import models as qwen_model
-
-from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3, OpenAI
-from opencompass.partitioners import NaivePartitioner, SizePartitioner
-from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
-from opencompass.partitioners.sub_size import SubjectiveSizePartitioner
-from opencompass.runners import LocalRunner
-from opencompass.runners import SlurmSequentialRunner
-from opencompass.tasks import OpenICLInferTask
-from opencompass.tasks.subjective_eval import SubjectiveEvalTask
-from opencompass.summarizers import WildBenchSingleSummarizer
-from opencompass.models import HuggingFacewithChatTemplate
-
-
-# models = sum([v for k, v in locals().items() if k.endswith("_model")], [])
-
-api_meta_template = dict(
-    round=[
-        dict(role='SYSTEM', api_role='SYSTEM'),
-        dict(role='HUMAN', api_role='HUMAN'),
-        dict(role='BOT', api_role='BOT', generate=True),
-    ]
-)
-
-# _meta_template = dict(
-#     round=[
-#         dict(role='HUMAN', begin='\n<|im_start|>user\n', end='<|im_end|>'),
-#         dict(role='BOT', begin='\n<|im_start|>assistant\n', end='<|im_end|>', generate=True),
-#     ],
-# )
-# -------------Inference Stage ----------------------------------------
-# For subjective evaluation, we often set do sample for models
-# set max_out_len to 4096.
-models = [
-    dict(
-        type=HuggingFacewithChatTemplate,
-        abbr='llama-3-8b-instruct-hf',
-        path='meta-llama/Meta-Llama-3-8B-Instruct',
-        max_out_len=4096,
-        batch_size=8,
-        run_cfg=dict(num_gpus=1),
-        stop_words=['<|end_of_text|>', '<|eot_id|>'],
-    ),
-    dict(
-        type=HuggingFacewithChatTemplate,
-        abbr='yi-1.5-6b-chat-hf',
-        path='01-ai/Yi-1.5-6B-Chat',
-        max_out_len=4096,
-        batch_size=8,
-        run_cfg=dict(num_gpus=1),
-    ),
-    dict(
-        type=HuggingFacewithChatTemplate,
-        abbr='qwen1.5-7b-chat-hf',
-        path='Qwen/Qwen1.5-7B-Chat',
-        max_out_len=4096,
-        batch_size=8,
-        run_cfg=dict(num_gpus=1),
-    ),
-    # dict(
-    #     type=HuggingFacewithChatTemplate,
-    #     abbr='llama-3-70b-instruct-hf',
-    #     path='meta-llama/Meta-Llama-3-70B-Instruct',
-    #     max_out_len=4096,
-    #     batch_size=8,
-    #     run_cfg=dict(num_gpus=4),
-    #     stop_words=['<|end_of_text|>', '<|eot_id|>'],
-    # ),
-    # dict(
-    #     type=HuggingFacewithChatTemplate,
-    #     abbr='yi-1.5-34b-chat-hf',
-    #     path='01-ai/Yi-1.5-34B-Chat',
-    #     max_out_len=4096,
-    #     batch_size=8,
-    #     run_cfg=dict(num_gpus=2),
-    # ),
-    # dict(
-    #     type=HuggingFacewithChatTemplate,
-    #     abbr='qwen1.5-72b-chat-hf',
-    #     path='Qwen/Qwen1.5-72B-Chat',
-    #     max_out_len=4096,
-    #     batch_size=8,
-    #     run_cfg=dict(num_gpus=4),
-    # )
-]
-
-datasets = [*subjective_datasets]
-
-# -------------Evalation Stage ----------------------------------------
-
-## ------------- JudgeLLM Configuration
-judge_models = [dict(
-    abbr='GPT4-Turbo',
-    type=OpenAI,
-    path='gpt-4-0613', # To compare with the official leaderboard, please use gpt4-0613
-    key='xxxx',  # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well
-    meta_template=api_meta_template,
-    query_per_second=16,
-    max_out_len=2048,
-    max_seq_len=2048,
-    batch_size=8,
-    temperature=0,
-)]
-
-
-infer = dict(
-    partitioner=dict(type=SizePartitioner, max_task_size=1000, strategy='split'),
-    runner=dict(
-        type=SlurmSequentialRunner,
-        max_num_workers=64,
-        quotatype='reserved',
-        partition='llmeval',
-        task=dict(type=OpenICLInferTask)),
-)
-
-## single evaluation
-eval = dict(
-    partitioner=dict(type=SubjectiveSizePartitioner, strategy='split', max_task_size=10000, mode='singlescore', models=models, judge_models=judge_models),
-    runner=dict(type=LocalRunner,
-                max_num_workers=2,
-                task=dict(type=SubjectiveEvalTask)),
-)
-
-summarizer = dict(type=WildBenchSingleSummarizer)
-
-work_dir = 'outputs/wildbench/'
--- a/docs/en/advanced_guides/subjective_evaluation.md
+++ b/docs/en/advanced_guides/subjective_evaluation.md
@ -21,6 +21,7 @@ We support the use of GPT-4 (or other JudgeLLM) for the subjective evaluation of
 4. AlpacaEvalv2 English Compare Dataset (https://github.com/tatsu-lab/alpaca_eval)
 5. ArenaHard English Compare Dataset, mainly focused on coding (https://github.com/lm-sys/arena-hard/tree/main)
 6. Fofo English Scoring Dataset (https://github.com/SalesforceAIResearch/FoFo/)
+7. Wildbench English Score and Compare Dataset（https://github.com/allenai/WildBench）

 ## Initiating Subjective Evaluation

--- a/docs/zh_cn/advanced_guides/subjective_evaluation.md
+++ b/docs/zh_cn/advanced_guides/subjective_evaluation.md
@ -21,6 +21,7 @@
 4. AlpacaEvalv2 英文Compare数据集（https://github.com/tatsu-lab/alpaca_eval）
 5. ArenaHard 英文Compare数据集，主要面向coding(https://github.com/lm-sys/arena-hard/tree/main)
 6. Fofo  英文Socring数据集（https://github.com/SalesforceAIResearch/FoFo/）
+7. Wildbench 英文Score和Compare数据集（https://github.com/allenai/WildBench）

 ## 启动主观评测

--- a/opencompass/datasets/subjective/wildbench.py
+++ b/opencompass/datasets/subjective/wildbench.py
@ -209,7 +209,7 @@ def parse_conversation(conversation):
@LOAD_DATASET.register_module()
 class WildBenchDataset(BaseDataset):

-    def load(self, path: str, K=-1, mode='pair'):
+    def load(self, path: str, K=-1, eval_mode='pair', *args, **kwargs):
        dataset = DatasetDict()
        raw_data = []
        with open(path, 'r', encoding='utf-8') as file:
@ -222,13 +222,13 @@ class WildBenchDataset(BaseDataset):
                for checklist_item in item['checklist']:
                    checklist_mardkdown += f'- {checklist_item}\n'

-                if mode == 'single':
+                if eval_mode == 'single':
                    prompt = score_prompt
-                elif mode == 'pair':
+                elif eval_mode == 'pair':
                    prompt = pair_prompt
                else:
                    assert NotImplementedError(
-                        f'Mode {mode} not in single or pair.')
+                        f'Eval mode {eval_mode} not in single or pair.')

                prompt = prompt.replace('{history}', history)
                prompt = prompt.replace('{user_query}', last_query)