From 65fad8e2ac7fae458de8c06fa19458bfd8b61efe Mon Sep 17 00:00:00 2001 From: klein Date: Fri, 26 Jul 2024 11:19:04 +0800 Subject: [PATCH] [Fix] minor update wildbench (#1335) * update crb * update crbbench * update crbbench * update crbbench * minor update wildbench * [Fix] Update doc of wildbench, and merge wildbench into subjective * [Fix] Update doc of wildbench, and merge wildbench into subjective, fix crbbench * Update crb.md * Update crb_pair_judge.py * Update crb_single_judge.py * Update subjective_evaluation.md * Update openai_api.py * [Update] update wildbench readme * [Update] update wildbench readme * [Update] update wildbench readme, remove crb * Delete configs/eval_subjective_wildbench_pair.py * Delete configs/eval_subjective_wildbench_single.py * Update __init__.py --------- Co-authored-by: bittersweet1999 <148421775+bittersweet1999@users.noreply.github.com> --- .../subjective/wildbench/wildbench.md | 10 +- .../wildbench/wildbench_pair_judge.py | 27 ++- .../wildbench/wildbench_single_judge.py | 6 +- configs/eval_subjective.py | 5 +- configs/eval_subjective_wildbench_pair.py | 180 ------------------ configs/eval_subjective_wildbench_single.py | 135 ------------- .../advanced_guides/subjective_evaluation.md | 1 + .../advanced_guides/subjective_evaluation.md | 1 + opencompass/datasets/subjective/wildbench.py | 8 +- 9 files changed, 43 insertions(+), 330 deletions(-) delete mode 100644 configs/eval_subjective_wildbench_pair.py delete mode 100644 configs/eval_subjective_wildbench_single.py diff --git a/configs/datasets/subjective/wildbench/wildbench.md b/configs/datasets/subjective/wildbench/wildbench.md index e4567ba1..c6101233 100644 --- a/configs/datasets/subjective/wildbench/wildbench.md +++ b/configs/datasets/subjective/wildbench/wildbench.md @@ -19,12 +19,16 @@ wildbench The wildbench.jsonl is the preprocessed dataset, and the other three are the reference, used for score. -Once you download the dataset, you have to modify the path defined in `configs/datasets/subjective/wildbench/wildbench_pair_judge.py` and `configs/datasets/subjective/wildbench/wildbench_single_judge.py` +Once you download the dataset, you have to modify the path defined in `configs/datasets/subjective/wildbench/wildbench_pair_judge.py` and `configs/datasets/subjective/wildbench/wildbench_single_judge.py`. + +Note that you have to modify the given_preds in line 57-61 of `configs/datasets/subjective/wildbench/wildbench_pair_judge.py` too, ## Run We have provide the script for wildbench in `configs/eval_subjective_wildbench_pair.py` and `configs/eval_subjective_wildbench_single.py`. -Please modify the path for `give_pred` (line 171) in `configs/eval_subjective_wildbench_pair.py` to your path. - Note that if you test the wildbench with other models, please set the max_out_lens to 4096. + +## Acknowledgement + +We greatly appreciate the authors of [wildbench dataset](https://github.com/allenai/WildBench). If you find it is useful in your research, please consider cite them. diff --git a/configs/datasets/subjective/wildbench/wildbench_pair_judge.py b/configs/datasets/subjective/wildbench/wildbench_pair_judge.py index e0a34c70..5037ae45 100644 --- a/configs/datasets/subjective/wildbench/wildbench_pair_judge.py +++ b/configs/datasets/subjective/wildbench/wildbench_pair_judge.py @@ -13,7 +13,7 @@ subjective_reader_cfg = dict( data_path ='./data/WildBench/wildbench.jsonl' -subjective_datasets = [] +wildbench_datasets = [] subjective_infer_cfg = dict( prompt_template=dict( type=PromptTemplate, @@ -33,14 +33,33 @@ subjective_eval_cfg = dict( ), pred_role='BOT', ) +gpt4 = dict( + abbr='gpt4-turbo', +) -subjective_datasets.append( +claude = dict( + abbr='HaiKu', +) + +llama_2_70b = dict( + abbr='llama-2-70b-chat-hf', +) + +wildbench_datasets.append( dict( abbr='wildbench', type=WildBenchDataset, path=data_path, - mode='pair', + eval_mode='pair', reader_cfg=subjective_reader_cfg, infer_cfg=subjective_infer_cfg, - eval_cfg=subjective_eval_cfg + eval_cfg=subjective_eval_cfg, + given_pred = [{'abbr': 'gpt4-turbo', 'path':'./data/WildBench/gpt4'}, + {'abbr': 'llama-2-70b-chat-hf', 'path':'./data/WildBench/llama2-70b'}, + {'abbr': 'HaiKu', 'path':'./data/WildBench/claude'}, + {'abbr': 'llama-2-70b-chat-turbomind', 'path':'./data/WildBench/llama2-70b'}, + {'abbr': 'llama-2-70b-chat-vllm', 'path':'./data/WildBench/llama2-70b'}], + mode='m2n', # m个模型 与 n个模型进行对战 + infer_order='random', + base_models = [llama_2_70b, gpt4, claude] )) diff --git a/configs/datasets/subjective/wildbench/wildbench_single_judge.py b/configs/datasets/subjective/wildbench/wildbench_single_judge.py index be11abcb..7c0df0de 100644 --- a/configs/datasets/subjective/wildbench/wildbench_single_judge.py +++ b/configs/datasets/subjective/wildbench/wildbench_single_judge.py @@ -12,7 +12,7 @@ subjective_reader_cfg = dict( data_path ='./data/WildBench/wildbench.jsonl' -subjective_datasets = [] +wildbench_single_datasets = [] # the question is a list, how to process it subjective_infer_cfg = dict( @@ -35,12 +35,12 @@ subjective_eval_cfg = dict( pred_role='BOT', ) -subjective_datasets.append( +wildbench_single_datasets.append( dict( abbr='wildbench', type=WildBenchDataset, path=data_path, - mode='single', + eval_mode='single', reader_cfg=subjective_reader_cfg, infer_cfg=subjective_infer_cfg, eval_cfg=subjective_eval_cfg diff --git a/configs/eval_subjective.py b/configs/eval_subjective.py index 523374d7..13d1228d 100644 --- a/configs/eval_subjective.py +++ b/configs/eval_subjective.py @@ -7,6 +7,7 @@ with read_base(): from .datasets.subjective.arena_hard.arena_hard_compare import arenahard_datasets from .datasets.subjective.compassbench.compassbench_compare import compassbench_datasets from .datasets.subjective.fofo.fofo_judge import fofo_datasets + from .datasets.subjective.wildbench.wildbench_pair_judge import wildbench_datasets from .datasets.subjective.multiround.mtbench_single_judge_diff_temp import mtbench_datasets from .datasets.subjective.multiround.mtbench101_judge import mtbench101_datasets from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3, OpenAI @@ -55,7 +56,9 @@ models = [ ) ] -datasets = [*alignbench_datasets, *alpacav2_datasets, *arenahard_datasets, *compassarena_datasets, *compassbench_datasets, *fofo_datasets, *mtbench_datasets, *mtbench101_datasets] # add datasets you want + +datasets = [*alignbench_datasets, *alpacav2_datasets, *arenahard_datasets, *compassarena_datasets, *compassbench_datasets, *fofo_datasets, *mtbench_datasets, *mtbench101_datasets, *wildbench_datasets] # add datasets you want + infer = dict( partitioner=dict(type=NaivePartitioner), diff --git a/configs/eval_subjective_wildbench_pair.py b/configs/eval_subjective_wildbench_pair.py deleted file mode 100644 index 652793cf..00000000 --- a/configs/eval_subjective_wildbench_pair.py +++ /dev/null @@ -1,180 +0,0 @@ -from mmengine.config import read_base - -with read_base(): - # from .datasets.subjective.multiround.mtbench_single_judge_diff_temp import subjective_datasets - from .datasets.subjective.wildbench.wildbench_pair_judge import subjective_datasets - from .models.openai.gpt_4 import models as gpt4_models - from .models.hf_llama.hf_llama2_70b_chat import models as llama2_models - # from .models.gemma.hf_gemma_2b_it import models - # from .models.hf_llama.hf_llama3_70b_instruct import models as llama3_model - # # from .models.hf_internlm.hf_internlm2_chat_7b import models - # from .models.yi.hf_yi_1_5_34b_chat import models as yi_model - # from .models.qwen.hf_qwen1_5_72b_chat import models as qwen_model - -from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3, OpenAI -from opencompass.partitioners import NaivePartitioner, SizePartitioner -from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner -from opencompass.partitioners.sub_size import SubjectiveSizePartitioner -from opencompass.runners import LocalRunner -from opencompass.runners import SlurmSequentialRunner -from opencompass.tasks import OpenICLInferTask -from opencompass.tasks.subjective_eval import SubjectiveEvalTask -from opencompass.summarizers import WildBenchPairSummarizer -from opencompass.models.claude_api.claude_api import Claude -from opencompass.models import HuggingFacewithChatTemplate - - -models = sum([v for k, v in locals().items() if k.endswith('_model')], []) - -api_meta_template = dict( - round=[ - dict(role='SYSTEM', api_role='SYSTEM'), - dict(role='HUMAN', api_role='HUMAN'), - dict(role='BOT', api_role='BOT', generate=True), - ] -) - -# _meta_template = dict( -# round=[ -# dict(role='HUMAN', begin='\n<|im_start|>user\n', end='<|im_end|>'), -# dict(role='BOT', begin='\n<|im_start|>assistant\n', end='<|im_end|>', generate=True), -# ], -# ) -# -------------Inference Stage ---------------------------------------- -# For subjective evaluation, we often set do sample for models - -models = [ - dict( - type=HuggingFacewithChatTemplate, - abbr='llama-3-8b-instruct-hf', - path='meta-llama/Meta-Llama-3-8B-Instruct', - max_out_len=4096, - batch_size=8, - run_cfg=dict(num_gpus=1), - stop_words=['<|end_of_text|>', '<|eot_id|>'], - ), - dict( - type=HuggingFacewithChatTemplate, - abbr='yi-1.5-6b-chat-hf', - path='01-ai/Yi-1.5-6B-Chat', - max_out_len=4096, - batch_size=8, - run_cfg=dict(num_gpus=1), - ), - dict( - type=HuggingFacewithChatTemplate, - abbr='qwen1.5-7b-chat-hf', - path='Qwen/Qwen1.5-7B-Chat', - max_out_len=4096, - batch_size=8, - run_cfg=dict(num_gpus=1), - ), - # dict( - # type=HuggingFacewithChatTemplate, - # abbr='llama-3-70b-instruct-hf', - # path='meta-llama/Meta-Llama-3-70B-Instruct', - # max_out_len=4096, - # batch_size=8, - # run_cfg=dict(num_gpus=4), - # stop_words=['<|end_of_text|>', '<|eot_id|>'], - # ), - # dict( - # type=HuggingFacewithChatTemplate, - # abbr='yi-1.5-34b-chat-hf', - # path='01-ai/Yi-1.5-34B-Chat', - # max_out_len=4096, - # batch_size=8, - # run_cfg=dict(num_gpus=2), - # ), - # dict( - # type=HuggingFacewithChatTemplate, - # abbr='qwen1.5-72b-chat-hf', - # path='Qwen/Qwen1.5-72B-Chat', - # max_out_len=4096, - # batch_size=8, - # run_cfg=dict(num_gpus=8), - # ) -] - -datasets = [*subjective_datasets] - -# -------------Evalation Stage ---------------------------------------- - -## ------------- JudgeLLM Configuration -judge_models = [dict( - abbr='GPT4-Turbo', - type=OpenAI, - path='gpt-4-0613', # To compare with the official leaderboard, please use gpt4-0613 - key='xxxx', # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well - meta_template=api_meta_template, - query_per_second=16, - max_out_len=2048, - max_seq_len=2048, - batch_size=8, - temperature=0, -)] - -gpt4 = dict( - abbr='gpt4-turbo', - type=OpenAI, - path='gpt-4-0409-preview', - key='', # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well - meta_template=api_meta_template, - query_per_second=1, - max_out_len=2048, - max_seq_len=4096, - batch_size=4, - retry=20, - temperature=1, -) # Re-inference gpt4's predictions or you can choose to use the pre-commited gpt4's predictions - -claude = dict(abbr='HaiKu', - type=Claude, - path='claude-2', - key='YOUR_CLAUDE_KEY', - query_per_second=1, - max_out_len=2048, max_seq_len=2048, batch_size=2, - ) -## single evaluation -# eval = dict( -# partitioner=dict(type=SubjectiveSizePartitioner, strategy='split', max_task_size=10000, mode='singlescore', models=models, judge_models=judge_models), -# runner=dict(type=LocalRunner, max_num_workers=32, task=dict(type=SubjectiveEvalTask)), -# ) -infer = dict( - partitioner=dict(type=SizePartitioner, max_task_size=1000, strategy='split'), - runner=dict( - type=SlurmSequentialRunner, - max_num_workers=64, - quotatype='reserved', - partition='llmeval', - task=dict(type=OpenICLInferTask)), -) - -eval = dict( - partitioner=dict( - type=SubjectiveNaivePartitioner, - mode='m2n', # m个模型 与 n个模型进行对战 - infer_order='random', - # 在m2n模式下,需要指定base_models和compare_models,将会对base_models和compare_models生成对应的两两pair(去重且不会与自身进行比较) - base_models = [*llama2_models, gpt4, claude], # 用于对比的基线模型 - compare_models = models, # 待评测模型 - judge_models=judge_models - ), - runner=dict( - type=LocalRunner, - # partition='llmeval', - # quotatype='auto', - max_num_workers=3, - task=dict( - type=SubjectiveEvalTask - )), - given_pred = [{'abbr':'gpt4-turbo', 'path':'./data/WildBench/gpt4'}, - {'abbr': 'llama-2-70b-chat-hf', 'path':'./data/WildBench/llama2-70b'}, - {'abbr': 'HaiKu', 'path':'./data/WildBench/claude'}, - {'abbr': 'llama-2-70b-chat-turbomind', 'path':'./data/WildBench/llama2-70b'}, - {'abbr': 'llama-2-70b-chat-vllm', 'path':'./data/WildBench/llama2-70b'}] -) - -summarizer = dict(type=WildBenchPairSummarizer) - -work_dir = 'outputs/wildbench/' diff --git a/configs/eval_subjective_wildbench_single.py b/configs/eval_subjective_wildbench_single.py deleted file mode 100644 index 5e053488..00000000 --- a/configs/eval_subjective_wildbench_single.py +++ /dev/null @@ -1,135 +0,0 @@ -from mmengine.config import read_base - -with read_base(): - # from .datasets.subjective.multiround.mtbench_single_judge_diff_temp import subjective_datasets - from .datasets.subjective.wildbench.wildbench_single_judge import subjective_datasets - # from .models.gemma.hf_gemma_2b_it import models as gemma_2b_models - # from .models.hf_llama.hf_llama3_70b_instruct import models as llama3_model - # # from .models.hf_internlm.hf_internlm2_chat_7b import models - # from .models.yi.hf_yi_1_5_34b_chat import models as yi_model - # from .models.qwen.hf_qwen1_5_72b_chat import models as qwen_model - -from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3, OpenAI -from opencompass.partitioners import NaivePartitioner, SizePartitioner -from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner -from opencompass.partitioners.sub_size import SubjectiveSizePartitioner -from opencompass.runners import LocalRunner -from opencompass.runners import SlurmSequentialRunner -from opencompass.tasks import OpenICLInferTask -from opencompass.tasks.subjective_eval import SubjectiveEvalTask -from opencompass.summarizers import WildBenchSingleSummarizer -from opencompass.models import HuggingFacewithChatTemplate - - -# models = sum([v for k, v in locals().items() if k.endswith("_model")], []) - -api_meta_template = dict( - round=[ - dict(role='SYSTEM', api_role='SYSTEM'), - dict(role='HUMAN', api_role='HUMAN'), - dict(role='BOT', api_role='BOT', generate=True), - ] -) - -# _meta_template = dict( -# round=[ -# dict(role='HUMAN', begin='\n<|im_start|>user\n', end='<|im_end|>'), -# dict(role='BOT', begin='\n<|im_start|>assistant\n', end='<|im_end|>', generate=True), -# ], -# ) -# -------------Inference Stage ---------------------------------------- -# For subjective evaluation, we often set do sample for models -# set max_out_len to 4096. -models = [ - dict( - type=HuggingFacewithChatTemplate, - abbr='llama-3-8b-instruct-hf', - path='meta-llama/Meta-Llama-3-8B-Instruct', - max_out_len=4096, - batch_size=8, - run_cfg=dict(num_gpus=1), - stop_words=['<|end_of_text|>', '<|eot_id|>'], - ), - dict( - type=HuggingFacewithChatTemplate, - abbr='yi-1.5-6b-chat-hf', - path='01-ai/Yi-1.5-6B-Chat', - max_out_len=4096, - batch_size=8, - run_cfg=dict(num_gpus=1), - ), - dict( - type=HuggingFacewithChatTemplate, - abbr='qwen1.5-7b-chat-hf', - path='Qwen/Qwen1.5-7B-Chat', - max_out_len=4096, - batch_size=8, - run_cfg=dict(num_gpus=1), - ), - # dict( - # type=HuggingFacewithChatTemplate, - # abbr='llama-3-70b-instruct-hf', - # path='meta-llama/Meta-Llama-3-70B-Instruct', - # max_out_len=4096, - # batch_size=8, - # run_cfg=dict(num_gpus=4), - # stop_words=['<|end_of_text|>', '<|eot_id|>'], - # ), - # dict( - # type=HuggingFacewithChatTemplate, - # abbr='yi-1.5-34b-chat-hf', - # path='01-ai/Yi-1.5-34B-Chat', - # max_out_len=4096, - # batch_size=8, - # run_cfg=dict(num_gpus=2), - # ), - # dict( - # type=HuggingFacewithChatTemplate, - # abbr='qwen1.5-72b-chat-hf', - # path='Qwen/Qwen1.5-72B-Chat', - # max_out_len=4096, - # batch_size=8, - # run_cfg=dict(num_gpus=4), - # ) -] - -datasets = [*subjective_datasets] - -# -------------Evalation Stage ---------------------------------------- - -## ------------- JudgeLLM Configuration -judge_models = [dict( - abbr='GPT4-Turbo', - type=OpenAI, - path='gpt-4-0613', # To compare with the official leaderboard, please use gpt4-0613 - key='xxxx', # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well - meta_template=api_meta_template, - query_per_second=16, - max_out_len=2048, - max_seq_len=2048, - batch_size=8, - temperature=0, -)] - - -infer = dict( - partitioner=dict(type=SizePartitioner, max_task_size=1000, strategy='split'), - runner=dict( - type=SlurmSequentialRunner, - max_num_workers=64, - quotatype='reserved', - partition='llmeval', - task=dict(type=OpenICLInferTask)), -) - -## single evaluation -eval = dict( - partitioner=dict(type=SubjectiveSizePartitioner, strategy='split', max_task_size=10000, mode='singlescore', models=models, judge_models=judge_models), - runner=dict(type=LocalRunner, - max_num_workers=2, - task=dict(type=SubjectiveEvalTask)), -) - -summarizer = dict(type=WildBenchSingleSummarizer) - -work_dir = 'outputs/wildbench/' diff --git a/docs/en/advanced_guides/subjective_evaluation.md b/docs/en/advanced_guides/subjective_evaluation.md index 50a41114..24a55921 100644 --- a/docs/en/advanced_guides/subjective_evaluation.md +++ b/docs/en/advanced_guides/subjective_evaluation.md @@ -21,6 +21,7 @@ We support the use of GPT-4 (or other JudgeLLM) for the subjective evaluation of 4. AlpacaEvalv2 English Compare Dataset (https://github.com/tatsu-lab/alpaca_eval) 5. ArenaHard English Compare Dataset, mainly focused on coding (https://github.com/lm-sys/arena-hard/tree/main) 6. Fofo English Scoring Dataset (https://github.com/SalesforceAIResearch/FoFo/) +7. Wildbench English Score and Compare Dataset(https://github.com/allenai/WildBench) ## Initiating Subjective Evaluation diff --git a/docs/zh_cn/advanced_guides/subjective_evaluation.md b/docs/zh_cn/advanced_guides/subjective_evaluation.md index d5b40c06..2dfb509e 100644 --- a/docs/zh_cn/advanced_guides/subjective_evaluation.md +++ b/docs/zh_cn/advanced_guides/subjective_evaluation.md @@ -21,6 +21,7 @@ 4. AlpacaEvalv2 英文Compare数据集(https://github.com/tatsu-lab/alpaca_eval) 5. ArenaHard 英文Compare数据集,主要面向coding(https://github.com/lm-sys/arena-hard/tree/main) 6. Fofo 英文Socring数据集(https://github.com/SalesforceAIResearch/FoFo/) +7. Wildbench 英文Score和Compare数据集(https://github.com/allenai/WildBench) ## 启动主观评测 diff --git a/opencompass/datasets/subjective/wildbench.py b/opencompass/datasets/subjective/wildbench.py index 8f0995f5..65d8ec27 100644 --- a/opencompass/datasets/subjective/wildbench.py +++ b/opencompass/datasets/subjective/wildbench.py @@ -209,7 +209,7 @@ def parse_conversation(conversation): @LOAD_DATASET.register_module() class WildBenchDataset(BaseDataset): - def load(self, path: str, K=-1, mode='pair'): + def load(self, path: str, K=-1, eval_mode='pair', *args, **kwargs): dataset = DatasetDict() raw_data = [] with open(path, 'r', encoding='utf-8') as file: @@ -222,13 +222,13 @@ class WildBenchDataset(BaseDataset): for checklist_item in item['checklist']: checklist_mardkdown += f'- {checklist_item}\n' - if mode == 'single': + if eval_mode == 'single': prompt = score_prompt - elif mode == 'pair': + elif eval_mode == 'pair': prompt = pair_prompt else: assert NotImplementedError( - f'Mode {mode} not in single or pair.') + f'Eval mode {eval_mode} not in single or pair.') prompt = prompt.replace('{history}', history) prompt = prompt.replace('{user_query}', last_query)