diff --git a/examples/eval_OpenHuEval_HuLifeQA.py b/examples/eval_OpenHuEval_HuLifeQA.py index f4ed6d62..2cfe616e 100644 --- a/examples/eval_OpenHuEval_HuLifeQA.py +++ b/examples/eval_OpenHuEval_HuLifeQA.py @@ -17,11 +17,10 @@ with read_base(): from opencompass.configs.models.hf_internlm.lmdeploy_internlm3_8b_instruct import models as lmdeploy_internlm3_8b_instruct_model - from opencompass.configs.models.openai.o1_mini_2024_09_12 import models as o1_mini_2024_09_12_model from opencompass.configs.models.qwq.lmdeploy_qwq_32b_preview import models as lmdeploy_qwq_32b_preview_model from opencompass.configs.models.deepseek.deepseek_r1_api_aliyun import models as deepseek_r1_api_aliyun_model - from opencompass.configs.models.deepseek.deepseek_r1_distill_llama_8b_api_aliyun import models as deepseek_r1_distill_llama_8b_api_aliyun_model - from opencompass.configs.models.deepseek.deepseek_r1_distill_qwen_7b_api_aliyun import models as deepseek_r1_distill_qwen_7b_api_aliyun_model + from opencompass.configs.models.openai.o1_mini_2024_09_12 import models as o1_mini_2024_09_12_model + # from opencompass.configs.models.openai.o3_mini_2025_01_31 import models as o3_mini_2025_01_31_model from opencompass.models import OpenAI from opencompass.partitioners import ( @@ -46,12 +45,6 @@ for model in deepseek_r1_api_aliyun_model: 'type': 'rm__before_eval' } } - if model['abbr'].startswith('QwQ'): - model['pred_postprocessor'] = { - 'OpenHuEval_*': { - 'type': 'extract_qwq_answer_before_eval' - } - } del model models = [ @@ -60,8 +53,6 @@ models = [ *o1_mini_2024_09_12_model, *deepseek_v3_api_aliyun_model, *deepseek_r1_api_aliyun_model, - *deepseek_r1_distill_llama_8b_api_aliyun_model, - *deepseek_r1_distill_qwen_7b_api_aliyun_model, *lmdeploy_qwen2_5_7b_instruct_model, *lmdeploy_qwen2_5_72b_instruct_model, *lmdeploy_llama3_1_8b_instruct_model, diff --git a/examples/eval_subjective_wildbench_single.py b/examples/eval_subjective_wildbench_single.py new file mode 100644 index 00000000..fc8dbd0a --- /dev/null +++ b/examples/eval_subjective_wildbench_single.py @@ -0,0 +1,112 @@ +from mmengine.config import read_base + +with read_base(): + from opencompass.configs.datasets.subjective.wildbench.wildbench_single_judge import wildbench_datasets + + from opencompass.configs.models.openai.gpt_4o_mini_20240718 import models as gpt_4o_mini_20240718_model + from opencompass.configs.models.openai.gpt_4o_2024_11_20 import models as gpt_4o_20241120_model + from opencompass.configs.models.deepseek.deepseek_v3_api_aliyun import models as deepseek_v3_api_aliyun_model + + from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_7b_instruct import models as lmdeploy_qwen2_5_7b_instruct_model + from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_72b_instruct import models as lmdeploy_qwen2_5_72b_instruct_model + from opencompass.configs.models.hf_llama.lmdeploy_llama3_1_8b_instruct import models as lmdeploy_llama3_1_8b_instruct_model + from opencompass.configs.models.hf_llama.lmdeploy_llama3_1_70b_instruct import models as lmdeploy_llama3_1_70b_instruct_model + + # from opencompass.configs.models.hf_internlm.lmdeploy_internlm3_8b_instruct import models as lmdeploy_internlm3_8b_instruct_model + + from opencompass.configs.models.openai.o1_mini_2024_09_12 import models as o1_mini_2024_09_12_model + from opencompass.configs.models.qwq.lmdeploy_qwq_32b_preview import models as lmdeploy_qwq_32b_preview_model + from opencompass.configs.models.deepseek.deepseek_r1_api_aliyun import models as deepseek_r1_api_aliyun_model + # from opencompass.configs.models.openai.o3_mini_2025_01_31 import models as o3_mini_2025_01_31_model + +from opencompass.models import OpenAI +from opencompass.partitioners import ( + NumWorkerPartitioner, + SubjectiveNumWorkerPartitioner, +) +from opencompass.runners import LocalRunner, SlurmSequentialRunner +from opencompass.summarizers import WildBenchSingleSummarizer +from opencompass.tasks import OpenICLInferTask +from opencompass.tasks.subjective_eval import SubjectiveEvalTask + +api_meta_template = dict(round=[ + dict(role='SYSTEM', api_role='SYSTEM'), + dict(role='HUMAN', api_role='HUMAN'), + dict(role='BOT', api_role='BOT', generate=True), +]) + +for model in deepseek_r1_api_aliyun_model: + model['return_reasoning_content'] = True + model['pred_postprocessor'] = { + 'open_hu_eval_*': { + 'type': 'rm__before_eval' + } + } +del model + +models = [ + *gpt_4o_mini_20240718_model, + *gpt_4o_20241120_model, + *o1_mini_2024_09_12_model, + *deepseek_v3_api_aliyun_model, + *deepseek_r1_api_aliyun_model, + *lmdeploy_qwen2_5_7b_instruct_model, + *lmdeploy_qwen2_5_72b_instruct_model, + *lmdeploy_llama3_1_8b_instruct_model, + *lmdeploy_llama3_1_70b_instruct_model, + # *lmdeploy_internlm3_8b_instruct_model, + *lmdeploy_qwq_32b_preview_model, +] + +judge_models = [ + dict( + abbr='GPT-4o-2024-08-06', + type=OpenAI, + path='gpt-4o-2024-08-06', + key='ENV', + openai_proxy_url='ENV', + verbose=True, + meta_template=api_meta_template, + query_per_second=2, + max_out_len=8192, + max_seq_len=16384, + batch_size=8, + temperature=0, + ) +] + +for ds in wildbench_datasets: + ds.update(dict(mode='singlescore')) +del ds +datasets = [*wildbench_datasets] +del wildbench_datasets + +infer = dict( + partitioner=dict( + type=NumWorkerPartitioner, + num_worker=8, + ), + runner=dict( + type=SlurmSequentialRunner, + max_num_workers=16, + task=dict(type=OpenICLInferTask), + ), +) + +eval = dict( + partitioner=dict( + type=SubjectiveNumWorkerPartitioner, + num_worker=8, + models=models, + judge_models=judge_models, + ), + runner=dict(type=LocalRunner, + max_num_workers=16, + task=dict(type=SubjectiveEvalTask)), +) + +summarizer = dict(type=WildBenchSingleSummarizer) + +work_dir = ( + './outputs/' + __file__.split('/')[-1].split('.')[0] + '/' +) # do NOT modify this line, yapf: disable, pylint: disable diff --git a/opencompass/configs/datasets/subjective/wildbench/wildbench_single_judge.py b/opencompass/configs/datasets/subjective/wildbench/wildbench_single_judge.py new file mode 100644 index 00000000..13b45c62 --- /dev/null +++ b/opencompass/configs/datasets/subjective/wildbench/wildbench_single_judge.py @@ -0,0 +1,50 @@ +from opencompass.datasets import WildBenchDataset +from opencompass.openicl.icl_evaluator import LMEvaluator +from opencompass.openicl.icl_inferencer import ChatInferencer +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever + +wildbench_reader_cfg = dict( + input_columns=['dialogue', 'prompt'], + output_column='judge', +) + +data_path ='/mnt/hwfile/opendatalab/yanghaote/datasets/WildBench/wildbench.jsonl' + +wildbench_datasets = [] +wildbench_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template="""{dialogue}""" + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict( + type=ChatInferencer, + max_seq_len=32768, + max_out_len=8192, + infer_mode='last', + ), +) + +wildbench_eval_cfg = dict( + evaluator=dict( + type=LMEvaluator, + prompt_template=dict( + type=PromptTemplate, + template="""{prompt}""" + ), + ), + pred_role='BOT', +) + +wildbench_datasets.append( + dict( + abbr='wildbench', + type=WildBenchDataset, + path=data_path, + eval_mode='single', + reader_cfg=wildbench_reader_cfg, + infer_cfg=wildbench_infer_cfg, + eval_cfg=wildbench_eval_cfg, + ) +) \ No newline at end of file diff --git a/opencompass/configs/models/openai/gpt_4o_2024_11_20.py b/opencompass/configs/models/openai/gpt_4o_2024_11_20.py index 7ceb00e5..f0801c29 100644 --- a/opencompass/configs/models/openai/gpt_4o_2024_11_20.py +++ b/opencompass/configs/models/openai/gpt_4o_2024_11_20.py @@ -15,7 +15,7 @@ models = [ openai_proxy_url='ENV', meta_template=api_meta_template, query_per_second=1, - max_out_len=2048, - max_seq_len=4096, + max_out_len=8192, + max_seq_len=8192, batch_size=8), ] diff --git a/opencompass/configs/models/openai/gpt_4o_mini_20240718.py b/opencompass/configs/models/openai/gpt_4o_mini_20240718.py index 8073bfbc..fcb38235 100644 --- a/opencompass/configs/models/openai/gpt_4o_mini_20240718.py +++ b/opencompass/configs/models/openai/gpt_4o_mini_20240718.py @@ -15,7 +15,7 @@ models = [ openai_proxy_url='ENV', meta_template=api_meta_template, query_per_second=1, - max_out_len=2048, - max_seq_len=4096, + max_out_len=8192, + max_seq_len=8192, batch_size=8), ]