diff --git a/configs/datasets/subjective/compassarena/compassarena_compare_new.py b/configs/datasets/subjective/compassarena/compassarena_compare_new.py index 96d7ac65..a32691ad 100644 --- a/configs/datasets/subjective/compassarena/compassarena_compare_new.py +++ b/configs/datasets/subjective/compassarena/compassarena_compare_new.py @@ -105,7 +105,7 @@ for _name, _prompt in sub_map.items(): ]), ), retriever=dict(type=ZeroRetriever), - inferencer=dict(type=GenInferencer, max_seq_len=4096, max_out_len=2048), + inferencer=dict(type=GenInferencer, max_seq_len=4096, max_out_len=4096), ) subjective_eval_cfg = dict( diff --git a/configs/datasets/subjective/wildbench/wildbench_pair_judge_new.py b/configs/datasets/subjective/wildbench/wildbench_pair_judge_new.py index 8eb9d901..eff6cbb0 100644 --- a/configs/datasets/subjective/wildbench/wildbench_pair_judge_new.py +++ b/configs/datasets/subjective/wildbench/wildbench_pair_judge_new.py @@ -20,7 +20,7 @@ subjective_infer_cfg = dict( template="""{dialogue}""" ), retriever=dict(type=ZeroRetriever), - inferencer=dict(type=ChatInferencer, max_seq_len=4096, max_out_len=512, infer_mode='last'), + inferencer=dict(type=ChatInferencer, max_seq_len=32768, max_out_len=4096, infer_mode='last'), ) subjective_eval_cfg = dict( diff --git a/configs/eval_academic_leaderboard_202412.py b/configs/eval_academic_leaderboard_202412.py new file mode 100644 index 00000000..0a9e19a5 --- /dev/null +++ b/configs/eval_academic_leaderboard_202412.py @@ -0,0 +1,152 @@ +from mmengine.config import read_base +import os.path as osp +from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner +from opencompass.runners import LocalRunner, VOLCRunner +from opencompass.tasks import OpenICLInferTask, OpenICLEvalTask + + +####################################################################### +# PART 0 Essential Configs # +####################################################################### +with read_base(): + # Datasets Part + ## Core Set + # Knowledge + from opencompass.configs.datasets.mmlu_pro.mmlu_pro_0shot_cot_gen_08c1de import ( + mmlu_pro_datasets, + ) + + # General Reasoning + from opencompass.configs.datasets.gpqa.gpqa_openai_simple_evals_gen_5aeece import ( + gpqa_datasets, + ) + from opencompass.configs.datasets.bbh.bbh_0shot_nocot_gen_925fc4 import ( + bbh_datasets, + ) + from opencompass.configs.datasets.humaneval.humaneval_openai_sample_evals_gen_159614 import ( + humaneval_datasets, + ) + + # Instruction Following + from opencompass.configs.datasets.IFEval.IFEval_gen_3321a3 import ( + ifeval_datasets, + ) + from opencompass.configs.datasets.livecodebench.livecodebench_gen_6966bc import ( + LCBCodeGeneration_dataset, + ) + + # Math + from opencompass.configs.datasets.cmo_fib.cmo_fib_gen_ace24b import ( + cmo_fib_datasets, + ) + from opencompass.configs.datasets.aime2024.aime2024_gen_6e39a4 import ( + aime2024_datasets, + ) + from opencompass.configs.datasets.math.math_prm800k_500_0shot_cot_gen import ( + math_datasets, + ) + + # Summary Groups + from opencompass.configs.summarizers.groups.bbh import bbh_summary_groups + from opencompass.configs.summarizers.groups.mmlu_pro import ( + mmlu_pro_summary_groups, + ) + + # Model List + from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat import ( + models as hf_internlm2_5_7b_chat_model, + ) + +####################################################################### +# PART 1 Datasets List # +####################################################################### +# datasets list for evaluation +# Only take LCB generation for evaluation +datasets = sum( + (v for k, v in locals().items() if k.endswith('_datasets')), [] +) + [LCBCodeGeneration_dataset] + +####################################################################### +# PART 2 Datset Summarizer # +####################################################################### + +core_summary_groups = [ + { + 'name': 'core_average', + 'subsets': [ + ['IFEval', 'Prompt-level-strict-accuracy'], + ['bbh', 'naive_average'], + ['math_prm800k_500', 'accuracy'], + ['cmo_fib', 'accuracy'], + ['aime2024', 'accuracy'], + ['GPQA_diamond', 'accuracy'], + ['mmlu_pro', 'naive_average'], + ['openai_humaneval', 'humaneval_pass@1'], + ['lcb_code_generation', 'pass@1'], + ], + }, +] + + +summarizer = dict( + dataset_abbrs=[ + ['core_average', 'naive_average'], + '', + 'Instruction Following', + ['IFEval', 'Prompt-level-strict-accuracy'], + '', + 'General Reasoning', + ['bbh', 'naive_average'], + ['GPQA_diamond', 'accuracy'], + '', + 'Math Calculation', + ['math_prm800k_500', 'accuracy'], + ['cmo_fib', 'accuracy'], + ['aime2024', 'accuracy'], + '', + 'Knowledge', + ['mmlu_pro', 'naive_average'], + '', + 'Code', + ['openai_humaneval', 'humaneval_pass@1'], + ['lcb_code_generation', 'pass@1'], + ], + summary_groups=sum( + [v for k, v in locals().items() if k.endswith('_summary_groups')], [] + ), +) + +####################################################################### +# PART 3 Models List # +####################################################################### + +models = sum([v for k, v in locals().items() if k.endswith('_model')], []) + +####################################################################### +# PART 4 Inference/Evaluation Configuaration # +####################################################################### + +# Local Runner +infer = dict( + partitioner=dict(type=NumWorkerPartitioner, num_worker=8), + runner=dict( + type=LocalRunner, + max_num_workers=16, + retry=0, # Modify if needed + task=dict(type=OpenICLInferTask), + ), +) + +# eval with local runner +eval = dict( + partitioner=dict(type=NaivePartitioner, n=10), + runner=dict( + type=LocalRunner, max_num_workers=16, task=dict(type=OpenICLEvalTask) + ), +) + + +####################################################################### +# PART 5 Utils Configuaration # +####################################################################### +work_dir = './outputs/oc_academic_202412' diff --git a/opencompass/configs/datasets/bbh/bbh_0shot_nocot_gen_925fc4.py b/opencompass/configs/datasets/bbh/bbh_0shot_nocot_gen_925fc4.py new file mode 100644 index 00000000..4370395e --- /dev/null +++ b/opencompass/configs/datasets/bbh/bbh_0shot_nocot_gen_925fc4.py @@ -0,0 +1,96 @@ +import os +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.openicl.icl_evaluator import AccEvaluator +from opencompass.datasets import BBHDataset, BBHEvaluator, bbh_mcq_postprocess, BBHEvaluator_mcq + +bbh_reader_cfg = dict(input_columns=['input'], output_column='target') + +bbh_multiple_choice_sets = [ + 'temporal_sequences', + 'disambiguation_qa', + 'date_understanding', + 'tracking_shuffled_objects_three_objects', + 'penguins_in_a_table', + 'geometric_shapes', + 'snarks', + 'ruin_names', + 'tracking_shuffled_objects_seven_objects', + 'tracking_shuffled_objects_five_objects', + 'logical_deduction_three_objects', + 'hyperbaton', + 'logical_deduction_five_objects', + 'logical_deduction_seven_objects', + 'movie_recommendation', + 'salient_translation_error_detection', + 'reasoning_about_colored_objects', +] +bbh_free_form_sets = [ + 'multistep_arithmetic_two', + 'navigate', + 'dyck_languages', + 'word_sorting', + 'sports_understanding', + 'boolean_expressions', + 'object_counting', + 'formal_fallacies', + 'causal_judgement', + 'web_of_lies', +] + +bbh_datasets = [] +for _name in bbh_multiple_choice_sets: + bbh_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict(round=[ + dict( + role='HUMAN', + prompt= + f"Question: {{input}}\n You must give your final answer by starting with 'So the answer is' " + ) + ])), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=512)) + bbh_eval_cfg = dict( + evaluator=dict(type=BBHEvaluator_mcq), + pred_role='BOT', + pred_postprocessor=dict(type=bbh_mcq_postprocess), + dataset_postprocessor=dict(type=bbh_mcq_postprocess)) + + bbh_datasets.append( + dict( + type=BBHDataset, + path='opencompass/bbh', + name=_name, + abbr='bbh-' + _name, + reader_cfg=bbh_reader_cfg, + infer_cfg=bbh_infer_cfg.copy(), + eval_cfg=bbh_eval_cfg.copy())) + +for _name in bbh_free_form_sets: + + bbh_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict(round=[ + dict( + role='HUMAN', + prompt= + f"Question: {{input}}\n You must give your final answer by starting with 'So the answer is' " + ) + ])), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=512)) + bbh_eval_cfg = dict(evaluator=dict(type=BBHEvaluator), pred_role='BOT') + + bbh_datasets.append( + dict( + type=BBHDataset, + path='opencompass/bbh', + name=_name, + abbr='bbh-' + _name, + reader_cfg=bbh_reader_cfg, + infer_cfg=bbh_infer_cfg.copy(), + eval_cfg=bbh_eval_cfg.copy())) diff --git a/opencompass/configs/datasets/korbench/korbench_mixed_gen_d00bdd.py b/opencompass/configs/datasets/korbench/korbench_mixed_gen_d00bdd.py index 6447dfe3..ccb95641 100644 --- a/opencompass/configs/datasets/korbench/korbench_mixed_gen_d00bdd.py +++ b/opencompass/configs/datasets/korbench/korbench_mixed_gen_d00bdd.py @@ -50,7 +50,7 @@ for category in categories: abbr=f"korbench_mixed_{category}", path="opencompass/korbench", category=category, - mode='mixed', + prompt_mode='mixed', reader_cfg=reader_cfg, infer_cfg=infer_cfg, eval_cfg=eval_cfg, diff --git a/opencompass/configs/datasets/korbench/korbench_single_0_shot_gen.py b/opencompass/configs/datasets/korbench/korbench_single_0_shot_gen.py index 8a7824b7..57c9350d 100644 --- a/opencompass/configs/datasets/korbench/korbench_single_0_shot_gen.py +++ b/opencompass/configs/datasets/korbench/korbench_single_0_shot_gen.py @@ -50,7 +50,7 @@ for category in categories: type=korbenchDataset, abbr=f"korbench_{category}", path="opencompass/korbench", - mode='0_shot', + prompt_mode='0_shot', category=category, reader_cfg=reader_cfg, infer_cfg=infer_cfg, diff --git a/opencompass/configs/datasets/korbench/korbench_single_3_shot_gen.py b/opencompass/configs/datasets/korbench/korbench_single_3_shot_gen.py index dc959189..1bf65b4e 100644 --- a/opencompass/configs/datasets/korbench/korbench_single_3_shot_gen.py +++ b/opencompass/configs/datasets/korbench/korbench_single_3_shot_gen.py @@ -1,4 +1,7 @@ -from opencompass.datasets.korbench.korbench import korbenchDataset, korbenchEvaluator +from opencompass.datasets.korbench.korbench import ( + korbenchDataset, + korbenchEvaluator, +) from opencompass.openicl.icl_inferencer import GenInferencer from opencompass.openicl.icl_prompt_template import PromptTemplate @@ -13,19 +16,9 @@ for category in categories: prompt_template = dict( type=PromptTemplate, template=dict( - begin=[ - dict( - role="HUMAN", - prompt="" - ) - ], - round=[ - dict( - role="HUMAN", - prompt="{prompt}" # f-string - ) - ] - ) + begin=[dict(role="HUMAN", prompt="")], + round=[dict(role="HUMAN", prompt="{prompt}")], # f-string + ), ) # Reader configuration @@ -51,7 +44,7 @@ for category in categories: type=korbenchDataset, abbr=f"korbench_{category}", path="opencompass/korbench", - mode='3_shot', + prompt_mode='3_shot', category=category, reader_cfg=reader_cfg, infer_cfg=infer_cfg, diff --git a/opencompass/configs/datasets/math/math_prm800k_500_0shot_cot_gen.py b/opencompass/configs/datasets/math/math_prm800k_500_0shot_cot_gen.py new file mode 100644 index 00000000..d4690bb3 --- /dev/null +++ b/opencompass/configs/datasets/math/math_prm800k_500_0shot_cot_gen.py @@ -0,0 +1,45 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import ( + MATHDataset, + MATHEvaluator, + math_postprocess_v2, + normalize_final_answer, +) + +math_reader_cfg = dict(input_columns=['problem'], output_column='solution') + +math_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict( + role='HUMAN', + prompt='{problem}\nPlease reason step by step, and put your final answer within \\boxed{}.', + ), + ] + ), + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=1024), +) + +# postprocess v2 +math_eval_cfg = dict( + evaluator=dict(type=MATHEvaluator, version='v2'), + pred_postprocessor=dict(type=math_postprocess_v2), +) + +math_datasets = [ + dict( + type=MATHDataset, + abbr='math_prm800k_500', + path='opencompass/math', + file_name='test_prm800k_500.json', + reader_cfg=math_reader_cfg, + infer_cfg=math_infer_cfg, + eval_cfg=math_eval_cfg, + ) +] diff --git a/opencompass/configs/datasets/subjective/compassarena/compassarena_compare_new.py b/opencompass/configs/datasets/subjective/compassarena/compassarena_compare_new.py index 96d7ac65..a32691ad 100644 --- a/opencompass/configs/datasets/subjective/compassarena/compassarena_compare_new.py +++ b/opencompass/configs/datasets/subjective/compassarena/compassarena_compare_new.py @@ -105,7 +105,7 @@ for _name, _prompt in sub_map.items(): ]), ), retriever=dict(type=ZeroRetriever), - inferencer=dict(type=GenInferencer, max_seq_len=4096, max_out_len=2048), + inferencer=dict(type=GenInferencer, max_seq_len=4096, max_out_len=4096), ) subjective_eval_cfg = dict( diff --git a/opencompass/configs/datasets/subjective/wildbench/wildbench_pair_judge_new.py b/opencompass/configs/datasets/subjective/wildbench/wildbench_pair_judge_new.py index 8eb9d901..eff6cbb0 100644 --- a/opencompass/configs/datasets/subjective/wildbench/wildbench_pair_judge_new.py +++ b/opencompass/configs/datasets/subjective/wildbench/wildbench_pair_judge_new.py @@ -20,7 +20,7 @@ subjective_infer_cfg = dict( template="""{dialogue}""" ), retriever=dict(type=ZeroRetriever), - inferencer=dict(type=ChatInferencer, max_seq_len=4096, max_out_len=512, infer_mode='last'), + inferencer=dict(type=ChatInferencer, max_seq_len=32768, max_out_len=4096, infer_mode='last'), ) subjective_eval_cfg = dict(