mirror of
https://github.com/open-compass/opencompass.git
synced 2025-05-30 16:03:24 +08:00
[Feature] Add OC academic 2412 (#1750)
This commit is contained in:
parent
54c0fb7a93
commit
0d26b348e4
@ -105,7 +105,7 @@ for _name, _prompt in sub_map.items():
|
||||
]),
|
||||
),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer, max_seq_len=4096, max_out_len=2048),
|
||||
inferencer=dict(type=GenInferencer, max_seq_len=4096, max_out_len=4096),
|
||||
)
|
||||
|
||||
subjective_eval_cfg = dict(
|
||||
|
@ -20,7 +20,7 @@ subjective_infer_cfg = dict(
|
||||
template="""{dialogue}"""
|
||||
),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=ChatInferencer, max_seq_len=4096, max_out_len=512, infer_mode='last'),
|
||||
inferencer=dict(type=ChatInferencer, max_seq_len=32768, max_out_len=4096, infer_mode='last'),
|
||||
)
|
||||
|
||||
subjective_eval_cfg = dict(
|
||||
|
152
configs/eval_academic_leaderboard_202412.py
Normal file
152
configs/eval_academic_leaderboard_202412.py
Normal file
@ -0,0 +1,152 @@
|
||||
from mmengine.config import read_base
|
||||
import os.path as osp
|
||||
from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner
|
||||
from opencompass.runners import LocalRunner, VOLCRunner
|
||||
from opencompass.tasks import OpenICLInferTask, OpenICLEvalTask
|
||||
|
||||
|
||||
#######################################################################
|
||||
# PART 0 Essential Configs #
|
||||
#######################################################################
|
||||
with read_base():
|
||||
# Datasets Part
|
||||
## Core Set
|
||||
# Knowledge
|
||||
from opencompass.configs.datasets.mmlu_pro.mmlu_pro_0shot_cot_gen_08c1de import (
|
||||
mmlu_pro_datasets,
|
||||
)
|
||||
|
||||
# General Reasoning
|
||||
from opencompass.configs.datasets.gpqa.gpqa_openai_simple_evals_gen_5aeece import (
|
||||
gpqa_datasets,
|
||||
)
|
||||
from opencompass.configs.datasets.bbh.bbh_0shot_nocot_gen_925fc4 import (
|
||||
bbh_datasets,
|
||||
)
|
||||
from opencompass.configs.datasets.humaneval.humaneval_openai_sample_evals_gen_159614 import (
|
||||
humaneval_datasets,
|
||||
)
|
||||
|
||||
# Instruction Following
|
||||
from opencompass.configs.datasets.IFEval.IFEval_gen_3321a3 import (
|
||||
ifeval_datasets,
|
||||
)
|
||||
from opencompass.configs.datasets.livecodebench.livecodebench_gen_6966bc import (
|
||||
LCBCodeGeneration_dataset,
|
||||
)
|
||||
|
||||
# Math
|
||||
from opencompass.configs.datasets.cmo_fib.cmo_fib_gen_ace24b import (
|
||||
cmo_fib_datasets,
|
||||
)
|
||||
from opencompass.configs.datasets.aime2024.aime2024_gen_6e39a4 import (
|
||||
aime2024_datasets,
|
||||
)
|
||||
from opencompass.configs.datasets.math.math_prm800k_500_0shot_cot_gen import (
|
||||
math_datasets,
|
||||
)
|
||||
|
||||
# Summary Groups
|
||||
from opencompass.configs.summarizers.groups.bbh import bbh_summary_groups
|
||||
from opencompass.configs.summarizers.groups.mmlu_pro import (
|
||||
mmlu_pro_summary_groups,
|
||||
)
|
||||
|
||||
# Model List
|
||||
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat import (
|
||||
models as hf_internlm2_5_7b_chat_model,
|
||||
)
|
||||
|
||||
#######################################################################
|
||||
# PART 1 Datasets List #
|
||||
#######################################################################
|
||||
# datasets list for evaluation
|
||||
# Only take LCB generation for evaluation
|
||||
datasets = sum(
|
||||
(v for k, v in locals().items() if k.endswith('_datasets')), []
|
||||
) + [LCBCodeGeneration_dataset]
|
||||
|
||||
#######################################################################
|
||||
# PART 2 Datset Summarizer #
|
||||
#######################################################################
|
||||
|
||||
core_summary_groups = [
|
||||
{
|
||||
'name': 'core_average',
|
||||
'subsets': [
|
||||
['IFEval', 'Prompt-level-strict-accuracy'],
|
||||
['bbh', 'naive_average'],
|
||||
['math_prm800k_500', 'accuracy'],
|
||||
['cmo_fib', 'accuracy'],
|
||||
['aime2024', 'accuracy'],
|
||||
['GPQA_diamond', 'accuracy'],
|
||||
['mmlu_pro', 'naive_average'],
|
||||
['openai_humaneval', 'humaneval_pass@1'],
|
||||
['lcb_code_generation', 'pass@1'],
|
||||
],
|
||||
},
|
||||
]
|
||||
|
||||
|
||||
summarizer = dict(
|
||||
dataset_abbrs=[
|
||||
['core_average', 'naive_average'],
|
||||
'',
|
||||
'Instruction Following',
|
||||
['IFEval', 'Prompt-level-strict-accuracy'],
|
||||
'',
|
||||
'General Reasoning',
|
||||
['bbh', 'naive_average'],
|
||||
['GPQA_diamond', 'accuracy'],
|
||||
'',
|
||||
'Math Calculation',
|
||||
['math_prm800k_500', 'accuracy'],
|
||||
['cmo_fib', 'accuracy'],
|
||||
['aime2024', 'accuracy'],
|
||||
'',
|
||||
'Knowledge',
|
||||
['mmlu_pro', 'naive_average'],
|
||||
'',
|
||||
'Code',
|
||||
['openai_humaneval', 'humaneval_pass@1'],
|
||||
['lcb_code_generation', 'pass@1'],
|
||||
],
|
||||
summary_groups=sum(
|
||||
[v for k, v in locals().items() if k.endswith('_summary_groups')], []
|
||||
),
|
||||
)
|
||||
|
||||
#######################################################################
|
||||
# PART 3 Models List #
|
||||
#######################################################################
|
||||
|
||||
models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
|
||||
|
||||
#######################################################################
|
||||
# PART 4 Inference/Evaluation Configuaration #
|
||||
#######################################################################
|
||||
|
||||
# Local Runner
|
||||
infer = dict(
|
||||
partitioner=dict(type=NumWorkerPartitioner, num_worker=8),
|
||||
runner=dict(
|
||||
type=LocalRunner,
|
||||
max_num_workers=16,
|
||||
retry=0, # Modify if needed
|
||||
task=dict(type=OpenICLInferTask),
|
||||
),
|
||||
)
|
||||
|
||||
# eval with local runner
|
||||
eval = dict(
|
||||
partitioner=dict(type=NaivePartitioner, n=10),
|
||||
runner=dict(
|
||||
type=LocalRunner, max_num_workers=16, task=dict(type=OpenICLEvalTask)
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
#######################################################################
|
||||
# PART 5 Utils Configuaration #
|
||||
#######################################################################
|
||||
work_dir = './outputs/oc_academic_202412'
|
@ -0,0 +1,96 @@
|
||||
import os
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.openicl.icl_evaluator import AccEvaluator
|
||||
from opencompass.datasets import BBHDataset, BBHEvaluator, bbh_mcq_postprocess, BBHEvaluator_mcq
|
||||
|
||||
bbh_reader_cfg = dict(input_columns=['input'], output_column='target')
|
||||
|
||||
bbh_multiple_choice_sets = [
|
||||
'temporal_sequences',
|
||||
'disambiguation_qa',
|
||||
'date_understanding',
|
||||
'tracking_shuffled_objects_three_objects',
|
||||
'penguins_in_a_table',
|
||||
'geometric_shapes',
|
||||
'snarks',
|
||||
'ruin_names',
|
||||
'tracking_shuffled_objects_seven_objects',
|
||||
'tracking_shuffled_objects_five_objects',
|
||||
'logical_deduction_three_objects',
|
||||
'hyperbaton',
|
||||
'logical_deduction_five_objects',
|
||||
'logical_deduction_seven_objects',
|
||||
'movie_recommendation',
|
||||
'salient_translation_error_detection',
|
||||
'reasoning_about_colored_objects',
|
||||
]
|
||||
bbh_free_form_sets = [
|
||||
'multistep_arithmetic_two',
|
||||
'navigate',
|
||||
'dyck_languages',
|
||||
'word_sorting',
|
||||
'sports_understanding',
|
||||
'boolean_expressions',
|
||||
'object_counting',
|
||||
'formal_fallacies',
|
||||
'causal_judgement',
|
||||
'web_of_lies',
|
||||
]
|
||||
|
||||
bbh_datasets = []
|
||||
for _name in bbh_multiple_choice_sets:
|
||||
bbh_infer_cfg = dict(
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(round=[
|
||||
dict(
|
||||
role='HUMAN',
|
||||
prompt=
|
||||
f"Question: {{input}}\n You must give your final answer by starting with 'So the answer is' "
|
||||
)
|
||||
])),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer, max_out_len=512))
|
||||
bbh_eval_cfg = dict(
|
||||
evaluator=dict(type=BBHEvaluator_mcq),
|
||||
pred_role='BOT',
|
||||
pred_postprocessor=dict(type=bbh_mcq_postprocess),
|
||||
dataset_postprocessor=dict(type=bbh_mcq_postprocess))
|
||||
|
||||
bbh_datasets.append(
|
||||
dict(
|
||||
type=BBHDataset,
|
||||
path='opencompass/bbh',
|
||||
name=_name,
|
||||
abbr='bbh-' + _name,
|
||||
reader_cfg=bbh_reader_cfg,
|
||||
infer_cfg=bbh_infer_cfg.copy(),
|
||||
eval_cfg=bbh_eval_cfg.copy()))
|
||||
|
||||
for _name in bbh_free_form_sets:
|
||||
|
||||
bbh_infer_cfg = dict(
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(round=[
|
||||
dict(
|
||||
role='HUMAN',
|
||||
prompt=
|
||||
f"Question: {{input}}\n You must give your final answer by starting with 'So the answer is' "
|
||||
)
|
||||
])),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer, max_out_len=512))
|
||||
bbh_eval_cfg = dict(evaluator=dict(type=BBHEvaluator), pred_role='BOT')
|
||||
|
||||
bbh_datasets.append(
|
||||
dict(
|
||||
type=BBHDataset,
|
||||
path='opencompass/bbh',
|
||||
name=_name,
|
||||
abbr='bbh-' + _name,
|
||||
reader_cfg=bbh_reader_cfg,
|
||||
infer_cfg=bbh_infer_cfg.copy(),
|
||||
eval_cfg=bbh_eval_cfg.copy()))
|
@ -50,7 +50,7 @@ for category in categories:
|
||||
abbr=f"korbench_mixed_{category}",
|
||||
path="opencompass/korbench",
|
||||
category=category,
|
||||
mode='mixed',
|
||||
prompt_mode='mixed',
|
||||
reader_cfg=reader_cfg,
|
||||
infer_cfg=infer_cfg,
|
||||
eval_cfg=eval_cfg,
|
||||
|
@ -50,7 +50,7 @@ for category in categories:
|
||||
type=korbenchDataset,
|
||||
abbr=f"korbench_{category}",
|
||||
path="opencompass/korbench",
|
||||
mode='0_shot',
|
||||
prompt_mode='0_shot',
|
||||
category=category,
|
||||
reader_cfg=reader_cfg,
|
||||
infer_cfg=infer_cfg,
|
||||
|
@ -1,4 +1,7 @@
|
||||
from opencompass.datasets.korbench.korbench import korbenchDataset, korbenchEvaluator
|
||||
from opencompass.datasets.korbench.korbench import (
|
||||
korbenchDataset,
|
||||
korbenchEvaluator,
|
||||
)
|
||||
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
@ -13,19 +16,9 @@ for category in categories:
|
||||
prompt_template = dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(
|
||||
begin=[
|
||||
dict(
|
||||
role="HUMAN",
|
||||
prompt=""
|
||||
)
|
||||
],
|
||||
round=[
|
||||
dict(
|
||||
role="HUMAN",
|
||||
prompt="{prompt}" # f-string
|
||||
)
|
||||
]
|
||||
)
|
||||
begin=[dict(role="HUMAN", prompt="")],
|
||||
round=[dict(role="HUMAN", prompt="{prompt}")], # f-string
|
||||
),
|
||||
)
|
||||
|
||||
# Reader configuration
|
||||
@ -51,7 +44,7 @@ for category in categories:
|
||||
type=korbenchDataset,
|
||||
abbr=f"korbench_{category}",
|
||||
path="opencompass/korbench",
|
||||
mode='3_shot',
|
||||
prompt_mode='3_shot',
|
||||
category=category,
|
||||
reader_cfg=reader_cfg,
|
||||
infer_cfg=infer_cfg,
|
||||
|
@ -0,0 +1,45 @@
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.datasets import (
|
||||
MATHDataset,
|
||||
MATHEvaluator,
|
||||
math_postprocess_v2,
|
||||
normalize_final_answer,
|
||||
)
|
||||
|
||||
math_reader_cfg = dict(input_columns=['problem'], output_column='solution')
|
||||
|
||||
math_infer_cfg = dict(
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(
|
||||
round=[
|
||||
dict(
|
||||
role='HUMAN',
|
||||
prompt='{problem}\nPlease reason step by step, and put your final answer within \\boxed{}.',
|
||||
),
|
||||
]
|
||||
),
|
||||
),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer, max_out_len=1024),
|
||||
)
|
||||
|
||||
# postprocess v2
|
||||
math_eval_cfg = dict(
|
||||
evaluator=dict(type=MATHEvaluator, version='v2'),
|
||||
pred_postprocessor=dict(type=math_postprocess_v2),
|
||||
)
|
||||
|
||||
math_datasets = [
|
||||
dict(
|
||||
type=MATHDataset,
|
||||
abbr='math_prm800k_500',
|
||||
path='opencompass/math',
|
||||
file_name='test_prm800k_500.json',
|
||||
reader_cfg=math_reader_cfg,
|
||||
infer_cfg=math_infer_cfg,
|
||||
eval_cfg=math_eval_cfg,
|
||||
)
|
||||
]
|
@ -105,7 +105,7 @@ for _name, _prompt in sub_map.items():
|
||||
]),
|
||||
),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer, max_seq_len=4096, max_out_len=2048),
|
||||
inferencer=dict(type=GenInferencer, max_seq_len=4096, max_out_len=4096),
|
||||
)
|
||||
|
||||
subjective_eval_cfg = dict(
|
||||
|
@ -20,7 +20,7 @@ subjective_infer_cfg = dict(
|
||||
template="""{dialogue}"""
|
||||
),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=ChatInferencer, max_seq_len=4096, max_out_len=512, infer_mode='last'),
|
||||
inferencer=dict(type=ChatInferencer, max_seq_len=32768, max_out_len=4096, infer_mode='last'),
|
||||
)
|
||||
|
||||
subjective_eval_cfg = dict(
|
||||
|
Loading…
Reference in New Issue
Block a user