[Feature] Add OC academic 2412 (#1750)

This commit is contained in:
Linchen Xiao 2024-12-10 21:53:06 +08:00 committed by GitHub
parent 54c0fb7a93
commit 0d26b348e4
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
10 changed files with 307 additions and 21 deletions

View File

@ -105,7 +105,7 @@ for _name, _prompt in sub_map.items():
]),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_seq_len=4096, max_out_len=2048),
inferencer=dict(type=GenInferencer, max_seq_len=4096, max_out_len=4096),
)
subjective_eval_cfg = dict(

View File

@ -20,7 +20,7 @@ subjective_infer_cfg = dict(
template="""{dialogue}"""
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=ChatInferencer, max_seq_len=4096, max_out_len=512, infer_mode='last'),
inferencer=dict(type=ChatInferencer, max_seq_len=32768, max_out_len=4096, infer_mode='last'),
)
subjective_eval_cfg = dict(

View File

@ -0,0 +1,152 @@
from mmengine.config import read_base
import os.path as osp
from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner
from opencompass.runners import LocalRunner, VOLCRunner
from opencompass.tasks import OpenICLInferTask, OpenICLEvalTask
#######################################################################
# PART 0 Essential Configs #
#######################################################################
with read_base():
# Datasets Part
## Core Set
# Knowledge
from opencompass.configs.datasets.mmlu_pro.mmlu_pro_0shot_cot_gen_08c1de import (
mmlu_pro_datasets,
)
# General Reasoning
from opencompass.configs.datasets.gpqa.gpqa_openai_simple_evals_gen_5aeece import (
gpqa_datasets,
)
from opencompass.configs.datasets.bbh.bbh_0shot_nocot_gen_925fc4 import (
bbh_datasets,
)
from opencompass.configs.datasets.humaneval.humaneval_openai_sample_evals_gen_159614 import (
humaneval_datasets,
)
# Instruction Following
from opencompass.configs.datasets.IFEval.IFEval_gen_3321a3 import (
ifeval_datasets,
)
from opencompass.configs.datasets.livecodebench.livecodebench_gen_6966bc import (
LCBCodeGeneration_dataset,
)
# Math
from opencompass.configs.datasets.cmo_fib.cmo_fib_gen_ace24b import (
cmo_fib_datasets,
)
from opencompass.configs.datasets.aime2024.aime2024_gen_6e39a4 import (
aime2024_datasets,
)
from opencompass.configs.datasets.math.math_prm800k_500_0shot_cot_gen import (
math_datasets,
)
# Summary Groups
from opencompass.configs.summarizers.groups.bbh import bbh_summary_groups
from opencompass.configs.summarizers.groups.mmlu_pro import (
mmlu_pro_summary_groups,
)
# Model List
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat import (
models as hf_internlm2_5_7b_chat_model,
)
#######################################################################
# PART 1 Datasets List #
#######################################################################
# datasets list for evaluation
# Only take LCB generation for evaluation
datasets = sum(
(v for k, v in locals().items() if k.endswith('_datasets')), []
) + [LCBCodeGeneration_dataset]
#######################################################################
# PART 2 Datset Summarizer #
#######################################################################
core_summary_groups = [
{
'name': 'core_average',
'subsets': [
['IFEval', 'Prompt-level-strict-accuracy'],
['bbh', 'naive_average'],
['math_prm800k_500', 'accuracy'],
['cmo_fib', 'accuracy'],
['aime2024', 'accuracy'],
['GPQA_diamond', 'accuracy'],
['mmlu_pro', 'naive_average'],
['openai_humaneval', 'humaneval_pass@1'],
['lcb_code_generation', 'pass@1'],
],
},
]
summarizer = dict(
dataset_abbrs=[
['core_average', 'naive_average'],
'',
'Instruction Following',
['IFEval', 'Prompt-level-strict-accuracy'],
'',
'General Reasoning',
['bbh', 'naive_average'],
['GPQA_diamond', 'accuracy'],
'',
'Math Calculation',
['math_prm800k_500', 'accuracy'],
['cmo_fib', 'accuracy'],
['aime2024', 'accuracy'],
'',
'Knowledge',
['mmlu_pro', 'naive_average'],
'',
'Code',
['openai_humaneval', 'humaneval_pass@1'],
['lcb_code_generation', 'pass@1'],
],
summary_groups=sum(
[v for k, v in locals().items() if k.endswith('_summary_groups')], []
),
)
#######################################################################
# PART 3 Models List #
#######################################################################
models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
#######################################################################
# PART 4 Inference/Evaluation Configuaration #
#######################################################################
# Local Runner
infer = dict(
partitioner=dict(type=NumWorkerPartitioner, num_worker=8),
runner=dict(
type=LocalRunner,
max_num_workers=16,
retry=0, # Modify if needed
task=dict(type=OpenICLInferTask),
),
)
# eval with local runner
eval = dict(
partitioner=dict(type=NaivePartitioner, n=10),
runner=dict(
type=LocalRunner, max_num_workers=16, task=dict(type=OpenICLEvalTask)
),
)
#######################################################################
# PART 5 Utils Configuaration #
#######################################################################
work_dir = './outputs/oc_academic_202412'

View File

@ -0,0 +1,96 @@
import os
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import AccEvaluator
from opencompass.datasets import BBHDataset, BBHEvaluator, bbh_mcq_postprocess, BBHEvaluator_mcq
bbh_reader_cfg = dict(input_columns=['input'], output_column='target')
bbh_multiple_choice_sets = [
'temporal_sequences',
'disambiguation_qa',
'date_understanding',
'tracking_shuffled_objects_three_objects',
'penguins_in_a_table',
'geometric_shapes',
'snarks',
'ruin_names',
'tracking_shuffled_objects_seven_objects',
'tracking_shuffled_objects_five_objects',
'logical_deduction_three_objects',
'hyperbaton',
'logical_deduction_five_objects',
'logical_deduction_seven_objects',
'movie_recommendation',
'salient_translation_error_detection',
'reasoning_about_colored_objects',
]
bbh_free_form_sets = [
'multistep_arithmetic_two',
'navigate',
'dyck_languages',
'word_sorting',
'sports_understanding',
'boolean_expressions',
'object_counting',
'formal_fallacies',
'causal_judgement',
'web_of_lies',
]
bbh_datasets = []
for _name in bbh_multiple_choice_sets:
bbh_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(round=[
dict(
role='HUMAN',
prompt=
f"Question: {{input}}\n You must give your final answer by starting with 'So the answer is' "
)
])),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=512))
bbh_eval_cfg = dict(
evaluator=dict(type=BBHEvaluator_mcq),
pred_role='BOT',
pred_postprocessor=dict(type=bbh_mcq_postprocess),
dataset_postprocessor=dict(type=bbh_mcq_postprocess))
bbh_datasets.append(
dict(
type=BBHDataset,
path='opencompass/bbh',
name=_name,
abbr='bbh-' + _name,
reader_cfg=bbh_reader_cfg,
infer_cfg=bbh_infer_cfg.copy(),
eval_cfg=bbh_eval_cfg.copy()))
for _name in bbh_free_form_sets:
bbh_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(round=[
dict(
role='HUMAN',
prompt=
f"Question: {{input}}\n You must give your final answer by starting with 'So the answer is' "
)
])),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=512))
bbh_eval_cfg = dict(evaluator=dict(type=BBHEvaluator), pred_role='BOT')
bbh_datasets.append(
dict(
type=BBHDataset,
path='opencompass/bbh',
name=_name,
abbr='bbh-' + _name,
reader_cfg=bbh_reader_cfg,
infer_cfg=bbh_infer_cfg.copy(),
eval_cfg=bbh_eval_cfg.copy()))

View File

@ -50,7 +50,7 @@ for category in categories:
abbr=f"korbench_mixed_{category}",
path="opencompass/korbench",
category=category,
mode='mixed',
prompt_mode='mixed',
reader_cfg=reader_cfg,
infer_cfg=infer_cfg,
eval_cfg=eval_cfg,

View File

@ -50,7 +50,7 @@ for category in categories:
type=korbenchDataset,
abbr=f"korbench_{category}",
path="opencompass/korbench",
mode='0_shot',
prompt_mode='0_shot',
category=category,
reader_cfg=reader_cfg,
infer_cfg=infer_cfg,

View File

@ -1,4 +1,7 @@
from opencompass.datasets.korbench.korbench import korbenchDataset, korbenchEvaluator
from opencompass.datasets.korbench.korbench import (
korbenchDataset,
korbenchEvaluator,
)
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_prompt_template import PromptTemplate
@ -13,19 +16,9 @@ for category in categories:
prompt_template = dict(
type=PromptTemplate,
template=dict(
begin=[
dict(
role="HUMAN",
prompt=""
)
],
round=[
dict(
role="HUMAN",
prompt="{prompt}" # f-string
)
]
)
begin=[dict(role="HUMAN", prompt="")],
round=[dict(role="HUMAN", prompt="{prompt}")], # f-string
),
)
# Reader configuration
@ -51,7 +44,7 @@ for category in categories:
type=korbenchDataset,
abbr=f"korbench_{category}",
path="opencompass/korbench",
mode='3_shot',
prompt_mode='3_shot',
category=category,
reader_cfg=reader_cfg,
infer_cfg=infer_cfg,

View File

@ -0,0 +1,45 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets import (
MATHDataset,
MATHEvaluator,
math_postprocess_v2,
normalize_final_answer,
)
math_reader_cfg = dict(input_columns=['problem'], output_column='solution')
math_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(
role='HUMAN',
prompt='{problem}\nPlease reason step by step, and put your final answer within \\boxed{}.',
),
]
),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=1024),
)
# postprocess v2
math_eval_cfg = dict(
evaluator=dict(type=MATHEvaluator, version='v2'),
pred_postprocessor=dict(type=math_postprocess_v2),
)
math_datasets = [
dict(
type=MATHDataset,
abbr='math_prm800k_500',
path='opencompass/math',
file_name='test_prm800k_500.json',
reader_cfg=math_reader_cfg,
infer_cfg=math_infer_cfg,
eval_cfg=math_eval_cfg,
)
]

View File

@ -105,7 +105,7 @@ for _name, _prompt in sub_map.items():
]),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_seq_len=4096, max_out_len=2048),
inferencer=dict(type=GenInferencer, max_seq_len=4096, max_out_len=4096),
)
subjective_eval_cfg = dict(

View File

@ -20,7 +20,7 @@ subjective_infer_cfg = dict(
template="""{dialogue}"""
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=ChatInferencer, max_seq_len=4096, max_out_len=512, infer_mode='last'),
inferencer=dict(type=ChatInferencer, max_seq_len=32768, max_out_len=4096, infer_mode='last'),
)
subjective_eval_cfg = dict(