mirror of
https://github.com/open-compass/opencompass.git
synced 2025-05-30 16:03:24 +08:00
[Update] Update OC academic 202412 (#1771)
* [Update] Update academic settings * Update * update
This commit is contained in:
parent
d70100cdf2
commit
ebefffed61
@ -79,6 +79,8 @@ Just like a compass guides us on our journey, OpenCompass will guide you through
|
||||
|
||||
We provide [OpenCompass Leaderboard](https://rank.opencompass.org.cn/home) for the community to rank all public models and API models. If you would like to join the evaluation, please provide the model repository URL or a standard API interface to the email address `opencompass@pjlab.org.cn`.
|
||||
|
||||
You can also refer to [CompassAcademic](configs/eval_academic_leaderboard_202412.py) to quickly reproduce the leaderboard results. The currently selected datasets include Knowledge Reasoning (MMLU-Pro/GPQA Diamond), Logical Reasoning (BBH), Mathematical Reasoning (MATH-500, AIME), Code Generation (LiveCodeBench, HumanEval), and Instruction Following (IFEval)."
|
||||
|
||||
<p align="right"><a href="#top">🔝Back to top</a></p>
|
||||
|
||||
## 🛠️ Installation
|
||||
|
@ -77,6 +77,8 @@
|
||||
|
||||
我们将陆续提供开源模型和 API 模型的具体性能榜单,请见 [OpenCompass Leaderboard](https://rank.opencompass.org.cn/home) 。如需加入评测,请提供模型仓库地址或标准的 API 接口至邮箱 `opencompass@pjlab.org.cn`.
|
||||
|
||||
你也可以参考[CompassAcademic](configs/eval_academic_leaderboard_202412.py),快速地复现榜单的结果,目前选取的数据集包括 综合知识推理 (MMLU-Pro/GPQA Diamond) ,逻辑推理 (BBH) ,数学推理 (MATH-500, AIME) ,代码生成 (LiveCodeBench, HumanEval) ,指令跟随 (IFEval) 。
|
||||
|
||||
<p align="right"><a href="#top">🔝返回顶部</a></p>
|
||||
|
||||
## 🛠️ 安装指南
|
||||
|
@ -10,12 +10,10 @@ from opencompass.tasks import OpenICLInferTask, OpenICLEvalTask
|
||||
#######################################################################
|
||||
with read_base():
|
||||
# Datasets Part
|
||||
## Core Set
|
||||
# Knowledge
|
||||
from opencompass.configs.datasets.mmlu_pro.mmlu_pro_0shot_cot_gen_08c1de import (
|
||||
mmlu_pro_datasets,
|
||||
)
|
||||
|
||||
# General Reasoning
|
||||
from opencompass.configs.datasets.gpqa.gpqa_openai_simple_evals_gen_5aeece import (
|
||||
gpqa_datasets,
|
||||
@ -23,22 +21,19 @@ with read_base():
|
||||
from opencompass.configs.datasets.bbh.bbh_0shot_nocot_gen_925fc4 import (
|
||||
bbh_datasets,
|
||||
)
|
||||
from opencompass.configs.datasets.humaneval.humaneval_openai_sample_evals_gen_159614 import (
|
||||
from opencompass.configs.datasets.humaneval.humaneval_openai_sample_evals_gen_dcae0e import (
|
||||
humaneval_datasets,
|
||||
)
|
||||
|
||||
# Instruction Following
|
||||
from opencompass.configs.datasets.IFEval.IFEval_gen_3321a3 import (
|
||||
from opencompass.configs.datasets.IFEval.IFEval_gen_353ae7 import (
|
||||
ifeval_datasets,
|
||||
)
|
||||
from opencompass.configs.datasets.livecodebench.livecodebench_gen_6966bc import (
|
||||
from opencompass.configs.datasets.livecodebench.livecodebench_gen_a4f90b import (
|
||||
LCBCodeGeneration_dataset,
|
||||
)
|
||||
|
||||
# Math
|
||||
from opencompass.configs.datasets.cmo_fib.cmo_fib_gen_ace24b import (
|
||||
cmo_fib_datasets,
|
||||
)
|
||||
from opencompass.configs.datasets.aime2024.aime2024_gen_6e39a4 import (
|
||||
aime2024_datasets,
|
||||
)
|
||||
@ -77,7 +72,6 @@ core_summary_groups = [
|
||||
['IFEval', 'Prompt-level-strict-accuracy'],
|
||||
['bbh', 'naive_average'],
|
||||
['math_prm800k_500', 'accuracy'],
|
||||
['cmo_fib', 'accuracy'],
|
||||
['aime2024', 'accuracy'],
|
||||
['GPQA_diamond', 'accuracy'],
|
||||
['mmlu_pro', 'naive_average'],
|
||||
@ -101,7 +95,6 @@ summarizer = dict(
|
||||
'',
|
||||
'Math Calculation',
|
||||
['math_prm800k_500', 'accuracy'],
|
||||
['cmo_fib', 'accuracy'],
|
||||
['aime2024', 'accuracy'],
|
||||
'',
|
||||
'Knowledge',
|
||||
|
@ -0,0 +1,164 @@
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.datasets import (
|
||||
LCBCodeGenerationDataset,
|
||||
LCBCodeExecutionDataset,
|
||||
LCBTestOutputPredictionDataset,
|
||||
LCBCodeGenerationEvaluator,
|
||||
LCBCodeExecutionEvaluator,
|
||||
LCBTestOutputEvaluator
|
||||
)
|
||||
from opencompass.datasets.livecodebench import TestOutputPromptConstants
|
||||
|
||||
|
||||
lcb_code_generation_reader_cfg = dict(
|
||||
input_columns=[
|
||||
'question_content',
|
||||
'format_prompt',
|
||||
],
|
||||
# output_column='evaluation_sample',
|
||||
output_column='question_id',
|
||||
)
|
||||
|
||||
SYSTEM_MESSAGE_GENERIC = f'You are an expert Python programmer. You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests. You will NOT return anything except for the program.'
|
||||
|
||||
prompt_template = '### Question:\n{question_content}\n\n{format_prompt}' + \
|
||||
'### Answer: (use the provided format with backticks)\n\n'
|
||||
|
||||
|
||||
# Code Generation Tasks
|
||||
lcb_code_generation_infer_cfg = dict(
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(
|
||||
round=[
|
||||
dict(
|
||||
role='HUMAN',
|
||||
prompt=prompt_template
|
||||
)
|
||||
]
|
||||
)
|
||||
),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer)
|
||||
)
|
||||
|
||||
lcb_code_generation_eval_cfg = dict(
|
||||
evaluator=dict(
|
||||
type=LCBCodeGenerationEvaluator,
|
||||
num_process_evaluate=4,
|
||||
timeout=6,
|
||||
),
|
||||
pred_role='BOT',
|
||||
)
|
||||
|
||||
LCBCodeGeneration_dataset = dict(
|
||||
type=LCBCodeGenerationDataset,
|
||||
abbr='lcb_code_generation',
|
||||
path='opencompass/code_generation_lite',
|
||||
reader_cfg=lcb_code_generation_reader_cfg,
|
||||
infer_cfg=lcb_code_generation_infer_cfg,
|
||||
eval_cfg=lcb_code_generation_eval_cfg
|
||||
)
|
||||
|
||||
# Code Execution Dataset
|
||||
lcb_code_execution_reader_cfg = dict(
|
||||
input_columns=[
|
||||
'prompt',
|
||||
],
|
||||
output_column='evaluation_sample',
|
||||
)
|
||||
|
||||
lcb_code_execution_infer_cfg = dict(
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(
|
||||
begin=[
|
||||
dict(
|
||||
role='SYSTEM',
|
||||
fallback_role='HUMAN',
|
||||
prompt='You are an expert at Python programming, code execution, test case generation, and fuzzing.'
|
||||
),
|
||||
],
|
||||
round=[
|
||||
dict(
|
||||
role='HUMAN',
|
||||
prompt='{prompt}'
|
||||
)
|
||||
]
|
||||
)
|
||||
),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer)
|
||||
)
|
||||
|
||||
lcb_code_execution_eval_cfg = dict(
|
||||
evaluator=dict(
|
||||
type=LCBCodeExecutionEvaluator,
|
||||
),
|
||||
pred_role='BOT',
|
||||
)
|
||||
|
||||
LCBCodeExecution_dataset = dict(
|
||||
type=LCBCodeExecutionDataset,
|
||||
abbr='lcb_code_execution',
|
||||
path='opencompass/execution-v2',
|
||||
reader_cfg=lcb_code_execution_reader_cfg,
|
||||
infer_cfg=lcb_code_execution_infer_cfg,
|
||||
eval_cfg=lcb_code_execution_eval_cfg,
|
||||
)
|
||||
|
||||
# TestOuputput Dataset
|
||||
lcb_test_output_reader_cfg = dict(
|
||||
input_columns=[
|
||||
'prompt',
|
||||
],
|
||||
output_column='evaluation_sample',
|
||||
)
|
||||
|
||||
system_prompt = 'You are an expert Python programmer. You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests. You will NOT return anything except for the program.'
|
||||
|
||||
lcb_test_output_infer_cfg = dict(
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(
|
||||
# begin=[
|
||||
# dict(
|
||||
# role='SYSTEM',
|
||||
# prompt=system_prompt
|
||||
# ),
|
||||
# ],
|
||||
round=[
|
||||
dict(
|
||||
role='HUMAN',
|
||||
prompt='{prompt}'
|
||||
)
|
||||
]
|
||||
)
|
||||
),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer)
|
||||
)
|
||||
|
||||
lcb_test_output_eval_cfg = dict(
|
||||
evaluator=dict(
|
||||
type=LCBTestOutputEvaluator,
|
||||
),
|
||||
pred_role='BOT',
|
||||
)
|
||||
|
||||
LCBTestOutput_dataset = dict(
|
||||
type=LCBTestOutputPredictionDataset,
|
||||
abbr='lcb_test_output',
|
||||
path='opencompass/test_generation',
|
||||
reader_cfg=lcb_test_output_reader_cfg,
|
||||
infer_cfg=lcb_test_output_infer_cfg,
|
||||
eval_cfg=lcb_test_output_eval_cfg,
|
||||
)
|
||||
|
||||
LCB_datasets = [
|
||||
LCBCodeGeneration_dataset,
|
||||
LCBCodeExecution_dataset,
|
||||
LCBTestOutput_dataset,
|
||||
]
|
@ -23,7 +23,7 @@ math_infer_cfg = dict(
|
||||
),
|
||||
),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer, max_out_len=1024),
|
||||
inferencer=dict(type=GenInferencer),
|
||||
)
|
||||
|
||||
# postprocess v2
|
||||
|
@ -163,6 +163,8 @@ class BigCodeBenchEvaluator(BaseEvaluator):
|
||||
logger.info('Read timeout error. Retrying in 4s...')
|
||||
time.sleep(4)
|
||||
|
||||
if 'pass@1' in pass_at_k.keys():
|
||||
pass_at_k['pass@1'] *= 100
|
||||
dump_results = {'details': results}
|
||||
dump_results.update(pass_at_k)
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user