mirror of
https://github.com/open-compass/opencompass.git
synced 2025-05-30 16:03:24 +08:00

* [Update] Support OlympiadBench-Math/OmniMath/LiveMathBench-Hard with LLM Verify * Update * Update * Update DeepSeek-R1 example * Update DeepSeek-R1 example * Update DeepSeek-R1 example
213 lines
7.5 KiB
Python
213 lines
7.5 KiB
Python
# Support AIME-2024 with Repeat8
|
|
# Support MATH-500
|
|
# Support OlympiadBench
|
|
# Support OmniMath
|
|
# Support LiveMathBench-202412-Hard
|
|
|
|
import os.path as osp
|
|
from itertools import product
|
|
from opencompass.models import OpenAISDK
|
|
from mmengine.config import read_base
|
|
from opencompass.utils.text_postprocessors import extract_non_reasoning_content
|
|
from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner
|
|
from opencompass.tasks import OpenICLInferTask, OpenICLEvalTask
|
|
from opencompass.runners import LocalRunner
|
|
from opencompass.models import (
|
|
TurboMindModelwithChatTemplate,
|
|
)
|
|
|
|
#######################################################################
|
|
# PART 1 Datasets List #
|
|
#######################################################################
|
|
with read_base():
|
|
# You can comment out the datasets you don't want to evaluate
|
|
|
|
# Datasets
|
|
# from opencompass.configs.datasets.math.math_prm800k_500_llmverify_gen_6ff468 import math_datasets # 1 Run
|
|
from opencompass.configs.datasets.aime2024.aime2024_llmverify_repeat8_gen_e8fcee import aime2024_datasets # 8 Run
|
|
# from opencompass.configs.datasets.OlympiadBench.OlympiadBench_0shot_llmverify_gen_be8b13 import olympiadbench_datasets
|
|
# from opencompass.configs.datasets.omni_math.omni_math_llmverify_gen_ccf9c0 import omnimath_datasets # 1 Run
|
|
# from opencompass.configs.datasets.livemathbench.livemathbench_hard_custom_llmverify_gen_85d0ef import livemathbench_datasets
|
|
|
|
|
|
# Summarizer
|
|
from opencompass.configs.summarizers.groups.OlympiadBench import OlympiadBenchMath_summary_groups
|
|
|
|
datasets = sum(
|
|
(v for k, v in locals().items() if k.endswith('_datasets')),
|
|
[],
|
|
)
|
|
|
|
# Set LLM Verifier used for each dataset
|
|
|
|
verifier_cfg = dict(
|
|
abbr='qwen2-5-32B-Instruct',
|
|
type=OpenAISDK,
|
|
path='Qwen/Qwen2.5-32B-Instruct', # You need to set your own judge model path
|
|
key='sk-1234', # You need to set your own API key
|
|
openai_api_base=[
|
|
'http://172.30.56.1:4000/v1', # You need to set your own API base
|
|
],
|
|
meta_template=dict(
|
|
round=[
|
|
dict(role='HUMAN', api_role='HUMAN'),
|
|
dict(role='BOT', api_role='BOT', generate=True),
|
|
],
|
|
),
|
|
query_per_second=16,
|
|
batch_size=1024,
|
|
temperature=0.001,
|
|
tokenizer_path='gpt-4o-2024-05-13',
|
|
verbose=True,
|
|
max_out_len=16384,
|
|
# max_seq_len=32768,
|
|
max_seq_len=49152,
|
|
)
|
|
|
|
for item in datasets:
|
|
# item['infer_cfg']['inferencer']['max_out_len'] = 32768 # You can unset this line if you want to avoid length cutoff
|
|
if 'judge_cfg' in item['eval_cfg']['evaluator']:
|
|
item['eval_cfg']['evaluator']['judge_cfg'] = verifier_cfg
|
|
|
|
|
|
#######################################################################
|
|
# PART 2 Model List #
|
|
#######################################################################
|
|
|
|
models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
|
|
|
|
models += [
|
|
# You can comment out the models you don't want to evaluate
|
|
# All models use sampling mode
|
|
dict(
|
|
type=TurboMindModelwithChatTemplate,
|
|
abbr='deepseek-r1-distill-qwen-7b-turbomind',
|
|
path='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B',
|
|
engine_config=dict(session_len=32768, max_batch_size=128, tp=1),
|
|
gen_config=dict(
|
|
do_sample=True,
|
|
temperature=0.6,
|
|
top_p=0.95,
|
|
max_new_tokens=32768),
|
|
max_seq_len=32768,
|
|
max_out_len=32768,
|
|
batch_size=64,
|
|
run_cfg=dict(num_gpus=1),
|
|
pred_postprocessor=dict(type=extract_non_reasoning_content)
|
|
),
|
|
# dict(
|
|
# type=TurboMindModelwithChatTemplate,
|
|
# abbr='deepseek-r1-distill-qwen-14b-turbomind',
|
|
# path='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B',
|
|
# engine_config=dict(session_len=32768, max_batch_size=128, tp=2),
|
|
# gen_config=dict(
|
|
# do_sample=True,
|
|
# temperature=0.6,
|
|
# top_p=0.95,
|
|
# max_new_tokens=32768),
|
|
# max_seq_len=32768,
|
|
# max_out_len=32768,
|
|
# batch_size=128,
|
|
# run_cfg=dict(num_gpus=2),
|
|
# pred_postprocessor=dict(type=extract_non_reasoning_content)
|
|
# ),
|
|
# dict(
|
|
# type=TurboMindModelwithChatTemplate,
|
|
# abbr='deepseek-r1-distill-qwen-32b-turbomind',
|
|
# path='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B',
|
|
# engine_config=dict(session_len=32768, max_batch_size=128, tp=4),
|
|
# gen_config=dict(
|
|
# do_sample=True,
|
|
# temperature=0.6,
|
|
# top_p=0.95,
|
|
# max_new_tokens=16384),
|
|
# max_seq_len=32768,
|
|
# max_out_len=16384,
|
|
# batch_size=128,
|
|
# run_cfg=dict(num_gpus=4),
|
|
# pred_postprocessor=dict(type=extract_non_reasoning_content)
|
|
# ),
|
|
]
|
|
|
|
#######################################################################
|
|
# PART 3 Inference/Evaluation #
|
|
#######################################################################
|
|
|
|
# Inference configuration
|
|
infer = dict(
|
|
partitioner=dict(
|
|
type=NumWorkerPartitioner,
|
|
num_worker=1
|
|
# Similar with data-parallelism, how many workers for evaluation,
|
|
# each worker will evaluate a part of the dataset. Total GPUs = num_worker * num_gpus_per_worker
|
|
# For example, If you have 8 GPUs, for 7B model using 1 GPU for one instance, you can set num_worker=8
|
|
# to max-utilize the GPUs.
|
|
# If you have 8 GPUs, for 14B model using 2 GPUs for one instance, you can set num_worker=4
|
|
),
|
|
runner=dict(
|
|
type=LocalRunner,
|
|
task=dict(type=OpenICLInferTask)
|
|
),
|
|
)
|
|
|
|
# Evaluation configuration
|
|
eval = dict(
|
|
partitioner=dict(
|
|
type=NaivePartitioner, n=8
|
|
),
|
|
runner=dict(
|
|
type=LocalRunner,
|
|
task=dict(
|
|
type=OpenICLEvalTask)
|
|
),
|
|
)
|
|
|
|
|
|
#######################################################################
|
|
# PART 4 Summarizer #
|
|
#######################################################################
|
|
|
|
|
|
summary_groups = sum(
|
|
[v for k, v in locals().items() if k.endswith('_summary_groups')], []
|
|
)
|
|
|
|
summary_groups.extend([
|
|
{
|
|
'name': 'AIME2024-Aveage8',
|
|
'subsets':[[f'aime2024-run{idx}', 'accuracy'] for idx in range(8)]
|
|
},
|
|
{
|
|
'name': 'LiveMathBench-v202412-Hard-Aveage8',
|
|
'subsets':[[
|
|
f'livemathbench_hard_custom_{split}_run{run_idx}', 'accuracy']
|
|
for split, run_idx in product(['hard_cn', 'hard_en'], range(8))
|
|
]
|
|
}
|
|
])
|
|
|
|
# Summarizer
|
|
summarizer = dict(
|
|
dataset_abbrs=[
|
|
'MATH',
|
|
# ['LiveMathBench-k1-n1', 'pass@1'],
|
|
# ['LiveMathBench-v202412-greedy', 'G-Pass@1_0.0'],
|
|
# ['aime2024', 'accuracy'],
|
|
['math_prm800k_500-llmjudge', 'accuracy'],
|
|
['AIME2024-Aveage8', 'naive_average'],
|
|
['LiveMathBench-v202412-Hard-Aveage8', 'naive_average'],
|
|
['OlympiadBenchMath', 'accuracy'],
|
|
['OmniMath', 'accuracy'],
|
|
],
|
|
summary_groups=summary_groups,
|
|
)
|
|
|
|
|
|
#######################################################################
|
|
# PART 5 Utils #
|
|
#######################################################################
|
|
|
|
work_dir = 'outputs/deepseek_r1_reasoning'
|
|
|
|
|