# Support AIME-2024 with Repeat8 # Support MATH-500 # Support OlympiadBench # Support OmniMath # Support LiveMathBench-202412-Hard import os.path as osp from itertools import product from opencompass.models import OpenAISDK from mmengine.config import read_base from opencompass.utils.text_postprocessors import extract_non_reasoning_content from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner from opencompass.tasks import OpenICLInferTask, OpenICLEvalTask from opencompass.runners import LocalRunner from opencompass.models import ( TurboMindModelwithChatTemplate, ) ####################################################################### # PART 1 Datasets List # ####################################################################### with read_base(): # You can comment out the datasets you don't want to evaluate # Datasets # from opencompass.configs.datasets.math.math_prm800k_500_llmverify_gen_6ff468 import math_datasets # 1 Run from opencompass.configs.datasets.aime2024.aime2024_llmverify_repeat8_gen_e8fcee import aime2024_datasets # 8 Run # from opencompass.configs.datasets.OlympiadBench.OlympiadBench_0shot_llmverify_gen_be8b13 import olympiadbench_datasets # from opencompass.configs.datasets.omni_math.omni_math_llmverify_gen_ccf9c0 import omnimath_datasets # 1 Run # from opencompass.configs.datasets.livemathbench.livemathbench_hard_custom_llmverify_gen_85d0ef import livemathbench_datasets # Summarizer from opencompass.configs.summarizers.groups.OlympiadBench import OlympiadBenchMath_summary_groups datasets = sum( (v for k, v in locals().items() if k.endswith('_datasets')), [], ) # Set LLM Verifier used for each dataset verifier_cfg = dict( abbr='qwen2-5-32B-Instruct', type=OpenAISDK, path='Qwen/Qwen2.5-32B-Instruct', # You need to set your own judge model path key='sk-1234', # You need to set your own API key openai_api_base=[ 'http://172.30.56.1:4000/v1', # You need to set your own API base ], meta_template=dict( round=[ dict(role='HUMAN', api_role='HUMAN'), dict(role='BOT', api_role='BOT', generate=True), ], ), query_per_second=16, batch_size=1024, temperature=0.001, tokenizer_path='gpt-4o-2024-05-13', verbose=True, max_out_len=16384, # max_seq_len=32768, max_seq_len=49152, ) for item in datasets: # item['infer_cfg']['inferencer']['max_out_len'] = 32768 # You can unset this line if you want to avoid length cutoff if 'judge_cfg' in item['eval_cfg']['evaluator']: item['eval_cfg']['evaluator']['judge_cfg'] = verifier_cfg ####################################################################### # PART 2 Model List # ####################################################################### models = sum([v for k, v in locals().items() if k.endswith('_model')], []) models += [ # You can comment out the models you don't want to evaluate # All models use sampling mode dict( type=TurboMindModelwithChatTemplate, abbr='deepseek-r1-distill-qwen-7b-turbomind', path='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', engine_config=dict(session_len=32768, max_batch_size=128, tp=1), gen_config=dict( do_sample=True, temperature=0.6, top_p=0.95, max_new_tokens=32768), max_seq_len=32768, max_out_len=32768, batch_size=64, run_cfg=dict(num_gpus=1), pred_postprocessor=dict(type=extract_non_reasoning_content) ), # dict( # type=TurboMindModelwithChatTemplate, # abbr='deepseek-r1-distill-qwen-14b-turbomind', # path='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', # engine_config=dict(session_len=32768, max_batch_size=128, tp=2), # gen_config=dict( # do_sample=True, # temperature=0.6, # top_p=0.95, # max_new_tokens=32768), # max_seq_len=32768, # max_out_len=32768, # batch_size=128, # run_cfg=dict(num_gpus=2), # pred_postprocessor=dict(type=extract_non_reasoning_content) # ), # dict( # type=TurboMindModelwithChatTemplate, # abbr='deepseek-r1-distill-qwen-32b-turbomind', # path='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', # engine_config=dict(session_len=32768, max_batch_size=128, tp=4), # gen_config=dict( # do_sample=True, # temperature=0.6, # top_p=0.95, # max_new_tokens=16384), # max_seq_len=32768, # max_out_len=16384, # batch_size=128, # run_cfg=dict(num_gpus=4), # pred_postprocessor=dict(type=extract_non_reasoning_content) # ), ] ####################################################################### # PART 3 Inference/Evaluation # ####################################################################### # Inference configuration infer = dict( partitioner=dict( type=NumWorkerPartitioner, num_worker=1 # Similar with data-parallelism, how many workers for evaluation, # each worker will evaluate a part of the dataset. Total GPUs = num_worker * num_gpus_per_worker # For example, If you have 8 GPUs, for 7B model using 1 GPU for one instance, you can set num_worker=8 # to max-utilize the GPUs. # If you have 8 GPUs, for 14B model using 2 GPUs for one instance, you can set num_worker=4 ), runner=dict( type=LocalRunner, task=dict(type=OpenICLInferTask) ), ) # Evaluation configuration eval = dict( partitioner=dict( type=NaivePartitioner, n=8 ), runner=dict( type=LocalRunner, task=dict( type=OpenICLEvalTask) ), ) ####################################################################### # PART 4 Summarizer # ####################################################################### summary_groups = sum( [v for k, v in locals().items() if k.endswith('_summary_groups')], [] ) summary_groups.extend([ { 'name': 'AIME2024-Aveage8', 'subsets':[[f'aime2024-run{idx}', 'accuracy'] for idx in range(8)] }, { 'name': 'LiveMathBench-v202412-Hard-Aveage8', 'subsets':[[ f'livemathbench_hard_custom_{split}_run{run_idx}', 'accuracy'] for split, run_idx in product(['hard_cn', 'hard_en'], range(8)) ] } ]) # Summarizer summarizer = dict( dataset_abbrs=[ 'MATH', # ['LiveMathBench-k1-n1', 'pass@1'], # ['LiveMathBench-v202412-greedy', 'G-Pass@1_0.0'], # ['aime2024', 'accuracy'], ['math_prm800k_500-llmjudge', 'accuracy'], ['AIME2024-Aveage8', 'naive_average'], ['LiveMathBench-v202412-Hard-Aveage8', 'naive_average'], ['OlympiadBenchMath', 'accuracy'], ['OmniMath', 'accuracy'], ], summary_groups=summary_groups, ) ####################################################################### # PART 5 Utils # ####################################################################### work_dir = 'outputs/deepseek_r1_reasoning'