# Support AIME-2024 with Repeat8 # Support MATH-500 # Support OlympiadBench # Support OmniMath # Support LiveMathBench-202412-Hard import os.path as osp from itertools import product from opencompass.models import OpenAISDK, MiniMaxChatCompletionV2 from mmengine.config import read_base from opencompass.utils.text_postprocessors import extract_non_reasoning_content from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner from opencompass.tasks import OpenICLInferTask, OpenICLEvalTask from opencompass.runners import LocalRunner from opencompass.models import ( TurboMindModelwithChatTemplate, ) ####################################################################### # PART 1 Datasets List # ####################################################################### with read_base(): # You can comment out the datasets you don't want to evaluate # Datasets from opencompass.configs.datasets.math.math_prm800k_500_llmverify_gen_6ff468 import math_datasets # 1 Run from opencompass.configs.datasets.aime2024.aime2024_llmverify_repeat8_gen_e8fcee import aime2024_datasets # 8 Run from opencompass.configs.datasets.OlympiadBench.OlympiadBench_0shot_llmverify_gen_be8b13 import \ olympiadbench_datasets from opencompass.configs.datasets.omni_math.omni_math_llmverify_gen_ccf9c0 import omnimath_datasets # 1 Run from opencompass.configs.datasets.livemathbench.livemathbench_hard_custom_llmverify_gen_85d0ef import \ livemathbench_datasets from opencompass.configs.models.minimax.minimax_deepseek_r1 import minimax_ds_r1 # Summarizer from opencompass.configs.summarizers.groups.OlympiadBench import OlympiadBenchMath_summary_groups # datasets = sum( # (v for k, v in locals().items() if k.endswith('_datasets')), # [], # ) datasets = aime2024_datasets # Set LLM Verifier used for each dataset # verifier_cfg = dict( # abbr='qwen2-5-32B-Instruct', # type=OpenAISDK, # path='Qwen/Qwen2.5-32B-Instruct', # You need to set your own judge model path # key='sk-1234', # You need to set your own API key # openai_api_base=[ # 'http://172.30.56.1:4000/v1', # You need to set your own API base # ], # meta_template=dict( # round=[ # dict(role='HUMAN', api_role='HUMAN'), # dict(role='BOT', api_role='BOT', generate=True), # ], # ), # query_per_second=16, # batch_size=1024, # temperature=0.001, # tokenizer_path='gpt-4o-2024-05-13', # verbose=True, # max_out_len=16384, # # max_seq_len=32768, # max_seq_len=49152, # ) verifier_cfg = dict(abbr='MiniMax-Text-01', type=MiniMaxChatCompletionV2, path='MiniMax-Text-01', url='https://api.minimax.chat/v1/text/chatcompletion_v2', key='eyJhbGciOiJSUzI1NiIsInR5cCI6IkpXVCJ9.eyJHcm91cE5hbWUiOiLlmLvlmLvlk4jlk4giLCJVc2VyTmFtZSI6IuWYu-WYu-WTiOWTiCIsIkFjY291bnQiOiIiLCJTdWJqZWN0SUQiOiIxNzQzNTAzNjg0MDUxOTMxMjk0IiwiUGhvbmUiOiIxODEwMDE3NjQ5OCIsIkdyb3VwSUQiOiIxNzQzNTAzNjg0MDQzNTQyNjg2IiwiUGFnZU5hbWUiOiIiLCJNYWlsIjoiIiwiQ3JlYXRlVGltZSI6IjIwMjUtMDMtMDcgMTY6MzU6MjQiLCJUb2tlblR5cGUiOjEsImlzcyI6Im1pbmltYXgifQ.p4QeQoT6Sk7hOCUxIfPpPJqFstT61DoKy7RGh5BcfhyLDO-l_0WFcbfyA212fCwxbCJc-RWCwM2H8q0nfMvKricNY9cXoFcQp2wCqcW11Rq6fhIpE8FzQz4HTDSlHEec9mwGDdeTOqOXUALhgqYho2anH2VP8aoARuXOSY8He_KyBxHRvODucarRkYWOjMUd20DRni7SGm8n_Gi2B_DacGW1ie60U8t2Aahna5h7pGFqudP0r-_YUtDabuqPX0Vo_EKPu1ZyVrY7jP_YT4FVx6AtYBrZcTzcq-KTm_F86-1sioUXzz9oPo3JFFJudhwbVQYrySB5jqJUJNoXP8OzDA', # You need to set your own API key meta_template=dict( round=[ dict(role='HUMAN', api_role='HUMAN'), dict(role='BOT', api_role='BOT', generate=True), ], ), query_per_second=3, retry=1, max_out_len=4096, max_seq_len=32768, batch_size=2) for item in datasets: # item['infer_cfg']['inferencer']['max_out_len'] = 32768 # You can unset this line if you want to avoid length cutoff if 'judge_cfg' in item['eval_cfg']['evaluator']: item['eval_cfg']['evaluator']['judge_cfg'] = verifier_cfg ####################################################################### # PART 2 Model List # ####################################################################### models = sum([v for k, v in locals().items() if k.endswith('_model')], []) models += [ # You can comment out the models you don't want to evaluate # All models use sampling mode dict( type=TurboMindModelwithChatTemplate, abbr='deepseek-r1-distill-qwen-7b-turbomind', path='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', engine_config=dict(session_len=32768, max_batch_size=128, tp=1), gen_config=dict( do_sample=True, temperature=0.6, top_p=0.95, max_new_tokens=32768), max_seq_len=32768, max_out_len=32768, batch_size=64, run_cfg=dict(num_gpus=1), pred_postprocessor=dict(type=extract_non_reasoning_content) ), # dict( # type=TurboMindModelwithChatTemplate, # abbr='deepseek-r1-distill-qwen-14b-turbomind', # path='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', # engine_config=dict(session_len=32768, max_batch_size=128, tp=2), # gen_config=dict( # do_sample=True, # temperature=0.6, # top_p=0.95, # max_new_tokens=32768), # max_seq_len=32768, # max_out_len=32768, # batch_size=128, # run_cfg=dict(num_gpus=2), # pred_postprocessor=dict(type=extract_non_reasoning_content) # ), # dict( # type=TurboMindModelwithChatTemplate, # abbr='deepseek-r1-distill-qwen-32b-turbomind', # path='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', # engine_config=dict(session_len=32768, max_batch_size=128, tp=4), # gen_config=dict( # do_sample=True, # temperature=0.6, # top_p=0.95, # max_new_tokens=16384), # max_seq_len=32768, # max_out_len=16384, # batch_size=128, # run_cfg=dict(num_gpus=4), # pred_postprocessor=dict(type=extract_non_reasoning_content) # ), ] models = minimax_ds_r1 ####################################################################### # PART 3 Inference/Evaluation # ####################################################################### # Inference configuration infer = dict( partitioner=dict( type=NumWorkerPartitioner, num_worker=1 # Similar with data-parallelism, how many workers for evaluation, # each worker will evaluate a part of the dataset. Total GPUs = num_worker * num_gpus_per_worker # For example, If you have 8 GPUs, for 7B model using 1 GPU for one instance, you can set num_worker=8 # to max-utilize the GPUs. # If you have 8 GPUs, for 14B model using 2 GPUs for one instance, you can set num_worker=4 ), runner=dict( type=LocalRunner, task=dict(type=OpenICLInferTask) ), ) # Evaluation configuration eval = dict( partitioner=dict( type=NaivePartitioner, n=8 ), runner=dict( type=LocalRunner, task=dict( type=OpenICLEvalTask) ), ) ####################################################################### # PART 4 Summarizer # ####################################################################### summary_groups = sum( [v for k, v in locals().items() if k.endswith('_summary_groups')], [] ) summary_groups.extend([ { 'name': 'AIME2024-Aveage8', 'subsets': [[f'aime2024-run{idx}', 'accuracy'] for idx in range(8)] }, { 'name': 'LiveMathBench-v202412-Hard-Aveage8', 'subsets': [[ f'livemathbench_hard_custom_{split}_run{run_idx}', 'accuracy'] for split, run_idx in product(['hard_cn', 'hard_en'], range(8)) ] } ]) # Summarizer summarizer = dict( dataset_abbrs=[ # 'MATH', # ['LiveMathBench-k1-n1', 'pass@1'], # ['LiveMathBench-v202412-greedy', 'G-Pass@1_0.0'], ['aime2024', 'accuracy'], # ['math_prm800k_500-llmjudge', 'accuracy'], # ['AIME2024-Aveage8', 'naive_average'], # ['LiveMathBench-v202412-Hard-Aveage8', 'naive_average'], # ['OlympiadBenchMath', 'accuracy'], # ['OmniMath', 'accuracy'], ], summary_groups=summary_groups, ) ####################################################################### # PART 5 Utils # ####################################################################### work_dir = 'outputs/deepseek_r1_reasoning'