diff --git a/examples/eval_deepseek_r1.py b/examples/eval_deepseek_r1.py new file mode 100644 index 00000000..4f477b62 --- /dev/null +++ b/examples/eval_deepseek_r1.py @@ -0,0 +1,210 @@ +# Support AIME-2024 with Repeat8 +# Support MATH-500 +# Support OlympiadBench +# Support OmniMath +# Support LiveMathBench-202412-Hard + +import os.path as osp +from itertools import product +from opencompass.models import OpenAISDK +from mmengine.config import read_base +from opencompass.utils.text_postprocessors import extract_non_reasoning_content +from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner +from opencompass.tasks import OpenICLInferTask, OpenICLEvalTask +from opencompass.runners import LocalRunner +from opencompass.models import ( + TurboMindModelwithChatTemplate, +) + +####################################################################### +# PART 1 Datasets List # +####################################################################### +with read_base(): + # You can comment out the datasets you don't want to evaluate + + # from opencompass.configs.datasets.math.math_prm800k_500_llmverify_gen_6ff468 import math_datasets # 1 Run + from opencompass.configs.datasets.aime2024.aime2024_llmverify_repeat8_gen_e8fcee import aime2024_datasets # 8 Run + # from opencompass.configs.datasets.OlympiadBench.OlympiadBench_0shot_llmverify_gen_be8b13 import olympiadbench_datasets + # from opencompass.configs.datasets.omni_math.omni_math_llmverify_gen_ccf9c0 import omnimath_datasets # 1 Run + # from opencompass.configs.datasets.livemathbench.livemathbench_hard_custom_llmverify_gen_85d0ef import livemathbench_datasets + + # Summarizer + from opencompass.configs.summarizers.groups.OlympiadBench import OlympiadBenchMath_summary_groups + +datasets = sum( + (v for k, v in locals().items() if k.endswith('_datasets')), + [], +) + +# Set LLM Verifier used for each dataset + +verifier_cfg = dict( + abbr='qwen2-5-32B-Instruct', + type=OpenAISDK, + path='Qwen/Qwen2.5-32B-Instruct', # You need to set your own judge model path + key='sk-1234', # You need to set your own API key + openai_api_base=[ + 'http://172.30.56.1:4000/v1', # You need to set your own API base + ], + meta_template=dict( + round=[ + dict(role='HUMAN', api_role='HUMAN'), + dict(role='BOT', api_role='BOT', generate=True), + ], + ), + query_per_second=16, + batch_size=1024, + temperature=0.001, + tokenizer_path='gpt-4o-2024-05-13', + verbose=True, + max_out_len=16384, + # max_seq_len=32768, + max_seq_len=49152, +) + +for item in datasets: + # item['infer_cfg']['inferencer']['max_out_len'] = 32768 # You can unset this line if you want to avoid length cutoff + if 'judge_cfg' in item['eval_cfg']['evaluator']: + item['eval_cfg']['evaluator']['judge_cfg'] = verifier_cfg + + +####################################################################### +# PART 2 Model List # +####################################################################### + +models = sum([v for k, v in locals().items() if k.endswith('_model')], []) + +models += [ + # You can comment out the models you don't want to evaluate + # All models use sampling mode + dict( + type=TurboMindModelwithChatTemplate, + abbr='deepseek-r1-distill-qwen-7b-turbomind', + path='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', + engine_config=dict(session_len=32768, max_batch_size=128, tp=1), + gen_config=dict( + do_sample=True, + temperature=0.6, + top_p=0.95, + max_new_tokens=32768), + max_seq_len=32768, + max_out_len=32768, + batch_size=64, + run_cfg=dict(num_gpus=1), + pred_postprocessor=dict(type=extract_non_reasoning_content) + ), + # dict( + # type=TurboMindModelwithChatTemplate, + # abbr='deepseek-r1-distill-qwen-14b-turbomind', + # path='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', + # engine_config=dict(session_len=32768, max_batch_size=128, tp=2), + # gen_config=dict( + # do_sample=True, + # temperature=0.6, + # top_p=0.95, + # max_new_tokens=32768), + # max_seq_len=32768, + # max_out_len=32768, + # batch_size=128, + # run_cfg=dict(num_gpus=2), + # pred_postprocessor=dict(type=extract_non_reasoning_content) + # ), + # dict( + # type=TurboMindModelwithChatTemplate, + # abbr='deepseek-r1-distill-qwen-32b-turbomind', + # path='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', + # engine_config=dict(session_len=32768, max_batch_size=128, tp=4), + # gen_config=dict( + # do_sample=True, + # temperature=0.6, + # top_p=0.95, + # max_new_tokens=16384), + # max_seq_len=32768, + # max_out_len=16384, + # batch_size=128, + # run_cfg=dict(num_gpus=4), + # pred_postprocessor=dict(type=extract_non_reasoning_content) + # ), +] + +####################################################################### +# PART 3 Inference/Evaluation # +####################################################################### + +# Inference configuration +infer = dict( + partitioner=dict( + type=NumWorkerPartitioner, + num_worker=1 + # Similar with data-parallelism, how many workers for evaluation, + # each worker will evaluate a part of the dataset. Total GPUs = num_worker * num_gpus_per_worker + # For example, If you have 8 GPUs, for 7B model using 1 GPU for one instance, you can set num_worker=8 + # to max-utilize the GPUs. + # If you have 8 GPUs, for 14B model using 2 GPUs for one instance, you can set num_worker=4 + ), + runner=dict( + type=LocalRunner, + task=dict(type=OpenICLInferTask) + ), +) + +# Evaluation configuration +eval = dict( + partitioner=dict( + type=NaivePartitioner, n=8 + ), + runner=dict( + type=LocalRunner, + task=dict( + type=OpenICLEvalTask) + ), +) + + +####################################################################### +# PART 4 Summarizer # +####################################################################### + + +summary_groups = sum( + [v for k, v in locals().items() if k.endswith('_summary_groups')], [] +) + +summary_groups.extend([ + { + 'name': 'AIME2024-Aveage8', + 'subsets':[[f'aime2024-run{idx}', 'accuracy'] for idx in range(8)] + }, + { + 'name': 'LiveMathBench-v202412-Hard-Aveage8', + 'subsets':[[ + f'livemathbench_hard_custom_{split}_run{run_idx}', 'accuracy'] + for split, run_idx in product(['hard_cn', 'hard_en'], range(8)) + ] + } +]) + +# Summarizer +summarizer = dict( + dataset_abbrs=[ + 'MATH', + # ['LiveMathBench-k1-n1', 'pass@1'], + # ['LiveMathBench-v202412-greedy', 'G-Pass@1_0.0'], + # ['aime2024', 'accuracy'], + ['math_prm800k_500-llmjudge', 'accuracy'], + ['AIME2024-Aveage8', 'naive_average'], + ['LiveMathBench-v202412-Hard-Aveage8', 'naive_average'], + ['OlympiadBenchMath', 'accuracy'], + ['OmniMath', 'accuracy'], + ], + summary_groups=summary_groups, +) + + +####################################################################### +# PART 5 Utils # +####################################################################### + +work_dir = "outputs/deepseek_r1_reasoning" + +