Update DeepSeek-R1 example

2025-05-30 16:03:24 +08:00 · 2025-02-27 16:02:20 +00:00 · 2025-02-27 16:02:20 +00:00 · 8103c0d245
commit 8103c0d245
parent ba7163ce2e
1 changed files with 210 additions and 0 deletions
--- a/examples/eval_deepseek_r1.py
+++ b/examples/eval_deepseek_r1.py
@ -0,0 +1,210 @@
+# Support AIME-2024 with Repeat8
+# Support MATH-500
+# Support OlympiadBench
+# Support OmniMath
+# Support LiveMathBench-202412-Hard
+
+import os.path as osp
+from itertools import product
+from opencompass.models import OpenAISDK
+from mmengine.config import read_base
+from opencompass.utils.text_postprocessors import extract_non_reasoning_content
+from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner
+from opencompass.tasks import OpenICLInferTask, OpenICLEvalTask
+from opencompass.runners import LocalRunner
+from opencompass.models import (
+    TurboMindModelwithChatTemplate,
+)
+
+#######################################################################
+#                          PART 1  Datasets List                      #
+#######################################################################
+with read_base():
+    # You can comment out the datasets you don't want to evaluate
+
+    # from opencompass.configs.datasets.math.math_prm800k_500_llmverify_gen_6ff468 import math_datasets # 1 Run
+    from opencompass.configs.datasets.aime2024.aime2024_llmverify_repeat8_gen_e8fcee import aime2024_datasets # 8 Run
+    # from opencompass.configs.datasets.OlympiadBench.OlympiadBench_0shot_llmverify_gen_be8b13 import olympiadbench_datasets
+    # from opencompass.configs.datasets.omni_math.omni_math_llmverify_gen_ccf9c0 import omnimath_datasets # 1 Run
+    # from opencompass.configs.datasets.livemathbench.livemathbench_hard_custom_llmverify_gen_85d0ef import livemathbench_datasets
+
+    # Summarizer
+    from opencompass.configs.summarizers.groups.OlympiadBench import OlympiadBenchMath_summary_groups
+
+datasets = sum(
+    (v for k, v in locals().items() if k.endswith('_datasets')),
+    [],
+)
+
+# Set LLM Verifier used for each dataset
+
+verifier_cfg = dict(
+        abbr='qwen2-5-32B-Instruct',
+        type=OpenAISDK,
+        path='Qwen/Qwen2.5-32B-Instruct', # You need to set your own judge model path
+        key='sk-1234', # You need to set your own API key
+        openai_api_base=[
+            'http://172.30.56.1:4000/v1', # You need to set your own API base
+        ],
+        meta_template=dict(
+            round=[
+                dict(role='HUMAN', api_role='HUMAN'),
+                dict(role='BOT', api_role='BOT', generate=True),
+            ], 
+        ),
+        query_per_second=16,
+        batch_size=1024,
+        temperature=0.001,
+        tokenizer_path='gpt-4o-2024-05-13',
+        verbose=True,
+        max_out_len=16384,
+        # max_seq_len=32768,
+        max_seq_len=49152,
+)
+
+for item in datasets:
+    # item['infer_cfg']['inferencer']['max_out_len'] = 32768 # You can unset this line if you want to avoid length cutoff
+    if 'judge_cfg' in item['eval_cfg']['evaluator']:
+        item['eval_cfg']['evaluator']['judge_cfg'] = verifier_cfg
+
+
+#######################################################################
+#                          PART 2  Model List                         #
+#######################################################################
+
+models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
+
+models += [
+    # You can comment out the models you don't want to evaluate
+    # All models use sampling mode
+    dict(
+        type=TurboMindModelwithChatTemplate,
+        abbr='deepseek-r1-distill-qwen-7b-turbomind',
+        path='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B',
+        engine_config=dict(session_len=32768, max_batch_size=128, tp=1),
+        gen_config=dict(
+                        do_sample=True,
+                        temperature=0.6,
+                        top_p=0.95,
+                        max_new_tokens=32768),
+        max_seq_len=32768,
+        max_out_len=32768,
+        batch_size=64,
+        run_cfg=dict(num_gpus=1),
+        pred_postprocessor=dict(type=extract_non_reasoning_content)
+    ),
+    # dict(
+    #     type=TurboMindModelwithChatTemplate,
+    #     abbr='deepseek-r1-distill-qwen-14b-turbomind',
+    #     path='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B',
+    #     engine_config=dict(session_len=32768, max_batch_size=128, tp=2),
+    #     gen_config=dict(
+    #                     do_sample=True,
+    #                     temperature=0.6,
+    #                     top_p=0.95,
+    #                     max_new_tokens=32768),
+    #     max_seq_len=32768,
+    #     max_out_len=32768,
+    #     batch_size=128,
+    #     run_cfg=dict(num_gpus=2),
+    #     pred_postprocessor=dict(type=extract_non_reasoning_content)
+    # ),
+    # dict(
+    #     type=TurboMindModelwithChatTemplate,
+    #     abbr='deepseek-r1-distill-qwen-32b-turbomind',
+    #     path='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B',
+    #     engine_config=dict(session_len=32768, max_batch_size=128, tp=4),
+    #     gen_config=dict(
+    #                     do_sample=True,
+    #                     temperature=0.6,
+    #                     top_p=0.95,
+    #                     max_new_tokens=16384),
+    #     max_seq_len=32768,
+    #     max_out_len=16384,
+    #     batch_size=128,
+    #     run_cfg=dict(num_gpus=4),
+    #     pred_postprocessor=dict(type=extract_non_reasoning_content)
+    # ),
+]
+
+#######################################################################
+#                          PART 3  Inference/Evaluation               #
+#######################################################################
+
+# Inference configuration
+infer = dict(
+    partitioner=dict(
+        type=NumWorkerPartitioner,
+        num_worker=1
+        # Similar with data-parallelism, how many workers for evaluation,
+        # each worker will evaluate a part of the dataset. Total GPUs = num_worker * num_gpus_per_worker
+        # For example, If you have 8 GPUs, for 7B model using 1 GPU for one instance, you can set num_worker=8
+        # to max-utilize the GPUs.
+        # If you have 8 GPUs, for 14B model using 2 GPUs for one instance, you can set num_worker=4
+    ),
+    runner=dict(
+        type=LocalRunner,
+        task=dict(type=OpenICLInferTask)
+    ),
+)
+
+# Evaluation configuration
+eval = dict(
+    partitioner=dict(
+        type=NaivePartitioner, n=8
+    ),
+    runner=dict(
+        type=LocalRunner,
+        task=dict(
+            type=OpenICLEvalTask)
+    ),
+)
+
+
+#######################################################################
+#                          PART 4  Summarizer                         #
+#######################################################################
+
+
+summary_groups = sum(
+    [v for k, v in locals().items() if k.endswith('_summary_groups')], []
+)
+
+summary_groups.extend([
+    {
+        'name': 'AIME2024-Aveage8',
+        'subsets':[[f'aime2024-run{idx}', 'accuracy'] for idx in range(8)]
+    },
+    {
+        'name': 'LiveMathBench-v202412-Hard-Aveage8',
+        'subsets':[[
+            f'livemathbench_hard_custom_{split}_run{run_idx}', 'accuracy'] 
+                for split, run_idx in product(['hard_cn', 'hard_en'], range(8))
+        ]
+    }
+])
+
+# Summarizer
+summarizer = dict(
+    dataset_abbrs=[
+        'MATH',
+        # ['LiveMathBench-k1-n1', 'pass@1'],
+        # ['LiveMathBench-v202412-greedy', 'G-Pass@1_0.0'],
+        # ['aime2024', 'accuracy'],
+        ['math_prm800k_500-llmjudge', 'accuracy'],
+        ['AIME2024-Aveage8', 'naive_average'],
+        ['LiveMathBench-v202412-Hard-Aveage8', 'naive_average'],
+        ['OlympiadBenchMath', 'accuracy'],
+        ['OmniMath', 'accuracy'],
+    ],
+    summary_groups=summary_groups,
+)
+
+
+#######################################################################
+#                          PART 5  Utils                              #
+#######################################################################
+
+work_dir = "outputs/deepseek_r1_reasoning"
+
+