OpenCompass/examples/eval_deepseek_r1.py

# Support AIME-2024 with Repeat8
# Support MATH-500
# Support OlympiadBench
# Support OmniMath
# Support LiveMathBench-202412-Hard

import os.path as osp
from itertools import product
from opencompass.models import OpenAISDK, MiniMaxChatCompletionV2
from mmengine.config import read_base
from opencompass.utils.text_postprocessors import extract_non_reasoning_content
from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner
from opencompass.tasks import OpenICLInferTask, OpenICLEvalTask
from opencompass.runners import LocalRunner
from opencompass.models import (
    TurboMindModelwithChatTemplate,
)

#######################################################################
#                          PART 1  Datasets List                      #
#######################################################################
with read_base():
    # You can comment out the datasets you don't want to evaluate

    # Datasets
    from opencompass.configs.datasets.math.math_prm800k_500_llmverify_gen_6ff468 import math_datasets  # 1 Run
    from opencompass.configs.datasets.aime2024.aime2024_llmverify_repeat8_gen_e8fcee import aime2024_datasets  # 8 Run
    from opencompass.configs.datasets.OlympiadBench.OlympiadBench_0shot_llmverify_gen_be8b13 import \
        olympiadbench_datasets
    from opencompass.configs.datasets.omni_math.omni_math_llmverify_gen_ccf9c0 import omnimath_datasets  # 1 Run
    from opencompass.configs.datasets.livemathbench.livemathbench_hard_custom_llmverify_gen_85d0ef import \
        livemathbench_datasets
    from opencompass.configs.models.minimax.minimax_deepseek_r1 import minimax_ds_r1

    # Summarizer
    from opencompass.configs.summarizers.groups.OlympiadBench import OlympiadBenchMath_summary_groups

# datasets = sum(
#     (v for k, v in locals().items() if k.endswith('_datasets')),
#     [],
# )
datasets = aime2024_datasets

# Set LLM Verifier used for each dataset

# verifier_cfg = dict(
#     abbr='qwen2-5-32B-Instruct',
#     type=OpenAISDK,
#     path='Qwen/Qwen2.5-32B-Instruct',  # You need to set your own judge model path
#     key='sk-1234',  # You need to set your own API key
#     openai_api_base=[
#         'http://172.30.56.1:4000/v1',  # You need to set your own API base
#     ],
#     meta_template=dict(
#         round=[
#             dict(role='HUMAN', api_role='HUMAN'),
#             dict(role='BOT', api_role='BOT', generate=True),
#         ],
#     ),
#     query_per_second=16,
#     batch_size=1024,
#     temperature=0.001,
#     tokenizer_path='gpt-4o-2024-05-13',
#     verbose=True,
#     max_out_len=16384,
#     # max_seq_len=32768,
#     max_seq_len=49152,
# )

verifier_cfg = dict(abbr='MiniMax-Text-01',
                    type=MiniMaxChatCompletionV2,
                    path='MiniMax-Text-01',
                    url='https://api.minimax.chat/v1/text/chatcompletion_v2',
                    key='eyJhbGciOiJSUzI1NiIsInR5cCI6IkpXVCJ9.eyJHcm91cE5hbWUiOiLlmLvlmLvlk4jlk4giLCJVc2VyTmFtZSI6IuWYu-WYu-WTiOWTiCIsIkFjY291bnQiOiIiLCJTdWJqZWN0SUQiOiIxNzQzNTAzNjg0MDUxOTMxMjk0IiwiUGhvbmUiOiIxODEwMDE3NjQ5OCIsIkdyb3VwSUQiOiIxNzQzNTAzNjg0MDQzNTQyNjg2IiwiUGFnZU5hbWUiOiIiLCJNYWlsIjoiIiwiQ3JlYXRlVGltZSI6IjIwMjUtMDMtMDcgMTY6MzU6MjQiLCJUb2tlblR5cGUiOjEsImlzcyI6Im1pbmltYXgifQ.p4QeQoT6Sk7hOCUxIfPpPJqFstT61DoKy7RGh5BcfhyLDO-l_0WFcbfyA212fCwxbCJc-RWCwM2H8q0nfMvKricNY9cXoFcQp2wCqcW11Rq6fhIpE8FzQz4HTDSlHEec9mwGDdeTOqOXUALhgqYho2anH2VP8aoARuXOSY8He_KyBxHRvODucarRkYWOjMUd20DRni7SGm8n_Gi2B_DacGW1ie60U8t2Aahna5h7pGFqudP0r-_YUtDabuqPX0Vo_EKPu1ZyVrY7jP_YT4FVx6AtYBrZcTzcq-KTm_F86-1sioUXzz9oPo3JFFJudhwbVQYrySB5jqJUJNoXP8OzDA',
                    # You need to set your own API key
                    meta_template=dict(
                        round=[
                            dict(role='HUMAN', api_role='HUMAN'),
                            dict(role='BOT', api_role='BOT', generate=True),
                        ],
                    ),
                    query_per_second=3,
                    retry=1,
                    max_out_len=4096, max_seq_len=32768, batch_size=2)

for item in datasets:
    # item['infer_cfg']['inferencer']['max_out_len'] = 32768 # You can unset this line if you want to avoid length cutoff
    if 'judge_cfg' in item['eval_cfg']['evaluator']:
        item['eval_cfg']['evaluator']['judge_cfg'] = verifier_cfg

#######################################################################
#                          PART 2  Model List                         #
#######################################################################

models = sum([v for k, v in locals().items() if k.endswith('_model')], [])

models += [
    # You can comment out the models you don't want to evaluate
    # All models use sampling mode
    dict(
        type=TurboMindModelwithChatTemplate,
        abbr='deepseek-r1-distill-qwen-7b-turbomind',
        path='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B',
        engine_config=dict(session_len=32768, max_batch_size=128, tp=1),
        gen_config=dict(
            do_sample=True,
            temperature=0.6,
            top_p=0.95,
            max_new_tokens=32768),
        max_seq_len=32768,
        max_out_len=32768,
        batch_size=64,
        run_cfg=dict(num_gpus=1),
        pred_postprocessor=dict(type=extract_non_reasoning_content)
    ),
    # dict(
    #     type=TurboMindModelwithChatTemplate,
    #     abbr='deepseek-r1-distill-qwen-14b-turbomind',
    #     path='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B',
    #     engine_config=dict(session_len=32768, max_batch_size=128, tp=2),
    #     gen_config=dict(
    #                     do_sample=True,
    #                     temperature=0.6,
    #                     top_p=0.95,
    #                     max_new_tokens=32768),
    #     max_seq_len=32768,
    #     max_out_len=32768,
    #     batch_size=128,
    #     run_cfg=dict(num_gpus=2),
    #     pred_postprocessor=dict(type=extract_non_reasoning_content)
    # ),
    # dict(
    #     type=TurboMindModelwithChatTemplate,
    #     abbr='deepseek-r1-distill-qwen-32b-turbomind',
    #     path='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B',
    #     engine_config=dict(session_len=32768, max_batch_size=128, tp=4),
    #     gen_config=dict(
    #                     do_sample=True,
    #                     temperature=0.6,
    #                     top_p=0.95,
    #                     max_new_tokens=16384),
    #     max_seq_len=32768,
    #     max_out_len=16384,
    #     batch_size=128,
    #     run_cfg=dict(num_gpus=4),
    #     pred_postprocessor=dict(type=extract_non_reasoning_content)
    # ),
]
models = minimax_ds_r1

#######################################################################
#                          PART 3  Inference/Evaluation               #
#######################################################################

# Inference configuration
infer = dict(
    partitioner=dict(
        type=NumWorkerPartitioner,
        num_worker=1
        # Similar with data-parallelism, how many workers for evaluation,
        # each worker will evaluate a part of the dataset. Total GPUs = num_worker * num_gpus_per_worker
        # For example, If you have 8 GPUs, for 7B model using 1 GPU for one instance, you can set num_worker=8
        # to max-utilize the GPUs.
        # If you have 8 GPUs, for 14B model using 2 GPUs for one instance, you can set num_worker=4
    ),
    runner=dict(
        type=LocalRunner,
        task=dict(type=OpenICLInferTask)
    ),
)

# Evaluation configuration
eval = dict(
    partitioner=dict(
        type=NaivePartitioner, n=8
    ),
    runner=dict(
        type=LocalRunner,
        task=dict(
            type=OpenICLEvalTask)
    ),
)

#######################################################################
#                          PART 4  Summarizer                         #
#######################################################################


summary_groups = sum(
    [v for k, v in locals().items() if k.endswith('_summary_groups')], []
)

summary_groups.extend([
    {
        'name': 'AIME2024-Aveage8',
        'subsets': [[f'aime2024-run{idx}', 'accuracy'] for idx in range(8)]
    },
    {
        'name': 'LiveMathBench-v202412-Hard-Aveage8',
        'subsets': [[
            f'livemathbench_hard_custom_{split}_run{run_idx}', 'accuracy']
            for split, run_idx in product(['hard_cn', 'hard_en'], range(8))
        ]
    }
])

# Summarizer
summarizer = dict(
    dataset_abbrs=[
        # 'MATH',
        # ['LiveMathBench-k1-n1', 'pass@1'],
        # ['LiveMathBench-v202412-greedy', 'G-Pass@1_0.0'],
        ['aime2024', 'accuracy'],
        # ['math_prm800k_500-llmjudge', 'accuracy'],
        # ['AIME2024-Aveage8', 'naive_average'],
        # ['LiveMathBench-v202412-Hard-Aveage8', 'naive_average'],
        # ['OlympiadBenchMath', 'accuracy'],
        # ['OmniMath', 'accuracy'],
    ],
    summary_groups=summary_groups,
)

#######################################################################
#                          PART 5  Utils                              #
#######################################################################

work_dir = 'outputs/deepseek_r1_reasoning'
[Update] Support OlympiadBench-Math/OmniMath/LiveMathBench-Hard (#1899) * [Update] Support OlympiadBench-Math/OmniMath/LiveMathBench-Hard with LLM Verify * Update * Update * Update DeepSeek-R1 example * Update DeepSeek-R1 example * Update DeepSeek-R1 example 2025-03-03 18:56:11 +08:00			`# Support AIME-2024 with Repeat8`
			`# Support MATH-500`
			`# Support OlympiadBench`
			`# Support OmniMath`
			`# Support LiveMathBench-202412-Hard`

			`import os.path as osp`
			`from itertools import product`
测试 2025-03-07 18:00:48 +08:00			`from opencompass.models import OpenAISDK, MiniMaxChatCompletionV2`
[Update] Support OlympiadBench-Math/OmniMath/LiveMathBench-Hard (#1899) * [Update] Support OlympiadBench-Math/OmniMath/LiveMathBench-Hard with LLM Verify * Update * Update * Update DeepSeek-R1 example * Update DeepSeek-R1 example * Update DeepSeek-R1 example 2025-03-03 18:56:11 +08:00			`from mmengine.config import read_base`
			`from opencompass.utils.text_postprocessors import extract_non_reasoning_content`
			`from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner`
			`from opencompass.tasks import OpenICLInferTask, OpenICLEvalTask`
			`from opencompass.runners import LocalRunner`
			`from opencompass.models import (`
			`TurboMindModelwithChatTemplate,`
			`)`

			`#######################################################################`
			`# PART 1 Datasets List #`
			`#######################################################################`
			`with read_base():`
			`# You can comment out the datasets you don't want to evaluate`

			`# Datasets`
测试 2025-03-07 18:00:48 +08:00			`from opencompass.configs.datasets.math.math_prm800k_500_llmverify_gen_6ff468 import math_datasets # 1 Run`
			`from opencompass.configs.datasets.aime2024.aime2024_llmverify_repeat8_gen_e8fcee import aime2024_datasets # 8 Run`
			`from opencompass.configs.datasets.OlympiadBench.OlympiadBench_0shot_llmverify_gen_be8b13 import \`
			`olympiadbench_datasets`
			`from opencompass.configs.datasets.omni_math.omni_math_llmverify_gen_ccf9c0 import omnimath_datasets # 1 Run`
			`from opencompass.configs.datasets.livemathbench.livemathbench_hard_custom_llmverify_gen_85d0ef import \`
			`livemathbench_datasets`
			`from opencompass.configs.models.minimax.minimax_deepseek_r1 import minimax_ds_r1`
[Update] Support OlympiadBench-Math/OmniMath/LiveMathBench-Hard (#1899) * [Update] Support OlympiadBench-Math/OmniMath/LiveMathBench-Hard with LLM Verify * Update * Update * Update DeepSeek-R1 example * Update DeepSeek-R1 example * Update DeepSeek-R1 example 2025-03-03 18:56:11 +08:00
			`# Summarizer`
			`from opencompass.configs.summarizers.groups.OlympiadBench import OlympiadBenchMath_summary_groups`

测试 2025-03-07 18:00:48 +08:00			`# datasets = sum(`
			`# (v for k, v in locals().items() if k.endswith('_datasets')),`
			`# [],`
			`# )`
			`datasets = aime2024_datasets`
[Update] Support OlympiadBench-Math/OmniMath/LiveMathBench-Hard (#1899) * [Update] Support OlympiadBench-Math/OmniMath/LiveMathBench-Hard with LLM Verify * Update * Update * Update DeepSeek-R1 example * Update DeepSeek-R1 example * Update DeepSeek-R1 example 2025-03-03 18:56:11 +08:00
			`# Set LLM Verifier used for each dataset`

测试 2025-03-07 18:00:48 +08:00			`# verifier_cfg = dict(`
			`# abbr='qwen2-5-32B-Instruct',`
			`# type=OpenAISDK,`
			`# path='Qwen/Qwen2.5-32B-Instruct', # You need to set your own judge model path`
			`# key='sk-1234', # You need to set your own API key`
			`# openai_api_base=[`
			`# 'http://172.30.56.1:4000/v1', # You need to set your own API base`
			`# ],`
			`# meta_template=dict(`
			`# round=[`
			`# dict(role='HUMAN', api_role='HUMAN'),`
			`# dict(role='BOT', api_role='BOT', generate=True),`
			`# ],`
			`# ),`
			`# query_per_second=16,`
			`# batch_size=1024,`
			`# temperature=0.001,`
			`# tokenizer_path='gpt-4o-2024-05-13',`
			`# verbose=True,`
			`# max_out_len=16384,`
			`# # max_seq_len=32768,`
			`# max_seq_len=49152,`
			`# )`

			`verifier_cfg = dict(abbr='MiniMax-Text-01',`
			`type=MiniMaxChatCompletionV2,`
			`path='MiniMax-Text-01',`
			`url='https://api.minimax.chat/v1/text/chatcompletion_v2',`
			key='eyJhbGciOiJSUzI1NiIsInR5cCI6IkpXVCJ9.eyJHcm91cE5hbWUiOiLlmLvlmLvlk4jlk4giLCJVc2VyTmFtZSI6IuWYu-WYu-WTiOWTiCIsIkFjY291bnQiOiIiLCJTdWJqZWN0SUQiOiIxNzQzNTAzNjg0MDUxOTMxMjk0IiwiUGhvbmUiOiIxODEwMDE3NjQ5OCIsIkdyb3VwSUQiOiIxNzQzNTAzNjg0MDQzNTQyNjg2IiwiUGFnZU5hbWUiOiIiLCJNYWlsIjoiIiwiQ3JlYXRlVGltZSI6IjIwMjUtMDMtMDcgMTY6MzU6MjQiLCJUb2tlblR5cGUiOjEsImlzcyI6Im1pbmltYXgifQ.p4QeQoT6Sk7hOCUxIfPpPJqFstT61DoKy7RGh5BcfhyLDO-l_0WFcbfyA212fCwxbCJc-RWCwM2H8q0nfMvKricNY9cXoFcQp2wCqcW11Rq6fhIpE8FzQz4HTDSlHEec9mwGDdeTOqOXUALhgqYho2anH2VP8aoARuXOSY8He_KyBxHRvODucarRkYWOjMUd20DRni7SGm8n_Gi2B_DacGW1ie60U8t2Aahna5h7pGFqudP0r-_YUtDabuqPX0Vo_EKPu1ZyVrY7jP_YT4FVx6AtYBrZcTzcq-KTm_F86-1sioUXzz9oPo3JFFJudhwbVQYrySB5jqJUJNoXP8OzDA',
			`# You need to set your own API key`
			`meta_template=dict(`
			`round=[`
			`dict(role='HUMAN', api_role='HUMAN'),`
			`dict(role='BOT', api_role='BOT', generate=True),`
			`],`
			`),`
			`query_per_second=3,`
			`retry=1,`
			`max_out_len=4096, max_seq_len=32768, batch_size=2)`
[Update] Support OlympiadBench-Math/OmniMath/LiveMathBench-Hard (#1899) * [Update] Support OlympiadBench-Math/OmniMath/LiveMathBench-Hard with LLM Verify * Update * Update * Update DeepSeek-R1 example * Update DeepSeek-R1 example * Update DeepSeek-R1 example 2025-03-03 18:56:11 +08:00
			`for item in datasets:`
			`# item['infer_cfg']['inferencer']['max_out_len'] = 32768 # You can unset this line if you want to avoid length cutoff`
			`if 'judge_cfg' in item['eval_cfg']['evaluator']:`
			`item['eval_cfg']['evaluator']['judge_cfg'] = verifier_cfg`

			`#######################################################################`
			`# PART 2 Model List #`
			`#######################################################################`

			`models = sum([v for k, v in locals().items() if k.endswith('_model')], [])`

			`models += [`
			`# You can comment out the models you don't want to evaluate`
			`# All models use sampling mode`
			`dict(`
			`type=TurboMindModelwithChatTemplate,`
			`abbr='deepseek-r1-distill-qwen-7b-turbomind',`
			`path='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B',`
			`engine_config=dict(session_len=32768, max_batch_size=128, tp=1),`
			`gen_config=dict(`
测试 2025-03-07 18:00:48 +08:00			`do_sample=True,`
			`temperature=0.6,`
			`top_p=0.95,`
			`max_new_tokens=32768),`
[Update] Support OlympiadBench-Math/OmniMath/LiveMathBench-Hard (#1899) * [Update] Support OlympiadBench-Math/OmniMath/LiveMathBench-Hard with LLM Verify * Update * Update * Update DeepSeek-R1 example * Update DeepSeek-R1 example * Update DeepSeek-R1 example 2025-03-03 18:56:11 +08:00			`max_seq_len=32768,`
			`max_out_len=32768,`
			`batch_size=64,`
			`run_cfg=dict(num_gpus=1),`
			`pred_postprocessor=dict(type=extract_non_reasoning_content)`
			`),`
			`# dict(`
			`# type=TurboMindModelwithChatTemplate,`
			`# abbr='deepseek-r1-distill-qwen-14b-turbomind',`
			`# path='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B',`
			`# engine_config=dict(session_len=32768, max_batch_size=128, tp=2),`
			`# gen_config=dict(`
			`# do_sample=True,`
			`# temperature=0.6,`
			`# top_p=0.95,`
			`# max_new_tokens=32768),`
			`# max_seq_len=32768,`
			`# max_out_len=32768,`
			`# batch_size=128,`
			`# run_cfg=dict(num_gpus=2),`
			`# pred_postprocessor=dict(type=extract_non_reasoning_content)`
			`# ),`
			`# dict(`
			`# type=TurboMindModelwithChatTemplate,`
			`# abbr='deepseek-r1-distill-qwen-32b-turbomind',`
			`# path='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B',`
			`# engine_config=dict(session_len=32768, max_batch_size=128, tp=4),`
			`# gen_config=dict(`
			`# do_sample=True,`
			`# temperature=0.6,`
			`# top_p=0.95,`
			`# max_new_tokens=16384),`
			`# max_seq_len=32768,`
			`# max_out_len=16384,`
			`# batch_size=128,`
			`# run_cfg=dict(num_gpus=4),`
			`# pred_postprocessor=dict(type=extract_non_reasoning_content)`
			`# ),`
			`]`
测试 2025-03-07 18:00:48 +08:00			`models = minimax_ds_r1`
[Update] Support OlympiadBench-Math/OmniMath/LiveMathBench-Hard (#1899) * [Update] Support OlympiadBench-Math/OmniMath/LiveMathBench-Hard with LLM Verify * Update * Update * Update DeepSeek-R1 example * Update DeepSeek-R1 example * Update DeepSeek-R1 example 2025-03-03 18:56:11 +08:00
			`#######################################################################`
			`# PART 3 Inference/Evaluation #`
			`#######################################################################`

			`# Inference configuration`
			`infer = dict(`
			`partitioner=dict(`
			`type=NumWorkerPartitioner,`
			`num_worker=1`
			`# Similar with data-parallelism, how many workers for evaluation,`
			`# each worker will evaluate a part of the dataset. Total GPUs = num_worker * num_gpus_per_worker`
			`# For example, If you have 8 GPUs, for 7B model using 1 GPU for one instance, you can set num_worker=8`
			`# to max-utilize the GPUs.`
			`# If you have 8 GPUs, for 14B model using 2 GPUs for one instance, you can set num_worker=4`
			`),`
			`runner=dict(`
			`type=LocalRunner,`
			`task=dict(type=OpenICLInferTask)`
			`),`
			`)`

			`# Evaluation configuration`
			`eval = dict(`
			`partitioner=dict(`
			`type=NaivePartitioner, n=8`
			`),`
			`runner=dict(`
			`type=LocalRunner,`
			`task=dict(`
			`type=OpenICLEvalTask)`
			`),`
			`)`

			`#######################################################################`
			`# PART 4 Summarizer #`
			`#######################################################################`


			`summary_groups = sum(`
			`[v for k, v in locals().items() if k.endswith('_summary_groups')], []`
			`)`

			`summary_groups.extend([`
			`{`
			`'name': 'AIME2024-Aveage8',`
测试 2025-03-07 18:00:48 +08:00			`'subsets': [[f'aime2024-run{idx}', 'accuracy'] for idx in range(8)]`
[Update] Support OlympiadBench-Math/OmniMath/LiveMathBench-Hard (#1899) * [Update] Support OlympiadBench-Math/OmniMath/LiveMathBench-Hard with LLM Verify * Update * Update * Update DeepSeek-R1 example * Update DeepSeek-R1 example * Update DeepSeek-R1 example 2025-03-03 18:56:11 +08:00			`},`
			`{`
			`'name': 'LiveMathBench-v202412-Hard-Aveage8',`
测试 2025-03-07 18:00:48 +08:00			`'subsets': [[`
			`f'livemathbench_hard_custom_{split}_run{run_idx}', 'accuracy']`
			`for split, run_idx in product(['hard_cn', 'hard_en'], range(8))`
[Update] Support OlympiadBench-Math/OmniMath/LiveMathBench-Hard (#1899) * [Update] Support OlympiadBench-Math/OmniMath/LiveMathBench-Hard with LLM Verify * Update * Update * Update DeepSeek-R1 example * Update DeepSeek-R1 example * Update DeepSeek-R1 example 2025-03-03 18:56:11 +08:00			`]`
			`}`
			`])`

			`# Summarizer`
			`summarizer = dict(`
			`dataset_abbrs=[`
测试 2025-03-07 18:00:48 +08:00			`# 'MATH',`
[Update] Support OlympiadBench-Math/OmniMath/LiveMathBench-Hard (#1899) * [Update] Support OlympiadBench-Math/OmniMath/LiveMathBench-Hard with LLM Verify * Update * Update * Update DeepSeek-R1 example * Update DeepSeek-R1 example * Update DeepSeek-R1 example 2025-03-03 18:56:11 +08:00			`# ['LiveMathBench-k1-n1', 'pass@1'],`
			`# ['LiveMathBench-v202412-greedy', 'G-Pass@1_0.0'],`
测试 2025-03-07 18:00:48 +08:00			`['aime2024', 'accuracy'],`
			`# ['math_prm800k_500-llmjudge', 'accuracy'],`
			`# ['AIME2024-Aveage8', 'naive_average'],`
			`# ['LiveMathBench-v202412-Hard-Aveage8', 'naive_average'],`
			`# ['OlympiadBenchMath', 'accuracy'],`
			`# ['OmniMath', 'accuracy'],`
[Update] Support OlympiadBench-Math/OmniMath/LiveMathBench-Hard (#1899) * [Update] Support OlympiadBench-Math/OmniMath/LiveMathBench-Hard with LLM Verify * Update * Update * Update DeepSeek-R1 example * Update DeepSeek-R1 example * Update DeepSeek-R1 example 2025-03-03 18:56:11 +08:00			`],`
			`summary_groups=summary_groups,`
			`)`

			`#######################################################################`
			`# PART 5 Utils #`
			`#######################################################################`

			`work_dir = 'outputs/deepseek_r1_reasoning'`