OpenCompass/examples/eval_OpenHuEval_HuSimpleQA.py

from mmengine.config import read_base

from opencompass.summarizers.subjective.husimpleqa import HuSimpleQASummarizer

with read_base():
    from opencompass.configs.datasets.OpenHuEval.HuSimpleQA.HuSimpleQA import HuSimpleQA_datasets, PROMPT_LANGUAGES

    from opencompass.configs.models.openai.gpt_4o_mini_20240718 import models as gpt_4o_mini_20240718_model
    from opencompass.configs.models.openai.gpt_4o_2024_11_20 import models as gpt_4o_2024_11_20_model
    from opencompass.configs.models.deepseek.deepseek_v3_api_aliyun import models as deepseek_v3_api_aliyun_model

    from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_7b_instruct import models as lmdeploy_qwen2_5_7b_instruct_model
    # from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_32b_instruct import models as lmdeploy_qwen2_5_32b_instruct_model
    # from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_14b_instruct import models as lmdeploy_qwen2_5_14b_instruct_model
    from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_72b_instruct import models as lmdeploy_qwen2_5_72b_instruct_model
    from opencompass.configs.models.hf_llama.lmdeploy_llama3_1_8b_instruct import models as lmdeploy_llama3_1_8b_instruct_model
    from opencompass.configs.models.hf_llama.lmdeploy_llama3_1_70b_instruct import models as lmdeploy_llama3_1_70b_instruct_model
    # from opencompass.configs.models.hf_llama.llama3_3_70b_api_siliconflow import models as llama3_3_70b_api_siliconflow_model


    # from opencompass.configs.models.hf_internlm.lmdeploy_internlm3_8b_instruct import models as lmdeploy_internlm3_8b_instruct_model

    from opencompass.configs.models.qwq.lmdeploy_qwq_32b_preview import models as lmdeploy_qwq_32b_preview_model
    from opencompass.configs.models.deepseek.deepseek_r1_api_aliyun import models as deepseek_r1_api_aliyun_model
    # from opencompass.configs.models.deepseek.deepseek_r1_distill_llama_8b_api_aliyun import models as deepseek_r1_distill_llama_8b_api_aliyun_model
    # from opencompass.configs.models.deepseek.lmdeploy_deepseek_r1_distill_qwen_32b_instruct import models as deepseek_r1_distill_qwen_32b_instruct_model
    # from opencompass.configs.models.deepseek.lmdeploy_deepseek_r1_distill_qwen_14b_instruct import models as deepseek_r1_distill_qwen_14b_instruct_model
    # from opencompass.configs.models.deepseek.lmdeploy_deepseek_r1_distill_llama_70b_instruct import models as deepseek_r1_distill_llama_70b_instruct_model
    from opencompass.configs.models.openai.o1_mini_2024_09_12 import models as o1_mini_2024_09_12_model
    # from opencompass.configs.models.openai.o3_mini_2025_01_31 import models as o3_mini_2025_01_31_model

from opencompass.models import OpenAI
from opencompass.partitioners import (
    NumWorkerPartitioner,
    SubjectiveNumWorkerPartitioner,
)
from opencompass.runners import LocalRunner, SlurmSequentialRunner
from opencompass.tasks import OpenICLInferTask
from opencompass.tasks.subjective_eval import SubjectiveEvalTask

api_meta_template = dict(round=[
    dict(role='HUMAN', api_role='HUMAN'),
    dict(role='BOT', api_role='BOT', generate=True),
])

models = sum([v for k, v in locals().items() if k.endswith('_model')], [])

for model in models:
    if model['abbr'].startswith('deepseek_r1_api_'):
        model['return_reasoning_content'] = True
        model['pred_postprocessor'] = {
            'OpenHuEval_*': {
                'type': 'rm_<think>_before_eval'
            }
        }
    if model['abbr'].startswith('QwQ'):
        model['pred_postprocessor'] = {
            'OpenHuEval_*': {
                'type': 'extract_qwq_answer_before_eval_for_husimpleqa'
            }
        }
del model


judge_models = [
    dict(
        abbr='GPT-4o-2024-08-06',
        type=OpenAI,
        path='gpt-4o-2024-08-06',
        key='ENV',
        openai_proxy_url='ENV',
        verbose=True,
        meta_template=api_meta_template,
        query_per_second=2,
        max_out_len=8192,
        max_seq_len=16384,
        batch_size=8,
        temperature=0,
    )
]

datasets = HuSimpleQA_datasets
del HuSimpleQA_datasets

infer = dict(
    partitioner=dict(
        type=NumWorkerPartitioner,
        num_worker=8,
    ),
    runner=dict(
        type=SlurmSequentialRunner,
        max_num_workers=16,
        task=dict(type=OpenICLInferTask),
    ),
)

eval = dict(
    partitioner=dict(
        type=SubjectiveNumWorkerPartitioner,
        num_worker=8,
        models=models,
        judge_models=judge_models,
    ),
    runner=dict(type=LocalRunner,
                max_num_workers=16,
                task=dict(type=SubjectiveEvalTask)),
)

summarizer = dict(type=HuSimpleQASummarizer, 
                  prompt_languages=PROMPT_LANGUAGES)

work_dir = (
    './outputs/' + __file__.split('/')[-1].split('.')[0] + '/' 
)  # do NOT modify this line, yapf: disable, pylint: disable
add HuSimpleQA 2025-02-10 21:20:39 +08:00			`from mmengine.config import read_base`

[Refactor] Change HuSimpleQA to subjective evaluation 2025-02-11 16:55:07 +08:00			`from opencompass.summarizers.subjective.husimpleqa import HuSimpleQASummarizer`

add HuSimpleQA 2025-02-10 21:20:39 +08:00			`with read_base():`
add some features (#32) * [Feature] Support answer extraction of QwQ when evaluating HuSimpleQA * [Feature] Support mulit-language summarization in HuSimpleQASummarizer * [Feature] Support DeepSeep-R1-Distill-Qwen_32B_turbomind 2025-02-14 20:44:53 +08:00			`from opencompass.configs.datasets.OpenHuEval.HuSimpleQA.HuSimpleQA import HuSimpleQA_datasets, PROMPT_LANGUAGES`
add HuSimpleQA 2025-02-10 21:20:39 +08:00
[Refactor] Change HuSimpleQA to subjective evaluation 2025-02-11 16:55:07 +08:00			`from opencompass.configs.models.openai.gpt_4o_mini_20240718 import models as gpt_4o_mini_20240718_model`
			`from opencompass.configs.models.openai.gpt_4o_2024_11_20 import models as gpt_4o_2024_11_20_model`
			`from opencompass.configs.models.deepseek.deepseek_v3_api_aliyun import models as deepseek_v3_api_aliyun_model`
add HuSimpleQA 2025-02-10 21:20:39 +08:00
			`from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_7b_instruct import models as lmdeploy_qwen2_5_7b_instruct_model`
[Feature] Ensure QwQ pred are processed before evaluation for configed datasets 2025-02-15 11:55:52 +08:00			`# from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_32b_instruct import models as lmdeploy_qwen2_5_32b_instruct_model`
			`# from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_14b_instruct import models as lmdeploy_qwen2_5_14b_instruct_model`
add HuSimpleQA 2025-02-10 21:20:39 +08:00			`from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_72b_instruct import models as lmdeploy_qwen2_5_72b_instruct_model`
			`from opencompass.configs.models.hf_llama.lmdeploy_llama3_1_8b_instruct import models as lmdeploy_llama3_1_8b_instruct_model`
			`from opencompass.configs.models.hf_llama.lmdeploy_llama3_1_70b_instruct import models as lmdeploy_llama3_1_70b_instruct_model`
[Feature] Ensure QwQ pred are processed before evaluation for configed datasets 2025-02-15 11:55:52 +08:00			`# from opencompass.configs.models.hf_llama.llama3_3_70b_api_siliconflow import models as llama3_3_70b_api_siliconflow_model`

add HuSimpleQA 2025-02-10 21:20:39 +08:00
add some features (#32) * [Feature] Support answer extraction of QwQ when evaluating HuSimpleQA * [Feature] Support mulit-language summarization in HuSimpleQASummarizer * [Feature] Support DeepSeep-R1-Distill-Qwen_32B_turbomind 2025-02-14 20:44:53 +08:00			`# from opencompass.configs.models.hf_internlm.lmdeploy_internlm3_8b_instruct import models as lmdeploy_internlm3_8b_instruct_model`
add HuSimpleQA 2025-02-10 21:20:39 +08:00
			`from opencompass.configs.models.qwq.lmdeploy_qwq_32b_preview import models as lmdeploy_qwq_32b_preview_model`
[Refactor] Change HuSimpleQA to subjective evaluation 2025-02-11 16:55:07 +08:00			`from opencompass.configs.models.deepseek.deepseek_r1_api_aliyun import models as deepseek_r1_api_aliyun_model`
[Feature] Ensure QwQ pred are processed before evaluation for configed datasets 2025-02-15 11:55:52 +08:00			`# from opencompass.configs.models.deepseek.deepseek_r1_distill_llama_8b_api_aliyun import models as deepseek_r1_distill_llama_8b_api_aliyun_model`
			`# from opencompass.configs.models.deepseek.lmdeploy_deepseek_r1_distill_qwen_32b_instruct import models as deepseek_r1_distill_qwen_32b_instruct_model`
			`# from opencompass.configs.models.deepseek.lmdeploy_deepseek_r1_distill_qwen_14b_instruct import models as deepseek_r1_distill_qwen_14b_instruct_model`
			`# from opencompass.configs.models.deepseek.lmdeploy_deepseek_r1_distill_llama_70b_instruct import models as deepseek_r1_distill_llama_70b_instruct_model`
[Refactor] Change HuSimpleQA to subjective evaluation 2025-02-11 16:55:07 +08:00			`from opencompass.configs.models.openai.o1_mini_2024_09_12 import models as o1_mini_2024_09_12_model`
[Fix] Fix HuSimpleQASummarizer bug (#28) 2025-02-13 11:28:49 +08:00			`# from opencompass.configs.models.openai.o3_mini_2025_01_31 import models as o3_mini_2025_01_31_model`
[Refactor] Change HuSimpleQA to subjective evaluation 2025-02-11 16:55:07 +08:00
			`from opencompass.models import OpenAI`
			`from opencompass.partitioners import (`
			`NumWorkerPartitioner,`
			`SubjectiveNumWorkerPartitioner,`
			`)`
			`from opencompass.runners import LocalRunner, SlurmSequentialRunner`
			`from opencompass.tasks import OpenICLInferTask`
			`from opencompass.tasks.subjective_eval import SubjectiveEvalTask`

			`api_meta_template = dict(round=[`
			`dict(role='HUMAN', api_role='HUMAN'),`
			`dict(role='BOT', api_role='BOT', generate=True),`
			`])`
add HuSimpleQA 2025-02-10 21:20:39 +08:00
			`models = sum([v for k, v in locals().items() if k.endswith('_model')], [])`

			`for model in models:`
			`if model['abbr'].startswith('deepseek_r1_api_'):`
			`model['return_reasoning_content'] = True`
			`model['pred_postprocessor'] = {`
			`'OpenHuEval_*': {`
			`'type': 'rm_<think>_before_eval'`
			`}`
			`}`
add some features (#32) * [Feature] Support answer extraction of QwQ when evaluating HuSimpleQA * [Feature] Support mulit-language summarization in HuSimpleQASummarizer * [Feature] Support DeepSeep-R1-Distill-Qwen_32B_turbomind 2025-02-14 20:44:53 +08:00			`if model['abbr'].startswith('QwQ'):`
			`model['pred_postprocessor'] = {`
			`'OpenHuEval_*': {`
			`'type': 'extract_qwq_answer_before_eval_for_husimpleqa'`
			`}`
			`}`
add HuSimpleQA 2025-02-10 21:20:39 +08:00			`del model`

[Refactor] Change HuSimpleQA to subjective evaluation 2025-02-11 16:55:07 +08:00
			`judge_models = [`
			`dict(`
			`abbr='GPT-4o-2024-08-06',`
			`type=OpenAI,`
			`path='gpt-4o-2024-08-06',`
			`key='ENV',`
			`openai_proxy_url='ENV',`
			`verbose=True,`
			`meta_template=api_meta_template,`
			`query_per_second=2,`
			`max_out_len=8192,`
			`max_seq_len=16384,`
			`batch_size=8,`
			`temperature=0,`
			`)`
			`]`

			`datasets = HuSimpleQA_datasets`
			`del HuSimpleQA_datasets`

			`infer = dict(`
			`partitioner=dict(`
			`type=NumWorkerPartitioner,`
			`num_worker=8,`
			`),`
			`runner=dict(`
			`type=SlurmSequentialRunner,`
			`max_num_workers=16,`
			`task=dict(type=OpenICLInferTask),`
			`),`
			`)`

			`eval = dict(`
			`partitioner=dict(`
			`type=SubjectiveNumWorkerPartitioner,`
			`num_worker=8,`
			`models=models,`
			`judge_models=judge_models,`
			`),`
			`runner=dict(type=LocalRunner,`
			`max_num_workers=16,`
			`task=dict(type=SubjectiveEvalTask)),`
			`)`

add some features (#32) * [Feature] Support answer extraction of QwQ when evaluating HuSimpleQA * [Feature] Support mulit-language summarization in HuSimpleQASummarizer * [Feature] Support DeepSeep-R1-Distill-Qwen_32B_turbomind 2025-02-14 20:44:53 +08:00			`summarizer = dict(type=HuSimpleQASummarizer,`
			`prompt_languages=PROMPT_LANGUAGES)`
[Refactor] Change HuSimpleQA to subjective evaluation 2025-02-11 16:55:07 +08:00
			`work_dir = (`
			`'./outputs/' + __file__.split('/')[-1].split('.')[0] + '/'`
			`) # do NOT modify this line, yapf: disable, pylint: disable`