From 93ecc670dff183b6b6caf9bd827bc1ff62f5c6f6 Mon Sep 17 00:00:00 2001 From: zhangsongyang Date: Wed, 30 Apr 2025 09:47:51 +0000 Subject: [PATCH] Update --- examples/eval_qwen3.py | 142 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 142 insertions(+) create mode 100644 examples/eval_qwen3.py diff --git a/examples/eval_qwen3.py b/examples/eval_qwen3.py new file mode 100644 index 00000000..eb600515 --- /dev/null +++ b/examples/eval_qwen3.py @@ -0,0 +1,142 @@ + +import os.path as osp +from opencompass.models import OpenAISDK +from mmengine.config import read_base +from opencompass.utils.text_postprocessors import extract_non_reasoning_content +from opencompass.runners import LocalRunner +from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner +from opencompass.tasks import OpenICLInferTask, OpenICLEvalTask + +with read_base(): + from opencompass.configs.datasets.aime2024.aime2024_cascade_eval_gen_5e9f4f import aime2024_datasets + from opencompass.configs.datasets.aime2025.aime2025_cascade_eval_gen_5e9f4f import aime2025_datasets + from opencompass.configs.datasets.math.math_500_cascade_eval_gen_6ff468 import math_datasets + +####################################################################### +# PART 0 Meta Info # +####################################################################### + + +api_meta_template = dict(round=[ + dict(role='HUMAN', api_role='HUMAN'), + dict(role='BOT', api_role='BOT', generate=True), +], +) + + +judge_cfg = dict( + abbr='qwen2-5-32B-Instruct', + type=OpenAISDK, + path='Qwen/Qwen2.5-32B-Instruct', + key='sk-1234', + openai_api_base=[ + 'http://x.x.x.x:4000/v1', + ], + meta_template=api_meta_template, + query_per_second=8, + batch_size=256, + temperature=0.001, + # max_completion_tokens=32768, + tokenizer_path='gpt-4o-2024-05-13', + # verbose=True, + max_out_len=16384, + max_seq_len=32768, + # max_seq_len=49152, + mode='mid', + retry=10 +) + +####################################################################### +# PART 1 Datasets List # +####################################################################### + +repeated_info = [ + (math_datasets, 4), + (aime2024_datasets, 32), + (aime2025_datasets, 32), +] + +for datasets_, num in repeated_info: + for dataset_ in datasets_: + dataset_['n'] = num + +datasets = sum( + (v for k, v in locals().items() if k.endswith('_datasets')), + [], +) + +for item in datasets: + item['infer_cfg']['inferencer']['max_out_len'] = 32768 + try: + if 'judge_cfg' in item['eval_cfg']['evaluator']: + item['eval_cfg']['evaluator']['judge_cfg'] = judge_cfg + elif'judge_cfg' in item['eval_cfg']['evaluator']['llm_evaluator']: + item['eval_cfg']['evaluator']['llm_evaluator']['judge_cfg'] = judge_cfg + except: + pass +####################################################################### +# PART 2 Dataset Summarizer # +####################################################################### + +summarizer = dict( + dataset_abbrs=[ + 'MATH', + ['math_prm800k_500', 'accuracy (4 runs average)'], + ['aime2024', 'accuracy (32 runs average)'], + ['aime2025', 'accuracy (32 runs average)'], + ['livemathbench_hard', 'naive_average'], + ['OlympiadBenchMath', 'accuracy'], + ['olymmath', 'naive_average'], + ], + summary_groups = sum( + [v for k, v in locals().items() if k.endswith('_summary_groups')], [] + ), +) + +####################################################################### +# PART 3 Models List # +####################################################################### +models = sum([v for k, v in locals().items() if k.endswith('_model')], []) +models += [ + + dict( + abbr='Qwen_Qwen3-235B-A22B', + type=OpenAISDK, + path='Qwen/Qwen3-235B-A22B', + key='sk-admin', + openai_api_base=[ + 'http://106.15.231.215:40007/v1/', + ], + meta_template=dict( + # begin=dict(role='SYSTEM', api_role='SYSTEM', prompt=''), + round=[ + dict(role='HUMAN', api_role='HUMAN'), + # XXX: all system roles are mapped to human in purpose + dict(role='BOT', api_role='BOT', generate=True), + ] + ), + query_per_second=16, + batch_size=128, + # batch_size=1, + temperature=0.6, + # max_completion_tokens=32768, + tokenizer_path='gpt-4', + # verbose=True, + max_out_len=32768, + max_seq_len=32768, + pred_postprocessor=dict(type=extract_non_reasoning_content) + ), +] + +infer = dict( + partitioner=dict(type=NumWorkerPartitioner, num_worker=8), + runner=dict(type=LocalRunner, task=dict(type=OpenICLInferTask)), +) + +eval = dict( + partitioner=dict(type=NaivePartitioner, n=8), + runner=dict(type=LocalRunner, task=dict(type=OpenICLEvalTask)), +) + +base_exp_dir = 'outputs/qwen3_reasoning' +work_dir = osp.join(base_exp_dir, 'chat_objective') \ No newline at end of file