mirror of
https://github.com/open-compass/opencompass.git
synced 2025-05-30 16:03:24 +08:00

* Update CascadeEvaluator * Update CascadeEvaluator * Update CascadeEvaluator * Update Config * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update
142 lines
4.7 KiB
Python
142 lines
4.7 KiB
Python
|
|
import os.path as osp
|
|
from opencompass.models import OpenAISDK
|
|
from mmengine.config import read_base
|
|
from opencompass.utils.text_postprocessors import extract_non_reasoning_content
|
|
from opencompass.runners import LocalRunner
|
|
from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner
|
|
from opencompass.tasks import OpenICLInferTask, OpenICLEvalTask
|
|
|
|
with read_base():
|
|
from opencompass.configs.datasets.aime2024.aime2024_cascade_eval_gen_5e9f4f import aime2024_datasets
|
|
from opencompass.configs.datasets.aime2025.aime2025_cascade_eval_gen_5e9f4f import aime2025_datasets
|
|
from opencompass.configs.datasets.math.math_500_cascade_eval_gen_6ff468 import math_datasets
|
|
|
|
#######################################################################
|
|
# PART 0 Meta Info #
|
|
#######################################################################
|
|
|
|
|
|
api_meta_template = dict(round=[
|
|
dict(role='HUMAN', api_role='HUMAN'),
|
|
dict(role='BOT', api_role='BOT', generate=True),
|
|
],
|
|
)
|
|
|
|
|
|
judge_cfg = dict(
|
|
abbr='qwen2-5-32B-Instruct',
|
|
type=OpenAISDK,
|
|
path='Qwen/Qwen2.5-32B-Instruct',
|
|
key='sk-1234',
|
|
openai_api_base=[
|
|
'http://x.x.x.x:4000/v1',
|
|
],
|
|
meta_template=api_meta_template,
|
|
query_per_second=8,
|
|
batch_size=256,
|
|
temperature=0.001,
|
|
# max_completion_tokens=32768,
|
|
tokenizer_path='gpt-4o-2024-05-13',
|
|
# verbose=True,
|
|
max_out_len=16384,
|
|
max_seq_len=32768,
|
|
# max_seq_len=49152,
|
|
mode='mid',
|
|
retry=10
|
|
)
|
|
|
|
#######################################################################
|
|
# PART 1 Datasets List #
|
|
#######################################################################
|
|
|
|
repeated_info = [
|
|
(math_datasets, 4),
|
|
(aime2024_datasets, 32),
|
|
(aime2025_datasets, 32),
|
|
]
|
|
|
|
for datasets_, num in repeated_info:
|
|
for dataset_ in datasets_:
|
|
dataset_['n'] = num
|
|
|
|
datasets = sum(
|
|
(v for k, v in locals().items() if k.endswith('_datasets')),
|
|
[],
|
|
)
|
|
|
|
for item in datasets:
|
|
item['infer_cfg']['inferencer']['max_out_len'] = 32768
|
|
try:
|
|
if 'judge_cfg' in item['eval_cfg']['evaluator']:
|
|
item['eval_cfg']['evaluator']['judge_cfg'] = judge_cfg
|
|
elif'judge_cfg' in item['eval_cfg']['evaluator']['llm_evaluator']:
|
|
item['eval_cfg']['evaluator']['llm_evaluator']['judge_cfg'] = judge_cfg
|
|
except:
|
|
pass
|
|
#######################################################################
|
|
# PART 2 Dataset Summarizer #
|
|
#######################################################################
|
|
|
|
summarizer = dict(
|
|
dataset_abbrs=[
|
|
'MATH',
|
|
['math_prm800k_500', 'accuracy (4 runs average)'],
|
|
['aime2024', 'accuracy (32 runs average)'],
|
|
['aime2025', 'accuracy (32 runs average)'],
|
|
['livemathbench_hard', 'naive_average'],
|
|
['OlympiadBenchMath', 'accuracy'],
|
|
['olymmath', 'naive_average'],
|
|
],
|
|
summary_groups = sum(
|
|
[v for k, v in locals().items() if k.endswith('_summary_groups')], []
|
|
),
|
|
)
|
|
|
|
#######################################################################
|
|
# PART 3 Models List #
|
|
#######################################################################
|
|
models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
|
|
models += [
|
|
|
|
dict(
|
|
abbr='Qwen_Qwen3-235B-A22B',
|
|
type=OpenAISDK,
|
|
path='Qwen/Qwen3-235B-A22B',
|
|
key='sk-admin',
|
|
openai_api_base=[
|
|
'http://106.15.231.215:40007/v1/',
|
|
],
|
|
meta_template=dict(
|
|
# begin=dict(role='SYSTEM', api_role='SYSTEM', prompt=''),
|
|
round=[
|
|
dict(role='HUMAN', api_role='HUMAN'),
|
|
# XXX: all system roles are mapped to human in purpose
|
|
dict(role='BOT', api_role='BOT', generate=True),
|
|
]
|
|
),
|
|
query_per_second=16,
|
|
batch_size=128,
|
|
# batch_size=1,
|
|
temperature=0.6,
|
|
# max_completion_tokens=32768,
|
|
tokenizer_path='gpt-4',
|
|
# verbose=True,
|
|
max_out_len=32768,
|
|
max_seq_len=32768,
|
|
pred_postprocessor=dict(type=extract_non_reasoning_content)
|
|
),
|
|
]
|
|
|
|
infer = dict(
|
|
partitioner=dict(type=NumWorkerPartitioner, num_worker=8),
|
|
runner=dict(type=LocalRunner, task=dict(type=OpenICLInferTask)),
|
|
)
|
|
|
|
eval = dict(
|
|
partitioner=dict(type=NaivePartitioner, n=8),
|
|
runner=dict(type=LocalRunner, task=dict(type=OpenICLEvalTask)),
|
|
)
|
|
|
|
base_exp_dir = 'outputs/qwen3_reasoning'
|
|
work_dir = osp.join(base_exp_dir, 'chat_objective') |