mirror of
https://github.com/open-compass/opencompass.git
synced 2025-05-30 16:03:24 +08:00

* update * update * update * update * update * update * updaste * update * update * update * update * update * update * update * updaste * update * update * update * update * update * update * update * update * update * Update daily-run-test.yml * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update
183 lines
6.6 KiB
Python
183 lines
6.6 KiB
Python
from copy import deepcopy
|
|
|
|
from mmengine.config import read_base
|
|
|
|
from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
|
|
from opencompass.runners import LocalRunner
|
|
from opencompass.summarizers import DefaultSubjectiveSummarizer
|
|
from opencompass.tasks.subjective_eval import SubjectiveEvalTask
|
|
|
|
with read_base():
|
|
# read hf models - chat models
|
|
# Dataset
|
|
from opencompass.configs.datasets.chinese_simpleqa.chinese_simpleqa_gen import \
|
|
csimpleqa_datasets # noqa: F401, E501
|
|
from opencompass.configs.datasets.SimpleQA.simpleqa_gen_0283c3 import \
|
|
simpleqa_datasets # noqa: F401, E501; noqa: F401, E501
|
|
from opencompass.configs.datasets.subjective.alignbench.alignbench_v1_1_judgeby_critiquellm_new import \
|
|
alignbench_datasets # noqa: F401, E501
|
|
from opencompass.configs.datasets.subjective.alpaca_eval.alpacav2_judgeby_gpt4_new import \
|
|
alpacav2_datasets # noqa: F401, E501
|
|
from opencompass.configs.datasets.subjective.arena_hard.arena_hard_compare_new import \
|
|
arenahard_datasets # noqa: F401, E501
|
|
from opencompass.configs.datasets.subjective.compassarena.compassarena_compare_new import \
|
|
compassarena_datasets # noqa: F401, E501
|
|
# from opencompass.configs.datasets.subjective.fofo.fofo_bilingual_judge_new import fofo_datasets # noqa: F401, E501
|
|
from opencompass.configs.datasets.subjective.followbench.followbench_llmeval_new import \
|
|
followbench_llmeval_datasets # noqa: F401, E501
|
|
from opencompass.configs.datasets.subjective.multiround.mtbench101_judge_new import \
|
|
mtbench101_datasets # noqa: F401, E501
|
|
from opencompass.configs.datasets.subjective.wildbench.wildbench_pair_judge_new import \
|
|
wildbench_datasets # noqa: F401, E501
|
|
from opencompass.configs.models.hf_internlm.hf_internlm2_5_7b_chat import \
|
|
models as hf_internlm2_5_7b_chat_model # noqa: F401, E501
|
|
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat import \
|
|
models as lmdeploy_internlm2_5_7b_chat_model # noqa: F401, E501
|
|
|
|
from ...volc import infer as volc_infer # noqa: F401, E501
|
|
|
|
datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')
|
|
and 'mtbench101' not in k and 'wildbench' not in k), [])
|
|
datasets += mtbench101_datasets # noqa: F401, E501
|
|
datasets += wildbench_datasets # noqa: F401, E501
|
|
|
|
api_meta_template = dict(
|
|
round=[
|
|
dict(role='HUMAN', api_role='HUMAN'),
|
|
dict(role='BOT', api_role='BOT', generate=True),
|
|
],
|
|
reserved_roles=[dict(role='SYSTEM', api_role='SYSTEM')],
|
|
)
|
|
|
|
models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
|
|
for m in models:
|
|
m['abbr'] = m['abbr'] + '_fullbench'
|
|
if 'turbomind' in m['abbr'] or 'lmdeploy' in m['abbr']:
|
|
m['engine_config']['max_batch_size'] = 1
|
|
m['batch_size'] = 1
|
|
|
|
models = sorted(models, key=lambda x: x['run_cfg']['num_gpus'])
|
|
|
|
judge_models = deepcopy([models[1]])
|
|
judge_models[0]['abbr'] = judge_models[0]['abbr'] + '-judge'
|
|
|
|
eval = dict(
|
|
partitioner=dict(
|
|
type=SubjectiveNaivePartitioner,
|
|
models=models,
|
|
judge_models=judge_models,
|
|
),
|
|
runner=dict(type=LocalRunner,
|
|
max_num_workers=16,
|
|
task=dict(type=SubjectiveEvalTask)),
|
|
)
|
|
|
|
summary_groups = []
|
|
summary_groups.append({
|
|
'name': 'compassarena_language',
|
|
'subsets': [
|
|
['compassarena_language', '内容总结'],
|
|
],
|
|
})
|
|
summary_groups.append({
|
|
'name': 'compassarena_knowledge',
|
|
'subsets': [
|
|
['compassarena_knowledge', '生活常识_ZH'],
|
|
],
|
|
})
|
|
summary_groups.append({
|
|
'name': 'compassarena_reason_v2',
|
|
'subsets': [
|
|
['compassarena_reason_v2', 'reasoning'],
|
|
],
|
|
})
|
|
summary_groups.append({
|
|
'name': 'compassarena_math_v2',
|
|
'subsets': [
|
|
['compassarena_math_v2', '高等数学_ZH'],
|
|
],
|
|
})
|
|
summary_groups.append({
|
|
'name': 'compassarena_creationv2_zh',
|
|
'subsets': [
|
|
['compassarena_creationv2_zh', '内容扩写_ZH'],
|
|
],
|
|
})
|
|
summary_groups.append({
|
|
'name':
|
|
'CompassArena',
|
|
'subsets': [
|
|
'compassarena_language',
|
|
'compassarena_knowledge',
|
|
'compassarena_reason_v2',
|
|
'compassarena_math_v2',
|
|
'compassarena_creationv2_zh',
|
|
],
|
|
})
|
|
summary_groups.append({
|
|
'name':
|
|
'FoFo',
|
|
'subsets': [['fofo_test_prompts', 'overall'],
|
|
['fofo_test_prompts_cn', 'overall']],
|
|
})
|
|
summary_groups.append({
|
|
'name':
|
|
'Followbench',
|
|
'subsets': [
|
|
['followbench_llmeval_en', 'HSR_AVG'],
|
|
['followbench_llmeval_en', 'SSR_AVG'],
|
|
],
|
|
})
|
|
|
|
# Summarizer
|
|
summarizer = dict(
|
|
dataset_abbrs=[
|
|
['alignment_bench_v1_1', '总分'],
|
|
['alpaca_eval', 'total'],
|
|
['arenahard', 'score'],
|
|
['Followbench', 'naive_average'],
|
|
['CompassArena', 'naive_average'],
|
|
['FoFo', 'naive_average'],
|
|
['mtbench101', 'avg'],
|
|
['wildbench', 'average'],
|
|
['simpleqa', 'accuracy_given_attempted'],
|
|
['chinese_simpleqa', 'given_attempted_accuracy'],
|
|
'',
|
|
['alignment_bench_v1_1', '专业能力'],
|
|
['alignment_bench_v1_1', '数学计算'],
|
|
['alignment_bench_v1_1', '基本任务'],
|
|
['alignment_bench_v1_1', '逻辑推理'],
|
|
['alignment_bench_v1_1', '中文理解'],
|
|
['alignment_bench_v1_1', '文本写作'],
|
|
['alignment_bench_v1_1', '角色扮演'],
|
|
['alignment_bench_v1_1', '综合问答'],
|
|
['alpaca_eval', 'helpful_base'],
|
|
['alpaca_eval', 'koala'],
|
|
['alpaca_eval', 'oasst'],
|
|
['alpaca_eval', 'selfinstruct'],
|
|
['alpaca_eval', 'vicuna'],
|
|
['compassarena_language', 'naive_average'],
|
|
['compassarena_knowledge', 'naive_average'],
|
|
['compassarena_reason_v2', 'naive_average'],
|
|
['compassarena_math_v2', 'naive_average'],
|
|
['compassarena_creationv2_zh', 'naive_average'],
|
|
['fofo_test_prompts', 'overall'],
|
|
['fofo_test_prompts_cn', 'overall'],
|
|
['followbench_llmeval_en', 'HSR_AVG'],
|
|
['followbench_llmeval_en', 'SSR_AVG'],
|
|
['followbench_llmeval_en', 'HSR_L1'],
|
|
['followbench_llmeval_en', 'HSR_L2'],
|
|
['followbench_llmeval_en', 'HSR_L3'],
|
|
['followbench_llmeval_en', 'HSR_L4'],
|
|
['followbench_llmeval_en', 'HSR_L5'],
|
|
['followbench_llmeval_en', 'SSR_L1'],
|
|
['followbench_llmeval_en', 'SSR_L2'],
|
|
['followbench_llmeval_en', 'SSR_L3'],
|
|
['followbench_llmeval_en', 'SSR_L4'],
|
|
['followbench_llmeval_en', 'SSR_L5'],
|
|
['simpleqa', 'f1'],
|
|
],
|
|
type=DefaultSubjectiveSummarizer,
|
|
summary_groups=summary_groups,
|
|
)
|