[ci] add fullbench testcase (#1766)

add volc testcase
This commit is contained in:
zhulinJulia24 2024-12-18 13:24:28 +08:00 committed by GitHub
parent 38dba9919b
commit 111f817e04
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
9 changed files with 1359 additions and 622 deletions

View File

@ -99,61 +99,66 @@ GaokaoBench_datasets = [
] ]
datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), []) datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
summary_groups = sum(
[v for k, v in locals().items() if k.endswith('_summary_groups')], [])
summary_groups.append(
{
'name': 'Mathbench',
'subsets': ['mathbench-a (average)', 'mathbench-t (average)'],
}, )
summarizer = dict( summarizer = dict(
dataset_abbrs=[ dataset_abbrs=[
'Language',
['race-high', 'accuracy'], ['race-high', 'accuracy'],
['ARC-c', 'accuracy'], ['ARC-c', 'accuracy'],
['BoolQ', 'accuracy'], ['BoolQ', 'accuracy'],
['mmlu_pro', 'naive_average'], ['triviaqa_wiki_1shot', 'score'],
['GPQA_diamond', 'accuracy'], ['nq_open_1shot', 'score'],
['cmmlu', 'naive_average'], '',
['mmlu', 'naive_average'], 'General Reasoning',
['drop', 'accuracy'], ['drop', 'accuracy'],
['bbh', 'naive_average'], ['bbh', 'naive_average'],
['GPQA_diamond', 'accuracy'],
['hellaswag', 'accuracy'],
['TheoremQA', 'score'],
['winogrande', 'accuracy'],
'',
'Math Calculation',
['gsm8k', 'accuracy'],
['GaokaoBench', 'weighted_average'],
'GaokaoBench_2010-2022_Math_II_MCQs',
'GaokaoBench_2010-2022_Math_II_Fill-in-the-Blank',
['math', 'accuracy'], ['math', 'accuracy'],
['Mathbench', 'naive_average'],
'',
'Knowledge',
['wikibench-wiki-single_choice_cncircular', 'perf_4'],
['cmmlu', 'naive_average'],
['mmlu', 'naive_average'],
['mmlu_pro', 'naive_average'],
'',
'Code',
['openai_humaneval', 'humaneval_pass@1'], ['openai_humaneval', 'humaneval_pass@1'],
['openai_humaneval_v2', 'humaneval_pass@1'], ['openai_humaneval_v2', 'humaneval_pass@1'],
['sanitized_mbpp', 'score'], ['sanitized_mbpp', 'score'],
['wikibench-wiki-single_choice_cncircular', 'perf_4'], '',
['gsm8k', 'accuracy'],
['GaokaoBench', 'weighted_average'],
['triviaqa_wiki_1shot', 'score'],
['nq_open_1shot', 'score'],
['winogrande', 'accuracy'],
['hellaswag', 'accuracy'],
['TheoremQA', 'score'],
['dingo_en_192', 'score'], ['dingo_en_192', 'score'],
['dingo_zh_170', 'score'], ['dingo_zh_170', 'score'],
'###### MathBench-A: Application Part ######',
'college',
'high',
'middle',
'primary',
'arithmetic',
'mathbench-a (average)',
'###### MathBench-T: Theory Part ######',
'college_knowledge',
'high_knowledge',
'middle_knowledge',
'primary_knowledge',
'mathbench-t (average)',
'###### Overall: Average between MathBench-A and MathBench-T ######',
'Overall',
'',
'bbh-logical_deduction_seven_objects',
'bbh-multistep_arithmetic_two',
'', '',
'mmlu', 'mmlu',
'mmlu-stem', 'mmlu-stem',
'mmlu-social-science', 'mmlu-social-science',
'mmlu-humanities', 'mmlu-humanities',
['mmlu-other', 'accuracy'], ['mmlu-other', 'accuracy'],
'',
'cmmlu', 'cmmlu',
'cmmlu-stem', 'cmmlu-stem',
'cmmlu-social-science', 'cmmlu-social-science',
'cmmlu-humanities', 'cmmlu-humanities',
'cmmlu-other', 'cmmlu-other',
['cmmlu-china-specific', 'accuracy'], ['cmmlu-china-specific', 'accuracy'],
'',
'mmlu_pro', 'mmlu_pro',
'mmlu_pro_biology', 'mmlu_pro_biology',
'mmlu_pro_business', 'mmlu_pro_business',
@ -169,9 +174,24 @@ summarizer = dict(
'mmlu_pro_physics', 'mmlu_pro_physics',
'mmlu_pro_psychology', 'mmlu_pro_psychology',
'mmlu_pro_other', 'mmlu_pro_other',
'',
'bbh-logical_deduction_seven_objects',
'bbh-multistep_arithmetic_two',
'###### MathBench-A: Application Part ######',
'college',
'high',
'middle',
'primary',
'arithmetic',
'mathbench-a (average)',
'###### MathBench-T: Theory Part ######',
'college_knowledge',
'high_knowledge',
'middle_knowledge',
'primary_knowledge',
'mathbench-t (average)',
], ],
summary_groups=sum( summary_groups=summary_groups,
[v for k, v in locals().items() if k.endswith('_summary_groups')], []),
) )
models = sum([v for k, v in locals().items() if k.endswith('_model')], []) models = sum([v for k, v in locals().items() if k.endswith('_model')], [])

View File

@ -7,8 +7,14 @@ with read_base():
aime2024_datasets # noqa: F401, E501 aime2024_datasets # noqa: F401, E501
from opencompass.configs.datasets.ARC_c.ARC_c_cot_gen_926652 import \ from opencompass.configs.datasets.ARC_c.ARC_c_cot_gen_926652 import \
ARC_c_datasets # noqa: F401, E501 ARC_c_datasets # noqa: F401, E501
# remove because of oom
# from opencompass.configs.datasets.ARC_Prize_Public_Evaluation.arc_prize_public_evaluation_gen_872059 import arc_prize_public_evaluation_datasets # noqa: F401, E501
from opencompass.configs.datasets.bbh.bbh_gen_5b92b0 import \ from opencompass.configs.datasets.bbh.bbh_gen_5b92b0 import \
bbh_datasets # noqa: F401, E501 bbh_datasets # noqa: F401, E501
from opencompass.configs.datasets.bigcodebench.bigcodebench_hard_complete_gen_faf748 import \
bigcodebench_hard_complete_datasets # noqa: F401, E501
from opencompass.configs.datasets.bigcodebench.bigcodebench_hard_instruct_gen_8815eb import \
bigcodebench_hard_instruct_datasets # noqa: F401, E501
from opencompass.configs.datasets.cmmlu.cmmlu_0shot_cot_gen_305931 import \ from opencompass.configs.datasets.cmmlu.cmmlu_0shot_cot_gen_305931 import \
cmmlu_datasets # noqa: F401, E501 cmmlu_datasets # noqa: F401, E501
from opencompass.configs.datasets.cmo_fib.cmo_fib_gen_ace24b import \ from opencompass.configs.datasets.cmo_fib.cmo_fib_gen_ace24b import \
@ -26,15 +32,17 @@ with read_base():
gsm8k_datasets # noqa: F401, E501 gsm8k_datasets # noqa: F401, E501
from opencompass.configs.datasets.hellaswag.hellaswag_10shot_gen_e42710 import \ from opencompass.configs.datasets.hellaswag.hellaswag_10shot_gen_e42710 import \
hellaswag_datasets # noqa: F401, E501 hellaswag_datasets # noqa: F401, E501
from opencompass.configs.datasets.humaneval.humaneval_openai_sample_evals_gen_159614 import \ from opencompass.configs.datasets.humaneval.humaneval_openai_sample_evals_gen_dcae0e import \
humaneval_datasets # noqa: F401, E501 humaneval_datasets # noqa: F401, E501
from opencompass.configs.datasets.humanevalx.humanevalx_gen_620cfa import \ from opencompass.configs.datasets.humanevalx.humanevalx_gen_3d84a3 import \
humanevalx_datasets # noqa: F401, E501 humanevalx_datasets # noqa: F401, E501
from opencompass.configs.datasets.IFEval.IFEval_gen_3321a3 import \ from opencompass.configs.datasets.IFEval.IFEval_gen_353ae7 import \
ifeval_datasets # noqa: F401, E501 ifeval_datasets # noqa: F401, E501
from opencompass.configs.datasets.korbench.korbench_single_0_shot_gen import \
korbench_0shot_single_datasets # noqa: F401, E501
from opencompass.configs.datasets.livecodebench.livecodebench_gen_b2b0fd import \ from opencompass.configs.datasets.livecodebench.livecodebench_gen_b2b0fd import \
LCB_datasets # noqa: F401, E501 LCB_datasets # noqa: F401, E501
from opencompass.configs.datasets.math.math_0shot_gen_393424 import \ from opencompass.configs.datasets.math.math_0shot_gen_11c4b5 import \
math_datasets # noqa: F401, E501 math_datasets # noqa: F401, E501
from opencompass.configs.datasets.MathBench.mathbench_2024_gen_50a320 import \ from opencompass.configs.datasets.MathBench.mathbench_2024_gen_50a320 import \
mathbench_datasets # noqa: F401, E501 mathbench_datasets # noqa: F401, E501
@ -71,6 +79,7 @@ with read_base():
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat import \ from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat import \
models as lmdeploy_internlm2_5_7b_chat_model # noqa: F401, E501 models as lmdeploy_internlm2_5_7b_chat_model # noqa: F401, E501
# Summary Groups # Summary Groups
# Summary Groups
from opencompass.configs.summarizers.groups.bbh import \ from opencompass.configs.summarizers.groups.bbh import \
bbh_summary_groups # noqa: F401, E501 bbh_summary_groups # noqa: F401, E501
from opencompass.configs.summarizers.groups.cmmlu import \ from opencompass.configs.summarizers.groups.cmmlu import \
@ -81,6 +90,8 @@ with read_base():
GaokaoBench_summary_groups # noqa: F401, E501 GaokaoBench_summary_groups # noqa: F401, E501
from opencompass.configs.summarizers.groups.humanevalx import \ from opencompass.configs.summarizers.groups.humanevalx import \
humanevalx_summary_groups # noqa: F401, E501 humanevalx_summary_groups # noqa: F401, E501
from opencompass.configs.summarizers.groups.korbench import \
korbench_summary_groups # noqa: F401, E501
from opencompass.configs.summarizers.groups.mathbench_v1_2024 import \ from opencompass.configs.summarizers.groups.mathbench_v1_2024 import \
mathbench_2024_summary_groups # noqa: F401, E501 mathbench_2024_summary_groups # noqa: F401, E501
from opencompass.configs.summarizers.groups.mmlu import \ from opencompass.configs.summarizers.groups.mmlu import \
@ -185,6 +196,8 @@ summarizer = dict(
['hellaswag', 'accuracy'], ['hellaswag', 'accuracy'],
['TheoremQA', 'score'], ['TheoremQA', 'score'],
['musr_average', 'naive_average'], ['musr_average', 'naive_average'],
['korbench_single', 'naive_average'],
['ARC_Prize_Public_Evaluation', 'accuracy'],
'', '',
'Math Calculation', 'Math Calculation',
['gsm8k', 'accuracy'], ['gsm8k', 'accuracy'],
@ -208,6 +221,8 @@ summarizer = dict(
['lcb_code_generation', 'pass@1'], ['lcb_code_generation', 'pass@1'],
['lcb_code_execution', 'pass@1'], ['lcb_code_execution', 'pass@1'],
['lcb_test_output', 'pass@1'], ['lcb_test_output', 'pass@1'],
['bigcodebench_hard_instruct', 'pass@1'],
['bigcodebench_hard_complete', 'pass@1'],
'', '',
'Agent', 'Agent',
['teval', 'naive_average'], ['teval', 'naive_average'],

View File

@ -4,35 +4,37 @@ from mmengine.config import read_base
from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
from opencompass.runners import LocalRunner from opencompass.runners import LocalRunner
from opencompass.summarizers import SubjectiveSummarizer from opencompass.summarizers import DefaultSubjectiveSummarizer
from opencompass.tasks.subjective_eval import SubjectiveEvalTask from opencompass.tasks.subjective_eval import SubjectiveEvalTask
with read_base(): with read_base():
# read hf models - chat models # read hf models - chat models
# Dataset # Dataset
from opencompass.configs.datasets.subjective.alignbench.alignbench_v1_1_judgeby_critiquellm import \ from opencompass.configs.datasets.chinese_simpleqa.chinese_simpleqa_gen import \
csimpleqa_datasets # noqa: F401, E501
from opencompass.configs.datasets.SimpleQA.simpleqa_gen_0283c3 import \
simpleqa_datasets # noqa: F401, E501; noqa: F401, E501
from opencompass.configs.datasets.subjective.alignbench.alignbench_v1_1_judgeby_critiquellm_new import \
alignbench_datasets # noqa: F401, E501 alignbench_datasets # noqa: F401, E501
from opencompass.configs.datasets.subjective.alpaca_eval.alpacav2_judgeby_gpt4 import \ from opencompass.configs.datasets.subjective.alpaca_eval.alpacav2_judgeby_gpt4_new import \
alpacav2_datasets # noqa: F401, E501 alpacav2_datasets # noqa: F401, E501
from opencompass.configs.datasets.subjective.arena_hard.arena_hard_compare import \ from opencompass.configs.datasets.subjective.arena_hard.arena_hard_compare_new import \
arenahard_datasets # noqa: F401, E501 arenahard_datasets # noqa: F401, E501
from opencompass.configs.datasets.subjective.compassarena.compassarena_compare import \ from opencompass.configs.datasets.subjective.compassarena.compassarena_compare_new import \
compassarena_datasets # noqa: F401, E501 compassarena_datasets # noqa: F401, E501
from opencompass.configs.datasets.subjective.fofo.fofo_bilingual_judge import \ from opencompass.configs.datasets.subjective.fofo.fofo_bilingual_judge_new import \
fofo_datasets # noqa: F401, E501 fofo_datasets # noqa: F401, E501
from opencompass.configs.datasets.subjective.followbench.followbench_llmeval import \ from opencompass.configs.datasets.subjective.followbench.followbench_llmeval_new import \
followbench_llmeval_datasets # noqa: F401, E501 followbench_llmeval_datasets # noqa: F401, E501
from opencompass.configs.datasets.subjective.multiround.mtbench101_judge import \ from opencompass.configs.datasets.subjective.multiround.mtbench101_judge_new import \
mtbench101_datasets # noqa: F401, E501 mtbench101_datasets # noqa: F401, E501
from opencompass.configs.datasets.subjective.wildbench.wildbench_pair_judge import \ from opencompass.configs.datasets.subjective.wildbench.wildbench_pair_judge_new import \
wildbench_datasets # noqa: F401, E501 wildbench_datasets # noqa: F401, E501
from opencompass.configs.models.hf_internlm.hf_internlm2_5_7b_chat import \ from opencompass.configs.models.hf_internlm.hf_internlm2_5_7b_chat import \
models as hf_internlm2_5_7b_chat_model # noqa: F401, E501 models as hf_internlm2_5_7b_chat_model # noqa: F401, E501
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat import \ from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat import \
models as lmdeploy_internlm2_5_7b_chat_model # noqa: F401, E501 models as lmdeploy_internlm2_5_7b_chat_model # noqa: F401, E501
summarizer = dict(type=SubjectiveSummarizer, function='subjective')
datasets = sum((v for k, v in locals().items() if k.endswith('_datasets') datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')
and 'mtbench101' not in k and 'wildbench' not in k), []) and 'mtbench101' not in k and 'wildbench' not in k), [])
datasets += mtbench101_datasets # noqa: F401, E501 datasets += mtbench101_datasets # noqa: F401, E501
@ -68,3 +70,128 @@ eval = dict(
max_num_workers=16, max_num_workers=16,
task=dict(type=SubjectiveEvalTask)), task=dict(type=SubjectiveEvalTask)),
) )
summary_groups = []
summary_groups.append({
'name':
'compassarena_language',
'subsets': [
['compassarena_language', '内容总结'],
['compassarena_language', '情感分析'],
['compassarena_language', 'Information Retrival'],
['compassarena_language', '综合问答'],
['compassarena_language', '中华文化'],
],
})
summary_groups.append({
'name':
'compassarena_knowledge',
'subsets': [
['compassarena_knowledge', '生活常识_ZH'],
['compassarena_knowledge', '自然科学工科_ZH'],
['compassarena_knowledge', '人文科学_ZH'],
['compassarena_knowledge', '自然科学理科_ZH'],
['compassarena_knowledge', '社会科学_ZH'],
],
})
summary_groups.append({
'name': 'compassarena_reason_v2',
'subsets': [
['compassarena_reason_v2', 'reasoning'],
],
})
summary_groups.append({
'name':
'compassarena_math_v2',
'subsets': [
['compassarena_math_v2', '高等数学_ZH'],
['compassarena_math_v2', '初等数学_ZH'],
['compassarena_math_v2', '中等数学_ZH'],
],
})
summary_groups.append({
'name':
'compassarena_creationv2_zh',
'subsets': [
['compassarena_creationv2_zh', '内容扩写_ZH'],
['compassarena_creationv2_zh', '内容续写_ZH'],
['compassarena_creationv2_zh', '内容改写_ZH'],
],
})
summary_groups.append({
'name':
'CompassArena',
'subsets': [
'compassarena_language',
'compassarena_knowledge',
'compassarena_reason_v2',
'compassarena_math_v2',
'compassarena_creationv2_zh',
],
})
summary_groups.append({
'name':
'FoFo',
'subsets': [['fofo_test_prompts', 'overall'],
['fofo_test_prompts_cn', 'overall']],
})
summary_groups.append({
'name':
'Followbench',
'subsets': [
['followbench_llmeval_en', 'HSR_AVG'],
['followbench_llmeval_en', 'SSR_AVG'],
],
})
# Summarizer
summarizer = dict(
dataset_abbrs=[
['alignment_bench_v1_1', '总分'],
['alpaca_eval', 'total'],
['arenahard', 'score'],
['Followbench', 'naive_average'],
['CompassArena', 'naive_average'],
['FoFo', 'naive_average'],
['mtbench101', 'avg'],
['wildbench', 'average'],
['simpleqa', 'accuracy_given_attempted'],
['chinese_simpleqa', 'given_attempted_accuracy'],
'',
['alignment_bench_v1_1', '专业能力'],
['alignment_bench_v1_1', '数学计算'],
['alignment_bench_v1_1', '基本任务'],
['alignment_bench_v1_1', '逻辑推理'],
['alignment_bench_v1_1', '中文理解'],
['alignment_bench_v1_1', '文本写作'],
['alignment_bench_v1_1', '角色扮演'],
['alignment_bench_v1_1', '综合问答'],
['alpaca_eval', 'helpful_base'],
['alpaca_eval', 'koala'],
['alpaca_eval', 'oasst'],
['alpaca_eval', 'selfinstruct'],
['alpaca_eval', 'vicuna'],
['compassarena_language', 'naive_average'],
['compassarena_knowledge', 'naive_average'],
['compassarena_reason_v2', 'naive_average'],
['compassarena_math_v2', 'naive_average'],
['compassarena_creationv2_zh', 'naive_average'],
['fofo_test_prompts', 'overall'],
['fofo_test_prompts_cn', 'overall'],
['followbench_llmeval_en', 'HSR_AVG'],
['followbench_llmeval_en', 'SSR_AVG'],
['followbench_llmeval_en', 'HSR_L1'],
['followbench_llmeval_en', 'HSR_L2'],
['followbench_llmeval_en', 'HSR_L3'],
['followbench_llmeval_en', 'HSR_L4'],
['followbench_llmeval_en', 'HSR_L5'],
['followbench_llmeval_en', 'SSR_L1'],
['followbench_llmeval_en', 'SSR_L2'],
['followbench_llmeval_en', 'SSR_L3'],
['followbench_llmeval_en', 'SSR_L4'],
['followbench_llmeval_en', 'SSR_L5'],
['simpleqa', 'f1'],
],
type=DefaultSubjectiveSummarizer,
summary_groups=summary_groups,
)

View File

@ -7,28 +7,55 @@ import yaml
output_path = 'regression_result_daily' output_path = 'regression_result_daily'
chat_model_list = [ chat_model_list = [
'baichuan2-7b-chat-hf', 'glm-4-9b-chat-hf', 'glm-4-9b-chat-turbomind', 'baichuan2-7b-chat-hf',
'glm-4-9b-chat-vllm', 'deepseek-7b-chat-hf', 'deepseek-moe-16b-chat-hf', 'glm-4-9b-chat-hf',
'deepseek-7b-chat-vllm', 'gemma2-2b-it-hf', 'gemma2-9b-it-hf', 'glm-4-9b-chat-turbomind',
'gemma-2b-it-hf', 'gemma-7b-it-hf', 'gemma-2-9b-it-turbomind', 'glm-4-9b-chat-vllm',
'gemma-7b-it-vllm', 'internlm2_5-7b-chat-hf', 'deepseek-7b-chat-hf',
'internlm2_5-7b-chat-turbomind', 'internlm2-chat-1.8b-turbomind', 'deepseek-moe-16b-chat-hf',
'internlm2-chat-1.8b-sft-turbomind', 'internlm2-chat-7b-lmdeploy', 'deepseek-7b-chat-vllm',
'internlm2-chat-7b-sft-turbomind', 'internlm2-chat-7b-vllm', 'gemma2-2b-it-hf',
'llama-3_1-8b-instruct-hf', 'llama-3_2-3b-instruct-hf', 'gemma2-9b-it-hf',
'llama-3-8b-instruct-hf', 'llama-3_1-8b-instruct-turbomind', 'gemma-2b-it-hf',
'llama-3_2-3b-instruct-turbomind', 'llama-3-8b-instruct-turbomind', 'gemma-7b-it-hf',
'mistral-7b-instruct-v0.2-hf', 'mistral-7b-instruct-v0.3-hf', 'gemma-2-9b-it-turbomind',
'mistral-nemo-instruct-2407-hf', 'mistral-nemo-instruct-2407-turbomind', 'gemma-7b-it-vllm',
'mistral-7b-instruct-v0.1-vllm', 'mistral-7b-instruct-v0.2-vllm', 'internlm2_5-7b-chat-hf',
'MiniCPM3-4B-hf', 'minicpm-2b-dpo-fp32-hf', 'minicpm-2b-sft-bf16-hf', 'internlm2_5-7b-chat-turbomind',
'minicpm-2b-sft-fp32-hf', 'phi-3-mini-4k-instruct-hf', 'internlm2-chat-1.8b-turbomind',
'qwen1.5-0.5b-chat-hf', 'qwen2-1.5b-instruct-hf', 'qwen2-7b-instruct-hf', 'internlm2-chat-1.8b-sft-turbomind',
'qwen2-1.5b-instruct-turbomind', 'qwen2-7b-instruct-turbomind', 'internlm2-chat-7b-lmdeploy',
'qwen1.5-0.5b-chat-vllm', 'yi-1.5-6b-chat-hf', 'yi-1.5-9b-chat-hf', 'internlm2-chat-7b-sft-turbomind',
'deepseek-v2-lite-chat-hf', 'internlm2_5-20b-chat-hf', 'internlm2-chat-7b-vllm',
'internlm2_5-20b-chat-turbomind', 'mistral-small-instruct-2409-hf', 'llama-3_1-8b-instruct-hf',
'mistral-small-instruct-2409-turbomind', 'qwen2.5-14b-instruct-hf', 'llama-3_2-3b-instruct-hf',
'llama-3-8b-instruct-hf',
'llama-3_1-8b-instruct-turbomind',
'llama-3_2-3b-instruct-turbomind',
'llama-3-8b-instruct-turbomind',
'mistral-7b-instruct-v0.2-hf',
'mistral-7b-instruct-v0.3-hf',
'mistral-nemo-instruct-2407-hf',
'mistral-nemo-instruct-2407-turbomind',
'mistral-7b-instruct-v0.1-vllm',
'mistral-7b-instruct-v0.2-vllm',
# 'MiniCPM3-4B-hf', 'minicpm-2b-dpo-fp32-hf', 'minicpm-2b-sft-bf16-hf',
# 'minicpm-2b-sft-fp32-hf',
'phi-3-mini-4k-instruct-hf',
'qwen1.5-0.5b-chat-hf',
'qwen2-1.5b-instruct-hf',
'qwen2-7b-instruct-hf',
'qwen2-1.5b-instruct-turbomind',
'qwen2-7b-instruct-turbomind',
'qwen1.5-0.5b-chat-vllm',
'yi-1.5-6b-chat-hf',
'yi-1.5-9b-chat-hf',
'deepseek-v2-lite-chat-hf',
'internlm2_5-20b-chat-hf',
'internlm2_5-20b-chat-turbomind',
'mistral-small-instruct-2409-hf',
'mistral-small-instruct-2409-turbomind',
'qwen2.5-14b-instruct-hf',
'qwen2.5-14b-instruct-turbomind' 'qwen2.5-14b-instruct-turbomind'
] ]
base_model_list = [ base_model_list = [
@ -92,9 +119,9 @@ def result_scores():
class TestChat: class TestChat:
"""Test cases for chat model.""" """Test cases for chat model."""
@pytest.mark.parametrize('model, dataset', @pytest.mark.parametrize(
[(p1, p2) for p1 in chat_model_list 'model, dataset', [(p1, p2) for p1 in chat_model_list
for p2 in ['gsm8k', 'race-high']]) for p2 in ['gsm8k_accuracy', 'race-high_accuracy']])
def test_model_dataset_score(self, baseline_scores_testrange, def test_model_dataset_score(self, baseline_scores_testrange,
result_scores, model, dataset): result_scores, model, dataset):
base_score = baseline_scores_testrange.get(model).get(dataset) base_score = baseline_scores_testrange.get(model).get(dataset)
@ -108,13 +135,14 @@ class TestChat:
class TestBase: class TestBase:
"""Test cases for base model.""" """Test cases for base model."""
@pytest.mark.parametrize( @pytest.mark.parametrize('model, dataset', [
'model, dataset', (p1, p2) for p1 in base_model_list for p2 in
[(p1, p2) for p1 in base_model_list ['gsm8k_accuracy', 'GPQA_diamond', 'race-high_accuracy', 'winogrande']
for p2 in ['gsm8k', 'GPQA_diamond', 'race-high', 'winogrande']]) ])
def test_model_dataset_score(self, baseline_scores_testrange, def test_model_dataset_score(self, baseline_scores_testrange,
result_scores, model, dataset): result_scores, model, dataset):
if model in ['gemma-2b-vllm', 'gemma-7b-vllm'] and dataset != 'gsm8k': if model in ['gemma-2b-vllm', 'gemma-7b-vllm'
] and dataset != 'gsm8k_accuracy':
return return
base_score = baseline_scores_testrange.get(model).get(dataset) base_score = baseline_scores_testrange.get(model).get(dataset)
result_score = result_scores.get(model).get(dataset) result_score = result_scores.get(model).get(dataset)
@ -131,16 +159,23 @@ class TestChatObjFullbench:
'internlm2_5-7b-chat-hf_fullbench', 'internlm2_5-7b-chat-hf_fullbench',
'internlm2_5-7b-chat-turbomind_fullbench' 'internlm2_5-7b-chat-turbomind_fullbench'
] for p2 in [ ] for p2 in [
'race-high', 'ARC-c', 'BoolQ', 'triviaqa_wiki_1shot', 'nq_open_1shot', 'race-high_accuracy', 'ARC-c_accuracy', 'BoolQ_accuracy',
'IFEval', 'drop', 'GPQA_diamond', 'hellaswag', 'TheoremQA', 'triviaqa_wiki_1shot_score', 'nq_open_1shot_score',
'musr_average', 'gsm8k', 'math', 'cmo_fib', 'aime2024', 'IFEval_Prompt-level-strict-accuracy', 'drop_accuracy',
'wikibench-wiki-single_choice_cncircular', 'sanitized_mbpp', 'ds1000', 'GPQA_diamond_accuracy', 'hellaswag_accuracy', 'TheoremQA_score',
'lcb_code_generation', 'lcb_code_execution', 'lcb_test_output', 'musr_average_naive_average', 'korbench_single_naive_average',
'bbh-logical_deduction_seven_objects', 'bbh-multistep_arithmetic_two', 'gsm8k_accuracy', 'math_accuracy', 'cmo_fib_accuracy',
'mmlu-other', 'cmmlu-china-specific', 'mmlu_pro_math', 'ds1000_Pandas', 'aime2024_accuracy', 'wikibench-wiki-single_choice_cncircular_perf_4',
'ds1000_Numpy', 'ds1000_Tensorflow', 'ds1000_Scipy', 'ds1000_Sklearn', 'sanitized_mbpp_score', 'ds1000_naive_average',
'ds1000_Pytorch', 'ds1000_Matplotlib', 'openai_mmmlu_lite_AR-XY', 'lcb_code_generation_pass@1', 'lcb_code_execution_pass@1',
'college', 'college_knowledge' 'lcb_test_output_pass@1', 'bbh-logical_deduction_seven_objects_score',
'bbh-multistep_arithmetic_two_score', 'mmlu-other_naive_average',
'cmmlu-china-specific_naive_average', 'mmlu_pro_math_accuracy',
'ds1000_Pandas_accuracy', 'ds1000_Numpy_accuracy',
'ds1000_Tensorflow_accuracy', 'ds1000_Scipy_accuracy',
'ds1000_Sklearn_accuracy', 'ds1000_Pytorch_accuracy',
'ds1000_Matplotlib_accuracy', 'openai_mmmlu_lite_AR-XY_accuracy',
'college_naive_average', 'college_knowledge_naive_average'
]]) ]])
def test_model_dataset_score(self, baseline_scores_fullbench, def test_model_dataset_score(self, baseline_scores_fullbench,
result_scores, model, dataset): result_scores, model, dataset):
@ -159,17 +194,27 @@ class TestChatSubFullbench:
'internlm2_5-7b-chat-hf_fullbench', 'internlm2_5-7b-chat-hf_fullbench',
'internlm2_5-7b-chat-turbomind_fullbench' 'internlm2_5-7b-chat-turbomind_fullbench'
] for p2 in [ ] for p2 in [
'Alignbench总分', 'Alignbench专业能力', 'AlpacaEvaltotal', 'alignment_bench_v1_1_总分', 'alpaca_eval_total', 'arenahard_score',
'AlpacaEvalhelpful_base', 'CompassArenacompassarena_language', 'Followbench_naive_average', 'CompassArena_naive_average',
'CompassArenacompassarena_knowledge', 'mtbench101_avg', 'wildbench_average',
'CompassArenacompassarena_reason_v2', 'simpleqa_accuracy_given_attempted',
'CompassArenacompassarena_math_v2', 'chinese_simpleqa_given_attempted_accuracy',
'CompassArenacompassarena_creationv2_zh', 'Fofofofo_test_prompts', 'alignment_bench_v1_1_专业能力', 'alignment_bench_v1_1_数学计算',
'followbenchHSR_AVG', 'followbenchSSR_AVG', 'followbenchHSR_L1', 'alignment_bench_v1_1_基本任务', 'alignment_bench_v1_1_逻辑推理',
'followbenchHSR_L2', 'followbenchHSR_L3', 'followbenchHSR_L4', 'alignment_bench_v1_1_中文理解', 'alignment_bench_v1_1_文本写作',
'followbenchHSR_L5', 'followbenchSSR_L1', 'followbenchSSR_L2', 'alignment_bench_v1_1_角色扮演', 'alignment_bench_v1_1_综合问答',
'followbenchSSR_L3', 'followbenchSSR_L4', 'followbenchSSR_L5', 'alpaca_eval_helpful_base', 'compassarena_language_naive_average',
'MTBench101average', 'Wildbenchscore' 'compassarena_knowledge_naive_average',
'compassarena_reason_v2_naive_average',
'compassarena_math_v2_naive_average',
'compassarena_creationv2_zh_naive_average',
'fofo_test_prompts_overall', 'followbench_llmeval_en_HSR_AVG',
'followbench_llmeval_en_SSR_AVG', 'followbench_llmeval_en_HSR_L1',
'followbench_llmeval_en_HSR_L2', 'followbench_llmeval_en_HSR_L3',
'followbench_llmeval_en_HSR_L4', 'followbench_llmeval_en_HSR_L5',
'followbench_llmeval_en_SSR_L1', 'followbench_llmeval_en_SSR_L2',
'followbench_llmeval_en_SSR_L3', 'followbench_llmeval_en_SSR_L4',
'followbench_llmeval_en_SSR_L5', 'simpleqa_f1'
]]) ]])
def test_model_dataset_score(self, baseline_scores_fullbench, def test_model_dataset_score(self, baseline_scores_fullbench,
result_scores, model, dataset): result_scores, model, dataset):
@ -187,13 +232,18 @@ class TestBaseFullbench:
@pytest.mark.parametrize('model, dataset', [(p1, p2) for p1 in [ @pytest.mark.parametrize('model, dataset', [(p1, p2) for p1 in [
'internlm2_5-7b-hf_fullbench', 'internlm2_5-7b-turbomind_fullbench' 'internlm2_5-7b-hf_fullbench', 'internlm2_5-7b-turbomind_fullbench'
] for p2 in [ ] for p2 in [
'race-high', 'ARC-c', 'BoolQ', 'drop', 'GPQA_diamond', 'math', 'race-high_accuracy', 'ARC-c_accuracy', 'BoolQ_accuracy',
'wikibench-wiki-single_choice_cncircular', 'sanitized_mbpp', 'gsm8k', 'triviaqa_wiki_1shot_score', 'nq_open_1shot_score', 'drop_accuracy',
'triviaqa_wiki_1shot', 'nq_open_1shot', 'winogrande', 'hellaswag', 'GPQA_diamond_accuracy', 'hellaswag_accuracy', 'TheoremQA_score',
'TheoremQA', 'dingo_en_192', 'dingo_zh_170', 'college', 'winogrande_accuracy', 'gsm8k_accuracy',
'college_knowledge', 'bbh-logical_deduction_seven_objects', 'GaokaoBench_2010-2022_Math_II_MCQs_score',
'bbh-multistep_arithmetic_two', 'mmlu-other', 'cmmlu-china-specific', 'GaokaoBench_2010-2022_Math_II_Fill-in-the-Blank_score',
'mmlu_pro_math' 'math_accuracy', 'wikibench-wiki-single_choice_cncircular_perf_4',
'sanitized_mbpp_score', 'dingo_en_192_score', 'dingo_zh_170_score',
'mmlu-other_accuracy', 'cmmlu-china-specific_accuracy',
'mmlu_pro_math_accuracy', 'bbh-logical_deduction_seven_objects_score',
'bbh-multistep_arithmetic_two_score', 'college_naive_average',
'college_knowledge_naive_average'
]]) ]])
def test_model_dataset_score(self, baseline_scores_fullbench, def test_model_dataset_score(self, baseline_scores_fullbench,
result_scores, model, dataset): result_scores, model, dataset):
@ -209,40 +259,238 @@ class TestApibench:
"""Test cases for chat model.""" """Test cases for chat model."""
@pytest.mark.parametrize('model, dataset', @pytest.mark.parametrize('model, dataset',
[('lmdeploy-api-test', 'race-middle'), [('lmdeploy-api-test', 'race-middle_accuracy'),
('lmdeploy-api-test', 'race-high'), ('lmdeploy-api-test', 'race-high_accuracy'),
('lmdeploy-api-test', 'gsm8k')]) ('lmdeploy-api-test', 'gsm8k_accuracy')])
def test_api(self, baseline_scores, result_scores, model, dataset): def test_api(self, baseline_scores, result_scores, model, dataset):
base_score = baseline_scores.get(model).get(dataset) base_score = baseline_scores.get(model).get(dataset)
result_score = result_scores.get(model).get(dataset) result_score = result_scores.get(model).get(dataset)
assert_score(model + '_batch', result_score, base_score) assert_score(model + '_batch', result_score, base_score)
@pytest.mark.usefixtures('result_scores')
@pytest.mark.usefixtures('baseline_scores_fullbench')
@pytest.mark.volc_fullbench
class TestVolcFullbench:
"""Test cases for chat model."""
@pytest.mark.parametrize('model, dataset', [(
p1, p2
) for p1 in ['internlm2_5-7b-chat-turbomind'] for p2 in [
'race-high_accuracy', 'ARC-c_accuracy', 'BoolQ_accuracy',
'triviaqa_wiki_1shot_score', 'nq_open_1shot_score',
'mmmlu_lite_naive_average', 'IFEval_Prompt-level-strict-accuracy',
'drop_accuracy', 'bbh_naive_average', 'GPQA_diamond_accuracy',
'hellaswag_accuracy', 'TheoremQA_score', 'musr_average_naive_average',
'korbench_single_naive_average',
'ARC_Prize_Public_Evaluation_accuracy', 'gsm8k_accuracy',
'GaokaoBench_weighted_average', 'math_accuracy', 'cmo_fib_accuracy',
'aime2024_accuracy', 'Mathbench_naive_average',
'wikibench-wiki-single_choice_cncircular_perf_4',
'cmmlu_naive_average', 'mmlu_naive_average', 'mmlu_pro_naive_average',
'openai_humaneval_humaneval_pass@1', 'sanitized_mbpp_score',
'humanevalx_naive_average', 'ds1000_naive_average',
'lcb_code_generation_pass@1', 'lcb_code_execution_pass@1',
'lcb_test_output_pass@1', 'bigcodebench_hard_instruct_pass@1',
'bigcodebench_hard_complete_pass@1', 'teval_naive_average',
'qa_dingo_cn_score', 'mmlu-stem_naive_average',
'mmlu-social-science_naive_average', 'mmlu-humanities_naive_average',
'mmlu-other_naive_average', 'cmmlu-stem_naive_average',
'cmmlu-social-science_naive_average', 'cmmlu-humanities_naive_average',
'cmmlu-other_naive_average', 'cmmlu-china-specific_naive_average',
'mmlu_pro_biology_accuracy', 'mmlu_pro_business_accuracy',
'mmlu_pro_chemistry_accuracy', 'mmlu_pro_computer_science_accuracy',
'mmlu_pro_economics_accuracy', 'mmlu_pro_engineering_accuracy',
'mmlu_pro_health_accuracy', 'mmlu_pro_history_accuracy',
'mmlu_pro_law_accuracy', 'mmlu_pro_math_accuracy',
'mmlu_pro_philosophy_accuracy', 'mmlu_pro_physics_accuracy',
'mmlu_pro_psychology_accuracy', 'mmlu_pro_other_accuracy',
'humanevalx-python_pass@1', 'humanevalx-cpp_pass@1',
'humanevalx-go_pass@1', 'humanevalx-java_pass@1',
'humanevalx-js_pass@1', 'ds1000_Pandas_accuracy',
'ds1000_Numpy_accuracy', 'ds1000_Tensorflow_accuracy',
'ds1000_Scipy_accuracy', 'ds1000_Sklearn_accuracy',
'ds1000_Pytorch_accuracy', 'ds1000_Matplotlib_accuracy',
'openai_mmmlu_lite_AR-XY_accuracy', 'openai_mmmlu_lite_BN-BD_accuracy',
'openai_mmmlu_lite_DE-DE_accuracy', 'openai_mmmlu_lite_ES-LA_accuracy',
'openai_mmmlu_lite_FR-FR_accuracy', 'openai_mmmlu_lite_HI-IN_accuracy',
'openai_mmmlu_lite_ID-ID_accuracy', 'openai_mmmlu_lite_IT-IT_accuracy',
'openai_mmmlu_lite_JA-JP_accuracy', 'openai_mmmlu_lite_KO-KR_accuracy',
'openai_mmmlu_lite_PT-BR_accuracy', 'openai_mmmlu_lite_SW-KE_accuracy',
'openai_mmmlu_lite_YO-NG_accuracy', 'openai_mmmlu_lite_ZH-CN_accuracy',
'college_naive_average', 'high_naive_average', 'middle_naive_average',
'primary_naive_average', 'arithmetic_naive_average',
'mathbench-a (average)_naive_average',
'college_knowledge_naive_average', 'high_knowledge_naive_average',
'middle_knowledge_naive_average', 'primary_knowledge_naive_average',
'mathbench-t (average)_naive_average'
]])
@pytest.mark.chat_objective
def test_chat_objective(self, baseline_scores_fullbench, result_scores,
model, dataset):
base_score = baseline_scores_fullbench.get(model).get(dataset)
result_score = result_scores.get(model).get(dataset)
assert_score(model + '_batch', result_score, base_score)
@pytest.mark.parametrize(
'model, dataset',
[(p1, p2) for p1 in ['internlm2_5-7b-chat-turbomind']
for p2 in [
'alignment_bench_v1_1_总分', 'alpaca_eval_total', 'arenahard_score',
'Followbench_naive_average', 'CompassArena_naive_average',
'FoFo_naive_average', 'mtbench101_avg', 'wildbench_average',
'simpleqa_accuracy_given_attempted',
'chinese_simpleqa_given_attempted_accuracy',
'alignment_bench_v1_1_专业能力', 'alignment_bench_v1_1_数学计算',
'alignment_bench_v1_1_基本任务', 'alignment_bench_v1_1_逻辑推理',
'alignment_bench_v1_1_中文理解', 'alignment_bench_v1_1_文本写作',
'alignment_bench_v1_1_角色扮演', 'alignment_bench_v1_1_综合问答',
'alpaca_eval_helpful_base', 'alpaca_eval_koala',
'alpaca_eval_oasst', 'alpaca_eval_selfinstruct',
'alpaca_eval_vicuna', 'compassarena_language_naive_average',
'compassarena_knowledge_naive_average',
'compassarena_reason_v2_naive_average',
'compassarena_math_v2_naive_average',
'compassarena_creationv2_zh_naive_average',
'fofo_test_prompts_overall', 'fofo_test_prompts_cn_overall',
'followbench_llmeval_en_HSR_AVG',
'followbench_llmeval_en_SSR_AVG', 'followbench_llmeval_en_HSR_L1',
'followbench_llmeval_en_HSR_L2', 'followbench_llmeval_en_HSR_L3',
'followbench_llmeval_en_HSR_L4', 'followbench_llmeval_en_HSR_L5',
'followbench_llmeval_en_SSR_L1', 'followbench_llmeval_en_SSR_L2',
'followbench_llmeval_en_SSR_L3', 'followbench_llmeval_en_SSR_L4',
'followbench_llmeval_en_SSR_L5', 'simpleqa_f1'
]])
@pytest.mark.chat_subjective
def test_chat_subjective(self, baseline_scores_fullbench, result_scores,
model, dataset):
base_score = baseline_scores_fullbench.get(model).get(dataset)
result_score = result_scores.get(model).get(dataset)
assert_score(model + '_batch', result_score, base_score)
@pytest.mark.parametrize('model, dataset', [(
p1, p2
) for p1 in ['internlm2_5-7b-turbomind'] for p2 in [
'race-high_accuracy', 'ARC-c_accuracy', 'BoolQ_accuracy',
'triviaqa_wiki_1shot_score', 'nq_open_1shot_score', 'drop_accuracy',
'bbh_naive_average', 'GPQA_diamond_accuracy', 'hellaswag_accuracy',
'TheoremQA_score', 'winogrande_accuracy', 'gsm8k_accuracy',
'GaokaoBench_weighted_average', 'math_accuracy',
'Mathbench_naive_average',
'wikibench-wiki-single_choice_cncircular_perf_4',
'cmmlu_naive_average', 'mmlu_naive_average', 'mmlu_pro_naive_average',
'openai_humaneval_humaneval_pass@1',
'openai_humaneval_v2_humaneval_pass@1', 'sanitized_mbpp_score',
'dingo_en_192_score', 'dingo_zh_170_score', 'mmlu-stem_naive_average',
'mmlu-social-science_naive_average', 'mmlu-humanities_naive_average',
'mmlu-other_naive_average', 'cmmlu-stem_naive_average',
'cmmlu-social-science_naive_average', 'cmmlu-humanities_naive_average',
'cmmlu-other_naive_average', 'cmmlu-china-specific_naive_average',
'mmlu_pro_biology_accuracy', 'mmlu_pro_business_accuracy',
'mmlu_pro_chemistry_accuracy', 'mmlu_pro_computer_science_accuracy',
'mmlu_pro_economics_accuracy', 'mmlu_pro_engineering_accuracy',
'mmlu_pro_health_accuracy', 'mmlu_pro_history_accuracy',
'mmlu_pro_law_accuracy', 'mmlu_pro_math_accuracy',
'mmlu_pro_philosophy_accuracy', 'mmlu_pro_physics_accuracy',
'mmlu_pro_psychology_accuracy', 'mmlu_pro_other_accuracy',
'college_naive_average', 'high_naive_average', 'middle_naive_average',
'primary_naive_average', 'arithmetic_naive_average',
'mathbench-a (average)_naive_average',
'college_knowledge_naive_average', 'high_knowledge_naive_average',
'middle_knowledge_naive_average', 'primary_knowledge_naive_average',
'mathbench-t (average)_naive_average'
]])
@pytest.mark.base_objective
def test_base_objective(self, baseline_scores_fullbench, result_scores,
model, dataset):
base_score = baseline_scores_fullbench.get(model).get(dataset)
result_score = result_scores.get(model).get(dataset)
assert_score(model + '_batch', result_score, base_score)
@pytest.mark.parametrize(
'model, dataset',
[(p1, p2) for p1 in ['internlm2_5-7b-turbomind']
for p2 in [
'Single-Needle-Retrieval(S-RT)-32000_naive_average',
'Single-Needle-Retrieval-EN-32000_naive_average',
'Single-Needle-Retrieval-ZH-32000_naive_average',
'Single-Needle-Retrieval(S-RT)-100000_naive_average',
'Single-Needle-Retrieval-EN-100000_naive_average',
'Single-Needle-Retrieval-ZH-100000_naive_average',
'Single-Needle-Retrieval(S-RT)-200000_naive_average',
'Single-Needle-Retrieval-EN-200000_naive_average',
'Single-Needle-Retrieval-ZH-200000_naive_average',
'longbench_naive_average', 'longbench_zh_naive_average',
'longbench_en_naive_average',
'longbench_single-document-qa_naive_average',
'longbench_multi-document-qa_naive_average',
'longbench_summarization_naive_average',
'longbench_few-shot-learning_naive_average',
'longbench_synthetic-tasks_naive_average',
'longbench_code-completion_naive_average'
]])
@pytest.mark.base_long_context
def test_base_long_context(self, baseline_scores_fullbench, result_scores,
model, dataset):
base_score = baseline_scores_fullbench.get(model).get(dataset)
result_score = result_scores.get(model).get(dataset)
assert_score(model + '_batch', result_score, base_score)
@pytest.mark.parametrize(
'model, dataset',
[(p1, p2) for p1 in ['internlm2_5-7b-chat-1m-turbomind']
for p2 in [
'ruler_8k_naive_average', 'ruler_32k_naive_average',
'ruler_128k_naive_average',
'NeedleBench-Overall-Score-8K_weighted_average',
'NeedleBench-Overall-Score-32K_weighted_average',
'NeedleBench-Overall-Score-128K_weighted_average',
'longbench_naive_average', 'longbench_zh_naive_average',
'longbench_en_naive_average', 'babilong_0k_naive_average',
'babilong_4k_naive_average', 'babilong_16k_naive_average',
'babilong_32k_naive_average', 'babilong_128k_naive_average',
'babilong_256k_naive_average',
'longbench_single-document-qa_naive_average',
'longbench_multi-document-qa_naive_average',
'longbench_summarization_naive_average',
'longbench_few-shot-learning_naive_average',
'longbench_synthetic-tasks_naive_average',
'longbench_code-completion_naive_average'
]])
@pytest.mark.chat_long_context
def test_chat_long_context(self, baseline_scores_fullbench, result_scores,
model, dataset):
base_score = baseline_scores_fullbench.get(model).get(dataset)
result_score = result_scores.get(model).get(dataset)
assert_score(model + '_batch', result_score, base_score)
@pytest.mark.usefixtures('result_scores') @pytest.mark.usefixtures('result_scores')
@pytest.mark.usefixtures('baseline_scores') @pytest.mark.usefixtures('baseline_scores')
class TestCmdCase: class TestCmdCase:
@pytest.mark.case1 @pytest.mark.case1
@pytest.mark.parametrize('model, dataset', @pytest.mark.parametrize('model, dataset',
[('internlm2_5-7b-hf', 'race-middle'), [('internlm2_5-7b-hf', 'race-middle_accuracy'),
('internlm2_5-7b-hf', 'race-high'), ('internlm2_5-7b-hf', 'race-high_accuracy'),
('internlm2_5-7b-hf', 'demo_gsm8k'), ('internlm2_5-7b-hf', 'demo_gsm8k_accuracy'),
('internlm2-1.8b-hf', 'race-middle'), ('internlm2-1.8b-hf', 'race-middle_accuracy'),
('internlm2-1.8b-hf', 'race-high'), ('internlm2-1.8b-hf', 'race-high_accuracy'),
('internlm2-1.8b-hf', 'demo_gsm8k')]) ('internlm2-1.8b-hf', 'demo_gsm8k_accuracy')])
def test_cmd_case1(self, baseline_scores, result_scores, model, dataset): def test_cmd_case1(self, baseline_scores, result_scores, model, dataset):
base_score = baseline_scores.get(model).get(dataset) base_score = baseline_scores.get(model).get(dataset)
result_score = result_scores.get(model).get(dataset) result_score = result_scores.get(model).get(dataset)
assert_score(model, result_score, base_score) assert_score(model, result_score, base_score)
@pytest.mark.case2 @pytest.mark.case2
@pytest.mark.parametrize('model, dataset', @pytest.mark.parametrize(
[('internlm2_5-7b-chat-lmdeploy', 'race-middle'), 'model, dataset',
('internlm2_5-7b-chat-lmdeploy', 'race-high'), [('internlm2_5-7b-chat-lmdeploy', 'race-middle_accuracy'),
('internlm2_5-7b-chat-lmdeploy', 'demo_gsm8k'), ('internlm2_5-7b-chat-lmdeploy', 'race-high_accuracy'),
('internlm2-chat-1.8b-lmdeploy', 'race-middle'), ('internlm2_5-7b-chat-lmdeploy', 'demo_gsm8k_accuracy'),
('internlm2-chat-1.8b-lmdeploy', 'race-high'), ('internlm2-chat-1.8b-lmdeploy', 'race-middle_accuracy'),
('internlm2-chat-1.8b-lmdeploy', 'demo_gsm8k')]) ('internlm2-chat-1.8b-lmdeploy', 'race-high_accuracy'),
('internlm2-chat-1.8b-lmdeploy', 'demo_gsm8k_accuracy')])
def test_cmd_case2(self, baseline_scores, result_scores, model, dataset): def test_cmd_case2(self, baseline_scores, result_scores, model, dataset):
base_score = baseline_scores.get(model).get(dataset) base_score = baseline_scores.get(model).get(dataset)
result_score = result_scores.get(model).get(dataset) result_score = result_scores.get(model).get(dataset)
@ -250,19 +498,19 @@ class TestCmdCase:
@pytest.mark.case3 @pytest.mark.case3
@pytest.mark.parametrize('model, dataset', @pytest.mark.parametrize('model, dataset',
[('internlm2_5-7b_hf', 'race-middle'), [('internlm2_5-7b_hf', 'race-middle_accuracy'),
('internlm2_5-7b_hf', 'race-high'), ('internlm2_5-7b_hf', 'race-high_accuracy'),
('internlm2_5-7b_hf', 'demo_gsm8k')]) ('internlm2_5-7b_hf', 'demo_gsm8k_accuracy')])
def test_cmd_case3(self, baseline_scores, result_scores, model, dataset): def test_cmd_case3(self, baseline_scores, result_scores, model, dataset):
base_score = baseline_scores.get(model).get(dataset) base_score = baseline_scores.get(model).get(dataset)
result_score = result_scores.get(model).get(dataset) result_score = result_scores.get(model).get(dataset)
assert_score(model, result_score, base_score) assert_score(model, result_score, base_score)
@pytest.mark.case4 @pytest.mark.case4
@pytest.mark.parametrize('model, dataset', @pytest.mark.parametrize(
[('internlm2_5-7b-chat_hf', 'race-middle'), 'model, dataset', [('internlm2_5-7b-chat_hf', 'race-middle_accuracy'),
('internlm2_5-7b-chat_hf', 'race-high'), ('internlm2_5-7b-chat_hf', 'race-high_accuracy'),
('internlm2_5-7b-chat_hf', 'demo_gsm8k')]) ('internlm2_5-7b-chat_hf', 'demo_gsm8k_accuracy')])
def test_cmd_case4(self, baseline_scores, result_scores, model, dataset): def test_cmd_case4(self, baseline_scores, result_scores, model, dataset):
base_score = baseline_scores.get(model).get(dataset) base_score = baseline_scores.get(model).get(dataset)
result_score = result_scores.get(model).get(dataset) result_score = result_scores.get(model).get(dataset)
@ -310,8 +558,7 @@ def find_csv_files(directory):
csv_files = [] csv_files = []
for root, dirs, files in os.walk(directory): for root, dirs, files in os.walk(directory):
for file in files: for file in files:
if file.endswith('.csv') and (file.startswith('summary') or if file.endswith('.csv') and file.startswith('summary'):
file.startswith('Subjective_all')):
csv_files.append(os.path.join(root, file)) csv_files.append(os.path.join(root, file))
csv_files_with_time = {f: os.path.getctime(f) for f in csv_files} csv_files_with_time = {f: os.path.getctime(f) for f in csv_files}
@ -324,24 +571,15 @@ def read_csv_file(file_path):
with open(file_path, 'r') as csvfile: with open(file_path, 'r') as csvfile:
reader = csv.DictReader(csvfile) reader = csv.DictReader(csvfile)
filtered_data = [] filtered_data = []
if 'Subjective_all' not in file_path: for row in reader:
for row in reader: if row['metric'] is not None and 'bpb' not in row[
if row['metric'] is not None and 'bpb' not in row['metric']: 'metric'] and '_' != row['metric']:
filtered_row = { filtered_row = row
k: v filtered_row['dataset'] = row['dataset'] + '_' + row['metric']
for k, v in row.items() del filtered_row['version']
if k not in ['version', 'metric', 'mode'] del filtered_row['metric']
} del filtered_row['mode']
filtered_data.append(filtered_row) filtered_data.append(filtered_row)
else:
for row in reader:
if row['Detailed Scores'] is not None:
filtered_row = row
filtered_row['dataset'] = filtered_row[
'Dataset'] + filtered_row['Detailed Scores']
del filtered_row['Dataset']
del filtered_row['Detailed Scores']
filtered_data.append(filtered_row)
result = {} result = {}
for data in filtered_data: for data in filtered_data:

View File

@ -1,34 +1,34 @@
internlm2_5-7b-hf: internlm2_5-7b-hf:
demo_gsm8k: 42.19 demo_gsm8k_accuracy: 42.19
race-middle: 91.78 race-middle_accuracy: 91.78
race-high: 90.02 race-high_accuracy: 90.02
internlm2_5-7b_hf: internlm2_5-7b_hf:
demo_gsm8k: 42.19 demo_gsm8k_accuracy: 42.19
race-middle: 91.78 race-middle_accuracy: 91.78
race-high: 90.02 race-high_accuracy: 90.02
internlm2-1.8b-hf: internlm2-1.8b-hf:
demo_gsm8k: 15.62 demo_gsm8k_accuracy: 15.62
race-middle: 71.66 race-middle_accuracy: 71.66
race-high: 66.38 race-high_accuracy: 66.38
internlm2_5-7b-chat-lmdeploy: internlm2_5-7b-chat-lmdeploy:
demo_gsm8k: 84.38 demo_gsm8k_accuracy: 84.38
race-middle: 92.76 race-middle_accuracy: 92.76
race-high: 90.54 race-high_accuracy: 90.54
internlm2-chat-1.8b-lmdeploy: internlm2-chat-1.8b-lmdeploy:
demo_gsm8k: 31 demo_gsm8k_accuracy: 31
race-middle: 81.34 race-middle_accuracy: 81.34
race-high: 73.96 race-high_accuracy: 73.96
internlm2_5-7b-chat_hf: internlm2_5-7b-chat_hf:
demo_gsm8k: 87.50 demo_gsm8k_accuracy: 87.50
race-middle: 92.76 race-middle_accuracy: 92.76
race-high: 90.48 race-high_accuracy: 90.48
lmdeploy-api-test: lmdeploy-api-test:
gsm8k: 83.78 gsm8k_accuracy: 83.78
race-middle: 92.41 race-middle_accuracy: 92.41
race-high: 90.37 race-high_accuracy: 90.37

View File

@ -1,173 +1,447 @@
internlm2_5-7b-chat-hf_fullbench: internlm2_5-7b-chat-hf_fullbench:
race-high: 93.75 race-high_accuracy: 93.75
ARC-c: 93.75 ARC-c_accuracy: 93.75
BoolQ: 81.25 BoolQ_accuracy: 81.25
triviaqa_wiki_1shot: 50 triviaqa_wiki_1shot_score: 50
nq_open_1shot: 25 nq_open_1shot_score: 25
IFEval: 50 IFEval_Prompt-level-strict-accuracy: 50
drop: 81.25 drop_accuracy: 81.25
GPQA_diamond: 25 GPQA_diamond_accuracy: 25
hellaswag: 87.5 hellaswag_accuracy: 87.5
TheoremQA: 18.75 TheoremQA_score: 18.75
musr_average: 39.58 musr_average_naive_average: 39.58
gsm8k: 56.25 korbench_single_naive_average: 40
math: 75 gsm8k_accuracy: 62.50
cmo_fib: 6.25 math_accuracy: 75
aime2024: 6.25 cmo_fib_accuracy: 6.25
wikibench-wiki-single_choice_cncircular: 50 aime2024_accuracy: 6.25
sanitized_mbpp: 68.75 wikibench-wiki-single_choice_cncircular_perf_4: 50
ds1000: 16.96 sanitized_mbpp_score: 68.75
lcb_code_generation: 12.5 ds1000_naive_average: 16.96
lcb_code_execution: 43.75 lcb_code_generation_pass@1: 12.5
lcb_test_output: 18.75 lcb_code_execution_pass@1: 43.75
bbh-logical_deduction_seven_objects: 50 lcb_test_output_pass@1: 18.75
bbh-multistep_arithmetic_two: 68.75 bbh-logical_deduction_seven_objects_score: 50
mmlu-other: 72.6 bbh-multistep_arithmetic_two_score: 68.75
cmmlu-china-specific: 76.25 mmlu-other_naive_average: 72.6
mmlu_pro_math: 25 cmmlu-china-specific_naive_average: 76.25
ds1000_Pandas: 12.5 mmlu_pro_math_accuracy: 25
ds1000_Numpy: 0 ds1000_Pandas_accuracy: 12.5
ds1000_Tensorflow: 12.5 ds1000_Numpy_accuracy: 0
ds1000_Scipy: 18.75 ds1000_Tensorflow_accuracy: 12.5
ds1000_Sklearn: 18.75 ds1000_Scipy_accuracy: 18.75
ds1000_Pytorch: 12.5 ds1000_Sklearn_accuracy: 18.75
ds1000_Matplotlib: 43.75 ds1000_Pytorch_accuracy: 12.5
openai_mmmlu_lite_AR-XY: 37.5 ds1000_Matplotlib_accuracy: 43.75
college: 12.5 openai_mmmlu_lite_AR-XY_accuracy: 37.5
college_knowledge: 87.5 college_naive_average: 12.5
Alignbench总分: 0.65 college_knowledge_naive_average: 87.5
Alignbench专业能力: 7.83 alignment_bench_v1_1_总分: 0.66
AlpacaEvaltotal: 0 alpaca_eval_total: 0
AlpacaEvalhelpful_base: 0 arenahard_score: 50
CompassArenacompassarena_language: 60 Followbench_naive_average: 1
CompassArenacompassarena_knowledge: 56 CompassArena_naive_average: 54.48
CompassArenacompassarena_reason_v2: 50 mtbench101_avg: 8.1
CompassArenacompassarena_math_v2: 53.5 wildbench_average: -9.86
CompassArenacompassarena_creationv2_zh: 48.75 simpleqa_accuracy_given_attempted: 0
Fofofofo_test_prompts: 1 chinese_simpleqa_given_attempted_accuracy: 1
followbenchHSR_AVG: 1 alignment_bench_v1_1_专业能力: 8
followbenchSSR_AVG: 1 alignment_bench_v1_1_数学计算: 0
followbenchHSR_L1: 1 alignment_bench_v1_1_基本任务: 0
followbenchHSR_L2: 1 alignment_bench_v1_1_逻辑推理: 0
followbenchHSR_L3: 1 alignment_bench_v1_1_中文理解: 0
followbenchHSR_L4: 1 alignment_bench_v1_1_文本写作: 0
followbenchHSR_L5: 1 alignment_bench_v1_1_角色扮演: 0
followbenchSSR_L1: 1 alignment_bench_v1_1_综合问答: 0
followbenchSSR_L2: 1 alpaca_eval_helpful_base: 0
followbenchSSR_L3: 1 compassarena_language_naive_average: 62
followbenchSSR_L4: 1 compassarena_knowledge_naive_average: 56
followbenchSSR_L5: 1 compassarena_reason_v2_naive_average: 49
MTBench101average: 8.1 compassarena_math_v2_naive_average: 57.05
Wildbenchscore: -3.3333333333333335 compassarena_creationv2_zh_naive_average: 48.34
fofo_test_prompts_overall: 1
followbench_llmeval_en_HSR_AVG: 1
followbench_llmeval_en_SSR_AVG: 1
followbench_llmeval_en_HSR_L1: 1
followbench_llmeval_en_HSR_L2: 1
followbench_llmeval_en_HSR_L3: 1
followbench_llmeval_en_HSR_L4: 1
followbench_llmeval_en_HSR_L5: 1
followbench_llmeval_en_SSR_L1: 1
followbench_llmeval_en_SSR_L2: 1
followbench_llmeval_en_SSR_L3: 1
followbench_llmeval_en_SSR_L4: 1
followbench_llmeval_en_SSR_L5: 1
simpleqa_f1: 0
internlm2_5-7b-chat-turbomind_fullbench: internlm2_5-7b-chat-turbomind_fullbench:
race-high: 93.75 race-high_accuracy: 93.75
ARC-c: 87.5 ARC-c_accuracy: 87.5
BoolQ: 68.75 BoolQ_accuracy: 68.75
triviaqa_wiki_1shot: 50 triviaqa_wiki_1shot_score: 50
nq_open_1shot: 25 nq_open_1shot_score: 25
IFEval: 50 IFEval_Prompt-level-strict-accuracy: 50
drop: 75 drop_accuracy: 75
hellaswag: 81.25 GPQA_diamond_accuracy: 25
TheoremQA: 6.25 hellaswag_accuracy: 81.25
musr_average: 37.5 TheoremQA_score: 6.25
gsm8k: 68.75 musr_average_naive_average: 37.5
math: 75 korbench_single_naive_average: 41.25
GPQA_diamond: 25 gsm8k_accuracy: 68.75
cmo_fib: 6.25 math_accuracy: 75
aime2024: 6.25 cmo_fib_accuracy: 6.25
wikibench-wiki-single_choice_cncircular: 25 aime2024_accuracy: 6.25
sanitized_mbpp: 68.75 wikibench-wiki-single_choice_cncircular_perf_4: 25
ds1000: 13.39 sanitized_mbpp_score: 68.75
lcb_code_generation: 12.5 ds1000_naive_average: 13.39
lcb_code_execution: 43.75 lcb_code_generation_pass@1: 12.5
lcb_test_output: 12.5 lcb_code_execution_pass@1: 43.75
bbh-logical_deduction_seven_objects: 56.25 lcb_test_output_pass@1: 12.5
bbh-multistep_arithmetic_two: 68.75 bbh-logical_deduction_seven_objects_score: 56.25
mmlu-other: 74.04 bbh-multistep_arithmetic_two_score: 68.75
cmmlu-china-specific: 76.25 mmlu-other_naive_average: 74.04
mmlu_pro_math: 25 cmmlu-china-specific_naive_average: 76.25
ds1000_Pandas: 0 mmlu_pro_math_accuracy: 25
ds1000_Numpy: 0 ds1000_Pandas_accuracy: 0
ds1000_Tensorflow: 12.5 ds1000_Numpy_accuracy: 0
ds1000_Scipy: 18.75 ds1000_Tensorflow_accuracy: 12.5
ds1000_Sklearn: 18.75 ds1000_Scipy_accuracy: 18.75
ds1000_Pytorch: 6.25 ds1000_Sklearn_accuracy: 18.75
ds1000_Matplotlib: 37.5 ds1000_Pytorch_accuracy: 6.25
openai_mmmlu_lite_AR-XY: 37.5 ds1000_Matplotlib_accuracy: 37.5
college: 0 openai_mmmlu_lite_AR-XY_accuracy: 37.5
college_knowledge: 87.5 college_naive_average: 0
Alignbench总分: 0.64 college_knowledge_naive_average: 87.5
Alignbench专业能力: 7.6 alignment_bench_v1_1_总分: 0.68
AlpacaEvaltotal: 10 alpaca_eval_total: 10
AlpacaEvalhelpful_base: 10 arenahard_score: 50
CompassArenacompassarena_language: 59 Followbench_naive_average: 1
CompassArenacompassarena_knowledge: 57 CompassArena_naive_average: 52.95
CompassArenacompassarena_reason_v2: 49.5 mtbench101_avg: 8.1
CompassArenacompassarena_math_v2: 51 wildbench_average: -4.44
CompassArenacompassarena_creationv2_zh: 43.75 simpleqa_accuracy_given_attempted: 0
Fofofofo_test_prompts: 1 chinese_simpleqa_given_attempted_accuracy: 1
followbenchHSR_AVG: 1 alignment_bench_v1_1_专业能力: 8.2
followbenchSSR_AVG: 1 alignment_bench_v1_1_数学计算: 0
followbenchHSR_L1: 1 alignment_bench_v1_1_基本任务: 0
followbenchHSR_L2: 1 alignment_bench_v1_1_逻辑推理: 0
followbenchHSR_L3: 1 alignment_bench_v1_1_中文理解: 0
followbenchHSR_L4: 1 alignment_bench_v1_1_文本写作: 0
followbenchHSR_L5: 1 alignment_bench_v1_1_角色扮演: 0
followbenchSSR_L1: 1 alignment_bench_v1_1_综合问答: 0
followbenchSSR_L2: 1 alpaca_eval_helpful_base: 10
followbenchSSR_L3: 1 compassarena_language_naive_average: 61.5
followbenchSSR_L4: 1 compassarena_knowledge_naive_average: 56.5
followbenchSSR_L5: 1 compassarena_reason_v2_naive_average: 47.5
MTBench101average: 8.1 compassarena_math_v2_naive_average: 53.03
Wildbenchscore: -8.333333333333334 compassarena_creationv2_zh_naive_average: 46.22
fofo_test_prompts_overall: 1
followbench_llmeval_en_HSR_AVG: 1
followbench_llmeval_en_SSR_AVG: 1
followbench_llmeval_en_HSR_L1: 1
followbench_llmeval_en_HSR_L2: 1
followbench_llmeval_en_HSR_L3: 1
followbench_llmeval_en_HSR_L4: 1
followbench_llmeval_en_HSR_L5: 1
followbench_llmeval_en_SSR_L1: 1
followbench_llmeval_en_SSR_L2: 1
followbench_llmeval_en_SSR_L3: 1
followbench_llmeval_en_SSR_L4: 1
followbench_llmeval_en_SSR_L5: 1
simpleqa_f1: 0
internlm2_5-7b-hf_fullbench: internlm2_5-7b-hf_fullbench:
race-high: 100 race-high_accuracy: 100
ARC-c: 68.75 ARC-c_accuracy: 68.75
BoolQ: 87.5 BoolQ_accuracy: 87.5
GPQA_diamond: 62.5 triviaqa_wiki_1shot_score: 43.75
drop: 62.5 nq_open_1shot_score: 43.75
math: 12.5 drop_accuracy: 62.5
wikibench-wiki-single_choice_cncircular: 25 GPQA_diamond_accuracy: 62.5
sanitized_mbpp: 56.25 hellaswag_accuracy: 93.75
gsm8k: 37.5 TheoremQA_score: 25
triviaqa_wiki_1shot: 43.75 winogrande_accuracy: 75
nq_open_1shot: 43.75 gsm8k_accuracy: 37.5
winogrande: 75 GaokaoBench_2010-2022_Math_II_MCQs_score: 62.5
hellaswag: 93.75 GaokaoBench_2010-2022_Math_II_Fill-in-the-Blank_score: 0
TheoremQA: 25 math_accuracy: 12.5
dingo_en_192: 37.5 wikibench-wiki-single_choice_cncircular_perf_4: 25
dingo_zh_170: 100 sanitized_mbpp_score: 56.25
college: 12.5 dingo_en_192_score: 37.5
college_knowledge: 87.5 dingo_zh_170_score: 100
bbh-logical_deduction_seven_objects: 43.75 mmlu-other_accuracy: 76.92
bbh-multistep_arithmetic_two: 56.25 cmmlu-china-specific_accuracy: 84.17
mmlu-other: 76.92 mmlu_pro_math_accuracy: 18.75
cmmlu-china-specific: 84.17 bbh-logical_deduction_seven_objects_score: 43.75
mmlu_pro_math: 18.75 bbh-multistep_arithmetic_two_score: 56.25
college_naive_average: 12.5
college_knowledge_naive_average: 87.5
internlm2_5-7b-turbomind_fullbench: internlm2_5-7b-turbomind_fullbench:
race-high: 100 race-high_accuracy: 100
ARC-c: 68.75 ARC-c_accuracy: 68.75
BoolQ: 87.5 BoolQ_accuracy: 87.5
GPQA_diamond: 62.5 triviaqa_wiki_1shot_score: 43.75
drop: 62.5 nq_open_1shot_score: 43.75
math: 18.75 drop_accuracy: 62.5
wikibench-wiki-single_choice_cncircular: 25 GPQA_diamond_accuracy: 62.5
sanitized_mbpp: 56.25 hellaswag_accuracy: 93.75
gsm8k: 68.75 TheoremQA_score: 31.25
triviaqa_wiki_1shot: 43.75 winogrande_accuracy: 87.5
nq_open_1shot: 43.75 gsm8k_accuracy: 68.75
winogrande: 87.5 GaokaoBench_2010-2022_Math_II_MCQs_score: 62.5
hellaswag: 93.75 GaokaoBench_2010-2022_Math_II_Fill-in-the-Blank_score: 0
TheoremQA: 31.25 math_accuracy: 18.75
dingo_en_192: 43.75 wikibench-wiki-single_choice_cncircular_perf_4: 25
dingo_zh_170: 100 sanitized_mbpp_score: 56.25
college: 12.5 dingo_en_192_score: 43.75
college_knowledge: 87.5 dingo_zh_170_score: 100
bbh-logical_deduction_seven_objects: 50 mmlu-other_accuracy: 76.92
bbh-multistep_arithmetic_two: 56.25 cmmlu-china-specific_accuracy: 84.17
mmlu-other: 76.92 mmlu_pro_math_accuracy: 18.75
cmmlu-china-specific: 84.17 bbh-logical_deduction_seven_objects_score: 50
mmlu_pro_math: 18.75 bbh-multistep_arithmetic_two_score: 56.25
college_naive_average: 12.5
college_knowledge_naive_average: 87.5
internlm2_5-7b-turbomind:
race-high_accuracy: 89.28
ARC-c_accuracy: 52.2
BoolQ_accuracy: 89.72
triviaqa_wiki_1shot_score: 65.88
nq_open_1shot_score: 34.82
drop_accuracy: 68.1
bbh_naive_average: 72.15
GPQA_diamond_accuracy: 32.83
hellaswag_accuracy: 88.36
TheoremQA_score: 25
winogrande_accuracy: 81.29
gsm8k_accuracy: 74.68
GaokaoBench_weighted_average: 58.19
math_accuracy: 33.98
Mathbench_naive_average: 48.38
wikibench-wiki-single_choice_cncircular_perf_4: 29.1
cmmlu_naive_average: 78.94
mmlu_naive_average: 71.44
mmlu_pro_naive_average: 38.18
openai_humaneval_humaneval_pass@1: 59.76
openai_humaneval_v2_humaneval_pass@1: 51.22
sanitized_mbpp_score: 55.25
dingo_en_192_score: 60.94
dingo_zh_170_score: 67.65
mmlu-stem_naive_average: 63.72
mmlu-social-science_naive_average: 80.15
mmlu-humanities_naive_average: 74.27
mmlu-other_naive_average: 71.85
cmmlu-stem_naive_average: 67.07
cmmlu-social-science_naive_average: 81.49
cmmlu-humanities_naive_average: 85.84
cmmlu-other_naive_average: 82.69
cmmlu-china-specific_naive_average: 79.88
mmlu_pro_biology_accuracy: 58.58
mmlu_pro_business_accuracy: 28.01
mmlu_pro_chemistry_accuracy: 22.79
mmlu_pro_computer_science_accuracy: 39.02
mmlu_pro_economics_accuracy: 53.08
mmlu_pro_engineering_accuracy: 25.7
mmlu_pro_health_accuracy: 46.94
mmlu_pro_history_accuracy: 43.04
mmlu_pro_law_accuracy: 29.7
mmlu_pro_math_accuracy: 24.2
mmlu_pro_philosophy_accuracy: 42.48
mmlu_pro_physics_accuracy: 26.02
mmlu_pro_psychology_accuracy: 52.76
mmlu_pro_other_accuracy: 42.21
college_naive_average: 10.67
high_naive_average: 6.67
middle_naive_average: 26.67
primary_naive_average: 60
arithmetic_naive_average: 55
mathbench-a (average)_naive_average: 31.8
college_knowledge_naive_average: 62.34
high_knowledge_naive_average: 59.83
middle_knowledge_naive_average: 71.15
primary_knowledge_naive_average: 66.55
mathbench-t (average)_naive_average: 64.97
Single-Needle-Retrieval(S-RT)-32000_naive_average: 100
Single-Needle-Retrieval-EN-32000_naive_average: 100
Single-Needle-Retrieval-ZH-32000_naive_average: 100
Single-Needle-Retrieval(S-RT)-100000_naive_average: 100
Single-Needle-Retrieval-EN-100000_naive_average: 100
Single-Needle-Retrieval-ZH-100000_naive_average: 100
Single-Needle-Retrieval(S-RT)-200000_naive_average: 100
Single-Needle-Retrieval-EN-200000_naive_average: 100
Single-Needle-Retrieval-ZH-200000_naive_average: 100
longbench_naive_average: 46.19
longbench_zh_naive_average: 49.3
longbench_en_naive_average: 43.97
longbench_single-document-qa_naive_average: 42.84
longbench_multi-document-qa_naive_average: 37.29
longbench_summarization_naive_average: 23.21
longbench_few-shot-learning_naive_average: 61.67
longbench_synthetic-tasks_naive_average: 60.05
longbench_code-completion_naive_average: 52.09
internlm2_5-7b-chat-turbomind:
race-high_accuracy: 86.16
ARC-c_accuracy: 90.17
BoolQ_accuracy: 87.89
triviaqa_wiki_1shot_score: 64.91
nq_open_1shot_score: 22.69
mmmlu_lite_naive_average: 44.96
IFEval_Prompt-level-strict-accuracy: 58.04
drop_accuracy: 77.68
bbh_naive_average: 73.14
GPQA_diamond_accuracy: 25.76
hellaswag_accuracy: 94.79
TheoremQA_score: 21.5
musr_average_naive_average: 51.03
korbench_single_naive_average: 31.92
ARC_Prize_Public_Evaluation_accuracy: 0.01
gsm8k_accuracy: 86.73
GaokaoBench_weighted_average: 77.89
math_accuracy: 61.5
cmo_fib_accuracy: 12.5
aime2024_accuracy: 3.33
Mathbench_naive_average: 65.17
wikibench-wiki-single_choice_cncircular_perf_4: 31.55
cmmlu_naive_average: 74.14
mmlu_naive_average: 70.52
mmlu_pro_naive_average: 44.98
openai_humaneval_humaneval_pass@1: 70.73
sanitized_mbpp_score: 63.81
humanevalx_naive_average: 38.17
ds1000_naive_average: 14.15
lcb_code_generation_pass@1: 17.75
lcb_code_execution_pass@1: 32.57
lcb_test_output_pass@1: 24.89
bigcodebench_hard_instruct_pass@1: 0.08
bigcodebench_hard_complete_pass@1: 0.06
teval_naive_average: 80.03
qa_dingo_cn_score: 99.01
mmlu-stem_naive_average: 68.2
mmlu-social-science_naive_average: 76.11
mmlu-humanities_naive_average: 68.71
mmlu-other_naive_average: 70.56
cmmlu-stem_naive_average: 66.27
cmmlu-social-science_naive_average: 75.7
cmmlu-humanities_naive_average: 77.7
cmmlu-other_naive_average: 77.71
cmmlu-china-specific_naive_average: 72.94
mmlu_pro_biology_accuracy: 66.25
mmlu_pro_business_accuracy: 48.42
mmlu_pro_chemistry_accuracy: 35.25
mmlu_pro_computer_science_accuracy: 47.56
mmlu_pro_economics_accuracy: 55.92
mmlu_pro_engineering_accuracy: 30.44
mmlu_pro_health_accuracy: 45.97
mmlu_pro_history_accuracy: 41.21
mmlu_pro_law_accuracy: 25.79
mmlu_pro_math_accuracy: 54.03
mmlu_pro_philosophy_accuracy: 36.47
mmlu_pro_physics_accuracy: 37.41
mmlu_pro_psychology_accuracy: 58.77
mmlu_pro_other_accuracy: 46.21
humanevalx-python_pass@1: 53.66
humanevalx-cpp_pass@1: 24.39
humanevalx-go_pass@1: 0
humanevalx-java_pass@1: 57.93
humanevalx-js_pass@1: 54.88
ds1000_Pandas_accuracy: 12.03
ds1000_Numpy_accuracy: 4.09
ds1000_Tensorflow_accuracy: 11.11
ds1000_Scipy_accuracy: 8.49
ds1000_Sklearn_accuracy: 6.96
ds1000_Pytorch_accuracy: 7.35
ds1000_Matplotlib_accuracy: 49.03
openai_mmmlu_lite_AR-XY_accuracy: 17.89
openai_mmmlu_lite_BN-BD_accuracy: 27.58
openai_mmmlu_lite_DE-DE_accuracy: 51.16
openai_mmmlu_lite_ES-LA_accuracy: 56.84
openai_mmmlu_lite_FR-FR_accuracy: 57.96
openai_mmmlu_lite_HI-IN_accuracy: 33.68
openai_mmmlu_lite_ID-ID_accuracy: 51.02
openai_mmmlu_lite_IT-IT_accuracy: 50.46
openai_mmmlu_lite_JA-JP_accuracy: 50.53
openai_mmmlu_lite_KO-KR_accuracy: 45.05
openai_mmmlu_lite_PT-BR_accuracy: 57.68
openai_mmmlu_lite_SW-KE_accuracy: 32.77
openai_mmmlu_lite_YO-NG_accuracy: 31.79
openai_mmmlu_lite_ZH-CN_accuracy: 65.05
college_naive_average: 20.33
high_naive_average: 47.67
middle_naive_average: 62
primary_naive_average: 72
arithmetic_naive_average: 62.33
mathbench-a (average)_naive_average: 52.87
college_knowledge_naive_average: 70.57
high_knowledge_naive_average: 70.13
middle_knowledge_naive_average: 81.17
primary_knowledge_naive_average: 88.01
mathbench-t (average)_naive_average: 77.47
alignment_bench_v1_1_总分: 5.68
alpaca_eval_total: 25.96
arenahard_score: 17.15
Followbench_naive_average: 0.81
CompassArena_naive_average: 34.61
FoFo_naive_average: 0.38
mtbench101_avg: 8.01
wildbench_average: -15.69
simpleqa_accuracy_given_attempted: 0.04
chinese_simpleqa_given_attempted_accuracy: 0.34
alignment_bench_v1_1_专业能力: 6.05
alignment_bench_v1_1_数学计算: 5.87
alignment_bench_v1_1_基本任务: 6.01
alignment_bench_v1_1_逻辑推理: 4.48
alignment_bench_v1_1_中文理解: 6.17
alignment_bench_v1_1_文本写作: 6.06
alignment_bench_v1_1_角色扮演: 6.3
alignment_bench_v1_1_综合问答: 6.45
alpaca_eval_helpful_base: 17.83
alpaca_eval_koala: 28.21
alpaca_eval_oasst: 23.4
alpaca_eval_selfinstruct: 30.95
alpaca_eval_vicuna: 25
compassarena_language_naive_average: 52.5
compassarena_knowledge_naive_average: 36
compassarena_reason_v2_naive_average: 35
compassarena_math_v2_naive_average: 19.91
compassarena_creationv2_zh_naive_average: 29.64
fofo_test_prompts_overall: 0.35
fofo_test_prompts_cn_overall: 0.41
followbench_llmeval_en_HSR_AVG: 0.73
followbench_llmeval_en_SSR_AVG: 0.88
followbench_llmeval_en_HSR_L1: 0.94
followbench_llmeval_en_HSR_L2: 0.77
followbench_llmeval_en_HSR_L3: 0.73
followbench_llmeval_en_HSR_L4: 0.68
followbench_llmeval_en_HSR_L5: 0.54
followbench_llmeval_en_SSR_L1: 0.94
followbench_llmeval_en_SSR_L2: 0.88
followbench_llmeval_en_SSR_L3: 0.87
followbench_llmeval_en_SSR_L4: 0.87
followbench_llmeval_en_SSR_L5: 0.85
simpleqa_f1: 0.04
internlm2_5-7b-chat-1m-turbomind:
ruler_8k_naive_average: 88.53
ruler_32k_naive_average: 83.84
ruler_128k_naive_average: 70.94
NeedleBench-Overall-Score-8K_weighted_average: 91.89
NeedleBench-Overall-Score-32K_weighted_average: 91.42
NeedleBench-Overall-Score-128K_weighted_average: 88.57
longbench_naive_average: 46.44
longbench_zh_naive_average: 45.19
longbench_en_naive_average: 45.71
babilong_0k_naive_average: 79.3
babilong_4k_naive_average: 67
babilong_16k_naive_average: 52.7
babilong_32k_naive_average: 48.9
babilong_128k_naive_average: 40.8
babilong_256k_naive_average: 23.5
longbench_single-document-qa_naive_average: 43.56
longbench_multi-document-qa_naive_average: 46.24
longbench_summarization_naive_average: 24.32
longbench_few-shot-learning_naive_average: 51.67
longbench_synthetic-tasks_naive_average: 66.83
longbench_code-completion_naive_average: 45.99

View File

@ -1,459 +1,459 @@
baichuan2-7b-chat-hf: baichuan2-7b-chat-hf:
gsm8k: 18.75 gsm8k_accuracy: 18.75
race-high: 78.12 race-high_accuracy: 78.12
glm-4-9b-chat-hf: glm-4-9b-chat-hf:
gsm8k: 68.75 gsm8k_accuracy: 68.75
race-high: 90.62 race-high_accuracy: 90.62
glm-4-9b-chat-turbomind: glm-4-9b-chat-turbomind:
gsm8k: 75.00 gsm8k_accuracy: 75.00
race-high: 90.62 race-high_accuracy: 90.62
glm-4-9b-chat-vllm: glm-4-9b-chat-vllm:
gsm8k: 65.62 gsm8k_accuracy: 65.62
race-high: 90.62 race-high_accuracy: 90.62
deepseek-7b-chat-hf: deepseek-7b-chat-hf:
gsm8k: 46.88 gsm8k_accuracy: 46.88
race-high: 81.25 race-high_accuracy: 81.25
deepseek-moe-16b-chat-hf: deepseek-moe-16b-chat-hf:
gsm8k: 50 gsm8k_accuracy: 50
race-high: 68.75 race-high_accuracy: 68.75
deepseek-7b-chat-vllm: deepseek-7b-chat-vllm:
gsm8k: 43.75 gsm8k_accuracy: 43.75
race-high: 75 race-high_accuracy: 75
gemma2-2b-it-hf: gemma2-2b-it-hf:
gsm8k: 50 gsm8k_accuracy: 50
race-high: 71.88 race-high_accuracy: 71.88
gemma2-9b-it-hf: gemma2-9b-it-hf:
gsm8k: 71.88 gsm8k_accuracy: 71.88
race-high: 84.38 race-high_accuracy: 84.38
gemma-2b-it-hf: gemma-2b-it-hf:
gsm8k: 3.12 gsm8k_accuracy: 3.12
race-high: 40.62 race-high_accuracy: 40.62
gemma-7b-it-hf: gemma-7b-it-hf:
gsm8k: 40.62 gsm8k_accuracy: 40.62
race-high: 68.75 race-high_accuracy: 68.75
gemma-2-9b-it-turbomind: gemma-2-9b-it-turbomind:
gsm8k: 65.62 gsm8k_accuracy: 65.62
race-high: 84.38 race-high_accuracy: 84.38
gemma-7b-it-vllm: gemma-7b-it-vllm:
gsm8k: 34.38 gsm8k_accuracy: 34.38
race-high: 68.75 race-high_accuracy: 68.75
internlm2_5-7b-chat-hf: internlm2_5-7b-chat-hf:
gsm8k: 84.38 gsm8k_accuracy: 84.38
race-high: 90.62 race-high_accuracy: 90.62
internlm2_5-7b-chat-turbomind: internlm2_5-7b-chat-turbomind:
gsm8k: 84.38 gsm8k_accuracy: 84.38
race-high: 90.62 race-high_accuracy: 90.62
internlm2-chat-1.8b-turbomind: internlm2-chat-1.8b-turbomind:
gsm8k: 25 gsm8k_accuracy: 25
race-high: 84.38 race-high_accuracy: 84.38
internlm2-chat-1.8b-sft-turbomind: internlm2-chat-1.8b-sft-turbomind:
gsm8k: 21.88 gsm8k_accuracy: 21.88
race-high: 84.38 race-high_accuracy: 84.38
internlm2-chat-7b-lmdeploy: internlm2-chat-7b-lmdeploy:
gsm8k: 53.12 gsm8k_accuracy: 53.12
race-high: 84.38 race-high_accuracy: 84.38
internlm2-chat-7b-sft-turbomind: internlm2-chat-7b-sft-turbomind:
gsm8k: 50 gsm8k_accuracy: 50
race-high: 90.62 race-high_accuracy: 90.62
internlm2-chat-7b-vllm: internlm2-chat-7b-vllm:
gsm8k: 43.75 gsm8k_accuracy: 43.75
race-high: 87.5 race-high_accuracy: 87.5
llama-3_1-8b-instruct-hf: llama-3_1-8b-instruct-hf:
gsm8k: 84.38 gsm8k_accuracy: 84.38
race-high: 90.62 race-high_accuracy: 90.62
llama-3_2-3b-instruct-hf: llama-3_2-3b-instruct-hf:
gsm8k: 65.62 gsm8k_accuracy: 68.75
race-high: 81.25 race-high_accuracy: 81.25
llama-3-8b-instruct-hf: llama-3-8b-instruct-hf:
gsm8k: 68.75 gsm8k_accuracy: 68.75
race-high: 87.5 race-high_accuracy: 87.5
llama-3_1-8b-instruct-turbomind: llama-3_1-8b-instruct-turbomind:
gsm8k: 78.12 gsm8k_accuracy: 78.12
race-high: 90.62 race-high_accuracy: 90.62
llama-3_2-3b-instruct-turbomind: llama-3_2-3b-instruct-turbomind:
gsm8k: 62.50 gsm8k_accuracy: 65.62
race-high: 81.25 race-high_accuracy: 81.25
llama-3-8b-instruct-turbomind: llama-3-8b-instruct-turbomind:
gsm8k: 68.75 gsm8k_accuracy: 68.75
race-high: 87.5 race-high_accuracy: 87.5
mistral-7b-instruct-v0.2-hf: mistral-7b-instruct-v0.2-hf:
gsm8k: 40.62 gsm8k_accuracy: 40.62
race-high: 75 race-high_accuracy: 75
mistral-7b-instruct-v0.3-hf: mistral-7b-instruct-v0.3-hf:
gsm8k: 40.62 gsm8k_accuracy: 40.62
race-high: 75 race-high_accuracy: 75
mistral-nemo-instruct-2407-hf: mistral-nemo-instruct-2407-hf:
gsm8k: 75 gsm8k_accuracy: 75
race-high: 81.25 race-high_accuracy: 81.25
mistral-nemo-instruct-2407-turbomind: mistral-nemo-instruct-2407-turbomind:
gsm8k: 68.75 gsm8k_accuracy: 68.75
race-high: 87.50 race-high_accuracy: 87.50
mistral-7b-instruct-v0.1-vllm: mistral-7b-instruct-v0.1-vllm:
gsm8k: 34.38 gsm8k_accuracy: 34.38
race-high: 68.75 race-high_accuracy: 68.75
mistral-7b-instruct-v0.2-vllm: mistral-7b-instruct-v0.2-vllm:
gsm8k: 43.75 gsm8k_accuracy: 43.75
race-high: 75 race-high_accuracy: 75
MiniCPM3-4B-hf: MiniCPM3-4B-hf:
gsm8k: 68.75 gsm8k_accuracy: 68.75
race-high: 84.38 race-high_accuracy: 84.38
minicpm-2b-dpo-fp32-hf: minicpm-2b-dpo-fp32-hf:
gsm8k: 56.25 gsm8k_accuracy: 56.25
race-high: 53.12 race-high_accuracy: 53.12
minicpm-2b-sft-bf16-hf: minicpm-2b-sft-bf16-hf:
gsm8k: 46.88 gsm8k_accuracy: 46.88
race-high: 65.62 race-high_accuracy: 65.62
minicpm-2b-sft-fp32-hf: minicpm-2b-sft-fp32-hf:
gsm8k: 46.88 gsm8k_accuracy: 46.88
race-high: 65.62 race-high_accuracy: 65.62
phi-3-mini-4k-instruct-hf: phi-3-mini-4k-instruct-hf:
gsm8k: 56.25 gsm8k_accuracy: 56.25
race-high: 84.38 race-high_accuracy: 84.38
qwen1.5-0.5b-chat-hf: qwen1.5-0.5b-chat-hf:
gsm8k: 0 gsm8k_accuracy: 0
race-high: 53.12 race-high_accuracy: 53.12
qwen2-1.5b-instruct-hf: qwen2-1.5b-instruct-hf:
gsm8k: 62.5 gsm8k_accuracy: 62.5
race-high: 84.38 race-high_accuracy: 84.38
qwen2-7b-instruct-hf: qwen2-7b-instruct-hf:
gsm8k: 68.75 gsm8k_accuracy: 68.75
race-high: 90.62 race-high_accuracy: 90.62
qwen2-1.5b-instruct-turbomind: qwen2-1.5b-instruct-turbomind:
gsm8k: 62.50 gsm8k_accuracy: 62.50
race-high: 84.38 race-high_accuracy: 84.38
qwen2-7b-instruct-turbomind: qwen2-7b-instruct-turbomind:
gsm8k: 81.25 gsm8k_accuracy: 81.25
race-high: 87.5 race-high_accuracy: 87.5
qwen1.5-0.5b-chat-vllm: qwen1.5-0.5b-chat-vllm:
gsm8k: 3.12 gsm8k_accuracy: 3.12
race-high: 53.12 race-high_accuracy: 53.12
yi-1.5-6b-chat-hf: yi-1.5-6b-chat-hf:
gsm8k: 65.62 gsm8k_accuracy: 65.62
race-high: 84.38 race-high_accuracy: 84.38
yi-1.5-9b-chat-hf: yi-1.5-9b-chat-hf:
gsm8k: 75 gsm8k_accuracy: 75
race-high: 93.75 race-high_accuracy: 93.75
deepseek-v2-lite-chat-hf: deepseek-v2-lite-chat-hf:
gsm8k: 43.75 gsm8k_accuracy: 43.75
race-high: 71.88 race-high_accuracy: 71.88
internlm2_5-20b-chat-hf: internlm2_5-20b-chat-hf:
gsm8k: 84.38 gsm8k_accuracy: 84.38
race-high: 87.5 race-high_accuracy: 87.5
internlm2_5-20b-chat-turbomind: internlm2_5-20b-chat-turbomind:
gsm8k: 84.38 gsm8k_accuracy: 84.38
race-high: 87.5 race-high_accuracy: 87.5
mistral-small-instruct-2409-hf: mistral-small-instruct-2409-hf:
gsm8k: 81.25 gsm8k_accuracy: 81.25
race-high: 87.50 race-high_accuracy: 87.50
mistral-small-instruct-2409-turbomind: mistral-small-instruct-2409-turbomind:
gsm8k: 78.12 gsm8k_accuracy: 78.12
race-high: 87.50 race-high_accuracy: 87.50
qwen2.5-14b-instruct-hf: qwen2.5-14b-instruct-hf:
gsm8k: 71.88 gsm8k_accuracy: 71.88
race-high: 96.88 race-high_accuracy: 96.88
qwen2.5-14b-instruct-turbomind: qwen2.5-14b-instruct-turbomind:
gsm8k: 71.88 gsm8k_accuracy: 71.88
race-high: 93.75 race-high_accuracy: 93.75
glm-4-9b-hf: glm-4-9b-hf:
gsm8k: 68.75 gsm8k_accuracy: 68.75
GPQA_diamond: 31.25 GPQA_diamond_accuracy: 31.25
race-high: 93.75 race-high_accuracy: 93.75
winogrande: 84.38 winogrande_accuracy: 84.38
deepseek-moe-16b-base-hf: deepseek-moe-16b-base-hf:
gsm8k: 21.88 gsm8k_accuracy: 21.88
GPQA_diamond: 0 GPQA_diamond_accuracy: 0
race-high: 21.88 race-high_accuracy: 21.88
winogrande: 65.62 winogrande_accuracy: 65.62
deepseek-7b-base-turbomind: deepseek-7b-base-turbomind:
gsm8k: 21.88 gsm8k_accuracy: 21.88
GPQA_diamond: 0 GPQA_diamond_accuracy: 0
race-high: 46.88 race-high_accuracy: 46.88
winogrande: 84.38 winogrande_accuracy: 84.38
deepseek-moe-16b-base-vllm: deepseek-moe-16b-base-vllm:
gsm8k: 21.88 gsm8k_accuracy: 21.88
GPQA_diamond: 0 GPQA_diamond_accuracy: 0
race-high: 25 race-high_accuracy: 25
winogrande: 68.75 winogrande_accuracy: 68.75
gemma2-2b-hf: gemma2-2b-hf:
gsm8k: 31.25 gsm8k_accuracy: 31.25
GPQA_diamond: 3.12 GPQA_diamond_accuracy: 3.12
race-high: 56.25 race-high_accuracy: 56.25
winogrande: 71.88 winogrande_accuracy: 71.88
gemma2-9b-hf: gemma2-9b-hf:
gsm8k: 68.75 gsm8k_accuracy: 68.75
GPQA_diamond: 0 GPQA_diamond_accuracy: 0
race-high: 81.25 race-high_accuracy: 81.25
winogrande: 84.38 winogrande_accuracy: 84.38
gemma-2b-hf: gemma-2b-hf:
gsm8k: 18.75 gsm8k_accuracy: 18.75
GPQA_diamond: 3.12 GPQA_diamond_accuracy: 3.12
race-high: 25 race-high_accuracy: 25
winogrande: 53.12 winogrande_accuracy: 53.12
gemma-7b-hf: gemma-7b-hf:
gsm8k: 56.25 gsm8k_accuracy: 56.25
GPQA_diamond: 6.25 GPQA_diamond_accuracy: 6.25
race-high: 65.62 race-high_accuracy: 65.62
winogrande: 78.12 winogrande_accuracy: 78.12
gemma-2b-vllm: gemma-2b-vllm:
gsm8k: 15.62 gsm8k_accuracy: 15.62
GPQA_diamond: 6.25 GPQA_diamond_accuracy: 6.25
race-high: race-high_accuracy:
winogrande: winogrande_accuracy:
gemma-7b-vllm: gemma-7b-vllm:
gsm8k: 53.12 gsm8k_accuracy: 53.12
GPQA_diamond: 6.25 GPQA_diamond_accuracy: 6.25
race-high: race-high_accuracy:
winogrande: winogrande_accuracy:
internlm2_5-7b-hf: internlm2_5-7b-hf:
gsm8k: 37.5 gsm8k_accuracy: 37.5
GPQA_diamond: 25 GPQA_diamond_accuracy: 25
race-high: 93.75 race-high_accuracy: 93.75
winogrande: 71.88 winogrande_accuracy: 71.88
internlm2-7b-hf: internlm2-7b-hf:
gsm8k: 53.12 gsm8k_accuracy: 53.12
GPQA_diamond: 18.75 GPQA_diamond_accuracy: 18.75
race-high: 62.5 race-high_accuracy: 62.5
winogrande: 78.12 winogrande_accuracy: 78.12
internlm2-base-7b-hf: internlm2-base-7b-hf:
gsm8k: 3.12 gsm8k_accuracy: 3.12
GPQA_diamond: 21.88 GPQA_diamond_accuracy: 21.88
race-high: 75 race-high_accuracy: 75
winogrande: 65.62 winogrande_accuracy: 65.62
internlm2-1.8b-turbomind: internlm2-1.8b-turbomind:
gsm8k: 12.5 gsm8k_accuracy: 12.5
GPQA_diamond: 12.5 GPQA_diamond_accuracy: 12.5
race-high: 71.88 race-high_accuracy: 71.88
winogrande: 75 winogrande_accuracy: 75
internlm2_5-7b-turbomind: internlm2_5-7b-turbomind:
gsm8k: 68.75 gsm8k_accuracy: 68.75
GPQA_diamond: 31.25 GPQA_diamond_accuracy: 31.25
race-high: 93.75 race-high_accuracy: 93.75
winogrande: 84.38 winogrande_accuracy: 84.38
internlm2-7b-turbomind: internlm2-7b-turbomind:
gsm8k: 56.25 gsm8k_accuracy: 56.25
GPQA_diamond: 21.88 GPQA_diamond_accuracy: 21.88
race-high: 75 race-high_accuracy: 75
winogrande: 81.25 winogrande_accuracy: 81.25
internlm2-base-7b-turbomind: internlm2-base-7b-turbomind:
gsm8k: 40.62 gsm8k_accuracy: 40.62
GPQA_diamond: 28.12 GPQA_diamond_accuracy: 28.12
race-high: 84.38 race-high_accuracy: 84.38
winogrande: 71.88 winogrande_accuracy: 71.88
llama-2-7b-hf: llama-2-7b-hf:
gsm8k: 21.88 gsm8k_accuracy: 21.88
GPQA_diamond: 21.88 GPQA_diamond_accuracy: 21.88
race-high: 40.62 race-high_accuracy: 40.62
winogrande: 71.88 winogrande_accuracy: 71.88
llama-3_1-8b-hf: llama-3_1-8b-hf:
gsm8k: 78.12 gsm8k_accuracy: 78.12
GPQA_diamond: 25 GPQA_diamond_accuracy: 25
race-high: 90.62 race-high_accuracy: 90.62
winogrande: 62.5 winogrande_accuracy: 62.5
llama-3-8b-hf: llama-3-8b-hf:
gsm8k: 46.88 gsm8k_accuracy: 46.88
GPQA_diamond: 6.25 GPQA_diamond_accuracy: 6.25
race-high: 65.62 race-high_accuracy: 65.62
winogrande: 65.62 winogrande_accuracy: 65.62
llama-3.1-8b-turbomind: llama-3.1-8b-turbomind:
gsm8k: 56.25 gsm8k_accuracy: 56.25
GPQA_diamond: 6.25 GPQA_diamond_accuracy: 6.25
race-high: 78.12 race-high_accuracy: 78.12
winogrande: 78.12 winogrande_accuracy: 78.12
llama-3-8b-turbomind: llama-3-8b-turbomind:
gsm8k: 50 gsm8k_accuracy: 50
GPQA_diamond: 9.38 GPQA_diamond_accuracy: 9.38
race-high: 65.62 race-high_accuracy: 65.62
winogrande: 78.12 winogrande_accuracy: 78.12
mistral-7b-v0.2-hf: mistral-7b-v0.2-hf:
gsm8k: 31.25 gsm8k_accuracy: 31.25
GPQA_diamond: 6.25 GPQA_diamond_accuracy: 6.25
race-high: 62.5 race-high_accuracy: 62.5
winogrande: 59.38 winogrande_accuracy: 59.38
mistral-7b-v0.3-hf: mistral-7b-v0.3-hf:
gsm8k: 31.25 gsm8k_accuracy: 31.25
GPQA_diamond: 6.25 GPQA_diamond_accuracy: 6.25
race-high: 62.5 race-high_accuracy: 62.5
winogrande: 59.38 winogrande_accuracy: 59.38
mistral-7b-v0.2-vllm: mistral-7b-v0.2-vllm:
gsm8k: 34.38 gsm8k_accuracy: 34.38
GPQA_diamond: 6.25 GPQA_diamond_accuracy: 6.25
race-high: 62.5 race-high_accuracy: 62.5
winogrande: 65.62 winogrande_accuracy: 65.62
qwen2.5-7b-hf: qwen2.5-7b-hf:
gsm8k: 81.25 gsm8k_accuracy: 81.25
GPQA_diamond: 18.75 GPQA_diamond_accuracy: 18.75
race-high: 87.5 race-high_accuracy: 87.5
winogrande: 71.88 winogrande_accuracy: 71.88
qwen2.5-1.5b-turbomind: qwen2.5-1.5b-turbomind:
gsm8k: 71.88 gsm8k_accuracy: 71.88
GPQA_diamond: 15.62 GPQA_diamond_accuracy: 15.62
race-high: 78.12 race-high_accuracy: 78.12
winogrande: 71.88 winogrande_accuracy: 71.88
qwen2.5-7b-turbomind: qwen2.5-7b-turbomind:
gsm8k: 71.88 gsm8k_accuracy: 71.88
GPQA_diamond: 25 GPQA_diamond_accuracy: 25
race-high: 87.5 race-high_accuracy: 87.5
winogrande: 71.88 winogrande_accuracy: 71.88
qwen1.5-moe-a2.7b-hf: qwen1.5-moe-a2.7b-hf:
gsm8k: 62.5 gsm8k_accuracy: 62.5
GPQA_diamond: 18.75 GPQA_diamond_accuracy: 18.75
race-high: 84.38 race-high_accuracy: 84.38
winogrande: 75 winogrande_accuracy: 75
qwen2-0.5b-hf: qwen2-0.5b-hf:
gsm8k: 25 gsm8k_accuracy: 25
GPQA_diamond: 0 GPQA_diamond_accuracy: 0
race-high: 40.62 race-high_accuracy: 40.62
winogrande: 62.5 winogrande_accuracy: 62.5
qwen2-1.5b-hf: qwen2-1.5b-hf:
gsm8k: 59.38 gsm8k_accuracy: 59.38
GPQA_diamond: 9.38 GPQA_diamond_accuracy: 9.38
race-high: 81.25 race-high_accuracy: 81.25
winogrande: 62.5 winogrande_accuracy: 62.5
qwen2-7b-hf: qwen2-7b-hf:
gsm8k: 68.75 gsm8k_accuracy: 68.75
GPQA_diamond: 9.38 GPQA_diamond_accuracy: 9.38
race-high: 87.5 race-high_accuracy: 87.5
winogrande: 68.75 winogrande_accuracy: 68.75
qwen2-1.5b-turbomind: qwen2-1.5b-turbomind:
gsm8k: 62.50 gsm8k_accuracy: 62.50
GPQA_diamond: 6.25 GPQA_diamond_accuracy: 6.25
race-high: 81.25 race-high_accuracy: 81.25
winogrande: 75 winogrande_accuracy: 75
qwen2-7b-turbomind: qwen2-7b-turbomind:
gsm8k: 68.75 gsm8k_accuracy: 68.75
GPQA_diamond: 12.5 GPQA_diamond_accuracy: 12.5
race-high: 87.5 race-high_accuracy: 87.5
winogrande: 71.88 winogrande_accuracy: 71.88
qwen1.5-0.5b-vllm: qwen1.5-0.5b-vllm:
gsm8k: 9.38 gsm8k_accuracy: 9.38
GPQA_diamond: 0 GPQA_diamond_accuracy: 0
race-high: 56.25 race-high_accuracy: 56.25
winogrande: 62.5 winogrande_accuracy: 62.5
yi-1.5-6b-hf: yi-1.5-6b-hf:
gsm8k: 62.5 gsm8k_accuracy: 62.5
GPQA_diamond: 3.12 GPQA_diamond_accuracy: 3.12
race-high: 87.5 race-high_accuracy: 87.5
winogrande: 62.5 winogrande_accuracy: 62.5
yi-1.5-9b-hf: yi-1.5-9b-hf:
gsm8k: 75 gsm8k_accuracy: 75
GPQA_diamond: 40.62 GPQA_diamond_accuracy: 40.62
race-high: 87.5 race-high_accuracy: 87.5
winogrande: 59.38 winogrande_accuracy: 59.38
deepseek-v2-lite-hf: deepseek-v2-lite-hf:
gsm8k: 28.12 gsm8k_accuracy: 28.12
GPQA_diamond: 21.88 GPQA_diamond_accuracy: 21.88
race-high: 59.38 race-high_accuracy: 59.38
winogrande: 75 winogrande_accuracy: 75
internlm2-20b-hf: internlm2-20b-hf:
gsm8k: 56.25 gsm8k_accuracy: 56.25
GPQA_diamond: 15.62 GPQA_diamond_accuracy: 15.62
race-high: 68.75 race-high_accuracy: 68.75
winogrande: 75 winogrande_accuracy: 75
internlm2-base-20b-hf: internlm2-base-20b-hf:
gsm8k: 12.5 gsm8k_accuracy: 12.5
GPQA_diamond: 9.38 GPQA_diamond_accuracy: 9.38
race-high: 84.38 race-high_accuracy: 84.38
winogrande: 65.62 winogrande_accuracy: 65.62
internlm2-20b-turbomind: internlm2-20b-turbomind:
gsm8k: 68.75 gsm8k_accuracy: 68.75
GPQA_diamond: 15.62 GPQA_diamond_accuracy: 15.62
race-high: 68.75 race-high_accuracy: 68.75
winogrande: 81.25 winogrande_accuracy: 81.25
qwen2.5-14b-hf: qwen2.5-14b-hf:
gsm8k: 75 gsm8k_accuracy: 75
GPQA_diamond: 37.5 GPQA_diamond_accuracy: 37.5
race-high: 93.75 race-high_accuracy: 93.75
winogrande: 84.38 winogrande_accuracy: 84.38

View File

@ -38,28 +38,21 @@ on:
description: "regression conda env, eg. ['dsw_cu11','dsw_cu12']" description: "regression conda env, eg. ['dsw_cu11','dsw_cu12']"
type: string type: string
default: "['dsw_cu12']" default: "['dsw_cu12']"
fullbench_eval:
required: true
description: 'fullbench volc functions'
type: string
default: "['base_long_context','base_objective','chat_long_context','chat_objective','chat_subjective']"
schedule: schedule:
- cron: '15 16 * * *' - cron: '15 14 * * *'
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
env: env:
CONDA_ENV: opencompass_regression
PIP_CACHE_PATH: /cpfs01/user/qa-llm-cicd/.cache/pip
HF_CACHE_PATH: /cpfs01/shared/public/public_hdd/llmeval/model_weights/hf_hub
HUGGINGFACE_HUB_CACHE: /cpfs01/shared/public/public_hdd/llmeval/model_weights/hf_hub
HF_HUB_CACHE: /cpfs01/shared/public/public_hdd/llmeval/model_weights/hf_hub
COMPASS_DATA_CACHE: /cpfs01/shared/public/llmeval/compass_data_cache
HF_DATASETS_OFFLINE: 1 HF_DATASETS_OFFLINE: 1
HF_EVALUATE_OFFLINE: 1 HF_EVALUATE_OFFLINE: 1
TRANSFORMERS_OFFLINE: 1 TRANSFORMERS_OFFLINE: 1
VLLM_USE_MODELSCOPE: false VLLM_USE_MODELSCOPE: false
LMDEPLOY_USE_MODELSCOPE: false LMDEPLOY_USE_MODELSCOPE: false
HF_HUB_OFFLINE: 1 HF_HUB_OFFLINE: 1
TRITON_PTXAS_PATH: /usr/local/cuda/bin/ptxas
REPORT_ROOT: /cpfs01/shared/public/qa-llm-cicd/report
OUTPUT_FOLDER: cuda12.1_dist_${{ github.run_id }} OUTPUT_FOLDER: cuda12.1_dist_${{ github.run_id }}
jobs: jobs:
@ -129,6 +122,9 @@ jobs:
matrix: matrix:
cuda_env: ${{ fromJSON(inputs.cuda_env || '["dsw_cu12"]')}} cuda_env: ${{ fromJSON(inputs.cuda_env || '["dsw_cu12"]')}}
runs-on: ${{ matrix.cuda_env }} runs-on: ${{ matrix.cuda_env }}
env:
CONDA_ENV: opencompass_regression
PIP_CACHE_PATH: /cpfs01/user/qa-llm-cicd/.cache/pip
environment: 'prod' environment: 'prod'
timeout-minutes: 240 #4hours timeout-minutes: 240 #4hours
steps: steps:
@ -209,6 +205,14 @@ jobs:
cuda_env: ${{ fromJSON(inputs.cuda_env || '["dsw_cu12"]')}} cuda_env: ${{ fromJSON(inputs.cuda_env || '["dsw_cu12"]')}}
regression_func: ${{fromJSON(github.event.inputs.regression_func || '["chat_models","base_models","chat_obj_fullbench","chat_sub_fullbench","base_fullbench","cmd","api"]')}} regression_func: ${{fromJSON(github.event.inputs.regression_func || '["chat_models","base_models","chat_obj_fullbench","chat_sub_fullbench","base_fullbench","cmd","api"]')}}
runs-on: ${{ matrix.cuda_env }} runs-on: ${{ matrix.cuda_env }}
env:
CONDA_ENV: opencompass_regression
PIP_CACHE_PATH: /cpfs01/user/qa-llm-cicd/.cache/pip
HF_CACHE_PATH: /cpfs01/shared/public/public_hdd/llmeval/model_weights/hf_hub
HUGGINGFACE_HUB_CACHE: /cpfs01/shared/public/public_hdd/llmeval/model_weights/hf_hub
HF_HUB_CACHE: /cpfs01/shared/public/public_hdd/llmeval/model_weights/hf_hub
COMPASS_DATA_CACHE: /cpfs01/shared/public/llmeval/compass_data_cache
REPORT_ROOT: /cpfs01/shared/public/qa-llm-cicd/report
environment: 'prod' environment: 'prod'
timeout-minutes: 240 #4hours timeout-minutes: 240 #4hours
steps: steps:
@ -305,9 +309,68 @@ jobs:
run: | run: |
kill -15 "$restful_pid" kill -15 "$restful_pid"
fullbench_run_test:
if: ${{!cancelled()}}
needs: ['build-pypi', 'build-pypi-lmdeploy']
env:
FULLBENCH_CONDA_ENV: regression_test
FULLBENCH_REPORT_ROOT: /fs-computility/llm/qa-llm-cicd/eval_report/regression
COMPASS_DATA_CACHE: /fs-computility/llm/shared/llmeval/datasets/compass_data_cache
strategy:
fail-fast: false
matrix:
function_type: ${{fromJSON(github.event.inputs.fullbench_eval || '["base_long_context","base_objective","chat_long_context","chat_objective","chat_subjective"]')}}
runs-on: volc_cu12
environment: 'prod'
timeout-minutes: 360 #6hours
steps:
- name: Clone repository
uses: actions/checkout@v2
with:
repository: ${{ github.event.inputs.repo_org || 'open-compass/opencompass' }}
ref: ${{github.event.inputs.repo_ref || 'main'}}
- name: Download Artifacts
uses: actions/download-artifact@v4
with:
name: my-artifact-${{ github.run_id }}
- name: Prepare - reinstall opencompass - cu12
if: ${{matrix.cuda_env == 'dsw_cu12' && inputs.build_lmdeploy}}
run: |
. /fs-computility/llm/qa-llm-cicd/miniconda3/bin/activate
conda activate ${{env.FULLBENCH_CONDA_ENV}}
pip install opencompass*.whl --no-deps
- name: Prepare - reinstall lmdeploy - cu12
if: ${{matrix.cuda_env == 'dsw_cu12' && inputs.build_lmdeploy}}
uses: actions/download-artifact@v4
with:
name: my-artifact-${{ github.run_id }}-py310
- name: Prepare - reinstall lmdeploy - cu12
if: ${{matrix.cuda_env == 'dsw_cu12' && inputs.build_lmdeploy}}
run: |
. /fs-computility/llm/qa-llm-cicd/miniconda3/bin/activate
conda activate ${{env.FULLBENCH_CONDA_ENV}}
pip install lmdeploy-*.whl --no-deps
- name: Conda env
if: ${{matrix.cuda_env == 'dsw_cu12' && inputs.build_lmdeploy}}
run: |
. /fs-computility/llm/qa-llm-cicd/miniconda3/bin/activate
conda activate ${{env.FULLBENCH_CONDA_ENV}}
conda info --envs
pip list
- name: Run command testcase
run: |
. /fs-computility/llm/qa-llm-cicd/miniconda3/bin/activate
conda activate ${{env.FULLBENCH_CONDA_ENV}}
conda info --envs
export from_tf=TRUE
opencompass /fs-computility/llm/qa-llm-cicd/ocplayground/template/regression/eval_${{ matrix.function_type }}.py --work-dir ${{env.FULLBENCH_REPORT_ROOT}}/${{ github.run_id }}/${{ matrix.function_type }} --reuse
rm regression_result_daily -f && ln -s ${{env.FULLBENCH_REPORT_ROOT}}/${{ github.run_id }}/${{ matrix.function_type }}/*/summary regression_result_daily
python -m pytest -m ${{ matrix.function_type }} -s -v --color=yes .github/scripts/oc_score_assert.py
notify_to_feishu: notify_to_feishu:
if: ${{ always() && !cancelled() && contains(needs.*.result, 'failure') && (github.ref_name == 'develop' || github.ref_name == 'main') }} if: ${{ always() && !cancelled() && contains(needs.*.result, 'failure') && (github.ref_name == 'develop' || github.ref_name == 'main') }}
needs: [daily_run_test] needs: [daily_run_test, fullbench_run_test]
environment: 'prod' environment: 'prod'
timeout-minutes: 5 timeout-minutes: 5
runs-on: self-hosted runs-on: self-hosted

View File

@ -29,7 +29,7 @@ env:
jobs: jobs:
pr_run_test: pr_run_test:
runs-on: self-hosted runs-on: dsw_cu12
environment: 'prod' environment: 'prod'
timeout-minutes: 30 timeout-minutes: 30
steps: steps: