This commit is contained in:
zhulinJulia24 2025-04-02 20:10:33 +08:00
parent 780bc1dd1e
commit e263f3df8d
4 changed files with 596 additions and 53 deletions

View File

@ -101,8 +101,6 @@ with read_base():
models as hf_mistral_nemo_instruct_2407_model # noqa: F401, E501
from opencompass.configs.models.mistral.hf_mistral_small_instruct_2409 import \
models as hf_mistral_small_instruct_2409_model # noqa: F401, E501
from opencompass.configs.models.mistral.hf_mixtral_8x22b_instruct_v0_1 import \
models as hf_mixtral_8x22b_instruct_v0_1_model # noqa: F401, E501
from opencompass.configs.models.mistral.lmdeploy_mistral_large_instruct_2411 import \
models as \
lmdeploy_mistral_large_instruct_2411_model # noqa: F401, E501
@ -128,8 +126,6 @@ with read_base():
models as hf_phi_3_5_MoE_instruct_model # noqa: F401, E501
from opencompass.configs.models.phi.hf_phi_3_medium_4k_instruct import \
models as hf_phi_3_medium_4k_instruct_model # noqa: F401, E501
from opencompass.configs.models.phi.hf_phi_3_mini_4k_instruct import \
models as hf_phi_3_mini_4k_instruct_model # noqa: F401, E501
from opencompass.configs.models.phi.hf_phi_3_small_8k_instruct import \
models as hf_phi_3_small_8k_instruct_model # noqa: F401, E501
from opencompass.configs.models.phi.hf_phi_4 import \
@ -174,6 +170,7 @@ with read_base():
from ...volc import infer as volc_infer # noqa: F401, E501
hf_glm4_9b_chat_model[0]['path'] = 'THUDM/glm-4-9b-chat-hf'
hf_deepseek_67b_chat_model[0]['run_cfg']['num_gpus'] = 8
race_datasets = [race_datasets[1]]
datasets = sum([v for k, v in locals().items() if k.endswith('_datasets')], [])

View File

@ -175,10 +175,11 @@ class TestApibench:
class TestVolcFullbench:
"""Test cases for chat model."""
@pytest.mark.parametrize(
'model, dataset',
[(p1, p2) for p1 in ['internlm2_5-7b-chat-turbomind']
for p2 in dataset_list('internlm2_5-7b-chat-turbomind', 'objective')])
@pytest.mark.parametrize('model, dataset', [(p1, p2) for p1 in [
'internlm2_5-7b-chat-turbomind', 'qwen2.5-7b-instruct-turbomind',
'internlm2_5-7b-chat-pytorch', 'qwen2.5-7b-instruct-pytorch',
'internlm3-8b-instruct-turbomind', 'internlm3-8b-instruct-pytorch'
] for p2 in dataset_list(p1, 'objective')])
@pytest.mark.chat_objective
def test_chat_objective(self, baseline_scores_fullbench, result_scores,
model, dataset):

View File

@ -454,3 +454,530 @@ internlm2_5-7b-chat-1m-turbomind:
longbench_few-shot-learning_score: 51.67
longbench_synthetic-tasks_score: 66.83
longbench_code-completion_score: 45.99
qwen2.5-7b-instruct-turbomind:
objective:
race-high_accuracy: 84.99
ARC-c_accuracy: 92.2
BoolQ_accuracy: 86.7
triviaqa_wiki_1shot_score: 53.06
nq_open_1shot_score: 17.51
mmmlu_lite_naive_average: 54.96
IFEval_Prompt-level-strict-accuracy: 71.53
drop_accuracy: 80.07
bbh_naive_average: 68.81
GPQA_diamond_accuracy: 34.34
hellaswag_accuracy: 85.42
TheoremQA_score: 18.38
musr_average_naive_average: 43.44
korbench_single_naive_average: 39.44
ARC_Prize_Public_Evaluation_accuracy: 0
gsm8k_accuracy: 92.57
GaokaoBench_weighted_average: 80.14
math_accuracy: 73.58
cmo_fib_accuracy: 25
aime2024_accuracy: 16.67
Mathbench_naive_average: 77.33
wikibench-wiki-single_choice_cncircular_perf_4: 34.9
cmmlu_naive_average: 75.97
mmlu_naive_average: 76.01
mmlu_pro_naive_average: 56.12
openai_humaneval_humaneval_pass@1: 83.54
sanitized_mbpp_score: 74.71
humanevalx_naive_average: 48.29
ds1000_naive_average: 18.66
lcb_code_generation_pass@1: 39.5
lcb_code_execution_pass@1: 42.38
lcb_test_output_pass@1: 50.68
bigcodebench_hard_instruct_pass@1: 100
bigcodebench_hard_complete_pass@1: 100
teval_naive_average: 79.72
SciCode_sub_accuracy: 100
qa_dingo_cn_score: 99.01
mmlu_accuracy: 76.01
mmlu-stem_accuracy: 77.59
mmlu-social-science_accuracy: 79.02
mmlu-humanities_accuracy: 72.07
mmlu-other_accuracy: 74.86
cmmlu_accuracy: 75.97
cmmlu-stem_accuracy: 73.09
cmmlu-social-science_accuracy: 75.95
cmmlu-humanities_accuracy: 76.53
cmmlu-other_accuracy: 78.79
cmmlu-china-specific_accuracy: 73.17
mmlu_pro_accuracy: 56.12
mmlu_pro_biology_accuracy: 71.41
mmlu_pro_business_accuracy: 67.68
mmlu_pro_chemistry_accuracy: 54.59
mmlu_pro_computer_science_accuracy: 58.29
mmlu_pro_economics_accuracy: 66.82
mmlu_pro_engineering_accuracy: 42.41
mmlu_pro_health_accuracy: 55.87
mmlu_pro_history_accuracy: 46.46
mmlu_pro_law_accuracy: 28.97
mmlu_pro_math_accuracy: 73.13
mmlu_pro_philosophy_accuracy: 44.89
mmlu_pro_physics_accuracy: 58.43
mmlu_pro_psychology_accuracy: 63.16
mmlu_pro_other_accuracy: 53.57
humanevalx-python_pass@1: 50
humanevalx-cpp_pass@1: 42.07
humanevalx-go_pass@1: 0
humanevalx-java_pass@1: 74.39
humanevalx-js_pass@1: 75
ds1000_Pandas_accuracy: 14.09
ds1000_Numpy_accuracy: 8.18
ds1000_Tensorflow_accuracy: 17.78
ds1000_Scipy_accuracy: 15.09
ds1000_Sklearn_accuracy: 10.43
ds1000_Pytorch_accuracy: 4.41
ds1000_Matplotlib_accuracy: 60.65
mmmlu_lite_accuracy: 54.96
openai_mmmlu_lite_AR-XY_accuracy: 42.32
openai_mmmlu_lite_BN-BD_accuracy: 42.25
openai_mmmlu_lite_DE-DE_accuracy: 59.93
openai_mmmlu_lite_ES-LA_accuracy: 66.53
openai_mmmlu_lite_FR-FR_accuracy: 66.88
openai_mmmlu_lite_HI-IN_accuracy: 49.26
openai_mmmlu_lite_ID-ID_accuracy: 61.26
openai_mmmlu_lite_IT-IT_accuracy: 65.47
openai_mmmlu_lite_JA-JP_accuracy: 61.54
openai_mmmlu_lite_KO-KR_accuracy: 60.28
openai_mmmlu_lite_PT-BR_accuracy: 55.51
openai_mmmlu_lite_SW-KE_accuracy: 36.42
openai_mmmlu_lite_YO-NG_accuracy: 32.14
openai_mmmlu_lite_ZH-CN_accuracy: 69.61
college_naive_average: 48
high_naive_average: 59
middle_naive_average: 78
primary_naive_average: 85.67
arithmetic_naive_average: 75.67
mathbench-a (average)_naive_average: 69.27
college_knowledge_naive_average: 83.86
high_knowledge_naive_average: 80.29
middle_knowledge_naive_average: 84.26
primary_knowledge_naive_average: 93.16
mathbench-t (average)_naive_average: 85.39
internlm2_5-7b-chat-pytorch:
objective:
race-high_accuracy: 86.39
ARC-c_accuracy: 90.51
BoolQ_accuracy: 88.01
triviaqa_wiki_1shot_score: 64.77
nq_open_1shot_score: 22.71
mmmlu_lite_naive_average: 45.02
IFEval_Prompt-level-strict-accuracy: 56.56
drop_accuracy: 75.46
bbh_naive_average: 73.34
GPQA_diamond_accuracy: 32.83
hellaswag_accuracy: 94.81
TheoremQA_score: 23.88
musr_average_naive_average: 51.31
korbench_single_naive_average: 32
ARC_Prize_Public_Evaluation_accuracy: 0.01
gsm8k_accuracy: 86.96
GaokaoBench_weighted_average: 78.05
math_accuracy: 60.34
cmo_fib_accuracy: 12.98
aime2024_accuracy: 3.33
Mathbench_naive_average: 64.82
wikibench-wiki-single_choice_cncircular_perf_4: 31.7
cmmlu_naive_average: 74.24
mmlu_naive_average: 70.2
mmlu_pro_naive_average: 45.39
openai_humaneval_humaneval_pass@1: 70.12
sanitized_mbpp_score: 64.59
humanevalx_naive_average: 38.78
ds1000_naive_average: 14.19
lcb_code_generation_pass@1: 16.5
lcb_code_execution_pass@1: 33.82
lcb_test_output_pass@1: 22.62
bigcodebench_hard_instruct_pass@1: 6.08
bigcodebench_hard_complete_pass@1: 100
teval_naive_average: 100
SciCode_sub_accuracy: 100
qa_dingo_cn_score: 100
mmlu_accuracy: 70.2
mmlu-stem_accuracy: 67.73
mmlu-social-science_accuracy: 75.49
mmlu-humanities_accuracy: 68.56
mmlu-other_accuracy: 70.58
cmmlu_accuracy: 74.24
cmmlu-stem_accuracy: 66.7
cmmlu-social-science_accuracy: 75.88
cmmlu-humanities_accuracy: 77.56
cmmlu-other_accuracy: 77.52
cmmlu-china-specific_accuracy: 73.46
mmlu_pro_accuracy: 45.39
mmlu_pro_biology_accuracy: 65.83
mmlu_pro_business_accuracy: 51.96
mmlu_pro_chemistry_accuracy: 36.84
mmlu_pro_computer_science_accuracy: 48.29
mmlu_pro_economics_accuracy: 56.16
mmlu_pro_engineering_accuracy: 29.1
mmlu_pro_health_accuracy: 44.5
mmlu_pro_history_accuracy: 42.26
mmlu_pro_law_accuracy: 24.98
mmlu_pro_math_accuracy: 54.85
mmlu_pro_philosophy_accuracy: 39.28
mmlu_pro_physics_accuracy: 37.41
mmlu_pro_psychology_accuracy: 58.27
mmlu_pro_other_accuracy: 45.78
humanevalx-python_pass@1: 56.1
humanevalx-cpp_pass@1: 20.73
humanevalx-go_pass@1: 0
humanevalx-java_pass@1: 59.15
humanevalx-js_pass@1: 57.93
ds1000_Pandas_accuracy: 8.93
ds1000_Numpy_accuracy: 4.09
ds1000_Tensorflow_accuracy: 11.11
ds1000_Scipy_accuracy: 7.55
ds1000_Sklearn_accuracy: 7.83
ds1000_Pytorch_accuracy: 8.82
ds1000_Matplotlib_accuracy: 50.97
mmmlu_lite_accuracy: 45.02
openai_mmmlu_lite_AR-XY_accuracy: 18.6
openai_mmmlu_lite_BN-BD_accuracy: 27.58
openai_mmmlu_lite_DE-DE_accuracy: 51.23
openai_mmmlu_lite_ES-LA_accuracy: 56.63
openai_mmmlu_lite_FR-FR_accuracy: 58.11
openai_mmmlu_lite_HI-IN_accuracy: 33.82
openai_mmmlu_lite_ID-ID_accuracy: 50.39
openai_mmmlu_lite_IT-IT_accuracy: 50.39
openai_mmmlu_lite_JA-JP_accuracy: 50.95
openai_mmmlu_lite_KO-KR_accuracy: 45.05
openai_mmmlu_lite_PT-BR_accuracy: 57.89
openai_mmmlu_lite_SW-KE_accuracy: 32.14
openai_mmmlu_lite_YO-NG_accuracy: 32.14
openai_mmmlu_lite_ZH-CN_accuracy: 65.33
college_naive_average: 21
high_naive_average: 47
middle_naive_average: 59.67
primary_naive_average: 76
arithmetic_naive_average: 62
mathbench-a (average)_naive_average: 53.13
college_knowledge_naive_average: 68.99
high_knowledge_naive_average: 70.06
middle_knowledge_naive_average: 78.53
primary_knowledge_naive_average: 88.49
mathbench-t (average)_naive_average: 76.51
qwen2.5-7b-instruct-pytorch:
objective:
race-high_accuracy: 85.16
ARC-c_accuracy: 90.85
BoolQ_accuracy: 86.61
triviaqa_wiki_1shot_score: 52.96
nq_open_1shot_score: 17.62
mmmlu_lite_naive_average: 54.7
IFEval_Prompt-level-strict-accuracy: 71.35
drop_accuracy: 80.23
bbh_naive_average: 68.88
GPQA_diamond_accuracy: 36.36
hellaswag_accuracy: 85.49
TheoremQA_score: 18.38
musr_average_naive_average: 43.3
korbench_single_naive_average: 39.44
ARC_Prize_Public_Evaluation_accuracy: 0
gsm8k_accuracy: 91.66
GaokaoBench_weighted_average: 80.02
math_accuracy: 73.74
cmo_fib_accuracy: 26.44
aime2024_accuracy: 10
Mathbench_naive_average: 77.08
wikibench-wiki-single_choice_cncircular_perf_4: 34
cmmlu_naive_average: 75.9
mmlu_naive_average: 76.27
mmlu_pro_naive_average: 56.14
openai_humaneval_humaneval_pass@1: 84.76
sanitized_mbpp_score: 74.71
humanevalx_naive_average: 48.17
ds1000_naive_average: 18.57
lcb_code_generation_pass@1: 38.75
lcb_code_execution_pass@1: 42.38
lcb_test_output_pass@1: 50.45
bigcodebench_hard_instruct_pass@1: 100
bigcodebench_hard_complete_pass@1: 100
teval_naive_average: 100
SciCode_sub_accuracy: 100
qa_dingo_cn_score: 100
mmlu_accuracy: 76.27
mmlu-stem_accuracy: 77.75
mmlu-social-science_accuracy: 78.65
mmlu-humanities_accuracy: 73.12
mmlu-other_accuracy: 75.05
cmmlu_accuracy: 75.9
cmmlu-stem_accuracy: 73.41
cmmlu-social-science_accuracy: 75.97
cmmlu-humanities_accuracy: 76.42
cmmlu-other_accuracy: 78.15
cmmlu-china-specific_accuracy: 73.27
mmlu_pro_accuracy: 56.14
mmlu_pro_biology_accuracy: 72.25
mmlu_pro_business_accuracy: 66.16
mmlu_pro_chemistry_accuracy: 55.65
mmlu_pro_computer_science_accuracy: 60.24
mmlu_pro_economics_accuracy: 66.82
mmlu_pro_engineering_accuracy: 41.38
mmlu_pro_health_accuracy: 54.89
mmlu_pro_history_accuracy: 46.46
mmlu_pro_law_accuracy: 29.06
mmlu_pro_math_accuracy: 73.58
mmlu_pro_philosophy_accuracy: 44.89
mmlu_pro_physics_accuracy: 60.05
mmlu_pro_psychology_accuracy: 61.9
mmlu_pro_other_accuracy: 52.6
humanevalx-python_pass@1: 51.83
humanevalx-cpp_pass@1: 42.68
humanevalx-go_pass@1: 0
humanevalx-java_pass@1: 73.78
humanevalx-js_pass@1: 72.56
ds1000_Pandas_accuracy: 14.09
ds1000_Numpy_accuracy: 8.64
ds1000_Tensorflow_accuracy: 17.78
ds1000_Scipy_accuracy: 15.09
ds1000_Sklearn_accuracy: 8.7
ds1000_Pytorch_accuracy: 4.41
ds1000_Matplotlib_accuracy: 61.29
mmmlu_lite_accuracy: 54.7
openai_mmmlu_lite_AR-XY_accuracy: 42.32
openai_mmmlu_lite_BN-BD_accuracy: 42.18
openai_mmmlu_lite_DE-DE_accuracy: 60
openai_mmmlu_lite_ES-LA_accuracy: 66.18
openai_mmmlu_lite_FR-FR_accuracy: 66.88
openai_mmmlu_lite_HI-IN_accuracy: 48.63
openai_mmmlu_lite_ID-ID_accuracy: 61.26
openai_mmmlu_lite_IT-IT_accuracy: 65.26
openai_mmmlu_lite_JA-JP_accuracy: 60.7
openai_mmmlu_lite_KO-KR_accuracy: 60.63
openai_mmmlu_lite_PT-BR_accuracy: 54.46
openai_mmmlu_lite_SW-KE_accuracy: 36
openai_mmmlu_lite_YO-NG_accuracy: 31.86
openai_mmmlu_lite_ZH-CN_accuracy: 69.4
college_naive_average: 48.33
high_naive_average: 59.33
middle_naive_average: 76.67
primary_naive_average: 86.67
arithmetic_naive_average: 74.33
mathbench-a (average)_naive_average: 69.07
college_knowledge_naive_average: 83.54
high_knowledge_naive_average: 80.82
middle_knowledge_naive_average: 83.79
primary_knowledge_naive_average: 92.22
mathbench-t (average)_naive_average: 85.1
internlm3-8b-instruct-turbomind:
objective:
race-high_accuracy: 89.22
ARC-c_accuracy: 92.54
BoolQ_accuracy: 86.45
triviaqa_wiki_1shot_score: 60.72
nq_open_1shot_score: 20.25
mmmlu_lite_naive_average: 41.82
IFEval_Prompt-level-strict-accuracy: 77.45
drop_accuracy: 83.27
bbh_naive_average: 55.22
GPQA_diamond_accuracy: 37.88
hellaswag_accuracy: 91.28
TheoremQA_score: 20.12
musr_average_naive_average: 36.86
korbench_single_naive_average: 41.2
ARC_Prize_Public_Evaluation_accuracy: 0.06
gsm8k_accuracy: 91.28
GaokaoBench_weighted_average: 86.59
math_accuracy: 76.96
cmo_fib_accuracy: 35.1
aime2024_accuracy: 16.67
Mathbench_naive_average: 78.96
wikibench-wiki-single_choice_cncircular_perf_4: 37.45
cmmlu_naive_average: 83.33
mmlu_naive_average: 76.21
mmlu_pro_naive_average: 57.96
openai_humaneval_humaneval_pass@1: 81.71
sanitized_mbpp_score: 69.65
humanevalx_naive_average: 40.73
ds1000_naive_average: 27.23
lcb_code_generation_pass@1: 34.75
lcb_code_execution_pass@1: 49.9
lcb_test_output_pass@1: 48.19
bigcodebench_hard_instruct_pass@1: 100
bigcodebench_hard_complete_pass@1: 100
teval_naive_average: 100
SciCode_sub_accuracy: 100
qa_dingo_cn_score: 100
mmlu_accuracy: 76.21
mmlu-stem_accuracy: 77.7
mmlu-social-science_accuracy: 80.98
mmlu-humanities_accuracy: 70.83
mmlu-other_accuracy: 75.01
cmmlu_accuracy: 83.33
cmmlu-stem_accuracy: 79.66
cmmlu-social-science_accuracy: 83.39
cmmlu-humanities_accuracy: 84.73
cmmlu-other_accuracy: 86.2
cmmlu-china-specific_accuracy: 81.77
mmlu_pro_accuracy: 57.96
mmlu_pro_biology_accuracy: 75.45
mmlu_pro_business_accuracy: 64.64
mmlu_pro_chemistry_accuracy: 59.81
mmlu_pro_computer_science_accuracy: 60.24
mmlu_pro_economics_accuracy: 68.6
mmlu_pro_engineering_accuracy: 44.79
mmlu_pro_health_accuracy: 58.31
mmlu_pro_history_accuracy: 49.87
mmlu_pro_law_accuracy: 32.43
mmlu_pro_math_accuracy: 70.17
mmlu_pro_philosophy_accuracy: 46.89
mmlu_pro_physics_accuracy: 59.58
mmlu_pro_psychology_accuracy: 66.29
mmlu_pro_other_accuracy: 54.33
humanevalx-python_pass@1: 43.9
humanevalx-cpp_pass@1: 20.12
humanevalx-go_pass@1: 0
humanevalx-java_pass@1: 74.39
humanevalx-js_pass@1: 65.24
ds1000_Pandas_accuracy: 16.49
ds1000_Numpy_accuracy: 34.09
ds1000_Tensorflow_accuracy: 26.67
ds1000_Scipy_accuracy: 17.92
ds1000_Sklearn_accuracy: 20.87
ds1000_Pytorch_accuracy: 19.12
ds1000_Matplotlib_accuracy: 55.48
mmmlu_lite_accuracy: 41.82
openai_mmmlu_lite_AR-XY_accuracy: 32.56
openai_mmmlu_lite_BN-BD_accuracy: 4.56
openai_mmmlu_lite_DE-DE_accuracy: 24.91
openai_mmmlu_lite_ES-LA_accuracy: 51.09
openai_mmmlu_lite_FR-FR_accuracy: 61.68
openai_mmmlu_lite_HI-IN_accuracy: 24.98
openai_mmmlu_lite_ID-ID_accuracy: 44.56
openai_mmmlu_lite_IT-IT_accuracy: 52.35
openai_mmmlu_lite_JA-JP_accuracy: 51.02
openai_mmmlu_lite_KO-KR_accuracy: 47.93
openai_mmmlu_lite_PT-BR_accuracy: 53.89
openai_mmmlu_lite_SW-KE_accuracy: 33.47
openai_mmmlu_lite_YO-NG_accuracy: 33.47
openai_mmmlu_lite_ZH-CN_accuracy: 69.05
college_naive_average: 45.67
high_naive_average: 64.67
middle_naive_average: 82.33
primary_naive_average: 90.33
arithmetic_naive_average: 74
mathbench-a (average)_naive_average: 71.4
college_knowledge_naive_average: 85.28
high_knowledge_naive_average: 79.43
middle_knowledge_naive_average: 87.9
primary_knowledge_naive_average: 93.42
mathbench-t (average)_naive_average: 86.51
internlm3-8b-instruct-pytorch:
objective:
race-high_accuracy: 89.02
ARC-c_accuracy: 93.56
BoolQ_accuracy: 86.67
triviaqa_wiki_1shot_score: 60.54
nq_open_1shot_score: 20.3
mmmlu_lite_naive_average: 42.6
IFEval_Prompt-level-strict-accuracy: 79.11
drop_accuracy: 83.32
bbh_naive_average: 54.76
GPQA_diamond_accuracy: 42.42
hellaswag_accuracy: 91.31
TheoremQA_score: 18
musr_average_naive_average: 36.62
korbench_single_naive_average: 41.84
ARC_Prize_Public_Evaluation_accuracy: 0.06
gsm8k_accuracy: 90.67
GaokaoBench_weighted_average: 86.27
math_accuracy: 76.68
cmo_fib_accuracy: 33.65
aime2024_accuracy: 10
Mathbench_naive_average: 78.92
wikibench-wiki-single_choice_cncircular_perf_4: 37.35
cmmlu_naive_average: 83.11
mmlu_naive_average: 76.23
mmlu_pro_naive_average: 58.16
openai_humaneval_humaneval_pass@1: 82.32
sanitized_mbpp_score: 70.04
humanevalx_naive_average: 39.76
ds1000_naive_average: 27.84
lcb_code_generation_pass@1: 34.5
lcb_code_execution_pass@1: 48.02
lcb_test_output_pass@1: 47.74
bigcodebench_hard_instruct_pass@1: 100
bigcodebench_hard_complete_pass@1: 100
teval_naive_average: 100
SciCode_sub_accuracy: 100
qa_dingo_cn_score: 100
mmlu_accuracy: 76.23
mmlu-stem_accuracy: 78.08
mmlu-social-science_accuracy: 80.31
mmlu-humanities_accuracy: 71.38
mmlu-other_accuracy: 74.63
cmmlu_accuracy: 83.11
cmmlu-stem_accuracy: 79.42
cmmlu-social-science_accuracy: 83.34
cmmlu-humanities_accuracy: 83.95
cmmlu-other_accuracy: 86.22
cmmlu-china-specific_accuracy: 81.5
mmlu_pro_accuracy: 58.16
mmlu_pro_biology_accuracy: 74.62
mmlu_pro_business_accuracy: 65.02
mmlu_pro_chemistry_accuracy: 60.69
mmlu_pro_computer_science_accuracy: 61.46
mmlu_pro_economics_accuracy: 68.25
mmlu_pro_engineering_accuracy: 45.3
mmlu_pro_health_accuracy: 60.15
mmlu_pro_history_accuracy: 50.66
mmlu_pro_law_accuracy: 31.7
mmlu_pro_math_accuracy: 70.32
mmlu_pro_philosophy_accuracy: 47.7
mmlu_pro_physics_accuracy: 59.51
mmlu_pro_psychology_accuracy: 65.41
mmlu_pro_other_accuracy: 53.46
humanevalx-python_pass@1: 42.68
humanevalx-cpp_pass@1: 19.51
humanevalx-go_pass@1: 0
humanevalx-java_pass@1: 72.56
humanevalx-js_pass@1: 64.02
ds1000_Pandas_accuracy: 14.09
ds1000_Numpy_accuracy: 35
ds1000_Tensorflow_accuracy: 24.44
ds1000_Scipy_accuracy: 20.75
ds1000_Sklearn_accuracy: 21.74
ds1000_Pytorch_accuracy: 22.06
ds1000_Matplotlib_accuracy: 56.77
mmmlu_lite_accuracy: 42.6
openai_mmmlu_lite_AR-XY_accuracy: 32.84
openai_mmmlu_lite_BN-BD_accuracy: 10.46
openai_mmmlu_lite_DE-DE_accuracy: 24.56
openai_mmmlu_lite_ES-LA_accuracy: 50.95
openai_mmmlu_lite_FR-FR_accuracy: 61.05
openai_mmmlu_lite_HI-IN_accuracy: 30.6
openai_mmmlu_lite_ID-ID_accuracy: 45.89
openai_mmmlu_lite_IT-IT_accuracy: 51.79
openai_mmmlu_lite_JA-JP_accuracy: 51.65
openai_mmmlu_lite_KO-KR_accuracy: 48.77
openai_mmmlu_lite_PT-BR_accuracy: 52.7
openai_mmmlu_lite_SW-KE_accuracy: 32.91
openai_mmmlu_lite_YO-NG_accuracy: 32.84
openai_mmmlu_lite_ZH-CN_accuracy: 69.33
college_naive_average: 47
high_naive_average: 66.67
middle_naive_average: 81.67
primary_naive_average: 89.33
arithmetic_naive_average: 73.67
mathbench-a (average)_naive_average: 71.67
college_knowledge_naive_average: 82.91
high_knowledge_naive_average: 79.86
middle_knowledge_naive_average: 88.92
primary_knowledge_naive_average: 92.96
mathbench-t (average)_naive_average: 86.16

View File

@ -1,21 +1,24 @@
chat:
glm-4-9b-chat-hf:
gsm8k_accuracy: 68.75
race-high_accuracy: 90.62
gsm8k_accuracy: 56.25
race-high_accuracy: 84.38
glm-4-9b-chat-turbomind:
gsm8k_accuracy: 71.88
race-high_accuracy: 90.62
glm-4-9b-chat-vllm:
gsm8k_accuracy: 71.88
gsm8k_accuracy: 68.75
race-high_accuracy: 90.62
deepseek-7b-chat-hf:
gsm8k_accuracy: 46.88
race-high_accuracy: 81.25
deepseek-moe-16b-chat-hf:
gsm8k_accuracy: 50
race-high_accuracy: 68.75
deepseek-r1-distill-llama-8b-turbomind:
gsm8k_accuracy: 31.25
race-high_accuracy: 81.25
deepseek-r1-distill-qwen-1_5b-turbomind:
gsm8k_accuracy: 37.5
race-high_accuracy: 53.12
deepseek-7b-chat-vllm:
gsm8k_accuracy: 50
gsm8k_accuracy: 43.75
race-high_accuracy: 78.12
gemma2-2b-it-hf:
gsm8k_accuracy: 50
@ -36,34 +39,40 @@ chat:
gsm8k_accuracy: 78.12
race-high_accuracy: 93.75
gemma-7b-it-vllm:
gsm8k_accuracy: 46.88
gsm8k_accuracy: 31.25
race-high_accuracy: 68.75
internlm2_5-7b-chat-hf:
gsm8k_accuracy: 84.38
race-high_accuracy: 90.62
internlm3-8b-instruct-hf:
gsm8k_accuracy: 65.62
race-high_accuracy: 87.5
internlm2_5-7b-chat-turbomind:
gsm8k_accuracy: 87.50
gsm8k_accuracy: 84.38
race-high_accuracy: 90.62
internlm2-chat-1.8b-turbomind:
gsm8k_accuracy: 28.12
race-high_accuracy: 84.38
internlm2-chat-1.8b-sft-turbomind:
gsm8k_accuracy: 21.88
gsm8k_accuracy: 31.25
race-high_accuracy: 84.38
internlm2-chat-7b-lmdeploy:
gsm8k_accuracy: 53.12
gsm8k_accuracy: 59.38
race-high_accuracy: 84.38
internlm2-chat-7b-sft-turbomind:
gsm8k_accuracy: 53.12
gsm8k_accuracy: 56.25
race-high_accuracy: 90.62
internlm3-8b-instruct-turbomind:
gsm8k_accuracy: 68.75
race-high_accuracy: 87.5
internlm2-chat-7b-vllm:
gsm8k_accuracy: 43.75
race-high_accuracy: 84.38
gsm8k_accuracy: 59.38
race-high_accuracy: 87.50
llama-3_1-8b-instruct-hf:
gsm8k_accuracy: 84.38
race-high_accuracy: 90.62
llama-3_2-3b-instruct-hf:
gsm8k_accuracy: 68.75
gsm8k_accuracy: 71.88
race-high_accuracy: 81.25
llama-3-8b-instruct-hf:
gsm8k_accuracy: 68.75
@ -72,14 +81,17 @@ chat:
gsm8k_accuracy: 18.75
race-high_accuracy: 46.88
llama-3_1-8b-instruct-turbomind:
gsm8k_accuracy: 78.12
gsm8k_accuracy: 81.25
race-high_accuracy: 90.62
llama-3_2-3b-instruct-turbomind:
gsm8k_accuracy: 65.62
gsm8k_accuracy: 75.00
race-high_accuracy: 81.25
llama-3-8b-instruct-turbomind:
gsm8k_accuracy: 71.88
race-high_accuracy: 87.5
gsm8k_accuracy: 68.75
race-high_accuracy: 84.38
internvl2_5-8b-turbomind:
gsm8k_accuracy: 0
race-high_accuracy: 0
mistral-7b-instruct-v0.2-hf:
gsm8k_accuracy: 40.62
race-high_accuracy: 75
@ -94,13 +106,10 @@ chat:
race-high_accuracy: 78.12
mistral-7b-instruct-v0.1-vllm:
gsm8k_accuracy: 34.38
race-high_accuracy: 68.75
race-high_accuracy: 65.62
mistral-7b-instruct-v0.2-vllm:
gsm8k_accuracy: 31.25
race-high_accuracy: 75
phi-3-mini-4k-instruct-hf:
gsm8k_accuracy: 81.25
race-high_accuracy: 87.50
gsm8k_accuracy: 21.88
race-high_accuracy: 78.12
qwen2.5-0.5b-instruct-hf:
gsm8k_accuracy: 34.38
race-high_accuracy: 46.88
@ -108,10 +117,10 @@ chat:
gsm8k_accuracy: 53.12
race-high_accuracy: 90.62
qwen2.5-0.5b-instruct-turbomind:
gsm8k_accuracy: 28.12
race-high_accuracy: 50
gsm8k_accuracy: 31.25
race-high_accuracy: 43.75
qwen2.5-3b-instruct-turbomind:
gsm8k_accuracy: 59.38
gsm8k_accuracy: 56.25
race-high_accuracy: 90.62
qwen1.5-0.5b-chat-hf:
gsm8k_accuracy: 0
@ -123,11 +132,11 @@ chat:
gsm8k_accuracy: 68.75
race-high_accuracy: 90.62
qwen2-1.5b-instruct-turbomind:
gsm8k_accuracy: 53.12
gsm8k_accuracy: 56.25
race-high_accuracy: 84.38
qwen2-7b-instruct-turbomind:
gsm8k_accuracy: 81.25
race-high_accuracy: 90.62
race-high_accuracy: 87.50
qwen1.5-0.5b-chat-vllm:
gsm8k_accuracy: 3.12
race-high_accuracy: 53.12
@ -143,11 +152,11 @@ chat:
yi-1.5-9b-chat-turbomind:
gsm8k_accuracy: 71.88
race-high_accuracy: 93.75
deepseek-v2-lite-chat-hf:
gsm8k_accuracy: 46.88
deepseek-v2_lite-chat-turbomind:
gsm8k_accuracy: 37.5
race-high_accuracy: 71.88
gemma2-27b-it-hf:
gsm8k_accuracy: 75
gsm8k_accuracy: 71.88
race-high_accuracy: 93.75
internlm2_5-20b-chat-hf:
gsm8k_accuracy: 84.38
@ -161,6 +170,9 @@ chat:
mistral-small-instruct-2409-turbomind:
gsm8k_accuracy: 81.25
race-high_accuracy: 87.50
phi-4:
gsm8k_accuracy: 81.25
race-high_accuracy: 87.50
qwen2.5-14b-instruct-hf:
gsm8k_accuracy: 71.88
race-high_accuracy: 96.88
@ -168,11 +180,11 @@ chat:
gsm8k_accuracy: 68.75
race-high_accuracy: 93.75
yi-1.5-34b-chat-turbomind:
gsm8k_accuracy: 78.12
gsm8k_accuracy: 75.00
race-high_accuracy: 93.75
deepseek-67b-chat-hf:
gsm8k_accuracy: 71.88
race-high_accuracy: 78.12
deepseek-r1-distill-qwen-32b-turbomind:
gsm8k_accuracy: 25
race-high_accuracy: 90.62
llama-3_3-70b-instruct-turbomind:
gsm8k_accuracy: 93.75
race-high_accuracy: 87.5
@ -180,20 +192,26 @@ chat:
gsm8k_accuracy: 59.38
race-high_accuracy: 81.25
mixtral-large-instruct-2411-turbomind:
gsm8k_accuracy: 90.62
gsm8k_accuracy: 87.50
race-high_accuracy: 93.75
nvidia-3_1-Nemotron-70b-instruct-HF-turbomind:
gsm8k_accuracy: 87.5
race-high_accuracy: 46.88
gsm8k_accuracy: 93.75
race-high_accuracy: 50.00
qwen2.5-72b-instruct-turbomind:
gsm8k_accuracy: 75
race-high_accuracy: 93.75
gsm8k_accuracy: 81.25
race-high_accuracy: 90.62
deepseek-r1-distill-llama-70b-turbomind:
gsm8k_accuracy: 40.62
race-high_accuracy: 90.62
deepseek-v2_5-1210-turbomind:
gsm8k_accuracy: 90.62
race-high_accuracy: 84.38
mixtral-8x22b-instruct-v0.1-hf:
gsm8k_accuracy: 81.25
race-high_accuracy: 81.25
mixtral-8x22b-instruct-v0.1-turbomind:
gsm8k_accuracy: 75
race-high_accuracy: 78.12
mixtral-8x22b-instruct-v0.1-vllm:
gsm8k_accuracy: 78.12
race-high_accuracy: 78.12
base:
glm-4-9b-hf:
gsm8k_accuracy: 68.75