mirror of
https://github.com/open-compass/opencompass.git
synced 2025-05-30 16:03:24 +08:00
update
This commit is contained in:
parent
780bc1dd1e
commit
e263f3df8d
@ -101,8 +101,6 @@ with read_base():
|
|||||||
models as hf_mistral_nemo_instruct_2407_model # noqa: F401, E501
|
models as hf_mistral_nemo_instruct_2407_model # noqa: F401, E501
|
||||||
from opencompass.configs.models.mistral.hf_mistral_small_instruct_2409 import \
|
from opencompass.configs.models.mistral.hf_mistral_small_instruct_2409 import \
|
||||||
models as hf_mistral_small_instruct_2409_model # noqa: F401, E501
|
models as hf_mistral_small_instruct_2409_model # noqa: F401, E501
|
||||||
from opencompass.configs.models.mistral.hf_mixtral_8x22b_instruct_v0_1 import \
|
|
||||||
models as hf_mixtral_8x22b_instruct_v0_1_model # noqa: F401, E501
|
|
||||||
from opencompass.configs.models.mistral.lmdeploy_mistral_large_instruct_2411 import \
|
from opencompass.configs.models.mistral.lmdeploy_mistral_large_instruct_2411 import \
|
||||||
models as \
|
models as \
|
||||||
lmdeploy_mistral_large_instruct_2411_model # noqa: F401, E501
|
lmdeploy_mistral_large_instruct_2411_model # noqa: F401, E501
|
||||||
@ -128,8 +126,6 @@ with read_base():
|
|||||||
models as hf_phi_3_5_MoE_instruct_model # noqa: F401, E501
|
models as hf_phi_3_5_MoE_instruct_model # noqa: F401, E501
|
||||||
from opencompass.configs.models.phi.hf_phi_3_medium_4k_instruct import \
|
from opencompass.configs.models.phi.hf_phi_3_medium_4k_instruct import \
|
||||||
models as hf_phi_3_medium_4k_instruct_model # noqa: F401, E501
|
models as hf_phi_3_medium_4k_instruct_model # noqa: F401, E501
|
||||||
from opencompass.configs.models.phi.hf_phi_3_mini_4k_instruct import \
|
|
||||||
models as hf_phi_3_mini_4k_instruct_model # noqa: F401, E501
|
|
||||||
from opencompass.configs.models.phi.hf_phi_3_small_8k_instruct import \
|
from opencompass.configs.models.phi.hf_phi_3_small_8k_instruct import \
|
||||||
models as hf_phi_3_small_8k_instruct_model # noqa: F401, E501
|
models as hf_phi_3_small_8k_instruct_model # noqa: F401, E501
|
||||||
from opencompass.configs.models.phi.hf_phi_4 import \
|
from opencompass.configs.models.phi.hf_phi_4 import \
|
||||||
@ -174,6 +170,7 @@ with read_base():
|
|||||||
from ...volc import infer as volc_infer # noqa: F401, E501
|
from ...volc import infer as volc_infer # noqa: F401, E501
|
||||||
|
|
||||||
hf_glm4_9b_chat_model[0]['path'] = 'THUDM/glm-4-9b-chat-hf'
|
hf_glm4_9b_chat_model[0]['path'] = 'THUDM/glm-4-9b-chat-hf'
|
||||||
|
hf_deepseek_67b_chat_model[0]['run_cfg']['num_gpus'] = 8
|
||||||
|
|
||||||
race_datasets = [race_datasets[1]]
|
race_datasets = [race_datasets[1]]
|
||||||
datasets = sum([v for k, v in locals().items() if k.endswith('_datasets')], [])
|
datasets = sum([v for k, v in locals().items() if k.endswith('_datasets')], [])
|
||||||
|
9
.github/scripts/oc_score_assert.py
vendored
9
.github/scripts/oc_score_assert.py
vendored
@ -175,10 +175,11 @@ class TestApibench:
|
|||||||
class TestVolcFullbench:
|
class TestVolcFullbench:
|
||||||
"""Test cases for chat model."""
|
"""Test cases for chat model."""
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize('model, dataset', [(p1, p2) for p1 in [
|
||||||
'model, dataset',
|
'internlm2_5-7b-chat-turbomind', 'qwen2.5-7b-instruct-turbomind',
|
||||||
[(p1, p2) for p1 in ['internlm2_5-7b-chat-turbomind']
|
'internlm2_5-7b-chat-pytorch', 'qwen2.5-7b-instruct-pytorch',
|
||||||
for p2 in dataset_list('internlm2_5-7b-chat-turbomind', 'objective')])
|
'internlm3-8b-instruct-turbomind', 'internlm3-8b-instruct-pytorch'
|
||||||
|
] for p2 in dataset_list(p1, 'objective')])
|
||||||
@pytest.mark.chat_objective
|
@pytest.mark.chat_objective
|
||||||
def test_chat_objective(self, baseline_scores_fullbench, result_scores,
|
def test_chat_objective(self, baseline_scores_fullbench, result_scores,
|
||||||
model, dataset):
|
model, dataset):
|
||||||
|
527
.github/scripts/oc_score_baseline_fullbench.yaml
vendored
527
.github/scripts/oc_score_baseline_fullbench.yaml
vendored
@ -454,3 +454,530 @@ internlm2_5-7b-chat-1m-turbomind:
|
|||||||
longbench_few-shot-learning_score: 51.67
|
longbench_few-shot-learning_score: 51.67
|
||||||
longbench_synthetic-tasks_score: 66.83
|
longbench_synthetic-tasks_score: 66.83
|
||||||
longbench_code-completion_score: 45.99
|
longbench_code-completion_score: 45.99
|
||||||
|
|
||||||
|
|
||||||
|
qwen2.5-7b-instruct-turbomind:
|
||||||
|
objective:
|
||||||
|
race-high_accuracy: 84.99
|
||||||
|
ARC-c_accuracy: 92.2
|
||||||
|
BoolQ_accuracy: 86.7
|
||||||
|
triviaqa_wiki_1shot_score: 53.06
|
||||||
|
nq_open_1shot_score: 17.51
|
||||||
|
mmmlu_lite_naive_average: 54.96
|
||||||
|
IFEval_Prompt-level-strict-accuracy: 71.53
|
||||||
|
drop_accuracy: 80.07
|
||||||
|
bbh_naive_average: 68.81
|
||||||
|
GPQA_diamond_accuracy: 34.34
|
||||||
|
hellaswag_accuracy: 85.42
|
||||||
|
TheoremQA_score: 18.38
|
||||||
|
musr_average_naive_average: 43.44
|
||||||
|
korbench_single_naive_average: 39.44
|
||||||
|
ARC_Prize_Public_Evaluation_accuracy: 0
|
||||||
|
gsm8k_accuracy: 92.57
|
||||||
|
GaokaoBench_weighted_average: 80.14
|
||||||
|
math_accuracy: 73.58
|
||||||
|
cmo_fib_accuracy: 25
|
||||||
|
aime2024_accuracy: 16.67
|
||||||
|
Mathbench_naive_average: 77.33
|
||||||
|
wikibench-wiki-single_choice_cncircular_perf_4: 34.9
|
||||||
|
cmmlu_naive_average: 75.97
|
||||||
|
mmlu_naive_average: 76.01
|
||||||
|
mmlu_pro_naive_average: 56.12
|
||||||
|
openai_humaneval_humaneval_pass@1: 83.54
|
||||||
|
sanitized_mbpp_score: 74.71
|
||||||
|
humanevalx_naive_average: 48.29
|
||||||
|
ds1000_naive_average: 18.66
|
||||||
|
lcb_code_generation_pass@1: 39.5
|
||||||
|
lcb_code_execution_pass@1: 42.38
|
||||||
|
lcb_test_output_pass@1: 50.68
|
||||||
|
bigcodebench_hard_instruct_pass@1: 100
|
||||||
|
bigcodebench_hard_complete_pass@1: 100
|
||||||
|
teval_naive_average: 79.72
|
||||||
|
SciCode_sub_accuracy: 100
|
||||||
|
qa_dingo_cn_score: 99.01
|
||||||
|
mmlu_accuracy: 76.01
|
||||||
|
mmlu-stem_accuracy: 77.59
|
||||||
|
mmlu-social-science_accuracy: 79.02
|
||||||
|
mmlu-humanities_accuracy: 72.07
|
||||||
|
mmlu-other_accuracy: 74.86
|
||||||
|
cmmlu_accuracy: 75.97
|
||||||
|
cmmlu-stem_accuracy: 73.09
|
||||||
|
cmmlu-social-science_accuracy: 75.95
|
||||||
|
cmmlu-humanities_accuracy: 76.53
|
||||||
|
cmmlu-other_accuracy: 78.79
|
||||||
|
cmmlu-china-specific_accuracy: 73.17
|
||||||
|
mmlu_pro_accuracy: 56.12
|
||||||
|
mmlu_pro_biology_accuracy: 71.41
|
||||||
|
mmlu_pro_business_accuracy: 67.68
|
||||||
|
mmlu_pro_chemistry_accuracy: 54.59
|
||||||
|
mmlu_pro_computer_science_accuracy: 58.29
|
||||||
|
mmlu_pro_economics_accuracy: 66.82
|
||||||
|
mmlu_pro_engineering_accuracy: 42.41
|
||||||
|
mmlu_pro_health_accuracy: 55.87
|
||||||
|
mmlu_pro_history_accuracy: 46.46
|
||||||
|
mmlu_pro_law_accuracy: 28.97
|
||||||
|
mmlu_pro_math_accuracy: 73.13
|
||||||
|
mmlu_pro_philosophy_accuracy: 44.89
|
||||||
|
mmlu_pro_physics_accuracy: 58.43
|
||||||
|
mmlu_pro_psychology_accuracy: 63.16
|
||||||
|
mmlu_pro_other_accuracy: 53.57
|
||||||
|
humanevalx-python_pass@1: 50
|
||||||
|
humanevalx-cpp_pass@1: 42.07
|
||||||
|
humanevalx-go_pass@1: 0
|
||||||
|
humanevalx-java_pass@1: 74.39
|
||||||
|
humanevalx-js_pass@1: 75
|
||||||
|
ds1000_Pandas_accuracy: 14.09
|
||||||
|
ds1000_Numpy_accuracy: 8.18
|
||||||
|
ds1000_Tensorflow_accuracy: 17.78
|
||||||
|
ds1000_Scipy_accuracy: 15.09
|
||||||
|
ds1000_Sklearn_accuracy: 10.43
|
||||||
|
ds1000_Pytorch_accuracy: 4.41
|
||||||
|
ds1000_Matplotlib_accuracy: 60.65
|
||||||
|
mmmlu_lite_accuracy: 54.96
|
||||||
|
openai_mmmlu_lite_AR-XY_accuracy: 42.32
|
||||||
|
openai_mmmlu_lite_BN-BD_accuracy: 42.25
|
||||||
|
openai_mmmlu_lite_DE-DE_accuracy: 59.93
|
||||||
|
openai_mmmlu_lite_ES-LA_accuracy: 66.53
|
||||||
|
openai_mmmlu_lite_FR-FR_accuracy: 66.88
|
||||||
|
openai_mmmlu_lite_HI-IN_accuracy: 49.26
|
||||||
|
openai_mmmlu_lite_ID-ID_accuracy: 61.26
|
||||||
|
openai_mmmlu_lite_IT-IT_accuracy: 65.47
|
||||||
|
openai_mmmlu_lite_JA-JP_accuracy: 61.54
|
||||||
|
openai_mmmlu_lite_KO-KR_accuracy: 60.28
|
||||||
|
openai_mmmlu_lite_PT-BR_accuracy: 55.51
|
||||||
|
openai_mmmlu_lite_SW-KE_accuracy: 36.42
|
||||||
|
openai_mmmlu_lite_YO-NG_accuracy: 32.14
|
||||||
|
openai_mmmlu_lite_ZH-CN_accuracy: 69.61
|
||||||
|
college_naive_average: 48
|
||||||
|
high_naive_average: 59
|
||||||
|
middle_naive_average: 78
|
||||||
|
primary_naive_average: 85.67
|
||||||
|
arithmetic_naive_average: 75.67
|
||||||
|
mathbench-a (average)_naive_average: 69.27
|
||||||
|
college_knowledge_naive_average: 83.86
|
||||||
|
high_knowledge_naive_average: 80.29
|
||||||
|
middle_knowledge_naive_average: 84.26
|
||||||
|
primary_knowledge_naive_average: 93.16
|
||||||
|
mathbench-t (average)_naive_average: 85.39
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
internlm2_5-7b-chat-pytorch:
|
||||||
|
objective:
|
||||||
|
race-high_accuracy: 86.39
|
||||||
|
ARC-c_accuracy: 90.51
|
||||||
|
BoolQ_accuracy: 88.01
|
||||||
|
triviaqa_wiki_1shot_score: 64.77
|
||||||
|
nq_open_1shot_score: 22.71
|
||||||
|
mmmlu_lite_naive_average: 45.02
|
||||||
|
IFEval_Prompt-level-strict-accuracy: 56.56
|
||||||
|
drop_accuracy: 75.46
|
||||||
|
bbh_naive_average: 73.34
|
||||||
|
GPQA_diamond_accuracy: 32.83
|
||||||
|
hellaswag_accuracy: 94.81
|
||||||
|
TheoremQA_score: 23.88
|
||||||
|
musr_average_naive_average: 51.31
|
||||||
|
korbench_single_naive_average: 32
|
||||||
|
ARC_Prize_Public_Evaluation_accuracy: 0.01
|
||||||
|
gsm8k_accuracy: 86.96
|
||||||
|
GaokaoBench_weighted_average: 78.05
|
||||||
|
math_accuracy: 60.34
|
||||||
|
cmo_fib_accuracy: 12.98
|
||||||
|
aime2024_accuracy: 3.33
|
||||||
|
Mathbench_naive_average: 64.82
|
||||||
|
wikibench-wiki-single_choice_cncircular_perf_4: 31.7
|
||||||
|
cmmlu_naive_average: 74.24
|
||||||
|
mmlu_naive_average: 70.2
|
||||||
|
mmlu_pro_naive_average: 45.39
|
||||||
|
openai_humaneval_humaneval_pass@1: 70.12
|
||||||
|
sanitized_mbpp_score: 64.59
|
||||||
|
humanevalx_naive_average: 38.78
|
||||||
|
ds1000_naive_average: 14.19
|
||||||
|
lcb_code_generation_pass@1: 16.5
|
||||||
|
lcb_code_execution_pass@1: 33.82
|
||||||
|
lcb_test_output_pass@1: 22.62
|
||||||
|
bigcodebench_hard_instruct_pass@1: 6.08
|
||||||
|
bigcodebench_hard_complete_pass@1: 100
|
||||||
|
teval_naive_average: 100
|
||||||
|
SciCode_sub_accuracy: 100
|
||||||
|
qa_dingo_cn_score: 100
|
||||||
|
mmlu_accuracy: 70.2
|
||||||
|
mmlu-stem_accuracy: 67.73
|
||||||
|
mmlu-social-science_accuracy: 75.49
|
||||||
|
mmlu-humanities_accuracy: 68.56
|
||||||
|
mmlu-other_accuracy: 70.58
|
||||||
|
cmmlu_accuracy: 74.24
|
||||||
|
cmmlu-stem_accuracy: 66.7
|
||||||
|
cmmlu-social-science_accuracy: 75.88
|
||||||
|
cmmlu-humanities_accuracy: 77.56
|
||||||
|
cmmlu-other_accuracy: 77.52
|
||||||
|
cmmlu-china-specific_accuracy: 73.46
|
||||||
|
mmlu_pro_accuracy: 45.39
|
||||||
|
mmlu_pro_biology_accuracy: 65.83
|
||||||
|
mmlu_pro_business_accuracy: 51.96
|
||||||
|
mmlu_pro_chemistry_accuracy: 36.84
|
||||||
|
mmlu_pro_computer_science_accuracy: 48.29
|
||||||
|
mmlu_pro_economics_accuracy: 56.16
|
||||||
|
mmlu_pro_engineering_accuracy: 29.1
|
||||||
|
mmlu_pro_health_accuracy: 44.5
|
||||||
|
mmlu_pro_history_accuracy: 42.26
|
||||||
|
mmlu_pro_law_accuracy: 24.98
|
||||||
|
mmlu_pro_math_accuracy: 54.85
|
||||||
|
mmlu_pro_philosophy_accuracy: 39.28
|
||||||
|
mmlu_pro_physics_accuracy: 37.41
|
||||||
|
mmlu_pro_psychology_accuracy: 58.27
|
||||||
|
mmlu_pro_other_accuracy: 45.78
|
||||||
|
humanevalx-python_pass@1: 56.1
|
||||||
|
humanevalx-cpp_pass@1: 20.73
|
||||||
|
humanevalx-go_pass@1: 0
|
||||||
|
humanevalx-java_pass@1: 59.15
|
||||||
|
humanevalx-js_pass@1: 57.93
|
||||||
|
ds1000_Pandas_accuracy: 8.93
|
||||||
|
ds1000_Numpy_accuracy: 4.09
|
||||||
|
ds1000_Tensorflow_accuracy: 11.11
|
||||||
|
ds1000_Scipy_accuracy: 7.55
|
||||||
|
ds1000_Sklearn_accuracy: 7.83
|
||||||
|
ds1000_Pytorch_accuracy: 8.82
|
||||||
|
ds1000_Matplotlib_accuracy: 50.97
|
||||||
|
mmmlu_lite_accuracy: 45.02
|
||||||
|
openai_mmmlu_lite_AR-XY_accuracy: 18.6
|
||||||
|
openai_mmmlu_lite_BN-BD_accuracy: 27.58
|
||||||
|
openai_mmmlu_lite_DE-DE_accuracy: 51.23
|
||||||
|
openai_mmmlu_lite_ES-LA_accuracy: 56.63
|
||||||
|
openai_mmmlu_lite_FR-FR_accuracy: 58.11
|
||||||
|
openai_mmmlu_lite_HI-IN_accuracy: 33.82
|
||||||
|
openai_mmmlu_lite_ID-ID_accuracy: 50.39
|
||||||
|
openai_mmmlu_lite_IT-IT_accuracy: 50.39
|
||||||
|
openai_mmmlu_lite_JA-JP_accuracy: 50.95
|
||||||
|
openai_mmmlu_lite_KO-KR_accuracy: 45.05
|
||||||
|
openai_mmmlu_lite_PT-BR_accuracy: 57.89
|
||||||
|
openai_mmmlu_lite_SW-KE_accuracy: 32.14
|
||||||
|
openai_mmmlu_lite_YO-NG_accuracy: 32.14
|
||||||
|
openai_mmmlu_lite_ZH-CN_accuracy: 65.33
|
||||||
|
college_naive_average: 21
|
||||||
|
high_naive_average: 47
|
||||||
|
middle_naive_average: 59.67
|
||||||
|
primary_naive_average: 76
|
||||||
|
arithmetic_naive_average: 62
|
||||||
|
mathbench-a (average)_naive_average: 53.13
|
||||||
|
college_knowledge_naive_average: 68.99
|
||||||
|
high_knowledge_naive_average: 70.06
|
||||||
|
middle_knowledge_naive_average: 78.53
|
||||||
|
primary_knowledge_naive_average: 88.49
|
||||||
|
mathbench-t (average)_naive_average: 76.51
|
||||||
|
|
||||||
|
|
||||||
|
qwen2.5-7b-instruct-pytorch:
|
||||||
|
objective:
|
||||||
|
race-high_accuracy: 85.16
|
||||||
|
ARC-c_accuracy: 90.85
|
||||||
|
BoolQ_accuracy: 86.61
|
||||||
|
triviaqa_wiki_1shot_score: 52.96
|
||||||
|
nq_open_1shot_score: 17.62
|
||||||
|
mmmlu_lite_naive_average: 54.7
|
||||||
|
IFEval_Prompt-level-strict-accuracy: 71.35
|
||||||
|
drop_accuracy: 80.23
|
||||||
|
bbh_naive_average: 68.88
|
||||||
|
GPQA_diamond_accuracy: 36.36
|
||||||
|
hellaswag_accuracy: 85.49
|
||||||
|
TheoremQA_score: 18.38
|
||||||
|
musr_average_naive_average: 43.3
|
||||||
|
korbench_single_naive_average: 39.44
|
||||||
|
ARC_Prize_Public_Evaluation_accuracy: 0
|
||||||
|
gsm8k_accuracy: 91.66
|
||||||
|
GaokaoBench_weighted_average: 80.02
|
||||||
|
math_accuracy: 73.74
|
||||||
|
cmo_fib_accuracy: 26.44
|
||||||
|
aime2024_accuracy: 10
|
||||||
|
Mathbench_naive_average: 77.08
|
||||||
|
wikibench-wiki-single_choice_cncircular_perf_4: 34
|
||||||
|
cmmlu_naive_average: 75.9
|
||||||
|
mmlu_naive_average: 76.27
|
||||||
|
mmlu_pro_naive_average: 56.14
|
||||||
|
openai_humaneval_humaneval_pass@1: 84.76
|
||||||
|
sanitized_mbpp_score: 74.71
|
||||||
|
humanevalx_naive_average: 48.17
|
||||||
|
ds1000_naive_average: 18.57
|
||||||
|
lcb_code_generation_pass@1: 38.75
|
||||||
|
lcb_code_execution_pass@1: 42.38
|
||||||
|
lcb_test_output_pass@1: 50.45
|
||||||
|
bigcodebench_hard_instruct_pass@1: 100
|
||||||
|
bigcodebench_hard_complete_pass@1: 100
|
||||||
|
teval_naive_average: 100
|
||||||
|
SciCode_sub_accuracy: 100
|
||||||
|
qa_dingo_cn_score: 100
|
||||||
|
mmlu_accuracy: 76.27
|
||||||
|
mmlu-stem_accuracy: 77.75
|
||||||
|
mmlu-social-science_accuracy: 78.65
|
||||||
|
mmlu-humanities_accuracy: 73.12
|
||||||
|
mmlu-other_accuracy: 75.05
|
||||||
|
cmmlu_accuracy: 75.9
|
||||||
|
cmmlu-stem_accuracy: 73.41
|
||||||
|
cmmlu-social-science_accuracy: 75.97
|
||||||
|
cmmlu-humanities_accuracy: 76.42
|
||||||
|
cmmlu-other_accuracy: 78.15
|
||||||
|
cmmlu-china-specific_accuracy: 73.27
|
||||||
|
mmlu_pro_accuracy: 56.14
|
||||||
|
mmlu_pro_biology_accuracy: 72.25
|
||||||
|
mmlu_pro_business_accuracy: 66.16
|
||||||
|
mmlu_pro_chemistry_accuracy: 55.65
|
||||||
|
mmlu_pro_computer_science_accuracy: 60.24
|
||||||
|
mmlu_pro_economics_accuracy: 66.82
|
||||||
|
mmlu_pro_engineering_accuracy: 41.38
|
||||||
|
mmlu_pro_health_accuracy: 54.89
|
||||||
|
mmlu_pro_history_accuracy: 46.46
|
||||||
|
mmlu_pro_law_accuracy: 29.06
|
||||||
|
mmlu_pro_math_accuracy: 73.58
|
||||||
|
mmlu_pro_philosophy_accuracy: 44.89
|
||||||
|
mmlu_pro_physics_accuracy: 60.05
|
||||||
|
mmlu_pro_psychology_accuracy: 61.9
|
||||||
|
mmlu_pro_other_accuracy: 52.6
|
||||||
|
humanevalx-python_pass@1: 51.83
|
||||||
|
humanevalx-cpp_pass@1: 42.68
|
||||||
|
humanevalx-go_pass@1: 0
|
||||||
|
humanevalx-java_pass@1: 73.78
|
||||||
|
humanevalx-js_pass@1: 72.56
|
||||||
|
ds1000_Pandas_accuracy: 14.09
|
||||||
|
ds1000_Numpy_accuracy: 8.64
|
||||||
|
ds1000_Tensorflow_accuracy: 17.78
|
||||||
|
ds1000_Scipy_accuracy: 15.09
|
||||||
|
ds1000_Sklearn_accuracy: 8.7
|
||||||
|
ds1000_Pytorch_accuracy: 4.41
|
||||||
|
ds1000_Matplotlib_accuracy: 61.29
|
||||||
|
mmmlu_lite_accuracy: 54.7
|
||||||
|
openai_mmmlu_lite_AR-XY_accuracy: 42.32
|
||||||
|
openai_mmmlu_lite_BN-BD_accuracy: 42.18
|
||||||
|
openai_mmmlu_lite_DE-DE_accuracy: 60
|
||||||
|
openai_mmmlu_lite_ES-LA_accuracy: 66.18
|
||||||
|
openai_mmmlu_lite_FR-FR_accuracy: 66.88
|
||||||
|
openai_mmmlu_lite_HI-IN_accuracy: 48.63
|
||||||
|
openai_mmmlu_lite_ID-ID_accuracy: 61.26
|
||||||
|
openai_mmmlu_lite_IT-IT_accuracy: 65.26
|
||||||
|
openai_mmmlu_lite_JA-JP_accuracy: 60.7
|
||||||
|
openai_mmmlu_lite_KO-KR_accuracy: 60.63
|
||||||
|
openai_mmmlu_lite_PT-BR_accuracy: 54.46
|
||||||
|
openai_mmmlu_lite_SW-KE_accuracy: 36
|
||||||
|
openai_mmmlu_lite_YO-NG_accuracy: 31.86
|
||||||
|
openai_mmmlu_lite_ZH-CN_accuracy: 69.4
|
||||||
|
college_naive_average: 48.33
|
||||||
|
high_naive_average: 59.33
|
||||||
|
middle_naive_average: 76.67
|
||||||
|
primary_naive_average: 86.67
|
||||||
|
arithmetic_naive_average: 74.33
|
||||||
|
mathbench-a (average)_naive_average: 69.07
|
||||||
|
college_knowledge_naive_average: 83.54
|
||||||
|
high_knowledge_naive_average: 80.82
|
||||||
|
middle_knowledge_naive_average: 83.79
|
||||||
|
primary_knowledge_naive_average: 92.22
|
||||||
|
mathbench-t (average)_naive_average: 85.1
|
||||||
|
|
||||||
|
|
||||||
|
internlm3-8b-instruct-turbomind:
|
||||||
|
objective:
|
||||||
|
race-high_accuracy: 89.22
|
||||||
|
ARC-c_accuracy: 92.54
|
||||||
|
BoolQ_accuracy: 86.45
|
||||||
|
triviaqa_wiki_1shot_score: 60.72
|
||||||
|
nq_open_1shot_score: 20.25
|
||||||
|
mmmlu_lite_naive_average: 41.82
|
||||||
|
IFEval_Prompt-level-strict-accuracy: 77.45
|
||||||
|
drop_accuracy: 83.27
|
||||||
|
bbh_naive_average: 55.22
|
||||||
|
GPQA_diamond_accuracy: 37.88
|
||||||
|
hellaswag_accuracy: 91.28
|
||||||
|
TheoremQA_score: 20.12
|
||||||
|
musr_average_naive_average: 36.86
|
||||||
|
korbench_single_naive_average: 41.2
|
||||||
|
ARC_Prize_Public_Evaluation_accuracy: 0.06
|
||||||
|
gsm8k_accuracy: 91.28
|
||||||
|
GaokaoBench_weighted_average: 86.59
|
||||||
|
math_accuracy: 76.96
|
||||||
|
cmo_fib_accuracy: 35.1
|
||||||
|
aime2024_accuracy: 16.67
|
||||||
|
Mathbench_naive_average: 78.96
|
||||||
|
wikibench-wiki-single_choice_cncircular_perf_4: 37.45
|
||||||
|
cmmlu_naive_average: 83.33
|
||||||
|
mmlu_naive_average: 76.21
|
||||||
|
mmlu_pro_naive_average: 57.96
|
||||||
|
openai_humaneval_humaneval_pass@1: 81.71
|
||||||
|
sanitized_mbpp_score: 69.65
|
||||||
|
humanevalx_naive_average: 40.73
|
||||||
|
ds1000_naive_average: 27.23
|
||||||
|
lcb_code_generation_pass@1: 34.75
|
||||||
|
lcb_code_execution_pass@1: 49.9
|
||||||
|
lcb_test_output_pass@1: 48.19
|
||||||
|
bigcodebench_hard_instruct_pass@1: 100
|
||||||
|
bigcodebench_hard_complete_pass@1: 100
|
||||||
|
teval_naive_average: 100
|
||||||
|
SciCode_sub_accuracy: 100
|
||||||
|
qa_dingo_cn_score: 100
|
||||||
|
mmlu_accuracy: 76.21
|
||||||
|
mmlu-stem_accuracy: 77.7
|
||||||
|
mmlu-social-science_accuracy: 80.98
|
||||||
|
mmlu-humanities_accuracy: 70.83
|
||||||
|
mmlu-other_accuracy: 75.01
|
||||||
|
cmmlu_accuracy: 83.33
|
||||||
|
cmmlu-stem_accuracy: 79.66
|
||||||
|
cmmlu-social-science_accuracy: 83.39
|
||||||
|
cmmlu-humanities_accuracy: 84.73
|
||||||
|
cmmlu-other_accuracy: 86.2
|
||||||
|
cmmlu-china-specific_accuracy: 81.77
|
||||||
|
mmlu_pro_accuracy: 57.96
|
||||||
|
mmlu_pro_biology_accuracy: 75.45
|
||||||
|
mmlu_pro_business_accuracy: 64.64
|
||||||
|
mmlu_pro_chemistry_accuracy: 59.81
|
||||||
|
mmlu_pro_computer_science_accuracy: 60.24
|
||||||
|
mmlu_pro_economics_accuracy: 68.6
|
||||||
|
mmlu_pro_engineering_accuracy: 44.79
|
||||||
|
mmlu_pro_health_accuracy: 58.31
|
||||||
|
mmlu_pro_history_accuracy: 49.87
|
||||||
|
mmlu_pro_law_accuracy: 32.43
|
||||||
|
mmlu_pro_math_accuracy: 70.17
|
||||||
|
mmlu_pro_philosophy_accuracy: 46.89
|
||||||
|
mmlu_pro_physics_accuracy: 59.58
|
||||||
|
mmlu_pro_psychology_accuracy: 66.29
|
||||||
|
mmlu_pro_other_accuracy: 54.33
|
||||||
|
humanevalx-python_pass@1: 43.9
|
||||||
|
humanevalx-cpp_pass@1: 20.12
|
||||||
|
humanevalx-go_pass@1: 0
|
||||||
|
humanevalx-java_pass@1: 74.39
|
||||||
|
humanevalx-js_pass@1: 65.24
|
||||||
|
ds1000_Pandas_accuracy: 16.49
|
||||||
|
ds1000_Numpy_accuracy: 34.09
|
||||||
|
ds1000_Tensorflow_accuracy: 26.67
|
||||||
|
ds1000_Scipy_accuracy: 17.92
|
||||||
|
ds1000_Sklearn_accuracy: 20.87
|
||||||
|
ds1000_Pytorch_accuracy: 19.12
|
||||||
|
ds1000_Matplotlib_accuracy: 55.48
|
||||||
|
mmmlu_lite_accuracy: 41.82
|
||||||
|
openai_mmmlu_lite_AR-XY_accuracy: 32.56
|
||||||
|
openai_mmmlu_lite_BN-BD_accuracy: 4.56
|
||||||
|
openai_mmmlu_lite_DE-DE_accuracy: 24.91
|
||||||
|
openai_mmmlu_lite_ES-LA_accuracy: 51.09
|
||||||
|
openai_mmmlu_lite_FR-FR_accuracy: 61.68
|
||||||
|
openai_mmmlu_lite_HI-IN_accuracy: 24.98
|
||||||
|
openai_mmmlu_lite_ID-ID_accuracy: 44.56
|
||||||
|
openai_mmmlu_lite_IT-IT_accuracy: 52.35
|
||||||
|
openai_mmmlu_lite_JA-JP_accuracy: 51.02
|
||||||
|
openai_mmmlu_lite_KO-KR_accuracy: 47.93
|
||||||
|
openai_mmmlu_lite_PT-BR_accuracy: 53.89
|
||||||
|
openai_mmmlu_lite_SW-KE_accuracy: 33.47
|
||||||
|
openai_mmmlu_lite_YO-NG_accuracy: 33.47
|
||||||
|
openai_mmmlu_lite_ZH-CN_accuracy: 69.05
|
||||||
|
college_naive_average: 45.67
|
||||||
|
high_naive_average: 64.67
|
||||||
|
middle_naive_average: 82.33
|
||||||
|
primary_naive_average: 90.33
|
||||||
|
arithmetic_naive_average: 74
|
||||||
|
mathbench-a (average)_naive_average: 71.4
|
||||||
|
college_knowledge_naive_average: 85.28
|
||||||
|
high_knowledge_naive_average: 79.43
|
||||||
|
middle_knowledge_naive_average: 87.9
|
||||||
|
primary_knowledge_naive_average: 93.42
|
||||||
|
mathbench-t (average)_naive_average: 86.51
|
||||||
|
|
||||||
|
|
||||||
|
internlm3-8b-instruct-pytorch:
|
||||||
|
objective:
|
||||||
|
race-high_accuracy: 89.02
|
||||||
|
ARC-c_accuracy: 93.56
|
||||||
|
BoolQ_accuracy: 86.67
|
||||||
|
triviaqa_wiki_1shot_score: 60.54
|
||||||
|
nq_open_1shot_score: 20.3
|
||||||
|
mmmlu_lite_naive_average: 42.6
|
||||||
|
IFEval_Prompt-level-strict-accuracy: 79.11
|
||||||
|
drop_accuracy: 83.32
|
||||||
|
bbh_naive_average: 54.76
|
||||||
|
GPQA_diamond_accuracy: 42.42
|
||||||
|
hellaswag_accuracy: 91.31
|
||||||
|
TheoremQA_score: 18
|
||||||
|
musr_average_naive_average: 36.62
|
||||||
|
korbench_single_naive_average: 41.84
|
||||||
|
ARC_Prize_Public_Evaluation_accuracy: 0.06
|
||||||
|
gsm8k_accuracy: 90.67
|
||||||
|
GaokaoBench_weighted_average: 86.27
|
||||||
|
math_accuracy: 76.68
|
||||||
|
cmo_fib_accuracy: 33.65
|
||||||
|
aime2024_accuracy: 10
|
||||||
|
Mathbench_naive_average: 78.92
|
||||||
|
wikibench-wiki-single_choice_cncircular_perf_4: 37.35
|
||||||
|
cmmlu_naive_average: 83.11
|
||||||
|
mmlu_naive_average: 76.23
|
||||||
|
mmlu_pro_naive_average: 58.16
|
||||||
|
openai_humaneval_humaneval_pass@1: 82.32
|
||||||
|
sanitized_mbpp_score: 70.04
|
||||||
|
humanevalx_naive_average: 39.76
|
||||||
|
ds1000_naive_average: 27.84
|
||||||
|
lcb_code_generation_pass@1: 34.5
|
||||||
|
lcb_code_execution_pass@1: 48.02
|
||||||
|
lcb_test_output_pass@1: 47.74
|
||||||
|
bigcodebench_hard_instruct_pass@1: 100
|
||||||
|
bigcodebench_hard_complete_pass@1: 100
|
||||||
|
teval_naive_average: 100
|
||||||
|
SciCode_sub_accuracy: 100
|
||||||
|
qa_dingo_cn_score: 100
|
||||||
|
mmlu_accuracy: 76.23
|
||||||
|
mmlu-stem_accuracy: 78.08
|
||||||
|
mmlu-social-science_accuracy: 80.31
|
||||||
|
mmlu-humanities_accuracy: 71.38
|
||||||
|
mmlu-other_accuracy: 74.63
|
||||||
|
cmmlu_accuracy: 83.11
|
||||||
|
cmmlu-stem_accuracy: 79.42
|
||||||
|
cmmlu-social-science_accuracy: 83.34
|
||||||
|
cmmlu-humanities_accuracy: 83.95
|
||||||
|
cmmlu-other_accuracy: 86.22
|
||||||
|
cmmlu-china-specific_accuracy: 81.5
|
||||||
|
mmlu_pro_accuracy: 58.16
|
||||||
|
mmlu_pro_biology_accuracy: 74.62
|
||||||
|
mmlu_pro_business_accuracy: 65.02
|
||||||
|
mmlu_pro_chemistry_accuracy: 60.69
|
||||||
|
mmlu_pro_computer_science_accuracy: 61.46
|
||||||
|
mmlu_pro_economics_accuracy: 68.25
|
||||||
|
mmlu_pro_engineering_accuracy: 45.3
|
||||||
|
mmlu_pro_health_accuracy: 60.15
|
||||||
|
mmlu_pro_history_accuracy: 50.66
|
||||||
|
mmlu_pro_law_accuracy: 31.7
|
||||||
|
mmlu_pro_math_accuracy: 70.32
|
||||||
|
mmlu_pro_philosophy_accuracy: 47.7
|
||||||
|
mmlu_pro_physics_accuracy: 59.51
|
||||||
|
mmlu_pro_psychology_accuracy: 65.41
|
||||||
|
mmlu_pro_other_accuracy: 53.46
|
||||||
|
humanevalx-python_pass@1: 42.68
|
||||||
|
humanevalx-cpp_pass@1: 19.51
|
||||||
|
humanevalx-go_pass@1: 0
|
||||||
|
humanevalx-java_pass@1: 72.56
|
||||||
|
humanevalx-js_pass@1: 64.02
|
||||||
|
ds1000_Pandas_accuracy: 14.09
|
||||||
|
ds1000_Numpy_accuracy: 35
|
||||||
|
ds1000_Tensorflow_accuracy: 24.44
|
||||||
|
ds1000_Scipy_accuracy: 20.75
|
||||||
|
ds1000_Sklearn_accuracy: 21.74
|
||||||
|
ds1000_Pytorch_accuracy: 22.06
|
||||||
|
ds1000_Matplotlib_accuracy: 56.77
|
||||||
|
mmmlu_lite_accuracy: 42.6
|
||||||
|
openai_mmmlu_lite_AR-XY_accuracy: 32.84
|
||||||
|
openai_mmmlu_lite_BN-BD_accuracy: 10.46
|
||||||
|
openai_mmmlu_lite_DE-DE_accuracy: 24.56
|
||||||
|
openai_mmmlu_lite_ES-LA_accuracy: 50.95
|
||||||
|
openai_mmmlu_lite_FR-FR_accuracy: 61.05
|
||||||
|
openai_mmmlu_lite_HI-IN_accuracy: 30.6
|
||||||
|
openai_mmmlu_lite_ID-ID_accuracy: 45.89
|
||||||
|
openai_mmmlu_lite_IT-IT_accuracy: 51.79
|
||||||
|
openai_mmmlu_lite_JA-JP_accuracy: 51.65
|
||||||
|
openai_mmmlu_lite_KO-KR_accuracy: 48.77
|
||||||
|
openai_mmmlu_lite_PT-BR_accuracy: 52.7
|
||||||
|
openai_mmmlu_lite_SW-KE_accuracy: 32.91
|
||||||
|
openai_mmmlu_lite_YO-NG_accuracy: 32.84
|
||||||
|
openai_mmmlu_lite_ZH-CN_accuracy: 69.33
|
||||||
|
college_naive_average: 47
|
||||||
|
high_naive_average: 66.67
|
||||||
|
middle_naive_average: 81.67
|
||||||
|
primary_naive_average: 89.33
|
||||||
|
arithmetic_naive_average: 73.67
|
||||||
|
mathbench-a (average)_naive_average: 71.67
|
||||||
|
college_knowledge_naive_average: 82.91
|
||||||
|
high_knowledge_naive_average: 79.86
|
||||||
|
middle_knowledge_naive_average: 88.92
|
||||||
|
primary_knowledge_naive_average: 92.96
|
||||||
|
mathbench-t (average)_naive_average: 86.16
|
||||||
|
108
.github/scripts/oc_score_baseline_testrange.yaml
vendored
108
.github/scripts/oc_score_baseline_testrange.yaml
vendored
@ -1,21 +1,24 @@
|
|||||||
chat:
|
chat:
|
||||||
glm-4-9b-chat-hf:
|
glm-4-9b-chat-hf:
|
||||||
gsm8k_accuracy: 68.75
|
gsm8k_accuracy: 56.25
|
||||||
race-high_accuracy: 90.62
|
race-high_accuracy: 84.38
|
||||||
glm-4-9b-chat-turbomind:
|
glm-4-9b-chat-turbomind:
|
||||||
gsm8k_accuracy: 71.88
|
gsm8k_accuracy: 71.88
|
||||||
race-high_accuracy: 90.62
|
race-high_accuracy: 90.62
|
||||||
glm-4-9b-chat-vllm:
|
glm-4-9b-chat-vllm:
|
||||||
gsm8k_accuracy: 71.88
|
gsm8k_accuracy: 68.75
|
||||||
race-high_accuracy: 90.62
|
race-high_accuracy: 90.62
|
||||||
deepseek-7b-chat-hf:
|
deepseek-7b-chat-hf:
|
||||||
gsm8k_accuracy: 46.88
|
gsm8k_accuracy: 46.88
|
||||||
race-high_accuracy: 81.25
|
race-high_accuracy: 81.25
|
||||||
deepseek-moe-16b-chat-hf:
|
deepseek-r1-distill-llama-8b-turbomind:
|
||||||
gsm8k_accuracy: 50
|
gsm8k_accuracy: 31.25
|
||||||
race-high_accuracy: 68.75
|
race-high_accuracy: 81.25
|
||||||
|
deepseek-r1-distill-qwen-1_5b-turbomind:
|
||||||
|
gsm8k_accuracy: 37.5
|
||||||
|
race-high_accuracy: 53.12
|
||||||
deepseek-7b-chat-vllm:
|
deepseek-7b-chat-vllm:
|
||||||
gsm8k_accuracy: 50
|
gsm8k_accuracy: 43.75
|
||||||
race-high_accuracy: 78.12
|
race-high_accuracy: 78.12
|
||||||
gemma2-2b-it-hf:
|
gemma2-2b-it-hf:
|
||||||
gsm8k_accuracy: 50
|
gsm8k_accuracy: 50
|
||||||
@ -36,34 +39,40 @@ chat:
|
|||||||
gsm8k_accuracy: 78.12
|
gsm8k_accuracy: 78.12
|
||||||
race-high_accuracy: 93.75
|
race-high_accuracy: 93.75
|
||||||
gemma-7b-it-vllm:
|
gemma-7b-it-vllm:
|
||||||
gsm8k_accuracy: 46.88
|
gsm8k_accuracy: 31.25
|
||||||
race-high_accuracy: 68.75
|
race-high_accuracy: 68.75
|
||||||
internlm2_5-7b-chat-hf:
|
internlm2_5-7b-chat-hf:
|
||||||
gsm8k_accuracy: 84.38
|
gsm8k_accuracy: 84.38
|
||||||
race-high_accuracy: 90.62
|
race-high_accuracy: 90.62
|
||||||
|
internlm3-8b-instruct-hf:
|
||||||
|
gsm8k_accuracy: 65.62
|
||||||
|
race-high_accuracy: 87.5
|
||||||
internlm2_5-7b-chat-turbomind:
|
internlm2_5-7b-chat-turbomind:
|
||||||
gsm8k_accuracy: 87.50
|
gsm8k_accuracy: 84.38
|
||||||
race-high_accuracy: 90.62
|
race-high_accuracy: 90.62
|
||||||
internlm2-chat-1.8b-turbomind:
|
internlm2-chat-1.8b-turbomind:
|
||||||
gsm8k_accuracy: 28.12
|
gsm8k_accuracy: 28.12
|
||||||
race-high_accuracy: 84.38
|
race-high_accuracy: 84.38
|
||||||
internlm2-chat-1.8b-sft-turbomind:
|
internlm2-chat-1.8b-sft-turbomind:
|
||||||
gsm8k_accuracy: 21.88
|
gsm8k_accuracy: 31.25
|
||||||
race-high_accuracy: 84.38
|
race-high_accuracy: 84.38
|
||||||
internlm2-chat-7b-lmdeploy:
|
internlm2-chat-7b-lmdeploy:
|
||||||
gsm8k_accuracy: 53.12
|
gsm8k_accuracy: 59.38
|
||||||
race-high_accuracy: 84.38
|
race-high_accuracy: 84.38
|
||||||
internlm2-chat-7b-sft-turbomind:
|
internlm2-chat-7b-sft-turbomind:
|
||||||
gsm8k_accuracy: 53.12
|
gsm8k_accuracy: 56.25
|
||||||
race-high_accuracy: 90.62
|
race-high_accuracy: 90.62
|
||||||
|
internlm3-8b-instruct-turbomind:
|
||||||
|
gsm8k_accuracy: 68.75
|
||||||
|
race-high_accuracy: 87.5
|
||||||
internlm2-chat-7b-vllm:
|
internlm2-chat-7b-vllm:
|
||||||
gsm8k_accuracy: 43.75
|
gsm8k_accuracy: 59.38
|
||||||
race-high_accuracy: 84.38
|
race-high_accuracy: 87.50
|
||||||
llama-3_1-8b-instruct-hf:
|
llama-3_1-8b-instruct-hf:
|
||||||
gsm8k_accuracy: 84.38
|
gsm8k_accuracy: 84.38
|
||||||
race-high_accuracy: 90.62
|
race-high_accuracy: 90.62
|
||||||
llama-3_2-3b-instruct-hf:
|
llama-3_2-3b-instruct-hf:
|
||||||
gsm8k_accuracy: 68.75
|
gsm8k_accuracy: 71.88
|
||||||
race-high_accuracy: 81.25
|
race-high_accuracy: 81.25
|
||||||
llama-3-8b-instruct-hf:
|
llama-3-8b-instruct-hf:
|
||||||
gsm8k_accuracy: 68.75
|
gsm8k_accuracy: 68.75
|
||||||
@ -72,14 +81,17 @@ chat:
|
|||||||
gsm8k_accuracy: 18.75
|
gsm8k_accuracy: 18.75
|
||||||
race-high_accuracy: 46.88
|
race-high_accuracy: 46.88
|
||||||
llama-3_1-8b-instruct-turbomind:
|
llama-3_1-8b-instruct-turbomind:
|
||||||
gsm8k_accuracy: 78.12
|
gsm8k_accuracy: 81.25
|
||||||
race-high_accuracy: 90.62
|
race-high_accuracy: 90.62
|
||||||
llama-3_2-3b-instruct-turbomind:
|
llama-3_2-3b-instruct-turbomind:
|
||||||
gsm8k_accuracy: 65.62
|
gsm8k_accuracy: 75.00
|
||||||
race-high_accuracy: 81.25
|
race-high_accuracy: 81.25
|
||||||
llama-3-8b-instruct-turbomind:
|
llama-3-8b-instruct-turbomind:
|
||||||
gsm8k_accuracy: 71.88
|
gsm8k_accuracy: 68.75
|
||||||
race-high_accuracy: 87.5
|
race-high_accuracy: 84.38
|
||||||
|
internvl2_5-8b-turbomind:
|
||||||
|
gsm8k_accuracy: 0
|
||||||
|
race-high_accuracy: 0
|
||||||
mistral-7b-instruct-v0.2-hf:
|
mistral-7b-instruct-v0.2-hf:
|
||||||
gsm8k_accuracy: 40.62
|
gsm8k_accuracy: 40.62
|
||||||
race-high_accuracy: 75
|
race-high_accuracy: 75
|
||||||
@ -94,13 +106,10 @@ chat:
|
|||||||
race-high_accuracy: 78.12
|
race-high_accuracy: 78.12
|
||||||
mistral-7b-instruct-v0.1-vllm:
|
mistral-7b-instruct-v0.1-vllm:
|
||||||
gsm8k_accuracy: 34.38
|
gsm8k_accuracy: 34.38
|
||||||
race-high_accuracy: 68.75
|
race-high_accuracy: 65.62
|
||||||
mistral-7b-instruct-v0.2-vllm:
|
mistral-7b-instruct-v0.2-vllm:
|
||||||
gsm8k_accuracy: 31.25
|
gsm8k_accuracy: 21.88
|
||||||
race-high_accuracy: 75
|
race-high_accuracy: 78.12
|
||||||
phi-3-mini-4k-instruct-hf:
|
|
||||||
gsm8k_accuracy: 81.25
|
|
||||||
race-high_accuracy: 87.50
|
|
||||||
qwen2.5-0.5b-instruct-hf:
|
qwen2.5-0.5b-instruct-hf:
|
||||||
gsm8k_accuracy: 34.38
|
gsm8k_accuracy: 34.38
|
||||||
race-high_accuracy: 46.88
|
race-high_accuracy: 46.88
|
||||||
@ -108,10 +117,10 @@ chat:
|
|||||||
gsm8k_accuracy: 53.12
|
gsm8k_accuracy: 53.12
|
||||||
race-high_accuracy: 90.62
|
race-high_accuracy: 90.62
|
||||||
qwen2.5-0.5b-instruct-turbomind:
|
qwen2.5-0.5b-instruct-turbomind:
|
||||||
gsm8k_accuracy: 28.12
|
gsm8k_accuracy: 31.25
|
||||||
race-high_accuracy: 50
|
race-high_accuracy: 43.75
|
||||||
qwen2.5-3b-instruct-turbomind:
|
qwen2.5-3b-instruct-turbomind:
|
||||||
gsm8k_accuracy: 59.38
|
gsm8k_accuracy: 56.25
|
||||||
race-high_accuracy: 90.62
|
race-high_accuracy: 90.62
|
||||||
qwen1.5-0.5b-chat-hf:
|
qwen1.5-0.5b-chat-hf:
|
||||||
gsm8k_accuracy: 0
|
gsm8k_accuracy: 0
|
||||||
@ -123,11 +132,11 @@ chat:
|
|||||||
gsm8k_accuracy: 68.75
|
gsm8k_accuracy: 68.75
|
||||||
race-high_accuracy: 90.62
|
race-high_accuracy: 90.62
|
||||||
qwen2-1.5b-instruct-turbomind:
|
qwen2-1.5b-instruct-turbomind:
|
||||||
gsm8k_accuracy: 53.12
|
gsm8k_accuracy: 56.25
|
||||||
race-high_accuracy: 84.38
|
race-high_accuracy: 84.38
|
||||||
qwen2-7b-instruct-turbomind:
|
qwen2-7b-instruct-turbomind:
|
||||||
gsm8k_accuracy: 81.25
|
gsm8k_accuracy: 81.25
|
||||||
race-high_accuracy: 90.62
|
race-high_accuracy: 87.50
|
||||||
qwen1.5-0.5b-chat-vllm:
|
qwen1.5-0.5b-chat-vllm:
|
||||||
gsm8k_accuracy: 3.12
|
gsm8k_accuracy: 3.12
|
||||||
race-high_accuracy: 53.12
|
race-high_accuracy: 53.12
|
||||||
@ -143,11 +152,11 @@ chat:
|
|||||||
yi-1.5-9b-chat-turbomind:
|
yi-1.5-9b-chat-turbomind:
|
||||||
gsm8k_accuracy: 71.88
|
gsm8k_accuracy: 71.88
|
||||||
race-high_accuracy: 93.75
|
race-high_accuracy: 93.75
|
||||||
deepseek-v2-lite-chat-hf:
|
deepseek-v2_lite-chat-turbomind:
|
||||||
gsm8k_accuracy: 46.88
|
gsm8k_accuracy: 37.5
|
||||||
race-high_accuracy: 71.88
|
race-high_accuracy: 71.88
|
||||||
gemma2-27b-it-hf:
|
gemma2-27b-it-hf:
|
||||||
gsm8k_accuracy: 75
|
gsm8k_accuracy: 71.88
|
||||||
race-high_accuracy: 93.75
|
race-high_accuracy: 93.75
|
||||||
internlm2_5-20b-chat-hf:
|
internlm2_5-20b-chat-hf:
|
||||||
gsm8k_accuracy: 84.38
|
gsm8k_accuracy: 84.38
|
||||||
@ -161,6 +170,9 @@ chat:
|
|||||||
mistral-small-instruct-2409-turbomind:
|
mistral-small-instruct-2409-turbomind:
|
||||||
gsm8k_accuracy: 81.25
|
gsm8k_accuracy: 81.25
|
||||||
race-high_accuracy: 87.50
|
race-high_accuracy: 87.50
|
||||||
|
phi-4:
|
||||||
|
gsm8k_accuracy: 81.25
|
||||||
|
race-high_accuracy: 87.50
|
||||||
qwen2.5-14b-instruct-hf:
|
qwen2.5-14b-instruct-hf:
|
||||||
gsm8k_accuracy: 71.88
|
gsm8k_accuracy: 71.88
|
||||||
race-high_accuracy: 96.88
|
race-high_accuracy: 96.88
|
||||||
@ -168,11 +180,11 @@ chat:
|
|||||||
gsm8k_accuracy: 68.75
|
gsm8k_accuracy: 68.75
|
||||||
race-high_accuracy: 93.75
|
race-high_accuracy: 93.75
|
||||||
yi-1.5-34b-chat-turbomind:
|
yi-1.5-34b-chat-turbomind:
|
||||||
gsm8k_accuracy: 78.12
|
gsm8k_accuracy: 75.00
|
||||||
race-high_accuracy: 93.75
|
race-high_accuracy: 93.75
|
||||||
deepseek-67b-chat-hf:
|
deepseek-r1-distill-qwen-32b-turbomind:
|
||||||
gsm8k_accuracy: 71.88
|
gsm8k_accuracy: 25
|
||||||
race-high_accuracy: 78.12
|
race-high_accuracy: 90.62
|
||||||
llama-3_3-70b-instruct-turbomind:
|
llama-3_3-70b-instruct-turbomind:
|
||||||
gsm8k_accuracy: 93.75
|
gsm8k_accuracy: 93.75
|
||||||
race-high_accuracy: 87.5
|
race-high_accuracy: 87.5
|
||||||
@ -180,20 +192,26 @@ chat:
|
|||||||
gsm8k_accuracy: 59.38
|
gsm8k_accuracy: 59.38
|
||||||
race-high_accuracy: 81.25
|
race-high_accuracy: 81.25
|
||||||
mixtral-large-instruct-2411-turbomind:
|
mixtral-large-instruct-2411-turbomind:
|
||||||
gsm8k_accuracy: 90.62
|
gsm8k_accuracy: 87.50
|
||||||
race-high_accuracy: 93.75
|
race-high_accuracy: 93.75
|
||||||
nvidia-3_1-Nemotron-70b-instruct-HF-turbomind:
|
nvidia-3_1-Nemotron-70b-instruct-HF-turbomind:
|
||||||
gsm8k_accuracy: 87.5
|
gsm8k_accuracy: 93.75
|
||||||
race-high_accuracy: 46.88
|
race-high_accuracy: 50.00
|
||||||
qwen2.5-72b-instruct-turbomind:
|
qwen2.5-72b-instruct-turbomind:
|
||||||
gsm8k_accuracy: 75
|
gsm8k_accuracy: 81.25
|
||||||
race-high_accuracy: 93.75
|
race-high_accuracy: 90.62
|
||||||
|
deepseek-r1-distill-llama-70b-turbomind:
|
||||||
|
gsm8k_accuracy: 40.62
|
||||||
|
race-high_accuracy: 90.62
|
||||||
deepseek-v2_5-1210-turbomind:
|
deepseek-v2_5-1210-turbomind:
|
||||||
gsm8k_accuracy: 90.62
|
gsm8k_accuracy: 90.62
|
||||||
race-high_accuracy: 84.38
|
race-high_accuracy: 84.38
|
||||||
mixtral-8x22b-instruct-v0.1-hf:
|
mixtral-8x22b-instruct-v0.1-turbomind:
|
||||||
gsm8k_accuracy: 81.25
|
gsm8k_accuracy: 75
|
||||||
race-high_accuracy: 81.25
|
race-high_accuracy: 78.12
|
||||||
|
mixtral-8x22b-instruct-v0.1-vllm:
|
||||||
|
gsm8k_accuracy: 78.12
|
||||||
|
race-high_accuracy: 78.12
|
||||||
base:
|
base:
|
||||||
glm-4-9b-hf:
|
glm-4-9b-hf:
|
||||||
gsm8k_accuracy: 68.75
|
gsm8k_accuracy: 68.75
|
||||||
|
Loading…
Reference in New Issue
Block a user