mirror of
https://github.com/open-compass/opencompass.git
synced 2025-05-30 16:03:24 +08:00
update
This commit is contained in:
parent
f71eb78c72
commit
b42b83ac60
60
.github/scripts/oc_score_baseline_fullbench.yaml
vendored
60
.github/scripts/oc_score_baseline_fullbench.yaml
vendored
@ -234,15 +234,15 @@ internlm2_5-7b-turbomind:
|
|||||||
sanitized_mbpp_score: 55.25
|
sanitized_mbpp_score: 55.25
|
||||||
dingo_en_192_score: 60.94
|
dingo_en_192_score: 60.94
|
||||||
dingo_zh_170_score: 67.65
|
dingo_zh_170_score: 67.65
|
||||||
mmlu-stem_naive_average: 63.72
|
mmlu-stem_accuracy: 63.72
|
||||||
mmlu-social-science_naive_average: 80.15
|
mmlu-social-science_accuracy: 80.15
|
||||||
mmlu-humanities_naive_average: 74.27
|
mmlu-humanities_accuracy: 74.27
|
||||||
mmlu-other_naive_average: 71.85
|
mmlu-other_accuracy: 71.85
|
||||||
cmmlu-stem_naive_average: 67.07
|
cmmlu-stem_accuracy: 67.07
|
||||||
cmmlu-social-science_naive_average: 81.49
|
cmmlu-social-science_accuracy: 81.49
|
||||||
cmmlu-humanities_naive_average: 85.84
|
cmmlu-humanities_accuracy: 85.84
|
||||||
cmmlu-other_naive_average: 82.69
|
cmmlu-other_accuracy: 82.69
|
||||||
cmmlu-china-specific_naive_average: 79.88
|
cmmlu-china-specific_accuracy: 79.88
|
||||||
mmlu_pro_biology_accuracy: 58.58
|
mmlu_pro_biology_accuracy: 58.58
|
||||||
mmlu_pro_business_accuracy: 28.01
|
mmlu_pro_business_accuracy: 28.01
|
||||||
mmlu_pro_chemistry_accuracy: 22.79
|
mmlu_pro_chemistry_accuracy: 22.79
|
||||||
@ -281,12 +281,12 @@ internlm2_5-7b-turbomind:
|
|||||||
longbench_naive_average: 46.19
|
longbench_naive_average: 46.19
|
||||||
longbench_zh_naive_average: 49.3
|
longbench_zh_naive_average: 49.3
|
||||||
longbench_en_naive_average: 43.97
|
longbench_en_naive_average: 43.97
|
||||||
longbench_single-document-qa_naive_average: 42.84
|
longbench_single-document-qa_score: 42.84
|
||||||
longbench_multi-document-qa_naive_average: 37.29
|
longbench_multi-document-qa_score: 37.29
|
||||||
longbench_summarization_naive_average: 23.21
|
longbench_summarization_score: 23.21
|
||||||
longbench_few-shot-learning_naive_average: 61.67
|
longbench_few-shot-learning_score: 61.67
|
||||||
longbench_synthetic-tasks_naive_average: 60.05
|
longbench_synthetic-tasks_score: 60.05
|
||||||
longbench_code-completion_naive_average: 52.09
|
longbench_code-completion_score: 52.09
|
||||||
|
|
||||||
internlm2_5-7b-chat-turbomind:
|
internlm2_5-7b-chat-turbomind:
|
||||||
objective:
|
objective:
|
||||||
@ -327,15 +327,15 @@ internlm2_5-7b-chat-turbomind:
|
|||||||
teval_naive_average: 80
|
teval_naive_average: 80
|
||||||
SciCode_sub_accuracy: 5.56
|
SciCode_sub_accuracy: 5.56
|
||||||
qa_dingo_cn_score: 99.01
|
qa_dingo_cn_score: 99.01
|
||||||
mmlu-stem_naive_average: 68.2
|
mmlu-stem_accuracy: 68.2
|
||||||
mmlu-social-science_naive_average: 75.8
|
mmlu-social-science_accuracy: 75.8
|
||||||
mmlu-humanities_naive_average: 69.3
|
mmlu-humanities_accuracy: 69.3
|
||||||
mmlu-other_naive_average: 71.3
|
mmlu-other_accuracy: 71.3
|
||||||
cmmlu-stem_naive_average: 66.64
|
cmmlu-stem_accuracy: 66.64
|
||||||
cmmlu-social-science_naive_average: 76
|
cmmlu-social-science_accuracy: 76
|
||||||
cmmlu-humanities_naive_average: 77.9
|
cmmlu-humanities_accuracy: 77.9
|
||||||
cmmlu-other_naive_average: 77.25
|
cmmlu-other_accuracy: 77.25
|
||||||
cmmlu-china-specific_naive_average: 73.6
|
cmmlu-china-specific_accuracy: 73.6
|
||||||
mmlu_pro_biology_accuracy: 66.67
|
mmlu_pro_biology_accuracy: 66.67
|
||||||
mmlu_pro_business_accuracy: 47.91
|
mmlu_pro_business_accuracy: 47.91
|
||||||
mmlu_pro_chemistry_accuracy: 35
|
mmlu_pro_chemistry_accuracy: 35
|
||||||
@ -448,9 +448,9 @@ internlm2_5-7b-chat-1m-turbomind:
|
|||||||
babilong_32k_naive_average: 48.9
|
babilong_32k_naive_average: 48.9
|
||||||
babilong_128k_naive_average: 40.8
|
babilong_128k_naive_average: 40.8
|
||||||
babilong_256k_naive_average: 23.5
|
babilong_256k_naive_average: 23.5
|
||||||
longbench_single-document-qa_naive_average: 43.56
|
longbench_single-document-qa_score: 43.56
|
||||||
longbench_multi-document-qa_naive_average: 46.24
|
longbench_multi-document-qa_score: 46.24
|
||||||
longbench_summarization_naive_average: 24.32
|
longbench_summarization_score: 24.32
|
||||||
longbench_few-shot-learning_naive_average: 51.67
|
longbench_few-shot-learning_score: 51.67
|
||||||
longbench_synthetic-tasks_naive_average: 66.83
|
longbench_synthetic-tasks_score: 66.83
|
||||||
longbench_code-completion_naive_average: 45.99
|
longbench_code-completion_score: 45.99
|
||||||
|
Loading…
Reference in New Issue
Block a user