diff --git a/.github/scripts/oc_score_baseline_fullbench.yaml b/.github/scripts/oc_score_baseline_fullbench.yaml index 9f171a02..d88ff992 100644 --- a/.github/scripts/oc_score_baseline_fullbench.yaml +++ b/.github/scripts/oc_score_baseline_fullbench.yaml @@ -234,15 +234,15 @@ internlm2_5-7b-turbomind: sanitized_mbpp_score: 55.25 dingo_en_192_score: 60.94 dingo_zh_170_score: 67.65 - mmlu-stem_naive_average: 63.72 - mmlu-social-science_naive_average: 80.15 - mmlu-humanities_naive_average: 74.27 - mmlu-other_naive_average: 71.85 - cmmlu-stem_naive_average: 67.07 - cmmlu-social-science_naive_average: 81.49 - cmmlu-humanities_naive_average: 85.84 - cmmlu-other_naive_average: 82.69 - cmmlu-china-specific_naive_average: 79.88 + mmlu-stem_accuracy: 63.72 + mmlu-social-science_accuracy: 80.15 + mmlu-humanities_accuracy: 74.27 + mmlu-other_accuracy: 71.85 + cmmlu-stem_accuracy: 67.07 + cmmlu-social-science_accuracy: 81.49 + cmmlu-humanities_accuracy: 85.84 + cmmlu-other_accuracy: 82.69 + cmmlu-china-specific_accuracy: 79.88 mmlu_pro_biology_accuracy: 58.58 mmlu_pro_business_accuracy: 28.01 mmlu_pro_chemistry_accuracy: 22.79 @@ -281,12 +281,12 @@ internlm2_5-7b-turbomind: longbench_naive_average: 46.19 longbench_zh_naive_average: 49.3 longbench_en_naive_average: 43.97 - longbench_single-document-qa_naive_average: 42.84 - longbench_multi-document-qa_naive_average: 37.29 - longbench_summarization_naive_average: 23.21 - longbench_few-shot-learning_naive_average: 61.67 - longbench_synthetic-tasks_naive_average: 60.05 - longbench_code-completion_naive_average: 52.09 + longbench_single-document-qa_score: 42.84 + longbench_multi-document-qa_score: 37.29 + longbench_summarization_score: 23.21 + longbench_few-shot-learning_score: 61.67 + longbench_synthetic-tasks_score: 60.05 + longbench_code-completion_score: 52.09 internlm2_5-7b-chat-turbomind: objective: @@ -327,15 +327,15 @@ internlm2_5-7b-chat-turbomind: teval_naive_average: 80 SciCode_sub_accuracy: 5.56 qa_dingo_cn_score: 99.01 - mmlu-stem_naive_average: 68.2 - mmlu-social-science_naive_average: 75.8 - mmlu-humanities_naive_average: 69.3 - mmlu-other_naive_average: 71.3 - cmmlu-stem_naive_average: 66.64 - cmmlu-social-science_naive_average: 76 - cmmlu-humanities_naive_average: 77.9 - cmmlu-other_naive_average: 77.25 - cmmlu-china-specific_naive_average: 73.6 + mmlu-stem_accuracy: 68.2 + mmlu-social-science_accuracy: 75.8 + mmlu-humanities_accuracy: 69.3 + mmlu-other_accuracy: 71.3 + cmmlu-stem_accuracy: 66.64 + cmmlu-social-science_accuracy: 76 + cmmlu-humanities_accuracy: 77.9 + cmmlu-other_accuracy: 77.25 + cmmlu-china-specific_accuracy: 73.6 mmlu_pro_biology_accuracy: 66.67 mmlu_pro_business_accuracy: 47.91 mmlu_pro_chemistry_accuracy: 35 @@ -448,9 +448,9 @@ internlm2_5-7b-chat-1m-turbomind: babilong_32k_naive_average: 48.9 babilong_128k_naive_average: 40.8 babilong_256k_naive_average: 23.5 - longbench_single-document-qa_naive_average: 43.56 - longbench_multi-document-qa_naive_average: 46.24 - longbench_summarization_naive_average: 24.32 - longbench_few-shot-learning_naive_average: 51.67 - longbench_synthetic-tasks_naive_average: 66.83 - longbench_code-completion_naive_average: 45.99 + longbench_single-document-qa_score: 43.56 + longbench_multi-document-qa_score: 46.24 + longbench_summarization_score: 24.32 + longbench_few-shot-learning_score: 51.67 + longbench_synthetic-tasks_score: 66.83 + longbench_code-completion_score: 45.99