diff --git a/.github/scripts/oc_score_baseline_fullbench.yaml b/.github/scripts/oc_score_baseline_fullbench.yaml index b90daa5e..c0e735fb 100644 --- a/.github/scripts/oc_score_baseline_fullbench.yaml +++ b/.github/scripts/oc_score_baseline_fullbench.yaml @@ -24,8 +24,8 @@ internlm2_5-7b-chat-hf_fullbench: lcb_test_output_pass@1: 18.75 bbh-logical_deduction_seven_objects_score: 50 bbh-multistep_arithmetic_two_score: 68.75 - mmlu-other_naive_average: 72.6 - cmmlu-china-specific_naive_average: 76.25 + mmlu-other_accuracy: 72.6 + cmmlu-china-specific_accuracy: 76.25 mmlu_pro_math_accuracy: 25 ds1000_Pandas_accuracy: 12.5 ds1000_Numpy_accuracy: 0 @@ -101,8 +101,8 @@ internlm2_5-7b-chat-turbomind_fullbench: lcb_test_output_pass@1: 25.00 bbh-logical_deduction_seven_objects_score: 50.00 bbh-multistep_arithmetic_two_score: 68.75 - mmlu-other_naive_average: 69.71 - cmmlu-china-specific_naive_average: 75.83 + mmlu-other_accuracy: 69.71 + cmmlu-china-specific_accuracy: 75.83 mmlu_pro_math_accuracy: 31.25 ds1000_Pandas_accuracy: 0 ds1000_Numpy_accuracy: 0