update

2025-05-30 16:03:24 +08:00 · 2025-04-03 15:06:01 +08:00 · 2025-04-03 15:06:01 +08:00 · 9d63fdd616
commit 9d63fdd616
parent 7157b8911d
1 changed files with 13 additions and 13 deletions
--- a/.github/scripts/oc_score_baseline_fullbench.yaml
+++ b/.github/scripts/oc_score_baseline_fullbench.yaml
@ -490,8 +490,8 @@ qwen2.5-7b-instruct-turbomind:
        lcb_code_generation_pass@1: 39.5
        lcb_code_execution_pass@1: 42.38
        lcb_test_output_pass@1: 50.68
-        bigcodebench_hard_instruct_pass@1: 100
-        bigcodebench_hard_complete_pass@1: 100
+        bigcodebench_hard_instruct_pass@1: 16.22
+        bigcodebench_hard_complete_pass@1: 11.49
        teval_naive_average: 79.72
        SciCode_sub_accuracy: 100
        qa_dingo_cn_score: 99.01
@ -598,8 +598,8 @@ internlm2_5-7b-chat-pytorch:
        lcb_code_execution_pass@1: 33.82
        lcb_test_output_pass@1: 22.62
        bigcodebench_hard_instruct_pass@1: 6.08
-        bigcodebench_hard_complete_pass@1: 100
-        teval_naive_average: 100
+        bigcodebench_hard_complete_pass@1: 6.76
+        teval_naive_average: 79.73
        SciCode_sub_accuracy: 100
        qa_dingo_cn_score: 100
        mmlu_accuracy: 70.2
@ -702,9 +702,9 @@ qwen2.5-7b-instruct-pytorch:
        lcb_code_generation_pass@1: 38.75
        lcb_code_execution_pass@1: 42.38
        lcb_test_output_pass@1: 50.45
-        bigcodebench_hard_instruct_pass@1: 100
-        bigcodebench_hard_complete_pass@1: 100
-        teval_naive_average: 100
+        bigcodebench_hard_instruct_pass@1: 16.89
+        bigcodebench_hard_complete_pass@1: 12.16
+        teval_naive_average: 79.46
        SciCode_sub_accuracy: 100
        qa_dingo_cn_score: 100
        mmlu_accuracy: 76.27
@ -807,9 +807,9 @@ internlm3-8b-instruct-turbomind:
        lcb_code_generation_pass@1: 34.75
        lcb_code_execution_pass@1: 49.9
        lcb_test_output_pass@1: 48.19
-        bigcodebench_hard_instruct_pass@1: 100
-        bigcodebench_hard_complete_pass@1: 100
-        teval_naive_average: 100
+        bigcodebench_hard_instruct_pass@1: 13.51
+        bigcodebench_hard_complete_pass@1: 15.54
+        teval_naive_average: 82.86
        SciCode_sub_accuracy: 100
        qa_dingo_cn_score: 100
        mmlu_accuracy: 76.21
@ -912,9 +912,9 @@ internlm3-8b-instruct-pytorch:
        lcb_code_generation_pass@1: 34.5
        lcb_code_execution_pass@1: 48.02
        lcb_test_output_pass@1: 47.74
-        bigcodebench_hard_instruct_pass@1: 100
-        bigcodebench_hard_complete_pass@1: 100
-        teval_naive_average: 100
+        bigcodebench_hard_instruct_pass@1: 12.84
+        bigcodebench_hard_complete_pass@1: 15.54
+        teval_naive_average: 82.86
        SciCode_sub_accuracy: 100
        qa_dingo_cn_score: 100
        mmlu_accuracy: 76.23