diff --git a/.github/scripts/oc_score_baseline_fullbench.yaml b/.github/scripts/oc_score_baseline_fullbench.yaml index 966aeed2..471c6602 100644 --- a/.github/scripts/oc_score_baseline_fullbench.yaml +++ b/.github/scripts/oc_score_baseline_fullbench.yaml @@ -39,15 +39,15 @@ internlm2_5-7b-chat-hf_fullbench: college_knowledge_naive_average: 87.5 subjective: alignment_bench_v1_1_总分: 0.66 - alpaca_eval_total: 0 - arenahard_score: 50 + alpaca_eval_total: 20.00 + arenahard_score: 56.82 Followbench_naive_average: 1 CompassArena_naive_average: 43 - mtbench101_avg: 7.8 - wildbench_average: -15.56 - simpleqa_accuracy_given_attempted: 0 - chinese_simpleqa_given_attempted_accuracy: 1 - alignment_bench_v1_1_专业能力: 8.00 + mtbench101_avg: 7.60 + wildbench_average: -14.58 + simpleqa_accuracy_given_attempted: 1.00 + chinese_simpleqa_given_attempted_accuracy: 0.90 + alignment_bench_v1_1_专业能力: 7.90 alignment_bench_v1_1_数学计算: 0 alignment_bench_v1_1_基本任务: 0 alignment_bench_v1_1_逻辑推理: 0 @@ -55,11 +55,11 @@ internlm2_5-7b-chat-hf_fullbench: alignment_bench_v1_1_文本写作: 0 alignment_bench_v1_1_角色扮演: 0 alignment_bench_v1_1_综合问答: 0 - alpaca_eval_helpful_base: 0 + alpaca_eval_helpful_base: 20.00 compassarena_language_naive_average: 35 - compassarena_knowledge_naive_average: 55 + compassarena_knowledge_naive_average: 60.00 compassarena_reason_v2_naive_average: 40 - compassarena_math_v2_naive_average: 55 + compassarena_math_v2_naive_average: 50.00 compassarena_creationv2_zh_naive_average: 30 followbench_llmeval_en_HSR_AVG: 1 followbench_llmeval_en_SSR_AVG: 1 @@ -73,7 +73,7 @@ internlm2_5-7b-chat-hf_fullbench: followbench_llmeval_en_SSR_L3: 1 followbench_llmeval_en_SSR_L4: 1 followbench_llmeval_en_SSR_L5: 1 - simpleqa_f1: 0 + simpleqa_f1: 0.12 internlm2_5-7b-chat-turbomind_fullbench: objective: @@ -115,16 +115,16 @@ internlm2_5-7b-chat-turbomind_fullbench: college_naive_average: 12.50 college_knowledge_naive_average: 87.5 subjective: - alignment_bench_v1_1_总分: 0.66 - alpaca_eval_total: 0 - arenahard_score: 50 + alignment_bench_v1_1_总分: 0.72 + alpaca_eval_total: 20.00 + arenahard_score: 55.77 Followbench_naive_average: 1 - CompassArena_naive_average: 40 - mtbench101_avg: 8 - wildbench_average: -6.81 - simpleqa_accuracy_given_attempted: 0 + CompassArena_naive_average: 39.00 + mtbench101_avg: 7.90 + wildbench_average: 0.00 + simpleqa_accuracy_given_attempted: 1.00 chinese_simpleqa_given_attempted_accuracy: 1 - alignment_bench_v1_1_专业能力: 7.9 + alignment_bench_v1_1_专业能力: 8.70 alignment_bench_v1_1_数学计算: 0 alignment_bench_v1_1_基本任务: 0 alignment_bench_v1_1_逻辑推理: 0 @@ -132,12 +132,12 @@ internlm2_5-7b-chat-turbomind_fullbench: alignment_bench_v1_1_文本写作: 0 alignment_bench_v1_1_角色扮演: 0 alignment_bench_v1_1_综合问答: 0 - alpaca_eval_helpful_base: 0 - compassarena_language_naive_average: 35 - compassarena_knowledge_naive_average: 45 - compassarena_reason_v2_naive_average: 25 - compassarena_math_v2_naive_average: 60 - compassarena_creationv2_zh_naive_average: 35 + alpaca_eval_helpful_base: 20.00 + compassarena_language_naive_average: 25.00 + compassarena_knowledge_naive_average: 55.00 + compassarena_reason_v2_naive_average: 35.00 + compassarena_math_v2_naive_average: 55.00 + compassarena_creationv2_zh_naive_average: 25.00 followbench_llmeval_en_HSR_AVG: 1 followbench_llmeval_en_SSR_AVG: 1 followbench_llmeval_en_HSR_L1: 1 @@ -150,7 +150,7 @@ internlm2_5-7b-chat-turbomind_fullbench: followbench_llmeval_en_SSR_L3: 1 followbench_llmeval_en_SSR_L4: 1 followbench_llmeval_en_SSR_L5: 1 - simpleqa_f1: 0 + simpleqa_f1: 0.12 internlm2_5-7b-hf_fullbench: objective: