This commit is contained in:
zhulinJulia24 2025-05-13 13:15:51 +08:00
parent 4641a2890f
commit 1cc85721cc

View File

@ -39,15 +39,15 @@ internlm2_5-7b-chat-hf_fullbench:
college_knowledge_naive_average: 87.5 college_knowledge_naive_average: 87.5
subjective: subjective:
alignment_bench_v1_1_总分: 0.66 alignment_bench_v1_1_总分: 0.66
alpaca_eval_total: 0 alpaca_eval_total: 20.00
arenahard_score: 50 arenahard_score: 56.82
Followbench_naive_average: 1 Followbench_naive_average: 1
CompassArena_naive_average: 43 CompassArena_naive_average: 43
mtbench101_avg: 7.8 mtbench101_avg: 7.60
wildbench_average: -15.56 wildbench_average: -14.58
simpleqa_accuracy_given_attempted: 0 simpleqa_accuracy_given_attempted: 1.00
chinese_simpleqa_given_attempted_accuracy: 1 chinese_simpleqa_given_attempted_accuracy: 0.90
alignment_bench_v1_1_专业能力: 8.00 alignment_bench_v1_1_专业能力: 7.90
alignment_bench_v1_1_数学计算: 0 alignment_bench_v1_1_数学计算: 0
alignment_bench_v1_1_基本任务: 0 alignment_bench_v1_1_基本任务: 0
alignment_bench_v1_1_逻辑推理: 0 alignment_bench_v1_1_逻辑推理: 0
@ -55,11 +55,11 @@ internlm2_5-7b-chat-hf_fullbench:
alignment_bench_v1_1_文本写作: 0 alignment_bench_v1_1_文本写作: 0
alignment_bench_v1_1_角色扮演: 0 alignment_bench_v1_1_角色扮演: 0
alignment_bench_v1_1_综合问答: 0 alignment_bench_v1_1_综合问答: 0
alpaca_eval_helpful_base: 0 alpaca_eval_helpful_base: 20.00
compassarena_language_naive_average: 35 compassarena_language_naive_average: 35
compassarena_knowledge_naive_average: 55 compassarena_knowledge_naive_average: 60.00
compassarena_reason_v2_naive_average: 40 compassarena_reason_v2_naive_average: 40
compassarena_math_v2_naive_average: 55 compassarena_math_v2_naive_average: 50.00
compassarena_creationv2_zh_naive_average: 30 compassarena_creationv2_zh_naive_average: 30
followbench_llmeval_en_HSR_AVG: 1 followbench_llmeval_en_HSR_AVG: 1
followbench_llmeval_en_SSR_AVG: 1 followbench_llmeval_en_SSR_AVG: 1
@ -73,7 +73,7 @@ internlm2_5-7b-chat-hf_fullbench:
followbench_llmeval_en_SSR_L3: 1 followbench_llmeval_en_SSR_L3: 1
followbench_llmeval_en_SSR_L4: 1 followbench_llmeval_en_SSR_L4: 1
followbench_llmeval_en_SSR_L5: 1 followbench_llmeval_en_SSR_L5: 1
simpleqa_f1: 0 simpleqa_f1: 0.12
internlm2_5-7b-chat-turbomind_fullbench: internlm2_5-7b-chat-turbomind_fullbench:
objective: objective:
@ -115,16 +115,16 @@ internlm2_5-7b-chat-turbomind_fullbench:
college_naive_average: 12.50 college_naive_average: 12.50
college_knowledge_naive_average: 87.5 college_knowledge_naive_average: 87.5
subjective: subjective:
alignment_bench_v1_1_总分: 0.66 alignment_bench_v1_1_总分: 0.72
alpaca_eval_total: 0 alpaca_eval_total: 20.00
arenahard_score: 50 arenahard_score: 55.77
Followbench_naive_average: 1 Followbench_naive_average: 1
CompassArena_naive_average: 40 CompassArena_naive_average: 39.00
mtbench101_avg: 8 mtbench101_avg: 7.90
wildbench_average: -6.81 wildbench_average: 0.00
simpleqa_accuracy_given_attempted: 0 simpleqa_accuracy_given_attempted: 1.00
chinese_simpleqa_given_attempted_accuracy: 1 chinese_simpleqa_given_attempted_accuracy: 1
alignment_bench_v1_1_专业能力: 7.9 alignment_bench_v1_1_专业能力: 8.70
alignment_bench_v1_1_数学计算: 0 alignment_bench_v1_1_数学计算: 0
alignment_bench_v1_1_基本任务: 0 alignment_bench_v1_1_基本任务: 0
alignment_bench_v1_1_逻辑推理: 0 alignment_bench_v1_1_逻辑推理: 0
@ -132,12 +132,12 @@ internlm2_5-7b-chat-turbomind_fullbench:
alignment_bench_v1_1_文本写作: 0 alignment_bench_v1_1_文本写作: 0
alignment_bench_v1_1_角色扮演: 0 alignment_bench_v1_1_角色扮演: 0
alignment_bench_v1_1_综合问答: 0 alignment_bench_v1_1_综合问答: 0
alpaca_eval_helpful_base: 0 alpaca_eval_helpful_base: 20.00
compassarena_language_naive_average: 35 compassarena_language_naive_average: 25.00
compassarena_knowledge_naive_average: 45 compassarena_knowledge_naive_average: 55.00
compassarena_reason_v2_naive_average: 25 compassarena_reason_v2_naive_average: 35.00
compassarena_math_v2_naive_average: 60 compassarena_math_v2_naive_average: 55.00
compassarena_creationv2_zh_naive_average: 35 compassarena_creationv2_zh_naive_average: 25.00
followbench_llmeval_en_HSR_AVG: 1 followbench_llmeval_en_HSR_AVG: 1
followbench_llmeval_en_SSR_AVG: 1 followbench_llmeval_en_SSR_AVG: 1
followbench_llmeval_en_HSR_L1: 1 followbench_llmeval_en_HSR_L1: 1
@ -150,7 +150,7 @@ internlm2_5-7b-chat-turbomind_fullbench:
followbench_llmeval_en_SSR_L3: 1 followbench_llmeval_en_SSR_L3: 1
followbench_llmeval_en_SSR_L4: 1 followbench_llmeval_en_SSR_L4: 1
followbench_llmeval_en_SSR_L5: 1 followbench_llmeval_en_SSR_L5: 1
simpleqa_f1: 0 simpleqa_f1: 0.12
internlm2_5-7b-hf_fullbench: internlm2_5-7b-hf_fullbench:
objective: objective: