mirror of
https://github.com/open-compass/opencompass.git
synced 2025-05-30 16:03:24 +08:00
update
This commit is contained in:
parent
4641a2890f
commit
1cc85721cc
52
.github/scripts/oc_score_baseline_fullbench.yaml
vendored
52
.github/scripts/oc_score_baseline_fullbench.yaml
vendored
@ -39,15 +39,15 @@ internlm2_5-7b-chat-hf_fullbench:
|
|||||||
college_knowledge_naive_average: 87.5
|
college_knowledge_naive_average: 87.5
|
||||||
subjective:
|
subjective:
|
||||||
alignment_bench_v1_1_总分: 0.66
|
alignment_bench_v1_1_总分: 0.66
|
||||||
alpaca_eval_total: 0
|
alpaca_eval_total: 20.00
|
||||||
arenahard_score: 50
|
arenahard_score: 56.82
|
||||||
Followbench_naive_average: 1
|
Followbench_naive_average: 1
|
||||||
CompassArena_naive_average: 43
|
CompassArena_naive_average: 43
|
||||||
mtbench101_avg: 7.8
|
mtbench101_avg: 7.60
|
||||||
wildbench_average: -15.56
|
wildbench_average: -14.58
|
||||||
simpleqa_accuracy_given_attempted: 0
|
simpleqa_accuracy_given_attempted: 1.00
|
||||||
chinese_simpleqa_given_attempted_accuracy: 1
|
chinese_simpleqa_given_attempted_accuracy: 0.90
|
||||||
alignment_bench_v1_1_专业能力: 8.00
|
alignment_bench_v1_1_专业能力: 7.90
|
||||||
alignment_bench_v1_1_数学计算: 0
|
alignment_bench_v1_1_数学计算: 0
|
||||||
alignment_bench_v1_1_基本任务: 0
|
alignment_bench_v1_1_基本任务: 0
|
||||||
alignment_bench_v1_1_逻辑推理: 0
|
alignment_bench_v1_1_逻辑推理: 0
|
||||||
@ -55,11 +55,11 @@ internlm2_5-7b-chat-hf_fullbench:
|
|||||||
alignment_bench_v1_1_文本写作: 0
|
alignment_bench_v1_1_文本写作: 0
|
||||||
alignment_bench_v1_1_角色扮演: 0
|
alignment_bench_v1_1_角色扮演: 0
|
||||||
alignment_bench_v1_1_综合问答: 0
|
alignment_bench_v1_1_综合问答: 0
|
||||||
alpaca_eval_helpful_base: 0
|
alpaca_eval_helpful_base: 20.00
|
||||||
compassarena_language_naive_average: 35
|
compassarena_language_naive_average: 35
|
||||||
compassarena_knowledge_naive_average: 55
|
compassarena_knowledge_naive_average: 60.00
|
||||||
compassarena_reason_v2_naive_average: 40
|
compassarena_reason_v2_naive_average: 40
|
||||||
compassarena_math_v2_naive_average: 55
|
compassarena_math_v2_naive_average: 50.00
|
||||||
compassarena_creationv2_zh_naive_average: 30
|
compassarena_creationv2_zh_naive_average: 30
|
||||||
followbench_llmeval_en_HSR_AVG: 1
|
followbench_llmeval_en_HSR_AVG: 1
|
||||||
followbench_llmeval_en_SSR_AVG: 1
|
followbench_llmeval_en_SSR_AVG: 1
|
||||||
@ -73,7 +73,7 @@ internlm2_5-7b-chat-hf_fullbench:
|
|||||||
followbench_llmeval_en_SSR_L3: 1
|
followbench_llmeval_en_SSR_L3: 1
|
||||||
followbench_llmeval_en_SSR_L4: 1
|
followbench_llmeval_en_SSR_L4: 1
|
||||||
followbench_llmeval_en_SSR_L5: 1
|
followbench_llmeval_en_SSR_L5: 1
|
||||||
simpleqa_f1: 0
|
simpleqa_f1: 0.12
|
||||||
|
|
||||||
internlm2_5-7b-chat-turbomind_fullbench:
|
internlm2_5-7b-chat-turbomind_fullbench:
|
||||||
objective:
|
objective:
|
||||||
@ -115,16 +115,16 @@ internlm2_5-7b-chat-turbomind_fullbench:
|
|||||||
college_naive_average: 12.50
|
college_naive_average: 12.50
|
||||||
college_knowledge_naive_average: 87.5
|
college_knowledge_naive_average: 87.5
|
||||||
subjective:
|
subjective:
|
||||||
alignment_bench_v1_1_总分: 0.66
|
alignment_bench_v1_1_总分: 0.72
|
||||||
alpaca_eval_total: 0
|
alpaca_eval_total: 20.00
|
||||||
arenahard_score: 50
|
arenahard_score: 55.77
|
||||||
Followbench_naive_average: 1
|
Followbench_naive_average: 1
|
||||||
CompassArena_naive_average: 40
|
CompassArena_naive_average: 39.00
|
||||||
mtbench101_avg: 8
|
mtbench101_avg: 7.90
|
||||||
wildbench_average: -6.81
|
wildbench_average: 0.00
|
||||||
simpleqa_accuracy_given_attempted: 0
|
simpleqa_accuracy_given_attempted: 1.00
|
||||||
chinese_simpleqa_given_attempted_accuracy: 1
|
chinese_simpleqa_given_attempted_accuracy: 1
|
||||||
alignment_bench_v1_1_专业能力: 7.9
|
alignment_bench_v1_1_专业能力: 8.70
|
||||||
alignment_bench_v1_1_数学计算: 0
|
alignment_bench_v1_1_数学计算: 0
|
||||||
alignment_bench_v1_1_基本任务: 0
|
alignment_bench_v1_1_基本任务: 0
|
||||||
alignment_bench_v1_1_逻辑推理: 0
|
alignment_bench_v1_1_逻辑推理: 0
|
||||||
@ -132,12 +132,12 @@ internlm2_5-7b-chat-turbomind_fullbench:
|
|||||||
alignment_bench_v1_1_文本写作: 0
|
alignment_bench_v1_1_文本写作: 0
|
||||||
alignment_bench_v1_1_角色扮演: 0
|
alignment_bench_v1_1_角色扮演: 0
|
||||||
alignment_bench_v1_1_综合问答: 0
|
alignment_bench_v1_1_综合问答: 0
|
||||||
alpaca_eval_helpful_base: 0
|
alpaca_eval_helpful_base: 20.00
|
||||||
compassarena_language_naive_average: 35
|
compassarena_language_naive_average: 25.00
|
||||||
compassarena_knowledge_naive_average: 45
|
compassarena_knowledge_naive_average: 55.00
|
||||||
compassarena_reason_v2_naive_average: 25
|
compassarena_reason_v2_naive_average: 35.00
|
||||||
compassarena_math_v2_naive_average: 60
|
compassarena_math_v2_naive_average: 55.00
|
||||||
compassarena_creationv2_zh_naive_average: 35
|
compassarena_creationv2_zh_naive_average: 25.00
|
||||||
followbench_llmeval_en_HSR_AVG: 1
|
followbench_llmeval_en_HSR_AVG: 1
|
||||||
followbench_llmeval_en_SSR_AVG: 1
|
followbench_llmeval_en_SSR_AVG: 1
|
||||||
followbench_llmeval_en_HSR_L1: 1
|
followbench_llmeval_en_HSR_L1: 1
|
||||||
@ -150,7 +150,7 @@ internlm2_5-7b-chat-turbomind_fullbench:
|
|||||||
followbench_llmeval_en_SSR_L3: 1
|
followbench_llmeval_en_SSR_L3: 1
|
||||||
followbench_llmeval_en_SSR_L4: 1
|
followbench_llmeval_en_SSR_L4: 1
|
||||||
followbench_llmeval_en_SSR_L5: 1
|
followbench_llmeval_en_SSR_L5: 1
|
||||||
simpleqa_f1: 0
|
simpleqa_f1: 0.12
|
||||||
|
|
||||||
internlm2_5-7b-hf_fullbench:
|
internlm2_5-7b-hf_fullbench:
|
||||||
objective:
|
objective:
|
||||||
|
Loading…
Reference in New Issue
Block a user