mirror of
https://github.com/open-compass/opencompass.git
synced 2025-05-30 16:03:24 +08:00
[ci] update baseline for kernal change of vllm and lmdeploy (#2011)
* update * update * update * update * update * update * update
This commit is contained in:
parent
a05f9da134
commit
6ac9b06bc2
26
.github/scripts/oc_score_baseline_fullbench.yaml
vendored
26
.github/scripts/oc_score_baseline_fullbench.yaml
vendored
@ -9,7 +9,7 @@ internlm2_5-7b-chat-hf_fullbench:
|
|||||||
drop_accuracy: 81.25
|
drop_accuracy: 81.25
|
||||||
GPQA_diamond_accuracy: 25
|
GPQA_diamond_accuracy: 25
|
||||||
hellaswag_accuracy: 87.5
|
hellaswag_accuracy: 87.5
|
||||||
TheoremQA_score: 18.75
|
TheoremQA_score: 12.50
|
||||||
musr_average_naive_average: 39.58
|
musr_average_naive_average: 39.58
|
||||||
korbench_single_naive_average: 40
|
korbench_single_naive_average: 40
|
||||||
gsm8k_accuracy: 62.50
|
gsm8k_accuracy: 62.50
|
||||||
@ -162,7 +162,7 @@ internlm2_5-7b-hf_fullbench:
|
|||||||
drop_accuracy: 62.5
|
drop_accuracy: 62.5
|
||||||
GPQA_diamond_accuracy: 62.5
|
GPQA_diamond_accuracy: 62.5
|
||||||
hellaswag_accuracy: 93.75
|
hellaswag_accuracy: 93.75
|
||||||
TheoremQA_score: 25
|
TheoremQA_score: 12.50
|
||||||
winogrande_accuracy: 75
|
winogrande_accuracy: 75
|
||||||
gsm8k_accuracy: 37.5
|
gsm8k_accuracy: 37.5
|
||||||
GaokaoBench_2010-2022_Math_II_MCQs_score: 62.5
|
GaokaoBench_2010-2022_Math_II_MCQs_score: 62.5
|
||||||
@ -190,7 +190,7 @@ internlm2_5-7b-turbomind_fullbench:
|
|||||||
drop_accuracy: 62.5
|
drop_accuracy: 62.5
|
||||||
GPQA_diamond_accuracy: 62.5
|
GPQA_diamond_accuracy: 62.5
|
||||||
hellaswag_accuracy: 93.75
|
hellaswag_accuracy: 93.75
|
||||||
TheoremQA_score: 31.25
|
TheoremQA_score: 12.50
|
||||||
winogrande_accuracy: 87.5
|
winogrande_accuracy: 87.5
|
||||||
gsm8k_accuracy: 56.25
|
gsm8k_accuracy: 56.25
|
||||||
GaokaoBench_2010-2022_Math_II_MCQs_score: 68.75
|
GaokaoBench_2010-2022_Math_II_MCQs_score: 68.75
|
||||||
@ -391,7 +391,7 @@ internlm2_5-7b-chat-turbomind:
|
|||||||
alpaca_eval_total: 25.96
|
alpaca_eval_total: 25.96
|
||||||
arenahard_score: 17.15
|
arenahard_score: 17.15
|
||||||
Followbench_naive_average: 0.81
|
Followbench_naive_average: 0.81
|
||||||
CompassArena_naive_average: 34.61
|
CompassArena_naive_average: 39.49
|
||||||
FoFo_naive_average: 0.38
|
FoFo_naive_average: 0.38
|
||||||
mtbench101_avg: 8.01
|
mtbench101_avg: 8.01
|
||||||
wildbench_average: -10.49
|
wildbench_average: -10.49
|
||||||
@ -410,10 +410,10 @@ internlm2_5-7b-chat-turbomind:
|
|||||||
alpaca_eval_oasst: 23.4
|
alpaca_eval_oasst: 23.4
|
||||||
alpaca_eval_selfinstruct: 30.95
|
alpaca_eval_selfinstruct: 30.95
|
||||||
alpaca_eval_vicuna: 33.75
|
alpaca_eval_vicuna: 33.75
|
||||||
compassarena_language_naive_average: 52.5
|
compassarena_language_naive_average: 58.50
|
||||||
compassarena_knowledge_naive_average: 36
|
compassarena_knowledge_naive_average: 36
|
||||||
compassarena_reason_v2_naive_average: 35
|
compassarena_reason_v2_naive_average: 35
|
||||||
compassarena_math_v2_naive_average: 19.91
|
compassarena_math_v2_naive_average: 25.95
|
||||||
compassarena_creationv2_zh_naive_average: 43.64
|
compassarena_creationv2_zh_naive_average: 43.64
|
||||||
fofo_test_prompts_overall: 0.35
|
fofo_test_prompts_overall: 0.35
|
||||||
fofo_test_prompts_cn_overall: 0.41
|
fofo_test_prompts_cn_overall: 0.41
|
||||||
@ -493,7 +493,7 @@ qwen2.5-7b-instruct-turbomind:
|
|||||||
bigcodebench_hard_instruct_pass@1: 16.22
|
bigcodebench_hard_instruct_pass@1: 16.22
|
||||||
bigcodebench_hard_complete_pass@1: 11.49
|
bigcodebench_hard_complete_pass@1: 11.49
|
||||||
teval_naive_average: 79.72
|
teval_naive_average: 79.72
|
||||||
SciCode_sub_accuracy: 100
|
SciCode_sub_accuracy: 10.76
|
||||||
qa_dingo_cn_score: 99.01
|
qa_dingo_cn_score: 99.01
|
||||||
mmlu_accuracy: 76.01
|
mmlu_accuracy: 76.01
|
||||||
mmlu-stem_accuracy: 77.59
|
mmlu-stem_accuracy: 77.59
|
||||||
@ -600,7 +600,7 @@ internlm2_5-7b-chat-pytorch:
|
|||||||
bigcodebench_hard_instruct_pass@1: 6.08
|
bigcodebench_hard_instruct_pass@1: 6.08
|
||||||
bigcodebench_hard_complete_pass@1: 6.76
|
bigcodebench_hard_complete_pass@1: 6.76
|
||||||
teval_naive_average: 79.73
|
teval_naive_average: 79.73
|
||||||
SciCode_sub_accuracy: 100
|
SciCode_sub_accuracy: 3.47
|
||||||
qa_dingo_cn_score: 100
|
qa_dingo_cn_score: 100
|
||||||
mmlu_accuracy: 70.2
|
mmlu_accuracy: 70.2
|
||||||
mmlu-stem_accuracy: 67.73
|
mmlu-stem_accuracy: 67.73
|
||||||
@ -689,7 +689,7 @@ qwen2.5-7b-instruct-pytorch:
|
|||||||
GaokaoBench_weighted_average: 80.02
|
GaokaoBench_weighted_average: 80.02
|
||||||
math_accuracy: 73.74
|
math_accuracy: 73.74
|
||||||
cmo_fib_accuracy: 26.44
|
cmo_fib_accuracy: 26.44
|
||||||
aime2024_accuracy: 10
|
aime2024_accuracy: 13.33
|
||||||
Mathbench_naive_average: 77.08
|
Mathbench_naive_average: 77.08
|
||||||
wikibench-wiki-single_choice_cncircular_perf_4: 34
|
wikibench-wiki-single_choice_cncircular_perf_4: 34
|
||||||
cmmlu_naive_average: 75.9
|
cmmlu_naive_average: 75.9
|
||||||
@ -705,7 +705,7 @@ qwen2.5-7b-instruct-pytorch:
|
|||||||
bigcodebench_hard_instruct_pass@1: 16.89
|
bigcodebench_hard_instruct_pass@1: 16.89
|
||||||
bigcodebench_hard_complete_pass@1: 12.16
|
bigcodebench_hard_complete_pass@1: 12.16
|
||||||
teval_naive_average: 79.46
|
teval_naive_average: 79.46
|
||||||
SciCode_sub_accuracy: 100
|
SciCode_sub_accuracy: 10.42
|
||||||
qa_dingo_cn_score: 100
|
qa_dingo_cn_score: 100
|
||||||
mmlu_accuracy: 76.27
|
mmlu_accuracy: 76.27
|
||||||
mmlu-stem_accuracy: 77.75
|
mmlu-stem_accuracy: 77.75
|
||||||
@ -810,7 +810,7 @@ internlm3-8b-instruct-turbomind:
|
|||||||
bigcodebench_hard_instruct_pass@1: 13.51
|
bigcodebench_hard_instruct_pass@1: 13.51
|
||||||
bigcodebench_hard_complete_pass@1: 15.54
|
bigcodebench_hard_complete_pass@1: 15.54
|
||||||
teval_naive_average: 82.86
|
teval_naive_average: 82.86
|
||||||
SciCode_sub_accuracy: 100
|
SciCode_sub_accuracy: 11.11
|
||||||
qa_dingo_cn_score: 100
|
qa_dingo_cn_score: 100
|
||||||
mmlu_accuracy: 76.21
|
mmlu_accuracy: 76.21
|
||||||
mmlu-stem_accuracy: 77.7
|
mmlu-stem_accuracy: 77.7
|
||||||
@ -889,7 +889,7 @@ internlm3-8b-instruct-pytorch:
|
|||||||
IFEval_Prompt-level-strict-accuracy: 79.11
|
IFEval_Prompt-level-strict-accuracy: 79.11
|
||||||
drop_accuracy: 83.32
|
drop_accuracy: 83.32
|
||||||
bbh_naive_average: 54.76
|
bbh_naive_average: 54.76
|
||||||
GPQA_diamond_accuracy: 42.42
|
GPQA_diamond_accuracy: 33.84
|
||||||
hellaswag_accuracy: 91.31
|
hellaswag_accuracy: 91.31
|
||||||
TheoremQA_score: 18
|
TheoremQA_score: 18
|
||||||
musr_average_naive_average: 36.62
|
musr_average_naive_average: 36.62
|
||||||
@ -915,7 +915,7 @@ internlm3-8b-instruct-pytorch:
|
|||||||
bigcodebench_hard_instruct_pass@1: 12.84
|
bigcodebench_hard_instruct_pass@1: 12.84
|
||||||
bigcodebench_hard_complete_pass@1: 15.54
|
bigcodebench_hard_complete_pass@1: 15.54
|
||||||
teval_naive_average: 82.86
|
teval_naive_average: 82.86
|
||||||
SciCode_sub_accuracy: 100
|
SciCode_sub_accuracy: 9.38
|
||||||
qa_dingo_cn_score: 100
|
qa_dingo_cn_score: 100
|
||||||
mmlu_accuracy: 76.23
|
mmlu_accuracy: 76.23
|
||||||
mmlu-stem_accuracy: 78.08
|
mmlu-stem_accuracy: 78.08
|
||||||
|
78
.github/scripts/oc_score_baseline_testrange.yaml
vendored
78
.github/scripts/oc_score_baseline_testrange.yaml
vendored
@ -6,7 +6,7 @@ chat:
|
|||||||
gsm8k_accuracy: 71.88
|
gsm8k_accuracy: 71.88
|
||||||
race-high_accuracy: 90.62
|
race-high_accuracy: 90.62
|
||||||
glm-4-9b-chat-vllm:
|
glm-4-9b-chat-vllm:
|
||||||
gsm8k_accuracy: 68.75
|
gsm8k_accuracy: 71.88
|
||||||
race-high_accuracy: 90.62
|
race-high_accuracy: 90.62
|
||||||
deepseek-7b-chat-hf:
|
deepseek-7b-chat-hf:
|
||||||
gsm8k_accuracy: 46.88
|
gsm8k_accuracy: 46.88
|
||||||
@ -84,7 +84,7 @@ chat:
|
|||||||
gsm8k_accuracy: 81.25
|
gsm8k_accuracy: 81.25
|
||||||
race-high_accuracy: 90.62
|
race-high_accuracy: 90.62
|
||||||
llama-3_2-3b-instruct-turbomind:
|
llama-3_2-3b-instruct-turbomind:
|
||||||
gsm8k_accuracy: 75.00
|
gsm8k_accuracy: 68.75
|
||||||
race-high_accuracy: 81.25
|
race-high_accuracy: 81.25
|
||||||
llama-3-8b-instruct-turbomind:
|
llama-3-8b-instruct-turbomind:
|
||||||
gsm8k_accuracy: 68.75
|
gsm8k_accuracy: 68.75
|
||||||
@ -204,14 +204,14 @@ chat:
|
|||||||
gsm8k_accuracy: 90.62
|
gsm8k_accuracy: 90.62
|
||||||
race-high_accuracy: 84.38
|
race-high_accuracy: 84.38
|
||||||
mixtral-8x22b-instruct-v0.1-turbomind:
|
mixtral-8x22b-instruct-v0.1-turbomind:
|
||||||
gsm8k_accuracy: 75
|
gsm8k_accuracy: 78.12
|
||||||
race-high_accuracy: 78.12
|
race-high_accuracy: 78.12
|
||||||
mixtral-8x22b-instruct-v0.1-vllm:
|
mixtral-8x22b-instruct-v0.1-vllm:
|
||||||
gsm8k_accuracy: 78.12
|
gsm8k_accuracy: 78.12
|
||||||
race-high_accuracy: 78.12
|
race-high_accuracy: 78.12
|
||||||
base:
|
base:
|
||||||
glm-4-9b-turbomind:
|
glm-4-9b-turbomind:
|
||||||
gsm8k_accuracy: 56.25
|
gsm8k_accuracy: 59.38
|
||||||
GPQA_diamond_accuracy: 28.12
|
GPQA_diamond_accuracy: 28.12
|
||||||
race-high_accuracy: 93.75
|
race-high_accuracy: 93.75
|
||||||
winogrande_accuracy: 84.38
|
winogrande_accuracy: 84.38
|
||||||
@ -253,8 +253,8 @@ base:
|
|||||||
gemma-2-9b-turbomind:
|
gemma-2-9b-turbomind:
|
||||||
gsm8k_accuracy: 68.75
|
gsm8k_accuracy: 68.75
|
||||||
GPQA_diamond_accuracy: 0
|
GPQA_diamond_accuracy: 0
|
||||||
race-high_accuracy: 78.12
|
race-high_accuracy: 18.75
|
||||||
winogrande_accuracy: 50
|
winogrande_accuracy: 46.88
|
||||||
gemma-2b-vllm:
|
gemma-2b-vllm:
|
||||||
gsm8k_accuracy: 15.62
|
gsm8k_accuracy: 15.62
|
||||||
GPQA_diamond_accuracy: 3.12
|
GPQA_diamond_accuracy: 3.12
|
||||||
@ -281,20 +281,20 @@ base:
|
|||||||
race-high_accuracy: 71.88
|
race-high_accuracy: 71.88
|
||||||
winogrande_accuracy: 75
|
winogrande_accuracy: 75
|
||||||
internlm2_5-7b-turbomind:
|
internlm2_5-7b-turbomind:
|
||||||
|
gsm8k_accuracy: 62.5
|
||||||
|
GPQA_diamond_accuracy: 31.25
|
||||||
|
race-high_accuracy: 93.75
|
||||||
|
winogrande_accuracy: 87.5
|
||||||
|
internlm2-7b-turbomind:
|
||||||
gsm8k_accuracy: 59.38
|
gsm8k_accuracy: 59.38
|
||||||
GPQA_diamond_accuracy: 34.38
|
GPQA_diamond_accuracy: 34.38
|
||||||
race-high_accuracy: 93.75
|
race-high_accuracy: 78.12
|
||||||
winogrande_accuracy: 84.38
|
winogrande_accuracy: 71.88
|
||||||
internlm2-7b-turbomind:
|
|
||||||
gsm8k_accuracy: 50
|
|
||||||
GPQA_diamond_accuracy: 18.75
|
|
||||||
race-high_accuracy: 71.88
|
|
||||||
winogrande_accuracy: 84.38
|
|
||||||
internlm2-base-7b-turbomind:
|
internlm2-base-7b-turbomind:
|
||||||
gsm8k_accuracy: 37.50
|
gsm8k_accuracy: 28.12
|
||||||
GPQA_diamond_accuracy: 21.88
|
GPQA_diamond_accuracy: 31.25
|
||||||
race-high_accuracy: 84.38
|
race-high_accuracy: 71.88
|
||||||
winogrande_accuracy: 75
|
winogrande_accuracy: 62.50
|
||||||
llama-2-7b-hf:
|
llama-2-7b-hf:
|
||||||
gsm8k_accuracy: 21.88
|
gsm8k_accuracy: 21.88
|
||||||
GPQA_diamond_accuracy: 21.88
|
GPQA_diamond_accuracy: 21.88
|
||||||
@ -311,15 +311,15 @@ base:
|
|||||||
race-high_accuracy: 65.62
|
race-high_accuracy: 65.62
|
||||||
winogrande_accuracy: 65.62
|
winogrande_accuracy: 65.62
|
||||||
llama-3.1-8b-turbomind:
|
llama-3.1-8b-turbomind:
|
||||||
gsm8k_accuracy: 56.25
|
gsm8k_accuracy: 59.38
|
||||||
GPQA_diamond_accuracy: 9.38
|
GPQA_diamond_accuracy: 15.62
|
||||||
race-high_accuracy: 78.12
|
race-high_accuracy: 78.12
|
||||||
winogrande_accuracy: 78.12
|
winogrande_accuracy: 78.12
|
||||||
llama-3-8b-turbomind:
|
llama-3-8b-turbomind:
|
||||||
gsm8k_accuracy: 46.88
|
gsm8k_accuracy: 46.88
|
||||||
GPQA_diamond_accuracy: 12.50
|
GPQA_diamond_accuracy: 12.50
|
||||||
race-high_accuracy: 65.62
|
race-high_accuracy: 65.62
|
||||||
winogrande_accuracy: 78.12
|
winogrande_accuracy: 81.25
|
||||||
mistral-7b-v0.3-hf:
|
mistral-7b-v0.3-hf:
|
||||||
gsm8k_accuracy: 31.25
|
gsm8k_accuracy: 31.25
|
||||||
GPQA_diamond_accuracy: 6.25
|
GPQA_diamond_accuracy: 6.25
|
||||||
@ -331,8 +331,8 @@ base:
|
|||||||
race-high_accuracy: 87.5
|
race-high_accuracy: 87.5
|
||||||
winogrande_accuracy: 71.88
|
winogrande_accuracy: 71.88
|
||||||
qwen2.5-1.5b-turbomind:
|
qwen2.5-1.5b-turbomind:
|
||||||
gsm8k_accuracy: 62.50
|
gsm8k_accuracy: 59.38
|
||||||
GPQA_diamond_accuracy: 15.62
|
GPQA_diamond_accuracy: 18.75
|
||||||
race-high_accuracy: 75
|
race-high_accuracy: 75
|
||||||
winogrande_accuracy: 71.88
|
winogrande_accuracy: 71.88
|
||||||
qwen2.5-7b-turbomind:
|
qwen2.5-7b-turbomind:
|
||||||
@ -362,19 +362,19 @@ base:
|
|||||||
winogrande_accuracy: 68.75
|
winogrande_accuracy: 68.75
|
||||||
qwen2-1.5b-turbomind:
|
qwen2-1.5b-turbomind:
|
||||||
gsm8k_accuracy: 59.38
|
gsm8k_accuracy: 59.38
|
||||||
GPQA_diamond_accuracy: 12.50
|
GPQA_diamond_accuracy: 6.25
|
||||||
race-high_accuracy: 81.25
|
race-high_accuracy: 81.25
|
||||||
winogrande_accuracy: 75
|
winogrande_accuracy: 75
|
||||||
qwen2-7b-turbomind:
|
qwen2-7b-turbomind:
|
||||||
gsm8k_accuracy: 65.62
|
gsm8k_accuracy: 62.5
|
||||||
GPQA_diamond_accuracy: 12.5
|
GPQA_diamond_accuracy: 12.5
|
||||||
race-high_accuracy: 87.5
|
race-high_accuracy: 87.5
|
||||||
winogrande_accuracy: 71.88
|
winogrande_accuracy: 75
|
||||||
qwen1.5-0.5b-vllm:
|
qwen1.5-0.5b-vllm:
|
||||||
gsm8k_accuracy: 6.25
|
gsm8k_accuracy: 9.38
|
||||||
GPQA_diamond_accuracy: 0
|
GPQA_diamond_accuracy: 0
|
||||||
race-high_accuracy: 56.25
|
race-high_accuracy: 56.25
|
||||||
winogrande_accuracy: 62.5
|
winogrande_accuracy: 59.38
|
||||||
yi-1.5-6b-hf:
|
yi-1.5-6b-hf:
|
||||||
gsm8k_accuracy: 62.5
|
gsm8k_accuracy: 62.5
|
||||||
GPQA_diamond_accuracy: 3.12
|
GPQA_diamond_accuracy: 3.12
|
||||||
@ -387,11 +387,11 @@ base:
|
|||||||
winogrande_accuracy: 59.38
|
winogrande_accuracy: 59.38
|
||||||
yi-1.5-9b-turbomind:
|
yi-1.5-9b-turbomind:
|
||||||
gsm8k_accuracy: 78.12
|
gsm8k_accuracy: 78.12
|
||||||
GPQA_diamond_accuracy: 43.75
|
GPQA_diamond_accuracy: 40.62
|
||||||
race-high_accuracy: 87.5
|
race-high_accuracy: 87.5
|
||||||
winogrande_accuracy: 71.88
|
winogrande_accuracy: 65.62
|
||||||
internlm2-20b-turbomind:
|
internlm2-20b-turbomind:
|
||||||
gsm8k_accuracy: 75
|
gsm8k_accuracy: 71.88
|
||||||
GPQA_diamond_accuracy: 18.75
|
GPQA_diamond_accuracy: 18.75
|
||||||
race-high_accuracy: 68.75
|
race-high_accuracy: 68.75
|
||||||
winogrande_accuracy: 81.25
|
winogrande_accuracy: 81.25
|
||||||
@ -406,18 +406,18 @@ base:
|
|||||||
race-high_accuracy: 93.75
|
race-high_accuracy: 93.75
|
||||||
winogrande_accuracy: 78.12
|
winogrande_accuracy: 78.12
|
||||||
qwen2.5-32b-turbomind:
|
qwen2.5-32b-turbomind:
|
||||||
gsm8k_accuracy: 87.5
|
gsm8k_accuracy: 84.38
|
||||||
GPQA_diamond_accuracy: 18.75
|
GPQA_diamond_accuracy: 28.12
|
||||||
race-high_accuracy: 93.75
|
race-high_accuracy: 93.75
|
||||||
winogrande_accuracy: 81.25
|
winogrande_accuracy: 81.25
|
||||||
deepseek-67b-base-turbomind:
|
deepseek-67b-base-turbomind:
|
||||||
gsm8k_accuracy: 53.12
|
gsm8k_accuracy: 59.38
|
||||||
GPQA_diamond_accuracy: 28.12
|
GPQA_diamond_accuracy: 34.38
|
||||||
race-high_accuracy: 81.25
|
race-high_accuracy: 78.12
|
||||||
winogrande_accuracy: 84.38
|
winogrande_accuracy: 81.25
|
||||||
llama-3-70b-turbomind:
|
llama-3-70b-turbomind:
|
||||||
gsm8k_accuracy: 56.25
|
gsm8k_accuracy: 56.25
|
||||||
GPQA_diamond_accuracy: 12.50
|
GPQA_diamond_accuracy: 15.62
|
||||||
race-high_accuracy: 93.75
|
race-high_accuracy: 93.75
|
||||||
winogrande_accuracy: 84.38
|
winogrande_accuracy: 84.38
|
||||||
qwen2.5-72b-turbomind:
|
qwen2.5-72b-turbomind:
|
||||||
@ -426,7 +426,7 @@ base:
|
|||||||
race-high_accuracy: 93.75
|
race-high_accuracy: 93.75
|
||||||
winogrande_accuracy: 87.5
|
winogrande_accuracy: 87.5
|
||||||
deepseek-v2-turbomind:
|
deepseek-v2-turbomind:
|
||||||
gsm8k_accuracy: 59.38
|
gsm8k_accuracy: 65.62
|
||||||
GPQA_diamond_accuracy: 3.12
|
GPQA_diamond_accuracy: 9.38
|
||||||
race-high_accuracy: 93.75
|
race-high_accuracy: 93.75
|
||||||
winogrande_accuracy: 81.25
|
winogrande_accuracy: 81.25
|
||||||
|
2
.github/workflows/daily-run-test.yml
vendored
2
.github/workflows/daily-run-test.yml
vendored
@ -44,7 +44,7 @@ on:
|
|||||||
type: string
|
type: string
|
||||||
default: "['base_objective','chat_objective','chat_subjective','base_long_context','chat_long_context']"
|
default: "['base_objective','chat_objective','chat_subjective','base_long_context','chat_long_context']"
|
||||||
schedule:
|
schedule:
|
||||||
- cron: '15 14 * * 0,2'
|
- cron: '15 14 * * 0,3'
|
||||||
|
|
||||||
env:
|
env:
|
||||||
HF_DATASETS_OFFLINE: 1
|
HF_DATASETS_OFFLINE: 1
|
||||||
|
@ -7,6 +7,7 @@ from .alpacaeval import alpacaeval_postprocess # noqa: F401, F403
|
|||||||
from .arena_hard import ArenaHardDataset # noqa: F401, F403
|
from .arena_hard import ArenaHardDataset # noqa: F401, F403
|
||||||
from .arena_hard import arenahard_bradleyterry_postprocess # noqa: F401, F403
|
from .arena_hard import arenahard_bradleyterry_postprocess # noqa: F401, F403
|
||||||
from .arena_hard import arenahard_postprocess # noqa: F401, F403
|
from .arena_hard import arenahard_postprocess # noqa: F401, F403
|
||||||
|
from .commonbench import commonbench_postprocess
|
||||||
from .compass_arena import CompassArenaDataset # noqa: F401, F403
|
from .compass_arena import CompassArenaDataset # noqa: F401, F403
|
||||||
from .compass_arena import \
|
from .compass_arena import \
|
||||||
compassarena_bradleyterry_postprocess # noqa: F401, F403
|
compassarena_bradleyterry_postprocess # noqa: F401, F403
|
||||||
|
56
opencompass/datasets/subjective/commonbench.py
Normal file
56
opencompass/datasets/subjective/commonbench.py
Normal file
@ -0,0 +1,56 @@
|
|||||||
|
# flake8: noqa: E501
|
||||||
|
import re
|
||||||
|
from collections import defaultdict
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
from opencompass.registry import DICT_POSTPROCESSORS
|
||||||
|
|
||||||
|
from .utils import get_judgeanswer_and_reference
|
||||||
|
|
||||||
|
|
||||||
|
def post_process(judgement: str):
|
||||||
|
"""Input a string like below:
|
||||||
|
|
||||||
|
xxx[[5]]xxx, and extract the score
|
||||||
|
"""
|
||||||
|
judgement = judgement['prediction']
|
||||||
|
pattern = r'\[\[([\d.]+)\]\]'
|
||||||
|
matched_result = re.findall(pattern, judgement)
|
||||||
|
if matched_result:
|
||||||
|
score = float(matched_result[0])
|
||||||
|
else:
|
||||||
|
return None
|
||||||
|
return {'score': score}
|
||||||
|
|
||||||
|
|
||||||
|
def get_capability_results(judged_answers, references):
|
||||||
|
capability_ratings = defaultdict(int)
|
||||||
|
capability_counts = defaultdict(int)
|
||||||
|
for ans, ref in zip(judged_answers, references):
|
||||||
|
capability_ratings['total'] += ans['score']
|
||||||
|
capability_counts['total'] += 1
|
||||||
|
capability_ratings[ref['capability']] += ans['score']
|
||||||
|
capability_counts[ref['capability']] += 1
|
||||||
|
|
||||||
|
capability_avg_ratings = defaultdict(float)
|
||||||
|
|
||||||
|
for capability, total_score in capability_ratings.items():
|
||||||
|
s = total_score / capability_counts[capability]
|
||||||
|
s = round(s, 2)
|
||||||
|
capability_avg_ratings[capability] = s
|
||||||
|
|
||||||
|
return capability_avg_ratings
|
||||||
|
|
||||||
|
|
||||||
|
@DICT_POSTPROCESSORS.register_module('commenbench')
|
||||||
|
def commonbench_postprocess(
|
||||||
|
output: dict,
|
||||||
|
output_path: str,
|
||||||
|
post_process: Optional[callable] = post_process,
|
||||||
|
) -> dict:
|
||||||
|
judged_answers, references = get_judgeanswer_and_reference(
|
||||||
|
output, output_path, post_process)
|
||||||
|
|
||||||
|
results = get_capability_results(judged_answers, references)
|
||||||
|
results['details'] = output
|
||||||
|
return results
|
Loading…
Reference in New Issue
Block a user