[ci] update baseline for kernal change of vllm and lmdeploy (#2011)

* update

* update

* update

* update

* update

* update

* update
This commit is contained in:
zhulinJulia24 2025-04-09 14:09:35 +08:00 committed by GitHub
parent a05f9da134
commit 6ac9b06bc2
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 110 additions and 53 deletions

View File

@ -9,7 +9,7 @@ internlm2_5-7b-chat-hf_fullbench:
drop_accuracy: 81.25 drop_accuracy: 81.25
GPQA_diamond_accuracy: 25 GPQA_diamond_accuracy: 25
hellaswag_accuracy: 87.5 hellaswag_accuracy: 87.5
TheoremQA_score: 18.75 TheoremQA_score: 12.50
musr_average_naive_average: 39.58 musr_average_naive_average: 39.58
korbench_single_naive_average: 40 korbench_single_naive_average: 40
gsm8k_accuracy: 62.50 gsm8k_accuracy: 62.50
@ -162,7 +162,7 @@ internlm2_5-7b-hf_fullbench:
drop_accuracy: 62.5 drop_accuracy: 62.5
GPQA_diamond_accuracy: 62.5 GPQA_diamond_accuracy: 62.5
hellaswag_accuracy: 93.75 hellaswag_accuracy: 93.75
TheoremQA_score: 25 TheoremQA_score: 12.50
winogrande_accuracy: 75 winogrande_accuracy: 75
gsm8k_accuracy: 37.5 gsm8k_accuracy: 37.5
GaokaoBench_2010-2022_Math_II_MCQs_score: 62.5 GaokaoBench_2010-2022_Math_II_MCQs_score: 62.5
@ -190,7 +190,7 @@ internlm2_5-7b-turbomind_fullbench:
drop_accuracy: 62.5 drop_accuracy: 62.5
GPQA_diamond_accuracy: 62.5 GPQA_diamond_accuracy: 62.5
hellaswag_accuracy: 93.75 hellaswag_accuracy: 93.75
TheoremQA_score: 31.25 TheoremQA_score: 12.50
winogrande_accuracy: 87.5 winogrande_accuracy: 87.5
gsm8k_accuracy: 56.25 gsm8k_accuracy: 56.25
GaokaoBench_2010-2022_Math_II_MCQs_score: 68.75 GaokaoBench_2010-2022_Math_II_MCQs_score: 68.75
@ -391,7 +391,7 @@ internlm2_5-7b-chat-turbomind:
alpaca_eval_total: 25.96 alpaca_eval_total: 25.96
arenahard_score: 17.15 arenahard_score: 17.15
Followbench_naive_average: 0.81 Followbench_naive_average: 0.81
CompassArena_naive_average: 34.61 CompassArena_naive_average: 39.49
FoFo_naive_average: 0.38 FoFo_naive_average: 0.38
mtbench101_avg: 8.01 mtbench101_avg: 8.01
wildbench_average: -10.49 wildbench_average: -10.49
@ -410,10 +410,10 @@ internlm2_5-7b-chat-turbomind:
alpaca_eval_oasst: 23.4 alpaca_eval_oasst: 23.4
alpaca_eval_selfinstruct: 30.95 alpaca_eval_selfinstruct: 30.95
alpaca_eval_vicuna: 33.75 alpaca_eval_vicuna: 33.75
compassarena_language_naive_average: 52.5 compassarena_language_naive_average: 58.50
compassarena_knowledge_naive_average: 36 compassarena_knowledge_naive_average: 36
compassarena_reason_v2_naive_average: 35 compassarena_reason_v2_naive_average: 35
compassarena_math_v2_naive_average: 19.91 compassarena_math_v2_naive_average: 25.95
compassarena_creationv2_zh_naive_average: 43.64 compassarena_creationv2_zh_naive_average: 43.64
fofo_test_prompts_overall: 0.35 fofo_test_prompts_overall: 0.35
fofo_test_prompts_cn_overall: 0.41 fofo_test_prompts_cn_overall: 0.41
@ -493,7 +493,7 @@ qwen2.5-7b-instruct-turbomind:
bigcodebench_hard_instruct_pass@1: 16.22 bigcodebench_hard_instruct_pass@1: 16.22
bigcodebench_hard_complete_pass@1: 11.49 bigcodebench_hard_complete_pass@1: 11.49
teval_naive_average: 79.72 teval_naive_average: 79.72
SciCode_sub_accuracy: 100 SciCode_sub_accuracy: 10.76
qa_dingo_cn_score: 99.01 qa_dingo_cn_score: 99.01
mmlu_accuracy: 76.01 mmlu_accuracy: 76.01
mmlu-stem_accuracy: 77.59 mmlu-stem_accuracy: 77.59
@ -600,7 +600,7 @@ internlm2_5-7b-chat-pytorch:
bigcodebench_hard_instruct_pass@1: 6.08 bigcodebench_hard_instruct_pass@1: 6.08
bigcodebench_hard_complete_pass@1: 6.76 bigcodebench_hard_complete_pass@1: 6.76
teval_naive_average: 79.73 teval_naive_average: 79.73
SciCode_sub_accuracy: 100 SciCode_sub_accuracy: 3.47
qa_dingo_cn_score: 100 qa_dingo_cn_score: 100
mmlu_accuracy: 70.2 mmlu_accuracy: 70.2
mmlu-stem_accuracy: 67.73 mmlu-stem_accuracy: 67.73
@ -689,7 +689,7 @@ qwen2.5-7b-instruct-pytorch:
GaokaoBench_weighted_average: 80.02 GaokaoBench_weighted_average: 80.02
math_accuracy: 73.74 math_accuracy: 73.74
cmo_fib_accuracy: 26.44 cmo_fib_accuracy: 26.44
aime2024_accuracy: 10 aime2024_accuracy: 13.33
Mathbench_naive_average: 77.08 Mathbench_naive_average: 77.08
wikibench-wiki-single_choice_cncircular_perf_4: 34 wikibench-wiki-single_choice_cncircular_perf_4: 34
cmmlu_naive_average: 75.9 cmmlu_naive_average: 75.9
@ -705,7 +705,7 @@ qwen2.5-7b-instruct-pytorch:
bigcodebench_hard_instruct_pass@1: 16.89 bigcodebench_hard_instruct_pass@1: 16.89
bigcodebench_hard_complete_pass@1: 12.16 bigcodebench_hard_complete_pass@1: 12.16
teval_naive_average: 79.46 teval_naive_average: 79.46
SciCode_sub_accuracy: 100 SciCode_sub_accuracy: 10.42
qa_dingo_cn_score: 100 qa_dingo_cn_score: 100
mmlu_accuracy: 76.27 mmlu_accuracy: 76.27
mmlu-stem_accuracy: 77.75 mmlu-stem_accuracy: 77.75
@ -810,7 +810,7 @@ internlm3-8b-instruct-turbomind:
bigcodebench_hard_instruct_pass@1: 13.51 bigcodebench_hard_instruct_pass@1: 13.51
bigcodebench_hard_complete_pass@1: 15.54 bigcodebench_hard_complete_pass@1: 15.54
teval_naive_average: 82.86 teval_naive_average: 82.86
SciCode_sub_accuracy: 100 SciCode_sub_accuracy: 11.11
qa_dingo_cn_score: 100 qa_dingo_cn_score: 100
mmlu_accuracy: 76.21 mmlu_accuracy: 76.21
mmlu-stem_accuracy: 77.7 mmlu-stem_accuracy: 77.7
@ -889,7 +889,7 @@ internlm3-8b-instruct-pytorch:
IFEval_Prompt-level-strict-accuracy: 79.11 IFEval_Prompt-level-strict-accuracy: 79.11
drop_accuracy: 83.32 drop_accuracy: 83.32
bbh_naive_average: 54.76 bbh_naive_average: 54.76
GPQA_diamond_accuracy: 42.42 GPQA_diamond_accuracy: 33.84
hellaswag_accuracy: 91.31 hellaswag_accuracy: 91.31
TheoremQA_score: 18 TheoremQA_score: 18
musr_average_naive_average: 36.62 musr_average_naive_average: 36.62
@ -915,7 +915,7 @@ internlm3-8b-instruct-pytorch:
bigcodebench_hard_instruct_pass@1: 12.84 bigcodebench_hard_instruct_pass@1: 12.84
bigcodebench_hard_complete_pass@1: 15.54 bigcodebench_hard_complete_pass@1: 15.54
teval_naive_average: 82.86 teval_naive_average: 82.86
SciCode_sub_accuracy: 100 SciCode_sub_accuracy: 9.38
qa_dingo_cn_score: 100 qa_dingo_cn_score: 100
mmlu_accuracy: 76.23 mmlu_accuracy: 76.23
mmlu-stem_accuracy: 78.08 mmlu-stem_accuracy: 78.08

View File

@ -6,7 +6,7 @@ chat:
gsm8k_accuracy: 71.88 gsm8k_accuracy: 71.88
race-high_accuracy: 90.62 race-high_accuracy: 90.62
glm-4-9b-chat-vllm: glm-4-9b-chat-vllm:
gsm8k_accuracy: 68.75 gsm8k_accuracy: 71.88
race-high_accuracy: 90.62 race-high_accuracy: 90.62
deepseek-7b-chat-hf: deepseek-7b-chat-hf:
gsm8k_accuracy: 46.88 gsm8k_accuracy: 46.88
@ -84,7 +84,7 @@ chat:
gsm8k_accuracy: 81.25 gsm8k_accuracy: 81.25
race-high_accuracy: 90.62 race-high_accuracy: 90.62
llama-3_2-3b-instruct-turbomind: llama-3_2-3b-instruct-turbomind:
gsm8k_accuracy: 75.00 gsm8k_accuracy: 68.75
race-high_accuracy: 81.25 race-high_accuracy: 81.25
llama-3-8b-instruct-turbomind: llama-3-8b-instruct-turbomind:
gsm8k_accuracy: 68.75 gsm8k_accuracy: 68.75
@ -204,14 +204,14 @@ chat:
gsm8k_accuracy: 90.62 gsm8k_accuracy: 90.62
race-high_accuracy: 84.38 race-high_accuracy: 84.38
mixtral-8x22b-instruct-v0.1-turbomind: mixtral-8x22b-instruct-v0.1-turbomind:
gsm8k_accuracy: 75 gsm8k_accuracy: 78.12
race-high_accuracy: 78.12 race-high_accuracy: 78.12
mixtral-8x22b-instruct-v0.1-vllm: mixtral-8x22b-instruct-v0.1-vllm:
gsm8k_accuracy: 78.12 gsm8k_accuracy: 78.12
race-high_accuracy: 78.12 race-high_accuracy: 78.12
base: base:
glm-4-9b-turbomind: glm-4-9b-turbomind:
gsm8k_accuracy: 56.25 gsm8k_accuracy: 59.38
GPQA_diamond_accuracy: 28.12 GPQA_diamond_accuracy: 28.12
race-high_accuracy: 93.75 race-high_accuracy: 93.75
winogrande_accuracy: 84.38 winogrande_accuracy: 84.38
@ -253,8 +253,8 @@ base:
gemma-2-9b-turbomind: gemma-2-9b-turbomind:
gsm8k_accuracy: 68.75 gsm8k_accuracy: 68.75
GPQA_diamond_accuracy: 0 GPQA_diamond_accuracy: 0
race-high_accuracy: 78.12 race-high_accuracy: 18.75
winogrande_accuracy: 50 winogrande_accuracy: 46.88
gemma-2b-vllm: gemma-2b-vllm:
gsm8k_accuracy: 15.62 gsm8k_accuracy: 15.62
GPQA_diamond_accuracy: 3.12 GPQA_diamond_accuracy: 3.12
@ -281,20 +281,20 @@ base:
race-high_accuracy: 71.88 race-high_accuracy: 71.88
winogrande_accuracy: 75 winogrande_accuracy: 75
internlm2_5-7b-turbomind: internlm2_5-7b-turbomind:
gsm8k_accuracy: 62.5
GPQA_diamond_accuracy: 31.25
race-high_accuracy: 93.75
winogrande_accuracy: 87.5
internlm2-7b-turbomind:
gsm8k_accuracy: 59.38 gsm8k_accuracy: 59.38
GPQA_diamond_accuracy: 34.38 GPQA_diamond_accuracy: 34.38
race-high_accuracy: 93.75 race-high_accuracy: 78.12
winogrande_accuracy: 84.38 winogrande_accuracy: 71.88
internlm2-7b-turbomind:
gsm8k_accuracy: 50
GPQA_diamond_accuracy: 18.75
race-high_accuracy: 71.88
winogrande_accuracy: 84.38
internlm2-base-7b-turbomind: internlm2-base-7b-turbomind:
gsm8k_accuracy: 37.50 gsm8k_accuracy: 28.12
GPQA_diamond_accuracy: 21.88 GPQA_diamond_accuracy: 31.25
race-high_accuracy: 84.38 race-high_accuracy: 71.88
winogrande_accuracy: 75 winogrande_accuracy: 62.50
llama-2-7b-hf: llama-2-7b-hf:
gsm8k_accuracy: 21.88 gsm8k_accuracy: 21.88
GPQA_diamond_accuracy: 21.88 GPQA_diamond_accuracy: 21.88
@ -311,15 +311,15 @@ base:
race-high_accuracy: 65.62 race-high_accuracy: 65.62
winogrande_accuracy: 65.62 winogrande_accuracy: 65.62
llama-3.1-8b-turbomind: llama-3.1-8b-turbomind:
gsm8k_accuracy: 56.25 gsm8k_accuracy: 59.38
GPQA_diamond_accuracy: 9.38 GPQA_diamond_accuracy: 15.62
race-high_accuracy: 78.12 race-high_accuracy: 78.12
winogrande_accuracy: 78.12 winogrande_accuracy: 78.12
llama-3-8b-turbomind: llama-3-8b-turbomind:
gsm8k_accuracy: 46.88 gsm8k_accuracy: 46.88
GPQA_diamond_accuracy: 12.50 GPQA_diamond_accuracy: 12.50
race-high_accuracy: 65.62 race-high_accuracy: 65.62
winogrande_accuracy: 78.12 winogrande_accuracy: 81.25
mistral-7b-v0.3-hf: mistral-7b-v0.3-hf:
gsm8k_accuracy: 31.25 gsm8k_accuracy: 31.25
GPQA_diamond_accuracy: 6.25 GPQA_diamond_accuracy: 6.25
@ -331,8 +331,8 @@ base:
race-high_accuracy: 87.5 race-high_accuracy: 87.5
winogrande_accuracy: 71.88 winogrande_accuracy: 71.88
qwen2.5-1.5b-turbomind: qwen2.5-1.5b-turbomind:
gsm8k_accuracy: 62.50 gsm8k_accuracy: 59.38
GPQA_diamond_accuracy: 15.62 GPQA_diamond_accuracy: 18.75
race-high_accuracy: 75 race-high_accuracy: 75
winogrande_accuracy: 71.88 winogrande_accuracy: 71.88
qwen2.5-7b-turbomind: qwen2.5-7b-turbomind:
@ -362,19 +362,19 @@ base:
winogrande_accuracy: 68.75 winogrande_accuracy: 68.75
qwen2-1.5b-turbomind: qwen2-1.5b-turbomind:
gsm8k_accuracy: 59.38 gsm8k_accuracy: 59.38
GPQA_diamond_accuracy: 12.50 GPQA_diamond_accuracy: 6.25
race-high_accuracy: 81.25 race-high_accuracy: 81.25
winogrande_accuracy: 75 winogrande_accuracy: 75
qwen2-7b-turbomind: qwen2-7b-turbomind:
gsm8k_accuracy: 65.62 gsm8k_accuracy: 62.5
GPQA_diamond_accuracy: 12.5 GPQA_diamond_accuracy: 12.5
race-high_accuracy: 87.5 race-high_accuracy: 87.5
winogrande_accuracy: 71.88 winogrande_accuracy: 75
qwen1.5-0.5b-vllm: qwen1.5-0.5b-vllm:
gsm8k_accuracy: 6.25 gsm8k_accuracy: 9.38
GPQA_diamond_accuracy: 0 GPQA_diamond_accuracy: 0
race-high_accuracy: 56.25 race-high_accuracy: 56.25
winogrande_accuracy: 62.5 winogrande_accuracy: 59.38
yi-1.5-6b-hf: yi-1.5-6b-hf:
gsm8k_accuracy: 62.5 gsm8k_accuracy: 62.5
GPQA_diamond_accuracy: 3.12 GPQA_diamond_accuracy: 3.12
@ -387,11 +387,11 @@ base:
winogrande_accuracy: 59.38 winogrande_accuracy: 59.38
yi-1.5-9b-turbomind: yi-1.5-9b-turbomind:
gsm8k_accuracy: 78.12 gsm8k_accuracy: 78.12
GPQA_diamond_accuracy: 43.75 GPQA_diamond_accuracy: 40.62
race-high_accuracy: 87.5 race-high_accuracy: 87.5
winogrande_accuracy: 71.88 winogrande_accuracy: 65.62
internlm2-20b-turbomind: internlm2-20b-turbomind:
gsm8k_accuracy: 75 gsm8k_accuracy: 71.88
GPQA_diamond_accuracy: 18.75 GPQA_diamond_accuracy: 18.75
race-high_accuracy: 68.75 race-high_accuracy: 68.75
winogrande_accuracy: 81.25 winogrande_accuracy: 81.25
@ -406,18 +406,18 @@ base:
race-high_accuracy: 93.75 race-high_accuracy: 93.75
winogrande_accuracy: 78.12 winogrande_accuracy: 78.12
qwen2.5-32b-turbomind: qwen2.5-32b-turbomind:
gsm8k_accuracy: 87.5 gsm8k_accuracy: 84.38
GPQA_diamond_accuracy: 18.75 GPQA_diamond_accuracy: 28.12
race-high_accuracy: 93.75 race-high_accuracy: 93.75
winogrande_accuracy: 81.25 winogrande_accuracy: 81.25
deepseek-67b-base-turbomind: deepseek-67b-base-turbomind:
gsm8k_accuracy: 53.12 gsm8k_accuracy: 59.38
GPQA_diamond_accuracy: 28.12 GPQA_diamond_accuracy: 34.38
race-high_accuracy: 81.25 race-high_accuracy: 78.12
winogrande_accuracy: 84.38 winogrande_accuracy: 81.25
llama-3-70b-turbomind: llama-3-70b-turbomind:
gsm8k_accuracy: 56.25 gsm8k_accuracy: 56.25
GPQA_diamond_accuracy: 12.50 GPQA_diamond_accuracy: 15.62
race-high_accuracy: 93.75 race-high_accuracy: 93.75
winogrande_accuracy: 84.38 winogrande_accuracy: 84.38
qwen2.5-72b-turbomind: qwen2.5-72b-turbomind:
@ -426,7 +426,7 @@ base:
race-high_accuracy: 93.75 race-high_accuracy: 93.75
winogrande_accuracy: 87.5 winogrande_accuracy: 87.5
deepseek-v2-turbomind: deepseek-v2-turbomind:
gsm8k_accuracy: 59.38 gsm8k_accuracy: 65.62
GPQA_diamond_accuracy: 3.12 GPQA_diamond_accuracy: 9.38
race-high_accuracy: 93.75 race-high_accuracy: 93.75
winogrande_accuracy: 81.25 winogrande_accuracy: 81.25

View File

@ -44,7 +44,7 @@ on:
type: string type: string
default: "['base_objective','chat_objective','chat_subjective','base_long_context','chat_long_context']" default: "['base_objective','chat_objective','chat_subjective','base_long_context','chat_long_context']"
schedule: schedule:
- cron: '15 14 * * 0,2' - cron: '15 14 * * 0,3'
env: env:
HF_DATASETS_OFFLINE: 1 HF_DATASETS_OFFLINE: 1

View File

@ -7,6 +7,7 @@ from .alpacaeval import alpacaeval_postprocess # noqa: F401, F403
from .arena_hard import ArenaHardDataset # noqa: F401, F403 from .arena_hard import ArenaHardDataset # noqa: F401, F403
from .arena_hard import arenahard_bradleyterry_postprocess # noqa: F401, F403 from .arena_hard import arenahard_bradleyterry_postprocess # noqa: F401, F403
from .arena_hard import arenahard_postprocess # noqa: F401, F403 from .arena_hard import arenahard_postprocess # noqa: F401, F403
from .commonbench import commonbench_postprocess
from .compass_arena import CompassArenaDataset # noqa: F401, F403 from .compass_arena import CompassArenaDataset # noqa: F401, F403
from .compass_arena import \ from .compass_arena import \
compassarena_bradleyterry_postprocess # noqa: F401, F403 compassarena_bradleyterry_postprocess # noqa: F401, F403

View File

@ -0,0 +1,56 @@
# flake8: noqa: E501
import re
from collections import defaultdict
from typing import Optional
from opencompass.registry import DICT_POSTPROCESSORS
from .utils import get_judgeanswer_and_reference
def post_process(judgement: str):
"""Input a string like below:
xxx[[5]]xxx, and extract the score
"""
judgement = judgement['prediction']
pattern = r'\[\[([\d.]+)\]\]'
matched_result = re.findall(pattern, judgement)
if matched_result:
score = float(matched_result[0])
else:
return None
return {'score': score}
def get_capability_results(judged_answers, references):
capability_ratings = defaultdict(int)
capability_counts = defaultdict(int)
for ans, ref in zip(judged_answers, references):
capability_ratings['total'] += ans['score']
capability_counts['total'] += 1
capability_ratings[ref['capability']] += ans['score']
capability_counts[ref['capability']] += 1
capability_avg_ratings = defaultdict(float)
for capability, total_score in capability_ratings.items():
s = total_score / capability_counts[capability]
s = round(s, 2)
capability_avg_ratings[capability] = s
return capability_avg_ratings
@DICT_POSTPROCESSORS.register_module('commenbench')
def commonbench_postprocess(
output: dict,
output_path: str,
post_process: Optional[callable] = post_process,
) -> dict:
judged_answers, references = get_judgeanswer_and_reference(
output, output_path, post_process)
results = get_capability_results(judged_answers, references)
results['details'] = output
return results