From 6ac9b06bc2209d82333987671d20175d7cd0d2f7 Mon Sep 17 00:00:00 2001 From: zhulinJulia24 <145004780+zhulinJulia24@users.noreply.github.com> Date: Wed, 9 Apr 2025 14:09:35 +0800 Subject: [PATCH] [ci] update baseline for kernal change of vllm and lmdeploy (#2011) * update * update * update * update * update * update * update --- .../scripts/oc_score_baseline_fullbench.yaml | 26 +++---- .../scripts/oc_score_baseline_testrange.yaml | 78 +++++++++---------- .github/workflows/daily-run-test.yml | 2 +- opencompass/datasets/subjective/__init__.py | 1 + .../datasets/subjective/commonbench.py | 56 +++++++++++++ 5 files changed, 110 insertions(+), 53 deletions(-) create mode 100644 opencompass/datasets/subjective/commonbench.py diff --git a/.github/scripts/oc_score_baseline_fullbench.yaml b/.github/scripts/oc_score_baseline_fullbench.yaml index 3f5753d3..fd355c0e 100644 --- a/.github/scripts/oc_score_baseline_fullbench.yaml +++ b/.github/scripts/oc_score_baseline_fullbench.yaml @@ -9,7 +9,7 @@ internlm2_5-7b-chat-hf_fullbench: drop_accuracy: 81.25 GPQA_diamond_accuracy: 25 hellaswag_accuracy: 87.5 - TheoremQA_score: 18.75 + TheoremQA_score: 12.50 musr_average_naive_average: 39.58 korbench_single_naive_average: 40 gsm8k_accuracy: 62.50 @@ -162,7 +162,7 @@ internlm2_5-7b-hf_fullbench: drop_accuracy: 62.5 GPQA_diamond_accuracy: 62.5 hellaswag_accuracy: 93.75 - TheoremQA_score: 25 + TheoremQA_score: 12.50 winogrande_accuracy: 75 gsm8k_accuracy: 37.5 GaokaoBench_2010-2022_Math_II_MCQs_score: 62.5 @@ -190,7 +190,7 @@ internlm2_5-7b-turbomind_fullbench: drop_accuracy: 62.5 GPQA_diamond_accuracy: 62.5 hellaswag_accuracy: 93.75 - TheoremQA_score: 31.25 + TheoremQA_score: 12.50 winogrande_accuracy: 87.5 gsm8k_accuracy: 56.25 GaokaoBench_2010-2022_Math_II_MCQs_score: 68.75 @@ -391,7 +391,7 @@ internlm2_5-7b-chat-turbomind: alpaca_eval_total: 25.96 arenahard_score: 17.15 Followbench_naive_average: 0.81 - CompassArena_naive_average: 34.61 + CompassArena_naive_average: 39.49 FoFo_naive_average: 0.38 mtbench101_avg: 8.01 wildbench_average: -10.49 @@ -410,10 +410,10 @@ internlm2_5-7b-chat-turbomind: alpaca_eval_oasst: 23.4 alpaca_eval_selfinstruct: 30.95 alpaca_eval_vicuna: 33.75 - compassarena_language_naive_average: 52.5 + compassarena_language_naive_average: 58.50 compassarena_knowledge_naive_average: 36 compassarena_reason_v2_naive_average: 35 - compassarena_math_v2_naive_average: 19.91 + compassarena_math_v2_naive_average: 25.95 compassarena_creationv2_zh_naive_average: 43.64 fofo_test_prompts_overall: 0.35 fofo_test_prompts_cn_overall: 0.41 @@ -493,7 +493,7 @@ qwen2.5-7b-instruct-turbomind: bigcodebench_hard_instruct_pass@1: 16.22 bigcodebench_hard_complete_pass@1: 11.49 teval_naive_average: 79.72 - SciCode_sub_accuracy: 100 + SciCode_sub_accuracy: 10.76 qa_dingo_cn_score: 99.01 mmlu_accuracy: 76.01 mmlu-stem_accuracy: 77.59 @@ -600,7 +600,7 @@ internlm2_5-7b-chat-pytorch: bigcodebench_hard_instruct_pass@1: 6.08 bigcodebench_hard_complete_pass@1: 6.76 teval_naive_average: 79.73 - SciCode_sub_accuracy: 100 + SciCode_sub_accuracy: 3.47 qa_dingo_cn_score: 100 mmlu_accuracy: 70.2 mmlu-stem_accuracy: 67.73 @@ -689,7 +689,7 @@ qwen2.5-7b-instruct-pytorch: GaokaoBench_weighted_average: 80.02 math_accuracy: 73.74 cmo_fib_accuracy: 26.44 - aime2024_accuracy: 10 + aime2024_accuracy: 13.33 Mathbench_naive_average: 77.08 wikibench-wiki-single_choice_cncircular_perf_4: 34 cmmlu_naive_average: 75.9 @@ -705,7 +705,7 @@ qwen2.5-7b-instruct-pytorch: bigcodebench_hard_instruct_pass@1: 16.89 bigcodebench_hard_complete_pass@1: 12.16 teval_naive_average: 79.46 - SciCode_sub_accuracy: 100 + SciCode_sub_accuracy: 10.42 qa_dingo_cn_score: 100 mmlu_accuracy: 76.27 mmlu-stem_accuracy: 77.75 @@ -810,7 +810,7 @@ internlm3-8b-instruct-turbomind: bigcodebench_hard_instruct_pass@1: 13.51 bigcodebench_hard_complete_pass@1: 15.54 teval_naive_average: 82.86 - SciCode_sub_accuracy: 100 + SciCode_sub_accuracy: 11.11 qa_dingo_cn_score: 100 mmlu_accuracy: 76.21 mmlu-stem_accuracy: 77.7 @@ -889,7 +889,7 @@ internlm3-8b-instruct-pytorch: IFEval_Prompt-level-strict-accuracy: 79.11 drop_accuracy: 83.32 bbh_naive_average: 54.76 - GPQA_diamond_accuracy: 42.42 + GPQA_diamond_accuracy: 33.84 hellaswag_accuracy: 91.31 TheoremQA_score: 18 musr_average_naive_average: 36.62 @@ -915,7 +915,7 @@ internlm3-8b-instruct-pytorch: bigcodebench_hard_instruct_pass@1: 12.84 bigcodebench_hard_complete_pass@1: 15.54 teval_naive_average: 82.86 - SciCode_sub_accuracy: 100 + SciCode_sub_accuracy: 9.38 qa_dingo_cn_score: 100 mmlu_accuracy: 76.23 mmlu-stem_accuracy: 78.08 diff --git a/.github/scripts/oc_score_baseline_testrange.yaml b/.github/scripts/oc_score_baseline_testrange.yaml index 16a13209..94a28d36 100644 --- a/.github/scripts/oc_score_baseline_testrange.yaml +++ b/.github/scripts/oc_score_baseline_testrange.yaml @@ -6,7 +6,7 @@ chat: gsm8k_accuracy: 71.88 race-high_accuracy: 90.62 glm-4-9b-chat-vllm: - gsm8k_accuracy: 68.75 + gsm8k_accuracy: 71.88 race-high_accuracy: 90.62 deepseek-7b-chat-hf: gsm8k_accuracy: 46.88 @@ -84,7 +84,7 @@ chat: gsm8k_accuracy: 81.25 race-high_accuracy: 90.62 llama-3_2-3b-instruct-turbomind: - gsm8k_accuracy: 75.00 + gsm8k_accuracy: 68.75 race-high_accuracy: 81.25 llama-3-8b-instruct-turbomind: gsm8k_accuracy: 68.75 @@ -204,14 +204,14 @@ chat: gsm8k_accuracy: 90.62 race-high_accuracy: 84.38 mixtral-8x22b-instruct-v0.1-turbomind: - gsm8k_accuracy: 75 + gsm8k_accuracy: 78.12 race-high_accuracy: 78.12 mixtral-8x22b-instruct-v0.1-vllm: gsm8k_accuracy: 78.12 race-high_accuracy: 78.12 base: glm-4-9b-turbomind: - gsm8k_accuracy: 56.25 + gsm8k_accuracy: 59.38 GPQA_diamond_accuracy: 28.12 race-high_accuracy: 93.75 winogrande_accuracy: 84.38 @@ -253,8 +253,8 @@ base: gemma-2-9b-turbomind: gsm8k_accuracy: 68.75 GPQA_diamond_accuracy: 0 - race-high_accuracy: 78.12 - winogrande_accuracy: 50 + race-high_accuracy: 18.75 + winogrande_accuracy: 46.88 gemma-2b-vllm: gsm8k_accuracy: 15.62 GPQA_diamond_accuracy: 3.12 @@ -281,20 +281,20 @@ base: race-high_accuracy: 71.88 winogrande_accuracy: 75 internlm2_5-7b-turbomind: + gsm8k_accuracy: 62.5 + GPQA_diamond_accuracy: 31.25 + race-high_accuracy: 93.75 + winogrande_accuracy: 87.5 + internlm2-7b-turbomind: gsm8k_accuracy: 59.38 GPQA_diamond_accuracy: 34.38 - race-high_accuracy: 93.75 - winogrande_accuracy: 84.38 - internlm2-7b-turbomind: - gsm8k_accuracy: 50 - GPQA_diamond_accuracy: 18.75 - race-high_accuracy: 71.88 - winogrande_accuracy: 84.38 + race-high_accuracy: 78.12 + winogrande_accuracy: 71.88 internlm2-base-7b-turbomind: - gsm8k_accuracy: 37.50 - GPQA_diamond_accuracy: 21.88 - race-high_accuracy: 84.38 - winogrande_accuracy: 75 + gsm8k_accuracy: 28.12 + GPQA_diamond_accuracy: 31.25 + race-high_accuracy: 71.88 + winogrande_accuracy: 62.50 llama-2-7b-hf: gsm8k_accuracy: 21.88 GPQA_diamond_accuracy: 21.88 @@ -311,15 +311,15 @@ base: race-high_accuracy: 65.62 winogrande_accuracy: 65.62 llama-3.1-8b-turbomind: - gsm8k_accuracy: 56.25 - GPQA_diamond_accuracy: 9.38 + gsm8k_accuracy: 59.38 + GPQA_diamond_accuracy: 15.62 race-high_accuracy: 78.12 winogrande_accuracy: 78.12 llama-3-8b-turbomind: gsm8k_accuracy: 46.88 GPQA_diamond_accuracy: 12.50 race-high_accuracy: 65.62 - winogrande_accuracy: 78.12 + winogrande_accuracy: 81.25 mistral-7b-v0.3-hf: gsm8k_accuracy: 31.25 GPQA_diamond_accuracy: 6.25 @@ -331,8 +331,8 @@ base: race-high_accuracy: 87.5 winogrande_accuracy: 71.88 qwen2.5-1.5b-turbomind: - gsm8k_accuracy: 62.50 - GPQA_diamond_accuracy: 15.62 + gsm8k_accuracy: 59.38 + GPQA_diamond_accuracy: 18.75 race-high_accuracy: 75 winogrande_accuracy: 71.88 qwen2.5-7b-turbomind: @@ -362,19 +362,19 @@ base: winogrande_accuracy: 68.75 qwen2-1.5b-turbomind: gsm8k_accuracy: 59.38 - GPQA_diamond_accuracy: 12.50 + GPQA_diamond_accuracy: 6.25 race-high_accuracy: 81.25 winogrande_accuracy: 75 qwen2-7b-turbomind: - gsm8k_accuracy: 65.62 + gsm8k_accuracy: 62.5 GPQA_diamond_accuracy: 12.5 race-high_accuracy: 87.5 - winogrande_accuracy: 71.88 + winogrande_accuracy: 75 qwen1.5-0.5b-vllm: - gsm8k_accuracy: 6.25 + gsm8k_accuracy: 9.38 GPQA_diamond_accuracy: 0 race-high_accuracy: 56.25 - winogrande_accuracy: 62.5 + winogrande_accuracy: 59.38 yi-1.5-6b-hf: gsm8k_accuracy: 62.5 GPQA_diamond_accuracy: 3.12 @@ -387,11 +387,11 @@ base: winogrande_accuracy: 59.38 yi-1.5-9b-turbomind: gsm8k_accuracy: 78.12 - GPQA_diamond_accuracy: 43.75 + GPQA_diamond_accuracy: 40.62 race-high_accuracy: 87.5 - winogrande_accuracy: 71.88 + winogrande_accuracy: 65.62 internlm2-20b-turbomind: - gsm8k_accuracy: 75 + gsm8k_accuracy: 71.88 GPQA_diamond_accuracy: 18.75 race-high_accuracy: 68.75 winogrande_accuracy: 81.25 @@ -406,18 +406,18 @@ base: race-high_accuracy: 93.75 winogrande_accuracy: 78.12 qwen2.5-32b-turbomind: - gsm8k_accuracy: 87.5 - GPQA_diamond_accuracy: 18.75 + gsm8k_accuracy: 84.38 + GPQA_diamond_accuracy: 28.12 race-high_accuracy: 93.75 winogrande_accuracy: 81.25 deepseek-67b-base-turbomind: - gsm8k_accuracy: 53.12 - GPQA_diamond_accuracy: 28.12 - race-high_accuracy: 81.25 - winogrande_accuracy: 84.38 + gsm8k_accuracy: 59.38 + GPQA_diamond_accuracy: 34.38 + race-high_accuracy: 78.12 + winogrande_accuracy: 81.25 llama-3-70b-turbomind: gsm8k_accuracy: 56.25 - GPQA_diamond_accuracy: 12.50 + GPQA_diamond_accuracy: 15.62 race-high_accuracy: 93.75 winogrande_accuracy: 84.38 qwen2.5-72b-turbomind: @@ -426,7 +426,7 @@ base: race-high_accuracy: 93.75 winogrande_accuracy: 87.5 deepseek-v2-turbomind: - gsm8k_accuracy: 59.38 - GPQA_diamond_accuracy: 3.12 + gsm8k_accuracy: 65.62 + GPQA_diamond_accuracy: 9.38 race-high_accuracy: 93.75 winogrande_accuracy: 81.25 diff --git a/.github/workflows/daily-run-test.yml b/.github/workflows/daily-run-test.yml index 6a1c2ebc..e6000c09 100644 --- a/.github/workflows/daily-run-test.yml +++ b/.github/workflows/daily-run-test.yml @@ -44,7 +44,7 @@ on: type: string default: "['base_objective','chat_objective','chat_subjective','base_long_context','chat_long_context']" schedule: - - cron: '15 14 * * 0,2' + - cron: '15 14 * * 0,3' env: HF_DATASETS_OFFLINE: 1 diff --git a/opencompass/datasets/subjective/__init__.py b/opencompass/datasets/subjective/__init__.py index 09a0a8ed..8d663528 100644 --- a/opencompass/datasets/subjective/__init__.py +++ b/opencompass/datasets/subjective/__init__.py @@ -7,6 +7,7 @@ from .alpacaeval import alpacaeval_postprocess # noqa: F401, F403 from .arena_hard import ArenaHardDataset # noqa: F401, F403 from .arena_hard import arenahard_bradleyterry_postprocess # noqa: F401, F403 from .arena_hard import arenahard_postprocess # noqa: F401, F403 +from .commonbench import commonbench_postprocess from .compass_arena import CompassArenaDataset # noqa: F401, F403 from .compass_arena import \ compassarena_bradleyterry_postprocess # noqa: F401, F403 diff --git a/opencompass/datasets/subjective/commonbench.py b/opencompass/datasets/subjective/commonbench.py new file mode 100644 index 00000000..1b634111 --- /dev/null +++ b/opencompass/datasets/subjective/commonbench.py @@ -0,0 +1,56 @@ +# flake8: noqa: E501 +import re +from collections import defaultdict +from typing import Optional + +from opencompass.registry import DICT_POSTPROCESSORS + +from .utils import get_judgeanswer_and_reference + + +def post_process(judgement: str): + """Input a string like below: + + xxx[[5]]xxx, and extract the score + """ + judgement = judgement['prediction'] + pattern = r'\[\[([\d.]+)\]\]' + matched_result = re.findall(pattern, judgement) + if matched_result: + score = float(matched_result[0]) + else: + return None + return {'score': score} + + +def get_capability_results(judged_answers, references): + capability_ratings = defaultdict(int) + capability_counts = defaultdict(int) + for ans, ref in zip(judged_answers, references): + capability_ratings['total'] += ans['score'] + capability_counts['total'] += 1 + capability_ratings[ref['capability']] += ans['score'] + capability_counts[ref['capability']] += 1 + + capability_avg_ratings = defaultdict(float) + + for capability, total_score in capability_ratings.items(): + s = total_score / capability_counts[capability] + s = round(s, 2) + capability_avg_ratings[capability] = s + + return capability_avg_ratings + + +@DICT_POSTPROCESSORS.register_module('commenbench') +def commonbench_postprocess( + output: dict, + output_path: str, + post_process: Optional[callable] = post_process, +) -> dict: + judged_answers, references = get_judgeanswer_and_reference( + output, output_path, post_process) + + results = get_capability_results(judged_answers, references) + results['details'] = output + return results