From ed81f9df302b6c9d61d5167e7c98bbedd238a09d Mon Sep 17 00:00:00 2001 From: zhulinJulia24 <145004780+zhulinJulia24@users.noreply.github.com> Date: Thu, 21 Nov 2024 10:37:33 +0800 Subject: [PATCH] [CI] update torch version and add more datasets into daily testcase (#1701) * update * update * update * update * update * update * update * update * update * update --------- Co-authored-by: zhulin1 --- .../scripts/eval_regression_base_fullbench.py | 4 + .github/scripts/eval_regression_chat.py | 2 + ...val_regression_chat_objective_fullbench.py | 132 ++++++++++++------ .github/scripts/oc_score_assert.py | 17 ++- .../scripts/oc_score_baseline_fullbench.yaml | 50 +++++-- .../scripts/oc_score_baseline_testrange.yaml | 20 +-- .github/workflows/daily-run-test.yml | 12 +- 7 files changed, 160 insertions(+), 77 deletions(-) diff --git a/.github/scripts/eval_regression_base_fullbench.py b/.github/scripts/eval_regression_base_fullbench.py index d5ad48c4..11c2f514 100644 --- a/.github/scripts/eval_regression_base_fullbench.py +++ b/.github/scripts/eval_regression_base_fullbench.py @@ -7,6 +7,8 @@ with read_base(): bbh_datasets # noqa: F401, E501 from opencompass.configs.datasets.cmmlu.cmmlu_ppl_041cbf import \ cmmlu_datasets # noqa: F401, E501 + from opencompass.configs.datasets.dingo.dingo_gen import \ + datasets as dingo_datasets # noqa: F401, E501 from opencompass.configs.datasets.drop.drop_gen_a2697c import \ drop_datasets # noqa: F401, E501 from opencompass.configs.datasets.GaokaoBench.GaokaoBench_no_subjective_gen_d21e37 import \ @@ -120,6 +122,8 @@ summarizer = dict( ['winogrande', 'accuracy'], ['hellaswag', 'accuracy'], ['TheoremQA', 'score'], + ['dingo_en_192', 'score'], + ['dingo_zh_170', 'score'], '###### MathBench-A: Application Part ######', 'college', 'high', diff --git a/.github/scripts/eval_regression_chat.py b/.github/scripts/eval_regression_chat.py index 68c225c5..7762e4f7 100644 --- a/.github/scripts/eval_regression_chat.py +++ b/.github/scripts/eval_regression_chat.py @@ -59,6 +59,8 @@ with read_base(): models as hf_llama3_2_3b_instruct_model # noqa: F401, E501 from opencompass.configs.models.hf_llama.hf_llama3_8b_instruct import \ models as hf_llama3_8b_instruct_model # noqa: F401, E501 + from opencompass.configs.models.hf_llama.lmdeploy_llama2_7b_chat import \ + models as lmdeploy_llama2_7b_chat_model # noqa: F401, E501 from opencompass.configs.models.hf_llama.lmdeploy_llama3_1_8b_instruct import \ models as lmdeploy_llama3_1_8b_instruct_model # noqa: F401, E501 from opencompass.configs.models.hf_llama.lmdeploy_llama3_2_3b_instruct import \ diff --git a/.github/scripts/eval_regression_chat_objective_fullbench.py b/.github/scripts/eval_regression_chat_objective_fullbench.py index ff8dfba4..c66fba33 100644 --- a/.github/scripts/eval_regression_chat_objective_fullbench.py +++ b/.github/scripts/eval_regression_chat_objective_fullbench.py @@ -3,12 +3,16 @@ from mmengine.config import read_base with read_base(): # read hf models - chat models # Dataset + from opencompass.configs.datasets.aime2024.aime2024_gen_6e39a4 import \ + aime2024_datasets # noqa: F401, E501 from opencompass.configs.datasets.ARC_c.ARC_c_cot_gen_926652 import \ ARC_c_datasets # noqa: F401, E501 from opencompass.configs.datasets.bbh.bbh_gen_5b92b0 import \ bbh_datasets # noqa: F401, E501 from opencompass.configs.datasets.cmmlu.cmmlu_0shot_cot_gen_305931 import \ cmmlu_datasets # noqa: F401, E501 + from opencompass.configs.datasets.cmo_fib.cmo_fib_gen_ace24b import \ + cmo_fib_datasets # noqa: F401, E501 from opencompass.configs.datasets.drop.drop_openai_simple_evals_gen_3857b0 import \ drop_datasets # noqa: F401, E501 from opencompass.configs.datasets.ds1000.ds1000_service_eval_gen_cbc84f import \ @@ -28,6 +32,8 @@ with read_base(): humanevalx_datasets # noqa: F401, E501 from opencompass.configs.datasets.IFEval.IFEval_gen_3321a3 import \ ifeval_datasets # noqa: F401, E501 + from opencompass.configs.datasets.livecodebench.livecodebench_gen_b2b0fd import \ + LCB_datasets # noqa: F401, E501 from opencompass.configs.datasets.math.math_0shot_gen_393424 import \ math_datasets # noqa: F401, E501 from opencompass.configs.datasets.MathBench.mathbench_2024_gen_50a320 import \ @@ -38,6 +44,10 @@ with read_base(): mmlu_datasets # noqa: F401, E501 from opencompass.configs.datasets.mmlu_pro.mmlu_pro_0shot_cot_gen_08c1de import \ mmlu_pro_datasets # noqa: F401, E501 + from opencompass.configs.datasets.mmmlu_lite.mmmlu_lite_gen_c51a84 import \ + mmmlu_lite_datasets # noqa: F401, E501 + from opencompass.configs.datasets.musr.musr_gen_3c6e15 import \ + musr_datasets # noqa: F401, E501 from opencompass.configs.datasets.nq.nq_open_1shot_gen_2e45e5 import \ nq_datasets # noqa: F401, E501 from opencompass.configs.datasets.race.race_cot_gen_d95929 import \ @@ -77,10 +87,14 @@ with read_base(): mmlu_summary_groups # noqa: F401, E501 from opencompass.configs.summarizers.groups.mmlu_pro import \ mmlu_pro_summary_groups # noqa: F401, E501 + from opencompass.configs.summarizers.groups.musr_average import \ + summarizer as musr_summarizer # noqa: F401, E501 from opencompass.configs.summarizers.groups.scicode import \ scicode_summary_groups # noqa: F401, E501 from opencompass.configs.summarizers.groups.teval import \ teval_summary_groups # noqa: F401, E501 + from opencompass.configs.summarizers.mmmlu_lite import \ + mmmlu_summary_groups # noqa: F401, E501 # For HumanEval-X Evaluation # Apply the evaluator ip_address and port @@ -122,6 +136,10 @@ mmlu_datasets = [ ] mmlu_pro_datasets = [mmlu_pro_datasets[0]] + +mmmlu_lite_datasets = [ + x for x in mmmlu_lite_datasets if 'mmlu_lite_AR-XY' in x['abbr'] +] mathbench_datasets = [x for x in mathbench_datasets if 'college' in x['abbr']] GaokaoBench_datasets = [ x for x in GaokaoBench_datasets if '2010-2022_Math_II_MCQs' in x['abbr'] @@ -137,52 +155,68 @@ datasets += teval_en_datasets datasets += teval_zh_datasets # datasets += SciCode_datasets +musr_summary_groups = musr_summarizer['summary_groups'] +summary_groups = sum( + [v for k, v in locals().items() if k.endswith('_summary_groups')], []) +summary_groups.append( + { + 'name': 'Mathbench', + 'subsets': ['mathbench-a (average)', 'mathbench-t (average)'], + }, ) + +# Summarizer summarizer = dict( dataset_abbrs=[ + 'Language', ['race-high', 'accuracy'], ['ARC-c', 'accuracy'], ['BoolQ', 'accuracy'], - ['mmlu_pro', 'naive_average'], + ['triviaqa_wiki_1shot', 'score'], + ['nq_open_1shot', 'score'], + ['mmmlu_lite', 'naive_average'], + '', + 'Instruction Following', + ['IFEval', 'Prompt-level-strict-accuracy'], + '', + 'General Reasoning', ['drop', 'accuracy'], ['bbh', 'naive_average'], ['GPQA_diamond', 'accuracy'], + ['hellaswag', 'accuracy'], + ['TheoremQA', 'score'], + ['musr_average', 'naive_average'], + '', + 'Math Calculation', + ['gsm8k', 'accuracy'], + ['GaokaoBench', 'weighted_average'], ['math', 'accuracy'], + ['cmo_fib', 'accuracy'], + ['aime2024', 'accuracy'], + ['Mathbench', 'naive_average'], + '', + 'Knowledge', ['wikibench-wiki-single_choice_cncircular', 'perf_4'], - ['openai_humaneval', 'humaneval_pass@1'], - ['sanitized_mbpp', 'score'], ['cmmlu', 'naive_average'], ['mmlu', 'naive_average'], + ['mmlu_pro', 'naive_average'], + '', + 'Code', + ['openai_humaneval', 'humaneval_pass@1'], + ['sanitized_mbpp', 'score'], + ['humanevalx', 'naive_average'], + ['ds1000', 'naive_average'], + ['lcb_code_generation', 'pass@1'], + ['lcb_code_execution', 'pass@1'], + ['lcb_test_output', 'pass@1'], + '', + 'Agent', ['teval', 'naive_average'], ['SciCode', 'accuracy'], ['SciCode', 'sub_accuracy'], - ['humanevalx', 'naive_average'], - ['ds1000', 'naive_average'], - ['IFEval', 'Prompt-level-strict-accuracy'], - ['gsm8k', 'accuracy'], - ['GaokaoBench', 'weighted_average'], - ['triviaqa_wiki_1shot', 'score'], - ['nq_open_1shot', 'score'], - ['hellaswag', 'accuracy'], - ['TheoremQA', 'score'], - '###### MathBench-A: Application Part ######', - 'college', - 'high', - 'middle', - 'primary', - 'arithmetic', - 'mathbench-a (average)', - '###### MathBench-T: Theory Part ######', - 'college_knowledge', - 'high_knowledge', - 'middle_knowledge', - 'primary_knowledge', - 'mathbench-t (average)', - '###### Overall: Average between MathBench-A and MathBench-T ######', - 'Overall', '', 'bbh-logical_deduction_seven_objects', 'bbh-multistep_arithmetic_two', - '' + '', 'mmlu', 'mmlu-stem', 'mmlu-social-science', @@ -212,15 +246,6 @@ summarizer = dict( 'mmlu_pro_psychology', 'mmlu_pro_other', '', - 'GaokaoBench_2010-2022_Math_II_MCQs', - 'GaokaoBench_2010-2022_Math_II_Fill-in-the-Blank', - '', - 'humanevalx-python', - 'humanevalx-cpp', - 'humanevalx-go', - 'humanevalx-java', - 'humanevalx-js', - '', 'ds1000_Pandas', 'ds1000_Numpy', 'ds1000_Tensorflow', @@ -228,9 +253,38 @@ summarizer = dict( 'ds1000_Sklearn', 'ds1000_Pytorch', 'ds1000_Matplotlib', + '', + 'mmmlu_lite', + 'openai_mmmlu_lite_AR-XY', + 'openai_mmmlu_lite_BN-BD', + 'openai_mmmlu_lite_DE-DE', + 'openai_mmmlu_lite_ES-LA', + 'openai_mmmlu_lite_FR-FR', + 'openai_mmmlu_lite_HI-IN', + 'openai_mmmlu_lite_ID-ID', + 'openai_mmmlu_lite_IT-IT', + 'openai_mmmlu_lite_JA-JP', + 'openai_mmmlu_lite_KO-KR', + 'openai_mmmlu_lite_PT-BR', + 'openai_mmmlu_lite_SW-KE', + 'openai_mmmlu_lite_YO-NG', + 'openai_mmmlu_lite_ZH-CN', + '', + '###### MathBench-A: Application Part ######', + 'college', + 'high', + 'middle', + 'primary', + 'arithmetic', + 'mathbench-a (average)', + '###### MathBench-T: Theory Part ######', + 'college_knowledge', + 'high_knowledge', + 'middle_knowledge', + 'primary_knowledge', + 'mathbench-t (average)', ], - summary_groups=sum( - [v for k, v in locals().items() if k.endswith('_summary_groups')], []), + summary_groups=summary_groups, ) for d in datasets: diff --git a/.github/scripts/oc_score_assert.py b/.github/scripts/oc_score_assert.py index d8e33adb..179dec27 100644 --- a/.github/scripts/oc_score_assert.py +++ b/.github/scripts/oc_score_assert.py @@ -131,14 +131,16 @@ class TestChatObjFullbench: 'internlm2_5-7b-chat-hf_fullbench', 'internlm2_5-7b-chat-turbomind_fullbench' ] for p2 in [ - 'race-high', 'ARC-c', 'BoolQ', 'drop', 'GPQA_diamond', 'math', + 'race-high', 'ARC-c', 'BoolQ', 'triviaqa_wiki_1shot', 'nq_open_1shot', + 'IFEval', 'drop', 'GPQA_diamond', 'hellaswag', 'TheoremQA', + 'musr_average', 'gsm8k', 'math', 'cmo_fib', 'aime2024', 'wikibench-wiki-single_choice_cncircular', 'sanitized_mbpp', 'ds1000', - 'gsm8k', 'triviaqa_wiki_1shot', 'nq_open_1shot', 'hellaswag', - 'TheoremQA', 'college', 'college_knowledge', + 'lcb_code_generation', 'lcb_code_execution', 'lcb_test_output', 'bbh-logical_deduction_seven_objects', 'bbh-multistep_arithmetic_two', 'mmlu-other', 'cmmlu-china-specific', 'mmlu_pro_math', 'ds1000_Pandas', 'ds1000_Numpy', 'ds1000_Tensorflow', 'ds1000_Scipy', 'ds1000_Sklearn', - 'ds1000_Pytorch', 'ds1000_Matplotlib' + 'ds1000_Pytorch', 'ds1000_Matplotlib', 'openai_mmmlu_lite_AR-XY', + 'college', 'college_knowledge' ]]) def test_model_dataset_score(self, baseline_scores_fullbench, result_scores, model, dataset): @@ -188,9 +190,10 @@ class TestBaseFullbench: 'race-high', 'ARC-c', 'BoolQ', 'drop', 'GPQA_diamond', 'math', 'wikibench-wiki-single_choice_cncircular', 'sanitized_mbpp', 'gsm8k', 'triviaqa_wiki_1shot', 'nq_open_1shot', 'winogrande', 'hellaswag', - 'TheoremQA', 'college', 'college_knowledge', - 'bbh-logical_deduction_seven_objects', 'bbh-multistep_arithmetic_two', - 'mmlu-other', 'cmmlu-china-specific', 'mmlu_pro_math' + 'TheoremQA', 'dingo_en_192', 'dingo_zh_170', 'college', + 'college_knowledge', 'bbh-logical_deduction_seven_objects', + 'bbh-multistep_arithmetic_two', 'mmlu-other', 'cmmlu-china-specific', + 'mmlu_pro_math' ]]) def test_model_dataset_score(self, baseline_scores_fullbench, result_scores, model, dataset): diff --git a/.github/scripts/oc_score_baseline_fullbench.yaml b/.github/scripts/oc_score_baseline_fullbench.yaml index c95e7b91..413a99a3 100644 --- a/.github/scripts/oc_score_baseline_fullbench.yaml +++ b/.github/scripts/oc_score_baseline_fullbench.yaml @@ -2,19 +2,24 @@ internlm2_5-7b-chat-hf_fullbench: race-high: 93.75 ARC-c: 93.75 BoolQ: 81.25 + triviaqa_wiki_1shot: 50 + nq_open_1shot: 25 + IFEval: 50 drop: 81.25 GPQA_diamond: 25 + hellaswag: 87.5 + TheoremQA: 18.75 + musr_average: 39.58 + gsm8k: 56.25 math: 75 + cmo_fib: 6.25 + aime2024: 6.25 wikibench-wiki-single_choice_cncircular: 50 sanitized_mbpp: 68.75 ds1000: 16.96 - gsm8k: 56.25 - triviaqa_wiki_1shot: 50 - nq_open_1shot: 25 - hellaswag: 87.5 - TheoremQA: 18.75 - college: 12.5 - college_knowledge: 87.5 + lcb_code_generation: 12.5 + lcb_code_execution: 43.75 + lcb_test_output: 18.75 bbh-logical_deduction_seven_objects: 50 bbh-multistep_arithmetic_two: 68.75 mmlu-other: 72.6 @@ -27,6 +32,9 @@ internlm2_5-7b-chat-hf_fullbench: ds1000_Sklearn: 18.75 ds1000_Pytorch: 12.5 ds1000_Matplotlib: 43.75 + openai_mmmlu_lite_AR-XY: 37.5 + college: 12.5 + college_knowledge: 87.5 Alignbench总分: 0.65 Alignbench专业能力: 7.83 AlpacaEvaltotal: 0 @@ -56,19 +64,24 @@ internlm2_5-7b-chat-turbomind_fullbench: race-high: 93.75 ARC-c: 87.5 BoolQ: 68.75 + triviaqa_wiki_1shot: 50 + nq_open_1shot: 25 + IFEval: 50 drop: 75 - GPQA_diamond: 25 + hellaswag: 81.25 + TheoremQA: 6.25 + musr_average: 39.58 + gsm8k: 68.75 math: 75 + GPQA_diamond: 25 + cmo_fib: 6.25 + aime2024: 6.25 wikibench-wiki-single_choice_cncircular: 25 sanitized_mbpp: 68.75 ds1000: 13.39 - gsm8k: 68.75 - triviaqa_wiki_1shot: 50 - nq_open_1shot: 25 - hellaswag: 81.25 - TheoremQA: 6.25 - college: 0 - college_knowledge: 87.5 + lcb_code_generation: 12.5 + lcb_code_execution: 43.75 + lcb_test_output: 12.5 bbh-logical_deduction_seven_objects: 56.25 bbh-multistep_arithmetic_two: 68.75 mmlu-other: 74.04 @@ -81,6 +94,9 @@ internlm2_5-7b-chat-turbomind_fullbench: ds1000_Sklearn: 18.75 ds1000_Pytorch: 6.25 ds1000_Matplotlib: 37.5 + openai_mmmlu_lite_AR-XY: 37.5 + college: 0 + college_knowledge: 87.5 Alignbench总分: 0.64 Alignbench专业能力: 7.6 AlpacaEvaltotal: 10 @@ -121,6 +137,8 @@ internlm2_5-7b-hf_fullbench: winogrande: 75 hellaswag: 93.75 TheoremQA: 25 + dingo_en_192: 37.5 + dingo_zh_170: 100 college: 12.5 college_knowledge: 87.5 bbh-logical_deduction_seven_objects: 43.75 @@ -144,6 +162,8 @@ internlm2_5-7b-turbomind_fullbench: winogrande: 87.5 hellaswag: 93.75 TheoremQA: 31.25 + dingo_en_192: 43.75 + dingo_zh_170: 100 college: 12.5 college_knowledge: 87.5 bbh-logical_deduction_seven_objects: 50 diff --git a/.github/scripts/oc_score_baseline_testrange.yaml b/.github/scripts/oc_score_baseline_testrange.yaml index f93f8957..68f6660a 100644 --- a/.github/scripts/oc_score_baseline_testrange.yaml +++ b/.github/scripts/oc_score_baseline_testrange.yaml @@ -43,11 +43,11 @@ gemma-7b-it-hf: race-high: 68.75 gemma-2-9b-it-turbomind: - gsm8k: 68.75 + gsm8k: 65.62 race-high: 84.38 gemma-7b-it-vllm: - gsm8k: 28.12 + gsm8k: 34.38 race-high: 68.75 internlm2_5-7b-chat-hf: @@ -95,7 +95,7 @@ llama-3_1-8b-instruct-turbomind: race-high: 90.62 llama-3_2-3b-instruct-turbomind: - gsm8k: 65.62 + gsm8k: 62.50 race-high: 81.25 llama-3-8b-instruct-turbomind: @@ -112,15 +112,15 @@ mistral-7b-instruct-v0.3-hf: mistral-nemo-instruct-2407-hf: gsm8k: 75 - race-high: 84.38 + race-high: 81.25 mistral-nemo-instruct-2407-turbomind: - gsm8k: 75 - race-high: 84.38 + gsm8k: 68.75 + race-high: 87.50 mistral-7b-instruct-v0.1-vllm: - gsm8k: 37.5 - race-high: 71.88 + gsm8k: 34.38 + race-high: 68.75 mistral-7b-instruct-v0.2-vllm: gsm8k: 43.75 @@ -255,13 +255,13 @@ gemma-7b-hf: winogrande: 78.12 gemma-2b-vllm: - gsm8k: 18.75 + gsm8k: 15.62 GPQA_diamond: 6.25 race-high: winogrande: gemma-7b-vllm: - gsm8k: 59.38 + gsm8k: 53.12 GPQA_diamond: 6.25 race-high: winogrande: diff --git a/.github/workflows/daily-run-test.yml b/.github/workflows/daily-run-test.yml index 125aaa71..1d7a1189 100644 --- a/.github/workflows/daily-run-test.yml +++ b/.github/workflows/daily-run-test.yml @@ -163,9 +163,9 @@ jobs: pip install /cpfs01/user/qa-llm-cicd/packages/lmdeploy-0.6.1+cu118-cp310-cp310-manylinux2014_x86_64.whl --cache-dir ${{env.PIP_CACHE_PATH}} pip install /cpfs01/user/qa-llm-cicd/packages/vllm-0.6.1.post1+cu118-cp310-cp310-manylinux1_x86_64.whl --cache-dir ${{env.PIP_CACHE_PATH}} pip uninstall torch torchvision torchaudio -y - pip install torch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 --cache-dir ${{env.PIP_CACHE_PATH}} --index-url https://download.pytorch.org/whl/cu118 - FLASH_ATTENTION_FORCE_BUILD=TRUE pip install /cpfs01/user/qa-llm-cicd/packages/flash_attn-2.6.3+cu118torch2.4cxx11abiFALSE-cp310-cp310-linux_x86_64.whl - pip install /cpfs01/user/qa-llm-cicd/packages/xformers-0.0.27.post2+cu118-cp310-cp310-manylinux2014_x86_64.whl --cache-dir ${{env.PIP_CACHE_PATH}} + pip install torch==2.5.1 torchvision==0.20.1 torchaudio==2.5.1 --cache-dir ${{env.PIP_CACHE_PATH}} --index-url https://download.pytorch.org/whl/cu118 + FLASH_ATTENTION_FORCE_BUILD=TRUE pip install /cpfs01/user/qa-llm-cicd/packages/flash_attn-2.7.0.post2+cu11torch2.5cxx11abiFALSE-cp310-cp310-linux_x86_64.whl + pip install /cpfs01/user/qa-llm-cicd/packages/xformers-0.0.28.post3-cp310-cp310-manylinux_2_28_x86_64.whl --cache-dir ${{env.PIP_CACHE_PATH}} conda info --envs pip list - name: Prepare - create conda env and install torch - cu12 @@ -183,9 +183,9 @@ jobs: pip install opencompass*.whl --cache-dir ${{env.PIP_CACHE_PATH}} pip install opencompass[lmdeploy] --cache-dir ${{env.PIP_CACHE_PATH}} pip install opencompass[vllm] --cache-dir ${{env.PIP_CACHE_PATH}} - pip install torch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 --cache-dir ${{env.PIP_CACHE_PATH}} - FLASH_ATTENTION_FORCE_BUILD=TRUE pip install /cpfs01/user/qa-llm-cicd/packages/flash_attn-2.6.3+cu123torch2.4cxx11abiFALSE-cp310-cp310-linux_x86_64.whl - pip install /cpfs01/user/qa-llm-cicd/packages/xformers-0.0.27.post2-cp310-cp310-manylinux2014_x86_64.whl --cache-dir ${{env.PIP_CACHE_PATH}} + pip install torch==2.5.1 torchvision==0.20.1 torchaudio==2.5.1 --cache-dir ${{env.PIP_CACHE_PATH}} + FLASH_ATTENTION_FORCE_BUILD=TRUE pip install /cpfs01/user/qa-llm-cicd/packages/flash_attn-2.7.0.post2+cu12torch2.5cxx11abiFALSE-cp310-cp310-linux_x86_64.whl + pip install /cpfs01/user/qa-llm-cicd/packages/xformers-0.0.28.post3-cp310-cp310-manylinux_2_28_x86_64.whl --cache-dir ${{env.PIP_CACHE_PATH}} conda info --envs pip list - name: Prepare - reinstall lmdeploy - cu12