[CI] update torch version and add more datasets into daily testcase (#1701)

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

---------

Co-authored-by: zhulin1 <zhulin1@pjlab.org.cn>
This commit is contained in:
zhulinJulia24 2024-11-21 10:37:33 +08:00 committed by GitHub
parent 05044dfaf2
commit ed81f9df30
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 160 additions and 77 deletions

View File

@ -7,6 +7,8 @@ with read_base():
bbh_datasets # noqa: F401, E501
from opencompass.configs.datasets.cmmlu.cmmlu_ppl_041cbf import \
cmmlu_datasets # noqa: F401, E501
from opencompass.configs.datasets.dingo.dingo_gen import \
datasets as dingo_datasets # noqa: F401, E501
from opencompass.configs.datasets.drop.drop_gen_a2697c import \
drop_datasets # noqa: F401, E501
from opencompass.configs.datasets.GaokaoBench.GaokaoBench_no_subjective_gen_d21e37 import \
@ -120,6 +122,8 @@ summarizer = dict(
['winogrande', 'accuracy'],
['hellaswag', 'accuracy'],
['TheoremQA', 'score'],
['dingo_en_192', 'score'],
['dingo_zh_170', 'score'],
'###### MathBench-A: Application Part ######',
'college',
'high',

View File

@ -59,6 +59,8 @@ with read_base():
models as hf_llama3_2_3b_instruct_model # noqa: F401, E501
from opencompass.configs.models.hf_llama.hf_llama3_8b_instruct import \
models as hf_llama3_8b_instruct_model # noqa: F401, E501
from opencompass.configs.models.hf_llama.lmdeploy_llama2_7b_chat import \
models as lmdeploy_llama2_7b_chat_model # noqa: F401, E501
from opencompass.configs.models.hf_llama.lmdeploy_llama3_1_8b_instruct import \
models as lmdeploy_llama3_1_8b_instruct_model # noqa: F401, E501
from opencompass.configs.models.hf_llama.lmdeploy_llama3_2_3b_instruct import \

View File

@ -3,12 +3,16 @@ from mmengine.config import read_base
with read_base():
# read hf models - chat models
# Dataset
from opencompass.configs.datasets.aime2024.aime2024_gen_6e39a4 import \
aime2024_datasets # noqa: F401, E501
from opencompass.configs.datasets.ARC_c.ARC_c_cot_gen_926652 import \
ARC_c_datasets # noqa: F401, E501
from opencompass.configs.datasets.bbh.bbh_gen_5b92b0 import \
bbh_datasets # noqa: F401, E501
from opencompass.configs.datasets.cmmlu.cmmlu_0shot_cot_gen_305931 import \
cmmlu_datasets # noqa: F401, E501
from opencompass.configs.datasets.cmo_fib.cmo_fib_gen_ace24b import \
cmo_fib_datasets # noqa: F401, E501
from opencompass.configs.datasets.drop.drop_openai_simple_evals_gen_3857b0 import \
drop_datasets # noqa: F401, E501
from opencompass.configs.datasets.ds1000.ds1000_service_eval_gen_cbc84f import \
@ -28,6 +32,8 @@ with read_base():
humanevalx_datasets # noqa: F401, E501
from opencompass.configs.datasets.IFEval.IFEval_gen_3321a3 import \
ifeval_datasets # noqa: F401, E501
from opencompass.configs.datasets.livecodebench.livecodebench_gen_b2b0fd import \
LCB_datasets # noqa: F401, E501
from opencompass.configs.datasets.math.math_0shot_gen_393424 import \
math_datasets # noqa: F401, E501
from opencompass.configs.datasets.MathBench.mathbench_2024_gen_50a320 import \
@ -38,6 +44,10 @@ with read_base():
mmlu_datasets # noqa: F401, E501
from opencompass.configs.datasets.mmlu_pro.mmlu_pro_0shot_cot_gen_08c1de import \
mmlu_pro_datasets # noqa: F401, E501
from opencompass.configs.datasets.mmmlu_lite.mmmlu_lite_gen_c51a84 import \
mmmlu_lite_datasets # noqa: F401, E501
from opencompass.configs.datasets.musr.musr_gen_3c6e15 import \
musr_datasets # noqa: F401, E501
from opencompass.configs.datasets.nq.nq_open_1shot_gen_2e45e5 import \
nq_datasets # noqa: F401, E501
from opencompass.configs.datasets.race.race_cot_gen_d95929 import \
@ -77,10 +87,14 @@ with read_base():
mmlu_summary_groups # noqa: F401, E501
from opencompass.configs.summarizers.groups.mmlu_pro import \
mmlu_pro_summary_groups # noqa: F401, E501
from opencompass.configs.summarizers.groups.musr_average import \
summarizer as musr_summarizer # noqa: F401, E501
from opencompass.configs.summarizers.groups.scicode import \
scicode_summary_groups # noqa: F401, E501
from opencompass.configs.summarizers.groups.teval import \
teval_summary_groups # noqa: F401, E501
from opencompass.configs.summarizers.mmmlu_lite import \
mmmlu_summary_groups # noqa: F401, E501
# For HumanEval-X Evaluation
# Apply the evaluator ip_address and port
@ -122,6 +136,10 @@ mmlu_datasets = [
]
mmlu_pro_datasets = [mmlu_pro_datasets[0]]
mmmlu_lite_datasets = [
x for x in mmmlu_lite_datasets if 'mmlu_lite_AR-XY' in x['abbr']
]
mathbench_datasets = [x for x in mathbench_datasets if 'college' in x['abbr']]
GaokaoBench_datasets = [
x for x in GaokaoBench_datasets if '2010-2022_Math_II_MCQs' in x['abbr']
@ -137,52 +155,68 @@ datasets += teval_en_datasets
datasets += teval_zh_datasets
# datasets += SciCode_datasets
musr_summary_groups = musr_summarizer['summary_groups']
summary_groups = sum(
[v for k, v in locals().items() if k.endswith('_summary_groups')], [])
summary_groups.append(
{
'name': 'Mathbench',
'subsets': ['mathbench-a (average)', 'mathbench-t (average)'],
}, )
# Summarizer
summarizer = dict(
dataset_abbrs=[
'Language',
['race-high', 'accuracy'],
['ARC-c', 'accuracy'],
['BoolQ', 'accuracy'],
['mmlu_pro', 'naive_average'],
['triviaqa_wiki_1shot', 'score'],
['nq_open_1shot', 'score'],
['mmmlu_lite', 'naive_average'],
'',
'Instruction Following',
['IFEval', 'Prompt-level-strict-accuracy'],
'',
'General Reasoning',
['drop', 'accuracy'],
['bbh', 'naive_average'],
['GPQA_diamond', 'accuracy'],
['hellaswag', 'accuracy'],
['TheoremQA', 'score'],
['musr_average', 'naive_average'],
'',
'Math Calculation',
['gsm8k', 'accuracy'],
['GaokaoBench', 'weighted_average'],
['math', 'accuracy'],
['cmo_fib', 'accuracy'],
['aime2024', 'accuracy'],
['Mathbench', 'naive_average'],
'',
'Knowledge',
['wikibench-wiki-single_choice_cncircular', 'perf_4'],
['openai_humaneval', 'humaneval_pass@1'],
['sanitized_mbpp', 'score'],
['cmmlu', 'naive_average'],
['mmlu', 'naive_average'],
['mmlu_pro', 'naive_average'],
'',
'Code',
['openai_humaneval', 'humaneval_pass@1'],
['sanitized_mbpp', 'score'],
['humanevalx', 'naive_average'],
['ds1000', 'naive_average'],
['lcb_code_generation', 'pass@1'],
['lcb_code_execution', 'pass@1'],
['lcb_test_output', 'pass@1'],
'',
'Agent',
['teval', 'naive_average'],
['SciCode', 'accuracy'],
['SciCode', 'sub_accuracy'],
['humanevalx', 'naive_average'],
['ds1000', 'naive_average'],
['IFEval', 'Prompt-level-strict-accuracy'],
['gsm8k', 'accuracy'],
['GaokaoBench', 'weighted_average'],
['triviaqa_wiki_1shot', 'score'],
['nq_open_1shot', 'score'],
['hellaswag', 'accuracy'],
['TheoremQA', 'score'],
'###### MathBench-A: Application Part ######',
'college',
'high',
'middle',
'primary',
'arithmetic',
'mathbench-a (average)',
'###### MathBench-T: Theory Part ######',
'college_knowledge',
'high_knowledge',
'middle_knowledge',
'primary_knowledge',
'mathbench-t (average)',
'###### Overall: Average between MathBench-A and MathBench-T ######',
'Overall',
'',
'bbh-logical_deduction_seven_objects',
'bbh-multistep_arithmetic_two',
''
'',
'mmlu',
'mmlu-stem',
'mmlu-social-science',
@ -212,15 +246,6 @@ summarizer = dict(
'mmlu_pro_psychology',
'mmlu_pro_other',
'',
'GaokaoBench_2010-2022_Math_II_MCQs',
'GaokaoBench_2010-2022_Math_II_Fill-in-the-Blank',
'',
'humanevalx-python',
'humanevalx-cpp',
'humanevalx-go',
'humanevalx-java',
'humanevalx-js',
'',
'ds1000_Pandas',
'ds1000_Numpy',
'ds1000_Tensorflow',
@ -228,9 +253,38 @@ summarizer = dict(
'ds1000_Sklearn',
'ds1000_Pytorch',
'ds1000_Matplotlib',
'',
'mmmlu_lite',
'openai_mmmlu_lite_AR-XY',
'openai_mmmlu_lite_BN-BD',
'openai_mmmlu_lite_DE-DE',
'openai_mmmlu_lite_ES-LA',
'openai_mmmlu_lite_FR-FR',
'openai_mmmlu_lite_HI-IN',
'openai_mmmlu_lite_ID-ID',
'openai_mmmlu_lite_IT-IT',
'openai_mmmlu_lite_JA-JP',
'openai_mmmlu_lite_KO-KR',
'openai_mmmlu_lite_PT-BR',
'openai_mmmlu_lite_SW-KE',
'openai_mmmlu_lite_YO-NG',
'openai_mmmlu_lite_ZH-CN',
'',
'###### MathBench-A: Application Part ######',
'college',
'high',
'middle',
'primary',
'arithmetic',
'mathbench-a (average)',
'###### MathBench-T: Theory Part ######',
'college_knowledge',
'high_knowledge',
'middle_knowledge',
'primary_knowledge',
'mathbench-t (average)',
],
summary_groups=sum(
[v for k, v in locals().items() if k.endswith('_summary_groups')], []),
summary_groups=summary_groups,
)
for d in datasets:

View File

@ -131,14 +131,16 @@ class TestChatObjFullbench:
'internlm2_5-7b-chat-hf_fullbench',
'internlm2_5-7b-chat-turbomind_fullbench'
] for p2 in [
'race-high', 'ARC-c', 'BoolQ', 'drop', 'GPQA_diamond', 'math',
'race-high', 'ARC-c', 'BoolQ', 'triviaqa_wiki_1shot', 'nq_open_1shot',
'IFEval', 'drop', 'GPQA_diamond', 'hellaswag', 'TheoremQA',
'musr_average', 'gsm8k', 'math', 'cmo_fib', 'aime2024',
'wikibench-wiki-single_choice_cncircular', 'sanitized_mbpp', 'ds1000',
'gsm8k', 'triviaqa_wiki_1shot', 'nq_open_1shot', 'hellaswag',
'TheoremQA', 'college', 'college_knowledge',
'lcb_code_generation', 'lcb_code_execution', 'lcb_test_output',
'bbh-logical_deduction_seven_objects', 'bbh-multistep_arithmetic_two',
'mmlu-other', 'cmmlu-china-specific', 'mmlu_pro_math', 'ds1000_Pandas',
'ds1000_Numpy', 'ds1000_Tensorflow', 'ds1000_Scipy', 'ds1000_Sklearn',
'ds1000_Pytorch', 'ds1000_Matplotlib'
'ds1000_Pytorch', 'ds1000_Matplotlib', 'openai_mmmlu_lite_AR-XY',
'college', 'college_knowledge'
]])
def test_model_dataset_score(self, baseline_scores_fullbench,
result_scores, model, dataset):
@ -188,9 +190,10 @@ class TestBaseFullbench:
'race-high', 'ARC-c', 'BoolQ', 'drop', 'GPQA_diamond', 'math',
'wikibench-wiki-single_choice_cncircular', 'sanitized_mbpp', 'gsm8k',
'triviaqa_wiki_1shot', 'nq_open_1shot', 'winogrande', 'hellaswag',
'TheoremQA', 'college', 'college_knowledge',
'bbh-logical_deduction_seven_objects', 'bbh-multistep_arithmetic_two',
'mmlu-other', 'cmmlu-china-specific', 'mmlu_pro_math'
'TheoremQA', 'dingo_en_192', 'dingo_zh_170', 'college',
'college_knowledge', 'bbh-logical_deduction_seven_objects',
'bbh-multistep_arithmetic_two', 'mmlu-other', 'cmmlu-china-specific',
'mmlu_pro_math'
]])
def test_model_dataset_score(self, baseline_scores_fullbench,
result_scores, model, dataset):

View File

@ -2,19 +2,24 @@ internlm2_5-7b-chat-hf_fullbench:
race-high: 93.75
ARC-c: 93.75
BoolQ: 81.25
triviaqa_wiki_1shot: 50
nq_open_1shot: 25
IFEval: 50
drop: 81.25
GPQA_diamond: 25
hellaswag: 87.5
TheoremQA: 18.75
musr_average: 39.58
gsm8k: 56.25
math: 75
cmo_fib: 6.25
aime2024: 6.25
wikibench-wiki-single_choice_cncircular: 50
sanitized_mbpp: 68.75
ds1000: 16.96
gsm8k: 56.25
triviaqa_wiki_1shot: 50
nq_open_1shot: 25
hellaswag: 87.5
TheoremQA: 18.75
college: 12.5
college_knowledge: 87.5
lcb_code_generation: 12.5
lcb_code_execution: 43.75
lcb_test_output: 18.75
bbh-logical_deduction_seven_objects: 50
bbh-multistep_arithmetic_two: 68.75
mmlu-other: 72.6
@ -27,6 +32,9 @@ internlm2_5-7b-chat-hf_fullbench:
ds1000_Sklearn: 18.75
ds1000_Pytorch: 12.5
ds1000_Matplotlib: 43.75
openai_mmmlu_lite_AR-XY: 37.5
college: 12.5
college_knowledge: 87.5
Alignbench总分: 0.65
Alignbench专业能力: 7.83
AlpacaEvaltotal: 0
@ -56,19 +64,24 @@ internlm2_5-7b-chat-turbomind_fullbench:
race-high: 93.75
ARC-c: 87.5
BoolQ: 68.75
triviaqa_wiki_1shot: 50
nq_open_1shot: 25
IFEval: 50
drop: 75
GPQA_diamond: 25
hellaswag: 81.25
TheoremQA: 6.25
musr_average: 39.58
gsm8k: 68.75
math: 75
GPQA_diamond: 25
cmo_fib: 6.25
aime2024: 6.25
wikibench-wiki-single_choice_cncircular: 25
sanitized_mbpp: 68.75
ds1000: 13.39
gsm8k: 68.75
triviaqa_wiki_1shot: 50
nq_open_1shot: 25
hellaswag: 81.25
TheoremQA: 6.25
college: 0
college_knowledge: 87.5
lcb_code_generation: 12.5
lcb_code_execution: 43.75
lcb_test_output: 12.5
bbh-logical_deduction_seven_objects: 56.25
bbh-multistep_arithmetic_two: 68.75
mmlu-other: 74.04
@ -81,6 +94,9 @@ internlm2_5-7b-chat-turbomind_fullbench:
ds1000_Sklearn: 18.75
ds1000_Pytorch: 6.25
ds1000_Matplotlib: 37.5
openai_mmmlu_lite_AR-XY: 37.5
college: 0
college_knowledge: 87.5
Alignbench总分: 0.64
Alignbench专业能力: 7.6
AlpacaEvaltotal: 10
@ -121,6 +137,8 @@ internlm2_5-7b-hf_fullbench:
winogrande: 75
hellaswag: 93.75
TheoremQA: 25
dingo_en_192: 37.5
dingo_zh_170: 100
college: 12.5
college_knowledge: 87.5
bbh-logical_deduction_seven_objects: 43.75
@ -144,6 +162,8 @@ internlm2_5-7b-turbomind_fullbench:
winogrande: 87.5
hellaswag: 93.75
TheoremQA: 31.25
dingo_en_192: 43.75
dingo_zh_170: 100
college: 12.5
college_knowledge: 87.5
bbh-logical_deduction_seven_objects: 50

View File

@ -43,11 +43,11 @@ gemma-7b-it-hf:
race-high: 68.75
gemma-2-9b-it-turbomind:
gsm8k: 68.75
gsm8k: 65.62
race-high: 84.38
gemma-7b-it-vllm:
gsm8k: 28.12
gsm8k: 34.38
race-high: 68.75
internlm2_5-7b-chat-hf:
@ -95,7 +95,7 @@ llama-3_1-8b-instruct-turbomind:
race-high: 90.62
llama-3_2-3b-instruct-turbomind:
gsm8k: 65.62
gsm8k: 62.50
race-high: 81.25
llama-3-8b-instruct-turbomind:
@ -112,15 +112,15 @@ mistral-7b-instruct-v0.3-hf:
mistral-nemo-instruct-2407-hf:
gsm8k: 75
race-high: 84.38
race-high: 81.25
mistral-nemo-instruct-2407-turbomind:
gsm8k: 75
race-high: 84.38
gsm8k: 68.75
race-high: 87.50
mistral-7b-instruct-v0.1-vllm:
gsm8k: 37.5
race-high: 71.88
gsm8k: 34.38
race-high: 68.75
mistral-7b-instruct-v0.2-vllm:
gsm8k: 43.75
@ -255,13 +255,13 @@ gemma-7b-hf:
winogrande: 78.12
gemma-2b-vllm:
gsm8k: 18.75
gsm8k: 15.62
GPQA_diamond: 6.25
race-high:
winogrande:
gemma-7b-vllm:
gsm8k: 59.38
gsm8k: 53.12
GPQA_diamond: 6.25
race-high:
winogrande:

View File

@ -163,9 +163,9 @@ jobs:
pip install /cpfs01/user/qa-llm-cicd/packages/lmdeploy-0.6.1+cu118-cp310-cp310-manylinux2014_x86_64.whl --cache-dir ${{env.PIP_CACHE_PATH}}
pip install /cpfs01/user/qa-llm-cicd/packages/vllm-0.6.1.post1+cu118-cp310-cp310-manylinux1_x86_64.whl --cache-dir ${{env.PIP_CACHE_PATH}}
pip uninstall torch torchvision torchaudio -y
pip install torch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 --cache-dir ${{env.PIP_CACHE_PATH}} --index-url https://download.pytorch.org/whl/cu118
FLASH_ATTENTION_FORCE_BUILD=TRUE pip install /cpfs01/user/qa-llm-cicd/packages/flash_attn-2.6.3+cu118torch2.4cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
pip install /cpfs01/user/qa-llm-cicd/packages/xformers-0.0.27.post2+cu118-cp310-cp310-manylinux2014_x86_64.whl --cache-dir ${{env.PIP_CACHE_PATH}}
pip install torch==2.5.1 torchvision==0.20.1 torchaudio==2.5.1 --cache-dir ${{env.PIP_CACHE_PATH}} --index-url https://download.pytorch.org/whl/cu118
FLASH_ATTENTION_FORCE_BUILD=TRUE pip install /cpfs01/user/qa-llm-cicd/packages/flash_attn-2.7.0.post2+cu11torch2.5cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
pip install /cpfs01/user/qa-llm-cicd/packages/xformers-0.0.28.post3-cp310-cp310-manylinux_2_28_x86_64.whl --cache-dir ${{env.PIP_CACHE_PATH}}
conda info --envs
pip list
- name: Prepare - create conda env and install torch - cu12
@ -183,9 +183,9 @@ jobs:
pip install opencompass*.whl --cache-dir ${{env.PIP_CACHE_PATH}}
pip install opencompass[lmdeploy] --cache-dir ${{env.PIP_CACHE_PATH}}
pip install opencompass[vllm] --cache-dir ${{env.PIP_CACHE_PATH}}
pip install torch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 --cache-dir ${{env.PIP_CACHE_PATH}}
FLASH_ATTENTION_FORCE_BUILD=TRUE pip install /cpfs01/user/qa-llm-cicd/packages/flash_attn-2.6.3+cu123torch2.4cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
pip install /cpfs01/user/qa-llm-cicd/packages/xformers-0.0.27.post2-cp310-cp310-manylinux2014_x86_64.whl --cache-dir ${{env.PIP_CACHE_PATH}}
pip install torch==2.5.1 torchvision==0.20.1 torchaudio==2.5.1 --cache-dir ${{env.PIP_CACHE_PATH}}
FLASH_ATTENTION_FORCE_BUILD=TRUE pip install /cpfs01/user/qa-llm-cicd/packages/flash_attn-2.7.0.post2+cu12torch2.5cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
pip install /cpfs01/user/qa-llm-cicd/packages/xformers-0.0.28.post3-cp310-cp310-manylinux_2_28_x86_64.whl --cache-dir ${{env.PIP_CACHE_PATH}}
conda info --envs
pip list
- name: Prepare - reinstall lmdeploy - cu12