This commit is contained in:
zhulinJulia24 2025-04-03 10:14:29 +08:00
parent b87052718e
commit 0585f9dad2
4 changed files with 22 additions and 66 deletions

View File

@ -11,14 +11,10 @@ with read_base():
from opencompass.configs.datasets.winogrande.winogrande_5shot_ll_252f01 import \ from opencompass.configs.datasets.winogrande.winogrande_5shot_ll_252f01 import \
winogrande_datasets # noqa: F401, E501 winogrande_datasets # noqa: F401, E501
# read hf models - chat models # read hf models - chat models
from opencompass.configs.models.chatglm.hf_glm4_9b import \
models as hf_glm4_9b_model # noqa: F401, E501
from opencompass.configs.models.chatglm.lmdeploy_glm4_9b import \ from opencompass.configs.models.chatglm.lmdeploy_glm4_9b import \
models as lmdeploy_glm4_9b_model # noqa: F401, E501 models as lmdeploy_glm4_9b_model # noqa: F401, E501
from opencompass.configs.models.deepseek.hf_deepseek_7b_base import \ from opencompass.configs.models.deepseek.hf_deepseek_7b_base import \
models as hf_deepseek_7b_base_model # noqa: F401, E501 models as hf_deepseek_7b_base_model # noqa: F401, E501
from opencompass.configs.models.deepseek.hf_deepseek_v2_lite import \
models as hf_deepseek_v2_lite_model # noqa: F401, E501
from opencompass.configs.models.deepseek.lmdeploy_deepseek_7b_base import \ from opencompass.configs.models.deepseek.lmdeploy_deepseek_7b_base import \
models as lmdeploy_deepseek_7b_base_model # noqa: F401, E501 models as lmdeploy_deepseek_7b_base_model # noqa: F401, E501
from opencompass.configs.models.deepseek.lmdeploy_deepseek_67b_base import \ from opencompass.configs.models.deepseek.lmdeploy_deepseek_67b_base import \
@ -110,8 +106,6 @@ with read_base():
from ...volc import infer as volc_infer # noqa: F401, E501 from ...volc import infer as volc_infer # noqa: F401, E501
hf_glm4_9b_model[0]['path'] = 'THUDM/glm-4-9b-hf'
race_datasets = [race_datasets[1]] race_datasets = [race_datasets[1]]
models = sum([v for k, v in locals().items() if k.endswith('_model')], []) models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
datasets = sum([v for k, v in locals().items() if k.endswith('_datasets')], []) datasets = sum([v for k, v in locals().items() if k.endswith('_datasets')], [])

View File

@ -246,10 +246,7 @@ class TestCmdCase:
@pytest.mark.parametrize('model, dataset', @pytest.mark.parametrize('model, dataset',
[('internlm2_5-7b-hf', 'race-middle_accuracy'), [('internlm2_5-7b-hf', 'race-middle_accuracy'),
('internlm2_5-7b-hf', 'race-high_accuracy'), ('internlm2_5-7b-hf', 'race-high_accuracy'),
('internlm2_5-7b-hf', 'demo_gsm8k_accuracy'), ('internlm2_5-7b-hf', 'demo_gsm8k_accuracy')])
('internlm2-1.8b-hf', 'race-middle_accuracy'),
('internlm2-1.8b-hf', 'race-high_accuracy'),
('internlm2-1.8b-hf', 'demo_gsm8k_accuracy')])
def test_cmd_case1(self, baseline_scores, result_scores, model, dataset): def test_cmd_case1(self, baseline_scores, result_scores, model, dataset):
base_score = baseline_scores.get(model).get(dataset) base_score = baseline_scores.get(model).get(dataset)
result_score = result_scores.get(model).get(dataset) result_score = result_scores.get(model).get(dataset)

View File

@ -8,11 +8,6 @@ internlm2_5-7b_hf:
race-middle_accuracy: 91.78 race-middle_accuracy: 91.78
race-high_accuracy: 90.02 race-high_accuracy: 90.02
internlm2-1.8b-hf:
demo_gsm8k_accuracy: 15.62
race-middle_accuracy: 71.66
race-high_accuracy: 66.38
internlm2_5-7b-chat-lmdeploy: internlm2_5-7b-chat-lmdeploy:
demo_gsm8k_accuracy: 89.06 demo_gsm8k_accuracy: 89.06
race-middle_accuracy: 92.76 race-middle_accuracy: 92.76

View File

@ -210,13 +210,8 @@ chat:
gsm8k_accuracy: 78.12 gsm8k_accuracy: 78.12
race-high_accuracy: 78.12 race-high_accuracy: 78.12
base: base:
glm-4-9b-hf:
gsm8k_accuracy: 68.75
GPQA_diamond_accuracy: 31.25
race-high_accuracy: 93.75
winogrande_accuracy: 84.38
glm-4-9b-turbomind: glm-4-9b-turbomind:
gsm8k_accuracy: 62.5 gsm8k_accuracy: 56.25
GPQA_diamond_accuracy: 28.12 GPQA_diamond_accuracy: 28.12
race-high_accuracy: 93.75 race-high_accuracy: 93.75
winogrande_accuracy: 84.38 winogrande_accuracy: 84.38
@ -226,7 +221,7 @@ base:
race-high_accuracy: 46.88 race-high_accuracy: 46.88
winogrande_accuracy: 71.88 winogrande_accuracy: 71.88
deepseek-7b-base-turbomind: deepseek-7b-base-turbomind:
gsm8k_accuracy: 21.88 gsm8k_accuracy: 18.75
GPQA_diamond_accuracy: 0 GPQA_diamond_accuracy: 0
race-high_accuracy: 43.75 race-high_accuracy: 43.75
winogrande_accuracy: 84.38 winogrande_accuracy: 84.38
@ -255,16 +250,21 @@ base:
GPQA_diamond_accuracy: 3.12 GPQA_diamond_accuracy: 3.12
race-high_accuracy: 65.62 race-high_accuracy: 65.62
winogrande_accuracy: 71.88 winogrande_accuracy: 71.88
gemma-2-9b-turbomind:
gsm8k_accuracy: 68.75
GPQA_diamond_accuracy: 0
race-high_accuracy: 78.12
winogrande_accuracy: 50
gemma-2b-vllm: gemma-2b-vllm:
gsm8k_accuracy: 15.62 gsm8k_accuracy: 15.62
GPQA_diamond_accuracy: 3.12 GPQA_diamond_accuracy: 3.12
race-high_accuracy: race-high_accuracy: 28.12
winogrande_accuracy: winogrande_accuracy: 68.75
gemma-7b-vllm: gemma-7b-vllm:
gsm8k_accuracy: 43.75 gsm8k_accuracy: 43.75
GPQA_diamond_accuracy: 9.38 GPQA_diamond_accuracy: 6.25
race-high_accuracy: race-high_accuracy: 81.25
winogrande_accuracy: winogrande_accuracy: 81.25
internlm2_5-7b-hf: internlm2_5-7b-hf:
gsm8k_accuracy: 37.5 gsm8k_accuracy: 37.5
GPQA_diamond_accuracy: 25 GPQA_diamond_accuracy: 25
@ -275,30 +275,25 @@ base:
GPQA_diamond_accuracy: 18.75 GPQA_diamond_accuracy: 18.75
race-high_accuracy: 62.5 race-high_accuracy: 62.5
winogrande_accuracy: 78.12 winogrande_accuracy: 78.12
internlm2-base-7b-hf:
gsm8k_accuracy: 3.12
GPQA_diamond_accuracy: 21.88
race-high_accuracy: 75
winogrande_accuracy: 65.62
internlm2-1.8b-turbomind: internlm2-1.8b-turbomind:
gsm8k_accuracy: 12.5 gsm8k_accuracy: 6.25
GPQA_diamond_accuracy: 9.38 GPQA_diamond_accuracy: 12.5
race-high_accuracy: 71.88 race-high_accuracy: 71.88
winogrande_accuracy: 78.12 winogrande_accuracy: 75
internlm2_5-7b-turbomind: internlm2_5-7b-turbomind:
gsm8k_accuracy: 62.50 gsm8k_accuracy: 59.38
GPQA_diamond_accuracy: 34.38 GPQA_diamond_accuracy: 34.38
race-high_accuracy: 93.75 race-high_accuracy: 93.75
winogrande_accuracy: 87.50 winogrande_accuracy: 84.38
internlm2-7b-turbomind: internlm2-7b-turbomind:
gsm8k_accuracy: 53.12 gsm8k_accuracy: 50
GPQA_diamond_accuracy: 21.88 GPQA_diamond_accuracy: 18.75
race-high_accuracy: 71.88 race-high_accuracy: 71.88
winogrande_accuracy: 84.38 winogrande_accuracy: 84.38
internlm2-base-7b-turbomind: internlm2-base-7b-turbomind:
gsm8k_accuracy: 37.50 gsm8k_accuracy: 37.50
GPQA_diamond_accuracy: 28.12 GPQA_diamond_accuracy: 21.88
race-high_accuracy: 81.25 race-high_accuracy: 84.38
winogrande_accuracy: 75 winogrande_accuracy: 75
llama-2-7b-hf: llama-2-7b-hf:
gsm8k_accuracy: 21.88 gsm8k_accuracy: 21.88
@ -395,21 +390,6 @@ base:
GPQA_diamond_accuracy: 40.62 GPQA_diamond_accuracy: 40.62
race-high_accuracy: 87.5 race-high_accuracy: 87.5
winogrande_accuracy: 71.88 winogrande_accuracy: 71.88
deepseek-v2-lite-hf:
gsm8k_accuracy: 31.25
GPQA_diamond_accuracy: 28.12
race-high_accuracy: 59.38
winogrande_accuracy: 71.88
internlm2-20b-hf:
gsm8k_accuracy: 56.25
GPQA_diamond_accuracy: 15.62
race-high_accuracy: 68.75
winogrande_accuracy: 75
internlm2-base-20b-hf:
gsm8k_accuracy: 12.5
GPQA_diamond_accuracy: 9.38
race-high_accuracy: 84.38
winogrande_accuracy: 65.62
internlm2-20b-turbomind: internlm2-20b-turbomind:
gsm8k_accuracy: 71.88 gsm8k_accuracy: 71.88
GPQA_diamond_accuracy: 15.62 GPQA_diamond_accuracy: 15.62
@ -430,11 +410,6 @@ base:
GPQA_diamond_accuracy: 28.12 GPQA_diamond_accuracy: 28.12
race-high_accuracy: 93.75 race-high_accuracy: 93.75
winogrande_accuracy: 81.25 winogrande_accuracy: 81.25
deepseek-67b-base-hf:
gsm8k_accuracy: 59.38
GPQA_diamond_accuracy: 31.25
race-high_accuracy: 81.25
winogrande_accuracy: 90.62
deepseek-67b-base-turbomind: deepseek-67b-base-turbomind:
gsm8k_accuracy: 56.25 gsm8k_accuracy: 56.25
GPQA_diamond_accuracy: 28.12 GPQA_diamond_accuracy: 28.12
@ -455,8 +430,3 @@ base:
GPQA_diamond_accuracy: 15.62 GPQA_diamond_accuracy: 15.62
race-high_accuracy: 93.75 race-high_accuracy: 93.75
winogrande_accuracy: 84.38 winogrande_accuracy: 84.38
llama-3-70b-hf:
gsm8k_accuracy: 62.5
GPQA_diamond_accuracy: 3.12
race-high_accuracy: 93.75
winogrande_accuracy: 84.38