diff --git a/.github/scripts/eval_regression_base_models.py b/.github/scripts/eval_regression_base_models.py index 904d2060..8543a30d 100644 --- a/.github/scripts/eval_regression_base_models.py +++ b/.github/scripts/eval_regression_base_models.py @@ -11,14 +11,10 @@ with read_base(): from opencompass.configs.datasets.winogrande.winogrande_5shot_ll_252f01 import \ winogrande_datasets # noqa: F401, E501 # read hf models - chat models - from opencompass.configs.models.chatglm.hf_glm4_9b import \ - models as hf_glm4_9b_model # noqa: F401, E501 from opencompass.configs.models.chatglm.lmdeploy_glm4_9b import \ models as lmdeploy_glm4_9b_model # noqa: F401, E501 from opencompass.configs.models.deepseek.hf_deepseek_7b_base import \ models as hf_deepseek_7b_base_model # noqa: F401, E501 - from opencompass.configs.models.deepseek.hf_deepseek_v2_lite import \ - models as hf_deepseek_v2_lite_model # noqa: F401, E501 from opencompass.configs.models.deepseek.lmdeploy_deepseek_7b_base import \ models as lmdeploy_deepseek_7b_base_model # noqa: F401, E501 from opencompass.configs.models.deepseek.lmdeploy_deepseek_67b_base import \ @@ -110,8 +106,6 @@ with read_base(): from ...volc import infer as volc_infer # noqa: F401, E501 -hf_glm4_9b_model[0]['path'] = 'THUDM/glm-4-9b-hf' - race_datasets = [race_datasets[1]] models = sum([v for k, v in locals().items() if k.endswith('_model')], []) datasets = sum([v for k, v in locals().items() if k.endswith('_datasets')], []) diff --git a/.github/scripts/oc_score_assert.py b/.github/scripts/oc_score_assert.py index 27c2e761..d8c778b7 100644 --- a/.github/scripts/oc_score_assert.py +++ b/.github/scripts/oc_score_assert.py @@ -246,10 +246,7 @@ class TestCmdCase: @pytest.mark.parametrize('model, dataset', [('internlm2_5-7b-hf', 'race-middle_accuracy'), ('internlm2_5-7b-hf', 'race-high_accuracy'), - ('internlm2_5-7b-hf', 'demo_gsm8k_accuracy'), - ('internlm2-1.8b-hf', 'race-middle_accuracy'), - ('internlm2-1.8b-hf', 'race-high_accuracy'), - ('internlm2-1.8b-hf', 'demo_gsm8k_accuracy')]) + ('internlm2_5-7b-hf', 'demo_gsm8k_accuracy')]) def test_cmd_case1(self, baseline_scores, result_scores, model, dataset): base_score = baseline_scores.get(model).get(dataset) result_score = result_scores.get(model).get(dataset) diff --git a/.github/scripts/oc_score_baseline.yaml b/.github/scripts/oc_score_baseline.yaml index cd2e3328..9cf6781e 100644 --- a/.github/scripts/oc_score_baseline.yaml +++ b/.github/scripts/oc_score_baseline.yaml @@ -8,11 +8,6 @@ internlm2_5-7b_hf: race-middle_accuracy: 91.78 race-high_accuracy: 90.02 -internlm2-1.8b-hf: - demo_gsm8k_accuracy: 15.62 - race-middle_accuracy: 71.66 - race-high_accuracy: 66.38 - internlm2_5-7b-chat-lmdeploy: demo_gsm8k_accuracy: 89.06 race-middle_accuracy: 92.76 diff --git a/.github/scripts/oc_score_baseline_testrange.yaml b/.github/scripts/oc_score_baseline_testrange.yaml index e317e1d5..06ba83bf 100644 --- a/.github/scripts/oc_score_baseline_testrange.yaml +++ b/.github/scripts/oc_score_baseline_testrange.yaml @@ -210,13 +210,8 @@ chat: gsm8k_accuracy: 78.12 race-high_accuracy: 78.12 base: - glm-4-9b-hf: - gsm8k_accuracy: 68.75 - GPQA_diamond_accuracy: 31.25 - race-high_accuracy: 93.75 - winogrande_accuracy: 84.38 glm-4-9b-turbomind: - gsm8k_accuracy: 62.5 + gsm8k_accuracy: 56.25 GPQA_diamond_accuracy: 28.12 race-high_accuracy: 93.75 winogrande_accuracy: 84.38 @@ -226,7 +221,7 @@ base: race-high_accuracy: 46.88 winogrande_accuracy: 71.88 deepseek-7b-base-turbomind: - gsm8k_accuracy: 21.88 + gsm8k_accuracy: 18.75 GPQA_diamond_accuracy: 0 race-high_accuracy: 43.75 winogrande_accuracy: 84.38 @@ -255,16 +250,21 @@ base: GPQA_diamond_accuracy: 3.12 race-high_accuracy: 65.62 winogrande_accuracy: 71.88 + gemma-2-9b-turbomind: + gsm8k_accuracy: 68.75 + GPQA_diamond_accuracy: 0 + race-high_accuracy: 78.12 + winogrande_accuracy: 50 gemma-2b-vllm: gsm8k_accuracy: 15.62 GPQA_diamond_accuracy: 3.12 - race-high_accuracy: - winogrande_accuracy: + race-high_accuracy: 28.12 + winogrande_accuracy: 68.75 gemma-7b-vllm: gsm8k_accuracy: 43.75 - GPQA_diamond_accuracy: 9.38 - race-high_accuracy: - winogrande_accuracy: + GPQA_diamond_accuracy: 6.25 + race-high_accuracy: 81.25 + winogrande_accuracy: 81.25 internlm2_5-7b-hf: gsm8k_accuracy: 37.5 GPQA_diamond_accuracy: 25 @@ -275,30 +275,25 @@ base: GPQA_diamond_accuracy: 18.75 race-high_accuracy: 62.5 winogrande_accuracy: 78.12 - internlm2-base-7b-hf: - gsm8k_accuracy: 3.12 - GPQA_diamond_accuracy: 21.88 - race-high_accuracy: 75 - winogrande_accuracy: 65.62 internlm2-1.8b-turbomind: - gsm8k_accuracy: 12.5 - GPQA_diamond_accuracy: 9.38 + gsm8k_accuracy: 6.25 + GPQA_diamond_accuracy: 12.5 race-high_accuracy: 71.88 - winogrande_accuracy: 78.12 + winogrande_accuracy: 75 internlm2_5-7b-turbomind: - gsm8k_accuracy: 62.50 + gsm8k_accuracy: 59.38 GPQA_diamond_accuracy: 34.38 race-high_accuracy: 93.75 - winogrande_accuracy: 87.50 + winogrande_accuracy: 84.38 internlm2-7b-turbomind: - gsm8k_accuracy: 53.12 - GPQA_diamond_accuracy: 21.88 + gsm8k_accuracy: 50 + GPQA_diamond_accuracy: 18.75 race-high_accuracy: 71.88 winogrande_accuracy: 84.38 internlm2-base-7b-turbomind: gsm8k_accuracy: 37.50 - GPQA_diamond_accuracy: 28.12 - race-high_accuracy: 81.25 + GPQA_diamond_accuracy: 21.88 + race-high_accuracy: 84.38 winogrande_accuracy: 75 llama-2-7b-hf: gsm8k_accuracy: 21.88 @@ -395,21 +390,6 @@ base: GPQA_diamond_accuracy: 40.62 race-high_accuracy: 87.5 winogrande_accuracy: 71.88 - deepseek-v2-lite-hf: - gsm8k_accuracy: 31.25 - GPQA_diamond_accuracy: 28.12 - race-high_accuracy: 59.38 - winogrande_accuracy: 71.88 - internlm2-20b-hf: - gsm8k_accuracy: 56.25 - GPQA_diamond_accuracy: 15.62 - race-high_accuracy: 68.75 - winogrande_accuracy: 75 - internlm2-base-20b-hf: - gsm8k_accuracy: 12.5 - GPQA_diamond_accuracy: 9.38 - race-high_accuracy: 84.38 - winogrande_accuracy: 65.62 internlm2-20b-turbomind: gsm8k_accuracy: 71.88 GPQA_diamond_accuracy: 15.62 @@ -430,11 +410,6 @@ base: GPQA_diamond_accuracy: 28.12 race-high_accuracy: 93.75 winogrande_accuracy: 81.25 - deepseek-67b-base-hf: - gsm8k_accuracy: 59.38 - GPQA_diamond_accuracy: 31.25 - race-high_accuracy: 81.25 - winogrande_accuracy: 90.62 deepseek-67b-base-turbomind: gsm8k_accuracy: 56.25 GPQA_diamond_accuracy: 28.12 @@ -455,8 +430,3 @@ base: GPQA_diamond_accuracy: 15.62 race-high_accuracy: 93.75 winogrande_accuracy: 84.38 - llama-3-70b-hf: - gsm8k_accuracy: 62.5 - GPQA_diamond_accuracy: 3.12 - race-high_accuracy: 93.75 - winogrande_accuracy: 84.38