diff --git a/.github/scripts/eval_regression_base_models.py b/.github/scripts/eval_regression_base_models.py index a3045378..56e64c2f 100644 --- a/.github/scripts/eval_regression_base_models.py +++ b/.github/scripts/eval_regression_base_models.py @@ -116,6 +116,8 @@ with read_base(): from ...volc import infer as volc_infer # noqa: F401, E501 +hf_glm4_9b_model[0]['path'] = 'THUDM/glm-4-9b-hf' + race_datasets = [race_datasets[1]] models = sum([v for k, v in locals().items() if k.endswith('_model')], []) datasets = sum([v for k, v in locals().items() if k.endswith('_datasets')], []) diff --git a/.github/scripts/eval_regression_chat_models.py b/.github/scripts/eval_regression_chat_models.py index f2bc484e..bfe923f6 100644 --- a/.github/scripts/eval_regression_chat_models.py +++ b/.github/scripts/eval_regression_chat_models.py @@ -97,8 +97,6 @@ with read_base(): models as hf_mistral_nemo_instruct_2407_model # noqa: F401, E501 from opencompass.configs.models.mistral.hf_mistral_small_instruct_2409 import \ models as hf_mistral_small_instruct_2409_model # noqa: F401, E501 - from opencompass.configs.models.mistral.hf_mixtral_8x7b_instruct_v0_1 import \ - models as hf_mixtral_8x7b_instruct_v0_1_model # noqa: F401, E501 from opencompass.configs.models.mistral.lmdeploy_mistral_large_instruct_2411 import \ models as \ lmdeploy_mistral_large_instruct_2411_model # noqa: F401, E501 diff --git a/.github/scripts/oc_score_baseline_testrange.yaml b/.github/scripts/oc_score_baseline_testrange.yaml index ea0e88f6..e317e1d5 100644 --- a/.github/scripts/oc_score_baseline_testrange.yaml +++ b/.github/scripts/oc_score_baseline_testrange.yaml @@ -89,9 +89,6 @@ chat: llama-3-8b-instruct-turbomind: gsm8k_accuracy: 68.75 race-high_accuracy: 84.38 - internvl2_5-8b-turbomind: - gsm8k_accuracy: 0 - race-high_accuracy: 0 mistral-7b-instruct-v0.2-hf: gsm8k_accuracy: 40.62 race-high_accuracy: 75 @@ -182,15 +179,15 @@ chat: yi-1.5-34b-chat-turbomind: gsm8k_accuracy: 75.00 race-high_accuracy: 93.75 + deepseek-67b-chat-turbomind: + gsm8k_accuracy: 75.00 + race-high_accuracy: 78.12 deepseek-r1-distill-qwen-32b-turbomind: gsm8k_accuracy: 25 race-high_accuracy: 90.62 llama-3_3-70b-instruct-turbomind: gsm8k_accuracy: 93.75 race-high_accuracy: 87.5 - mixtral-8x7b-instruct-v0.1-hf: - gsm8k_accuracy: 59.38 - race-high_accuracy: 81.25 mixtral-large-instruct-2411-turbomind: gsm8k_accuracy: 87.50 race-high_accuracy: 93.75 @@ -228,15 +225,10 @@ base: GPQA_diamond_accuracy: 0 race-high_accuracy: 46.88 winogrande_accuracy: 71.88 - deepseek-moe-16b-base-hf: - gsm8k_accuracy: 21.88 - GPQA_diamond_accuracy: 0 - race-high_accuracy: 21.88 - winogrande_accuracy: 65.62 deepseek-7b-base-turbomind: gsm8k_accuracy: 21.88 GPQA_diamond_accuracy: 0 - race-high_accuracy: 46.88 + race-high_accuracy: 43.75 winogrande_accuracy: 84.38 deepseek-moe-16b-base-vllm: gsm8k_accuracy: 21.88 @@ -269,7 +261,7 @@ base: race-high_accuracy: winogrande_accuracy: gemma-7b-vllm: - gsm8k_accuracy: 53.12 + gsm8k_accuracy: 43.75 GPQA_diamond_accuracy: 9.38 race-high_accuracy: winogrande_accuracy: diff --git a/.github/workflows/daily-run-test.yml b/.github/workflows/daily-run-test.yml index 40b1c41d..ec0fc644 100644 --- a/.github/workflows/daily-run-test.yml +++ b/.github/workflows/daily-run-test.yml @@ -258,7 +258,7 @@ jobs: conda info --envs export from_tf=TRUE python tools/list_configs.py internlm2_5 mmlu - opencompass --models hf_internlm2_5_7b hf_internlm3_8b_instruct --datasets race_ppl demo_gsm8k_chat_gen --batch-size 1 --max-out-len 256 --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd1 --reuse --max-num-workers 2 --dump-eval-details + opencompass --models hf_internlm2_5_7b --datasets race_ppl demo_gsm8k_chat_gen --batch-size 1 --max-out-len 256 --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd1 --reuse --max-num-workers 2 --dump-eval-details rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd1/*/summary regression_result_daily python -m pytest -m case1 -s -v --color=yes .github/scripts/oc_score_assert.py opencompass --models hf_internlm2_5_7b_chat hf_internlm3_8b_instruct --datasets race_gen demo_gsm8k_chat_gen -a lmdeploy --batch-size 1 --max-out-len 256 --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd2 --reuse --max-num-workers 2 --dump-eval-details