update

2025-05-30 16:03:24 +08:00 · 2025-04-03 09:57:01 +08:00 · 2025-04-03 09:57:01 +08:00 · c2cc5f7054
commit c2cc5f7054
parent 69082bafb8
4 changed files with 8 additions and 16 deletions
--- a/.github/scripts/eval_regression_base_models.py
+++ b/.github/scripts/eval_regression_base_models.py
@ -116,6 +116,8 @@ with read_base():
    from ...volc import infer as volc_infer  # noqa: F401, E501
 hf_glm4_9b_model[0]['path'] = 'THUDM/glm-4-9b-hf'
 race_datasets = [race_datasets[1]]
 models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
 datasets = sum([v for k, v in locals().items() if k.endswith('_datasets')], [])
--- a/.github/scripts/eval_regression_chat_models.py
+++ b/.github/scripts/eval_regression_chat_models.py
@ -97,8 +97,6 @@ with read_base():
        models as hf_mistral_nemo_instruct_2407_model  # noqa: F401, E501
    from opencompass.configs.models.mistral.hf_mistral_small_instruct_2409 import \
        models as hf_mistral_small_instruct_2409_model  # noqa: F401, E501
    from opencompass.configs.models.mistral.hf_mixtral_8x7b_instruct_v0_1 import \
        models as hf_mixtral_8x7b_instruct_v0_1_model  # noqa: F401, E501
    from opencompass.configs.models.mistral.lmdeploy_mistral_large_instruct_2411 import \
        models as \
        lmdeploy_mistral_large_instruct_2411_model  # noqa: F401, E501
--- a/.github/scripts/oc_score_baseline_testrange.yaml
+++ b/.github/scripts/oc_score_baseline_testrange.yaml
@ -89,9 +89,6 @@ chat:
    llama-3-8b-instruct-turbomind:
        gsm8k_accuracy: 68.75
        race-high_accuracy: 84.38
    internvl2_5-8b-turbomind:
        gsm8k_accuracy: 0
        race-high_accuracy: 0
    mistral-7b-instruct-v0.2-hf:
        gsm8k_accuracy: 40.62
        race-high_accuracy: 75
@ -182,15 +179,15 @@ chat:
    yi-1.5-34b-chat-turbomind:
        gsm8k_accuracy: 75.00
        race-high_accuracy: 93.75
    deepseek-67b-chat-turbomind:
        gsm8k_accuracy: 75.00
        race-high_accuracy: 78.12
    deepseek-r1-distill-qwen-32b-turbomind:
        gsm8k_accuracy: 25
        race-high_accuracy: 90.62
    llama-3_3-70b-instruct-turbomind:
        gsm8k_accuracy: 93.75
        race-high_accuracy: 87.5
    mixtral-8x7b-instruct-v0.1-hf:
        gsm8k_accuracy: 59.38
        race-high_accuracy: 81.25
    mixtral-large-instruct-2411-turbomind:
        gsm8k_accuracy: 87.50
        race-high_accuracy: 93.75
@ -228,15 +225,10 @@ base:
        GPQA_diamond_accuracy: 0
        race-high_accuracy: 46.88
        winogrande_accuracy: 71.88
    deepseek-moe-16b-base-hf:
        gsm8k_accuracy: 21.88
        GPQA_diamond_accuracy: 0
        race-high_accuracy: 21.88
        winogrande_accuracy: 65.62
    deepseek-7b-base-turbomind:
        gsm8k_accuracy: 21.88
        GPQA_diamond_accuracy: 0
-        race-high_accuracy: 46.88
+        race-high_accuracy: 43.75
        winogrande_accuracy: 84.38
    deepseek-moe-16b-base-vllm:
        gsm8k_accuracy: 21.88
@ -269,7 +261,7 @@ base:
        race-high_accuracy:
        winogrande_accuracy:
    gemma-7b-vllm:
-        gsm8k_accuracy: 53.12
+        gsm8k_accuracy: 43.75
        GPQA_diamond_accuracy: 9.38
        race-high_accuracy:
        winogrande_accuracy:
--- a/.github/workflows/daily-run-test.yml
+++ b/.github/workflows/daily-run-test.yml
@ -258,7 +258,7 @@ jobs:
          conda info --envs
          export from_tf=TRUE
          python tools/list_configs.py internlm2_5 mmlu
-          opencompass --models hf_internlm2_5_7b hf_internlm3_8b_instruct --datasets race_ppl demo_gsm8k_chat_gen --batch-size 1 --max-out-len 256 --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd1 --reuse --max-num-workers 2 --dump-eval-details
+          opencompass --models hf_internlm2_5_7b --datasets race_ppl demo_gsm8k_chat_gen --batch-size 1 --max-out-len 256 --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd1 --reuse --max-num-workers 2 --dump-eval-details
          rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd1/*/summary regression_result_daily
          python -m pytest -m case1 -s -v --color=yes .github/scripts/oc_score_assert.py
          opencompass --models hf_internlm2_5_7b_chat hf_internlm3_8b_instruct --datasets race_gen demo_gsm8k_chat_gen -a lmdeploy --batch-size 1 --max-out-len 256 --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd2 --reuse --max-num-workers 2 --dump-eval-details