From e3c2521df5975f2362c8eb5955397045f570efeb Mon Sep 17 00:00:00 2001 From: zhulinJulia24 Date: Thu, 3 Apr 2025 13:47:45 +0800 Subject: [PATCH] update --- .../scripts/oc_score_baseline_testrange.yaml | 48 +++++++++---------- .github/workflows/daily-run-test.yml | 10 ++-- 2 files changed, 29 insertions(+), 29 deletions(-) diff --git a/.github/scripts/oc_score_baseline_testrange.yaml b/.github/scripts/oc_score_baseline_testrange.yaml index 06ba83bf..16a13209 100644 --- a/.github/scripts/oc_score_baseline_testrange.yaml +++ b/.github/scripts/oc_score_baseline_testrange.yaml @@ -316,7 +316,7 @@ base: race-high_accuracy: 78.12 winogrande_accuracy: 78.12 llama-3-8b-turbomind: - gsm8k_accuracy: 50 + gsm8k_accuracy: 46.88 GPQA_diamond_accuracy: 12.50 race-high_accuracy: 65.62 winogrande_accuracy: 78.12 @@ -332,14 +332,14 @@ base: winogrande_accuracy: 71.88 qwen2.5-1.5b-turbomind: gsm8k_accuracy: 62.50 - GPQA_diamond_accuracy: 12.50 - race-high_accuracy: 78.12 - winogrande_accuracy: 68.75 - qwen2.5-7b-turbomind: - gsm8k_accuracy: 75.00 - GPQA_diamond_accuracy: 25 - race-high_accuracy: 87.5 + GPQA_diamond_accuracy: 15.62 + race-high_accuracy: 75 winogrande_accuracy: 71.88 + qwen2.5-7b-turbomind: + gsm8k_accuracy: 71.88 + GPQA_diamond_accuracy: 18.75 + race-high_accuracy: 87.5 + winogrande_accuracy: 75.00 qwen1.5-moe-a2.7b-hf: gsm8k_accuracy: 62.5 GPQA_diamond_accuracy: 18.75 @@ -361,17 +361,17 @@ base: race-high_accuracy: 87.5 winogrande_accuracy: 68.75 qwen2-1.5b-turbomind: - gsm8k_accuracy: 56.25 - GPQA_diamond_accuracy: 9.38 + gsm8k_accuracy: 59.38 + GPQA_diamond_accuracy: 12.50 race-high_accuracy: 81.25 winogrande_accuracy: 75 qwen2-7b-turbomind: - gsm8k_accuracy: 75.00 + gsm8k_accuracy: 65.62 GPQA_diamond_accuracy: 12.5 race-high_accuracy: 87.5 winogrande_accuracy: 71.88 qwen1.5-0.5b-vllm: - gsm8k_accuracy: 9.38 + gsm8k_accuracy: 6.25 GPQA_diamond_accuracy: 0 race-high_accuracy: 56.25 winogrande_accuracy: 62.5 @@ -387,12 +387,12 @@ base: winogrande_accuracy: 59.38 yi-1.5-9b-turbomind: gsm8k_accuracy: 78.12 - GPQA_diamond_accuracy: 40.62 + GPQA_diamond_accuracy: 43.75 race-high_accuracy: 87.5 winogrande_accuracy: 71.88 internlm2-20b-turbomind: - gsm8k_accuracy: 71.88 - GPQA_diamond_accuracy: 15.62 + gsm8k_accuracy: 75 + GPQA_diamond_accuracy: 18.75 race-high_accuracy: 68.75 winogrande_accuracy: 81.25 qwen2.5-14b-hf: @@ -406,27 +406,27 @@ base: race-high_accuracy: 93.75 winogrande_accuracy: 78.12 qwen2.5-32b-turbomind: - gsm8k_accuracy: 84.38 - GPQA_diamond_accuracy: 28.12 + gsm8k_accuracy: 87.5 + GPQA_diamond_accuracy: 18.75 race-high_accuracy: 93.75 winogrande_accuracy: 81.25 deepseek-67b-base-turbomind: - gsm8k_accuracy: 56.25 + gsm8k_accuracy: 53.12 GPQA_diamond_accuracy: 28.12 race-high_accuracy: 81.25 winogrande_accuracy: 84.38 llama-3-70b-turbomind: - gsm8k_accuracy: 59.38 - GPQA_diamond_accuracy: 9.38 + gsm8k_accuracy: 56.25 + GPQA_diamond_accuracy: 12.50 race-high_accuracy: 93.75 winogrande_accuracy: 84.38 qwen2.5-72b-turbomind: gsm8k_accuracy: 84.38 - GPQA_diamond_accuracy: 34.38 + GPQA_diamond_accuracy: 31.25 race-high_accuracy: 93.75 winogrande_accuracy: 87.5 deepseek-v2-turbomind: - gsm8k_accuracy: 65.62 - GPQA_diamond_accuracy: 15.62 + gsm8k_accuracy: 59.38 + GPQA_diamond_accuracy: 3.12 race-high_accuracy: 93.75 - winogrande_accuracy: 84.38 + winogrande_accuracy: 81.25 diff --git a/.github/workflows/daily-run-test.yml b/.github/workflows/daily-run-test.yml index ec0fc644..cb5844cf 100644 --- a/.github/workflows/daily-run-test.yml +++ b/.github/workflows/daily-run-test.yml @@ -258,19 +258,19 @@ jobs: conda info --envs export from_tf=TRUE python tools/list_configs.py internlm2_5 mmlu - opencompass --models hf_internlm2_5_7b --datasets race_ppl demo_gsm8k_chat_gen --batch-size 1 --max-out-len 256 --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd1 --reuse --max-num-workers 2 --dump-eval-details + opencompass --models hf_internlm2_5_7b --datasets race_ppl demo_gsm8k_chat_gen --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd1 --reuse --max-num-workers 2 --dump-eval-details rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd1/*/summary regression_result_daily python -m pytest -m case1 -s -v --color=yes .github/scripts/oc_score_assert.py - opencompass --models hf_internlm2_5_7b_chat hf_internlm3_8b_instruct --datasets race_gen demo_gsm8k_chat_gen -a lmdeploy --batch-size 1 --max-out-len 256 --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd2 --reuse --max-num-workers 2 --dump-eval-details + opencompass --models hf_internlm2_5_7b_chat hf_internlm3_8b_instruct --datasets race_gen demo_gsm8k_chat_gen -a lmdeploy --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd2 --reuse --max-num-workers 2 --dump-eval-details rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd2/*/summary regression_result_daily python -m pytest -m case2 -s -v --color=yes .github/scripts/oc_score_assert.py - opencompass --datasets race_ppl demo_gsm8k_chat_gen --hf-type base --hf-path internlm/internlm2_5-7b --batch-size 1 --max-out-len 256 --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd3 --reuse --max-num-workers 2 --dump-eval-details + opencompass --datasets race_ppl demo_gsm8k_chat_gen --hf-type base --hf-path internlm/internlm2_5-7b --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd3 --reuse --max-num-workers 2 --dump-eval-details rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd3/*/summary regression_result_daily python -m pytest -m case3 -s -v --color=yes .github/scripts/oc_score_assert.py - opencompass --datasets race_gen demo_gsm8k_chat_gen --hf-type chat --hf-path internlm/internlm3_8b-instruct -a lmdeploy --batch-size 1 --max-out-len 256 --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd4 --reuse --max-num-workers 2 --dump-eval-details + opencompass --datasets race_gen demo_gsm8k_chat_gen --hf-type chat --hf-path internlm/internlm3_8b-instruct -a lmdeploy--work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd4 --reuse --max-num-workers 2 --dump-eval-details rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd4/*/summary regression_result_daily python -m pytest -m case4 -s -v --color=yes .github/scripts/oc_score_assert.py - opencompass --datasets race_gen demo_gsm8k_chat_gen --hf-type chat --hf-path internlm/internlm3_8b-instruct -a vllm --batch-size 1 --max-out-len 256 --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd5 --reuse --max-num-workers 2 --dump-eval-details + opencompass --datasets race_gen demo_gsm8k_chat_gen --hf-type chat --hf-path internlm/internlm3_8b-instruct -a vllm --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd5 --reuse --max-num-workers 2 --dump-eval-details rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd5/*/summary regression_result_daily python -m pytest -m case5 -s -v --color=yes .github/scripts/oc_score_assert.py - name: Run model test - api