mirror of
https://github.com/open-compass/opencompass.git
synced 2025-05-30 16:03:24 +08:00
update
This commit is contained in:
parent
7b2bee4292
commit
e3c2521df5
48
.github/scripts/oc_score_baseline_testrange.yaml
vendored
48
.github/scripts/oc_score_baseline_testrange.yaml
vendored
@ -316,7 +316,7 @@ base:
|
|||||||
race-high_accuracy: 78.12
|
race-high_accuracy: 78.12
|
||||||
winogrande_accuracy: 78.12
|
winogrande_accuracy: 78.12
|
||||||
llama-3-8b-turbomind:
|
llama-3-8b-turbomind:
|
||||||
gsm8k_accuracy: 50
|
gsm8k_accuracy: 46.88
|
||||||
GPQA_diamond_accuracy: 12.50
|
GPQA_diamond_accuracy: 12.50
|
||||||
race-high_accuracy: 65.62
|
race-high_accuracy: 65.62
|
||||||
winogrande_accuracy: 78.12
|
winogrande_accuracy: 78.12
|
||||||
@ -332,14 +332,14 @@ base:
|
|||||||
winogrande_accuracy: 71.88
|
winogrande_accuracy: 71.88
|
||||||
qwen2.5-1.5b-turbomind:
|
qwen2.5-1.5b-turbomind:
|
||||||
gsm8k_accuracy: 62.50
|
gsm8k_accuracy: 62.50
|
||||||
GPQA_diamond_accuracy: 12.50
|
GPQA_diamond_accuracy: 15.62
|
||||||
race-high_accuracy: 78.12
|
race-high_accuracy: 75
|
||||||
winogrande_accuracy: 68.75
|
|
||||||
qwen2.5-7b-turbomind:
|
|
||||||
gsm8k_accuracy: 75.00
|
|
||||||
GPQA_diamond_accuracy: 25
|
|
||||||
race-high_accuracy: 87.5
|
|
||||||
winogrande_accuracy: 71.88
|
winogrande_accuracy: 71.88
|
||||||
|
qwen2.5-7b-turbomind:
|
||||||
|
gsm8k_accuracy: 71.88
|
||||||
|
GPQA_diamond_accuracy: 18.75
|
||||||
|
race-high_accuracy: 87.5
|
||||||
|
winogrande_accuracy: 75.00
|
||||||
qwen1.5-moe-a2.7b-hf:
|
qwen1.5-moe-a2.7b-hf:
|
||||||
gsm8k_accuracy: 62.5
|
gsm8k_accuracy: 62.5
|
||||||
GPQA_diamond_accuracy: 18.75
|
GPQA_diamond_accuracy: 18.75
|
||||||
@ -361,17 +361,17 @@ base:
|
|||||||
race-high_accuracy: 87.5
|
race-high_accuracy: 87.5
|
||||||
winogrande_accuracy: 68.75
|
winogrande_accuracy: 68.75
|
||||||
qwen2-1.5b-turbomind:
|
qwen2-1.5b-turbomind:
|
||||||
gsm8k_accuracy: 56.25
|
gsm8k_accuracy: 59.38
|
||||||
GPQA_diamond_accuracy: 9.38
|
GPQA_diamond_accuracy: 12.50
|
||||||
race-high_accuracy: 81.25
|
race-high_accuracy: 81.25
|
||||||
winogrande_accuracy: 75
|
winogrande_accuracy: 75
|
||||||
qwen2-7b-turbomind:
|
qwen2-7b-turbomind:
|
||||||
gsm8k_accuracy: 75.00
|
gsm8k_accuracy: 65.62
|
||||||
GPQA_diamond_accuracy: 12.5
|
GPQA_diamond_accuracy: 12.5
|
||||||
race-high_accuracy: 87.5
|
race-high_accuracy: 87.5
|
||||||
winogrande_accuracy: 71.88
|
winogrande_accuracy: 71.88
|
||||||
qwen1.5-0.5b-vllm:
|
qwen1.5-0.5b-vllm:
|
||||||
gsm8k_accuracy: 9.38
|
gsm8k_accuracy: 6.25
|
||||||
GPQA_diamond_accuracy: 0
|
GPQA_diamond_accuracy: 0
|
||||||
race-high_accuracy: 56.25
|
race-high_accuracy: 56.25
|
||||||
winogrande_accuracy: 62.5
|
winogrande_accuracy: 62.5
|
||||||
@ -387,12 +387,12 @@ base:
|
|||||||
winogrande_accuracy: 59.38
|
winogrande_accuracy: 59.38
|
||||||
yi-1.5-9b-turbomind:
|
yi-1.5-9b-turbomind:
|
||||||
gsm8k_accuracy: 78.12
|
gsm8k_accuracy: 78.12
|
||||||
GPQA_diamond_accuracy: 40.62
|
GPQA_diamond_accuracy: 43.75
|
||||||
race-high_accuracy: 87.5
|
race-high_accuracy: 87.5
|
||||||
winogrande_accuracy: 71.88
|
winogrande_accuracy: 71.88
|
||||||
internlm2-20b-turbomind:
|
internlm2-20b-turbomind:
|
||||||
gsm8k_accuracy: 71.88
|
gsm8k_accuracy: 75
|
||||||
GPQA_diamond_accuracy: 15.62
|
GPQA_diamond_accuracy: 18.75
|
||||||
race-high_accuracy: 68.75
|
race-high_accuracy: 68.75
|
||||||
winogrande_accuracy: 81.25
|
winogrande_accuracy: 81.25
|
||||||
qwen2.5-14b-hf:
|
qwen2.5-14b-hf:
|
||||||
@ -406,27 +406,27 @@ base:
|
|||||||
race-high_accuracy: 93.75
|
race-high_accuracy: 93.75
|
||||||
winogrande_accuracy: 78.12
|
winogrande_accuracy: 78.12
|
||||||
qwen2.5-32b-turbomind:
|
qwen2.5-32b-turbomind:
|
||||||
gsm8k_accuracy: 84.38
|
gsm8k_accuracy: 87.5
|
||||||
GPQA_diamond_accuracy: 28.12
|
GPQA_diamond_accuracy: 18.75
|
||||||
race-high_accuracy: 93.75
|
race-high_accuracy: 93.75
|
||||||
winogrande_accuracy: 81.25
|
winogrande_accuracy: 81.25
|
||||||
deepseek-67b-base-turbomind:
|
deepseek-67b-base-turbomind:
|
||||||
gsm8k_accuracy: 56.25
|
gsm8k_accuracy: 53.12
|
||||||
GPQA_diamond_accuracy: 28.12
|
GPQA_diamond_accuracy: 28.12
|
||||||
race-high_accuracy: 81.25
|
race-high_accuracy: 81.25
|
||||||
winogrande_accuracy: 84.38
|
winogrande_accuracy: 84.38
|
||||||
llama-3-70b-turbomind:
|
llama-3-70b-turbomind:
|
||||||
gsm8k_accuracy: 59.38
|
gsm8k_accuracy: 56.25
|
||||||
GPQA_diamond_accuracy: 9.38
|
GPQA_diamond_accuracy: 12.50
|
||||||
race-high_accuracy: 93.75
|
race-high_accuracy: 93.75
|
||||||
winogrande_accuracy: 84.38
|
winogrande_accuracy: 84.38
|
||||||
qwen2.5-72b-turbomind:
|
qwen2.5-72b-turbomind:
|
||||||
gsm8k_accuracy: 84.38
|
gsm8k_accuracy: 84.38
|
||||||
GPQA_diamond_accuracy: 34.38
|
GPQA_diamond_accuracy: 31.25
|
||||||
race-high_accuracy: 93.75
|
race-high_accuracy: 93.75
|
||||||
winogrande_accuracy: 87.5
|
winogrande_accuracy: 87.5
|
||||||
deepseek-v2-turbomind:
|
deepseek-v2-turbomind:
|
||||||
gsm8k_accuracy: 65.62
|
gsm8k_accuracy: 59.38
|
||||||
GPQA_diamond_accuracy: 15.62
|
GPQA_diamond_accuracy: 3.12
|
||||||
race-high_accuracy: 93.75
|
race-high_accuracy: 93.75
|
||||||
winogrande_accuracy: 84.38
|
winogrande_accuracy: 81.25
|
||||||
|
10
.github/workflows/daily-run-test.yml
vendored
10
.github/workflows/daily-run-test.yml
vendored
@ -258,19 +258,19 @@ jobs:
|
|||||||
conda info --envs
|
conda info --envs
|
||||||
export from_tf=TRUE
|
export from_tf=TRUE
|
||||||
python tools/list_configs.py internlm2_5 mmlu
|
python tools/list_configs.py internlm2_5 mmlu
|
||||||
opencompass --models hf_internlm2_5_7b --datasets race_ppl demo_gsm8k_chat_gen --batch-size 1 --max-out-len 256 --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd1 --reuse --max-num-workers 2 --dump-eval-details
|
opencompass --models hf_internlm2_5_7b --datasets race_ppl demo_gsm8k_chat_gen --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd1 --reuse --max-num-workers 2 --dump-eval-details
|
||||||
rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd1/*/summary regression_result_daily
|
rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd1/*/summary regression_result_daily
|
||||||
python -m pytest -m case1 -s -v --color=yes .github/scripts/oc_score_assert.py
|
python -m pytest -m case1 -s -v --color=yes .github/scripts/oc_score_assert.py
|
||||||
opencompass --models hf_internlm2_5_7b_chat hf_internlm3_8b_instruct --datasets race_gen demo_gsm8k_chat_gen -a lmdeploy --batch-size 1 --max-out-len 256 --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd2 --reuse --max-num-workers 2 --dump-eval-details
|
opencompass --models hf_internlm2_5_7b_chat hf_internlm3_8b_instruct --datasets race_gen demo_gsm8k_chat_gen -a lmdeploy --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd2 --reuse --max-num-workers 2 --dump-eval-details
|
||||||
rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd2/*/summary regression_result_daily
|
rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd2/*/summary regression_result_daily
|
||||||
python -m pytest -m case2 -s -v --color=yes .github/scripts/oc_score_assert.py
|
python -m pytest -m case2 -s -v --color=yes .github/scripts/oc_score_assert.py
|
||||||
opencompass --datasets race_ppl demo_gsm8k_chat_gen --hf-type base --hf-path internlm/internlm2_5-7b --batch-size 1 --max-out-len 256 --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd3 --reuse --max-num-workers 2 --dump-eval-details
|
opencompass --datasets race_ppl demo_gsm8k_chat_gen --hf-type base --hf-path internlm/internlm2_5-7b --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd3 --reuse --max-num-workers 2 --dump-eval-details
|
||||||
rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd3/*/summary regression_result_daily
|
rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd3/*/summary regression_result_daily
|
||||||
python -m pytest -m case3 -s -v --color=yes .github/scripts/oc_score_assert.py
|
python -m pytest -m case3 -s -v --color=yes .github/scripts/oc_score_assert.py
|
||||||
opencompass --datasets race_gen demo_gsm8k_chat_gen --hf-type chat --hf-path internlm/internlm3_8b-instruct -a lmdeploy --batch-size 1 --max-out-len 256 --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd4 --reuse --max-num-workers 2 --dump-eval-details
|
opencompass --datasets race_gen demo_gsm8k_chat_gen --hf-type chat --hf-path internlm/internlm3_8b-instruct -a lmdeploy--work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd4 --reuse --max-num-workers 2 --dump-eval-details
|
||||||
rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd4/*/summary regression_result_daily
|
rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd4/*/summary regression_result_daily
|
||||||
python -m pytest -m case4 -s -v --color=yes .github/scripts/oc_score_assert.py
|
python -m pytest -m case4 -s -v --color=yes .github/scripts/oc_score_assert.py
|
||||||
opencompass --datasets race_gen demo_gsm8k_chat_gen --hf-type chat --hf-path internlm/internlm3_8b-instruct -a vllm --batch-size 1 --max-out-len 256 --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd5 --reuse --max-num-workers 2 --dump-eval-details
|
opencompass --datasets race_gen demo_gsm8k_chat_gen --hf-type chat --hf-path internlm/internlm3_8b-instruct -a vllm --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd5 --reuse --max-num-workers 2 --dump-eval-details
|
||||||
rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd5/*/summary regression_result_daily
|
rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd5/*/summary regression_result_daily
|
||||||
python -m pytest -m case5 -s -v --color=yes .github/scripts/oc_score_assert.py
|
python -m pytest -m case5 -s -v --color=yes .github/scripts/oc_score_assert.py
|
||||||
- name: Run model test - api
|
- name: Run model test - api
|
||||||
|
Loading…
Reference in New Issue
Block a user