OpenCompass/.github/scripts/oc_score_baseline.yaml
zhulinJulia24 167cfdcca3
[ci] update daily testcase (#1285)
* Update daily-run-test.yml

* Create eval_regression_chat.py

* Delete .github/scripts/.github/scripts/eval_regression_chat.py

* Create eval_regression_chat.py

* Update pr-run-test.yml

* Update daily-run-test.yml

* Update daily-run-test.yml

* Update daily-run-test.yml

* Update oc_score_baseline.yaml

* Update oc_score_assert.py

* Update daily-run-test.yml

* Update daily-run-test.yml

* Update oc_score_baseline.yaml

* Update oc_score_assert.py

* Update oc_score_assert.py

* fix lint

* update

* update

* update

* update

* update

* update

* update

* update

* update

* Update daily-run-test.yml

* update

---------

Co-authored-by: zhulin1 <zhulin1@pjlab.org.cn>
2024-07-03 18:56:09 +08:00

181 lines
2.7 KiB
YAML

baichuan2-7b-chat-hf:
gsm8k: 30
race-middle: 74
race-high: 79
deepseek-7b-chat-hf:
gsm8k: 60
race-middle: 74
race-high: 80
deepseek-moe-16b-chat-hf:
gsm8k: 62
race-middle: 62
race-high: 70
gemma-2b-it-hf:
gsm8k: 14
race-middle: 62
race-high: 52
gemma-7b-it-hf:
gsm8k: 39
race-middle: 74
race-high: 71
internlm2-chat-1.8b-turbomind:
gsm8k: 40
race-middle: 82
race-high: 83
internlm2-chat-1.8b-sft-turbomind:
gsm8k: 32
race-middle: 81
race-high: 83
internlm2-chat-7b-turbomind:
gsm8k: 69
race-middle: 90
race-high: 88
internlm2-chat-7b-sft-turbomind:
gsm8k: 71
race-middle: 91
race-high: 92
llama-3-8b-instruct-hf:
gsm8k: 77
race-middle: 85
race-high: 87
llama-3-8b-instruct-turbomind:
gsm8k: 77
race-middle: 85
race-high: 89
mistral-7b-instruct-v0.2-hf:
gsm8k: 48
race-middle: 82
race-high: 78
minicpm-2b-dpo-fp32-hf:
gsm8k: 58
race-middle: 66
race-high: 74
minicpm-2b-sft-bf16-hf:
gsm8k: 58
race-middle: 75
race-high: 81
minicpm-2b-sft-fp32-hf:
gsm8k: 58
race-middle: 75
race-high: 81
phi-3-mini-4k-instruct-hf:
gsm8k: 67
race-middle: 81
race-high: 84
qwen1.5-0.5b-chat-hf:
gsm8k: 5
race-middle: 55
race-high: 50
qwen2-1.5b-instruct-turbomind:
gsm8k: 60
race-middle: 77
race-high: 86
qwen2-7b-instruct-turbomind:
gsm8k: 88
race-middle: 87
race-high: 89
yi-1.5-6b-chat-hf:
gsm8k: 72
race-middle: 88
race-high: 86
yi-1.5-9b-chat-hf:
gsm8k: 81
race-middle: 89
race-high: 91
deepseek-moe-16b-base-hf:
gsm8k: 25
race-middle: 35
race-high: 23
deepseek-7b-base-turbomind:
gsm8k: 21
race-middle: 42
race-high: 42
gemma-2b-hf:
gsm8k: 19
race-middle: 33
race-high: 26
gemma-7b-hf:
gsm8k: 65
race-middle: 59
race-high: 66
internlm2-1.8b-turbomind:
gsm8k: 27
race-middle: 75
race-high: 72
internlm2-7b-turbomind:
gsm8k: 67
race-middle: 78
race-high: 76
internlm2-base-7b-turbomind:
gsm8k: 39
race-middle: 75
race-high: 81
llama-3-8b-turbomind:
gsm8k: 52
race-middle: 63
race-high: 70
mistral-7b-v0.2-hf:
gsm8k: 43
race-middle: 42
race-high: 60
qwen1.5-moe-a2.7b-hf:
gsm8k: 64
race-middle: 78
race-high: 90
qwen2-0.5b-hf:
gsm8k: 35
race-middle: 52
race-high: 48
qwen2-1.5b-turbomind:
gsm8k: 57
race-middle: 64
race-high: 78
qwen2-7b-turbomind:
gsm8k: 83
race-middle: 88
race-high: 88
yi-1.5-6b-hf:
gsm8k: 59
race-middle: 81
race-high: 89
yi-1.5-9b-hf:
gsm8k: 77
race-middle: 90
race-high: 90