mirror of
https://github.com/open-compass/opencompass.git
synced 2025-05-30 16:03:24 +08:00
updaste
This commit is contained in:
parent
b87052718e
commit
0585f9dad2
@ -11,14 +11,10 @@ with read_base():
|
|||||||
from opencompass.configs.datasets.winogrande.winogrande_5shot_ll_252f01 import \
|
from opencompass.configs.datasets.winogrande.winogrande_5shot_ll_252f01 import \
|
||||||
winogrande_datasets # noqa: F401, E501
|
winogrande_datasets # noqa: F401, E501
|
||||||
# read hf models - chat models
|
# read hf models - chat models
|
||||||
from opencompass.configs.models.chatglm.hf_glm4_9b import \
|
|
||||||
models as hf_glm4_9b_model # noqa: F401, E501
|
|
||||||
from opencompass.configs.models.chatglm.lmdeploy_glm4_9b import \
|
from opencompass.configs.models.chatglm.lmdeploy_glm4_9b import \
|
||||||
models as lmdeploy_glm4_9b_model # noqa: F401, E501
|
models as lmdeploy_glm4_9b_model # noqa: F401, E501
|
||||||
from opencompass.configs.models.deepseek.hf_deepseek_7b_base import \
|
from opencompass.configs.models.deepseek.hf_deepseek_7b_base import \
|
||||||
models as hf_deepseek_7b_base_model # noqa: F401, E501
|
models as hf_deepseek_7b_base_model # noqa: F401, E501
|
||||||
from opencompass.configs.models.deepseek.hf_deepseek_v2_lite import \
|
|
||||||
models as hf_deepseek_v2_lite_model # noqa: F401, E501
|
|
||||||
from opencompass.configs.models.deepseek.lmdeploy_deepseek_7b_base import \
|
from opencompass.configs.models.deepseek.lmdeploy_deepseek_7b_base import \
|
||||||
models as lmdeploy_deepseek_7b_base_model # noqa: F401, E501
|
models as lmdeploy_deepseek_7b_base_model # noqa: F401, E501
|
||||||
from opencompass.configs.models.deepseek.lmdeploy_deepseek_67b_base import \
|
from opencompass.configs.models.deepseek.lmdeploy_deepseek_67b_base import \
|
||||||
@ -110,8 +106,6 @@ with read_base():
|
|||||||
|
|
||||||
from ...volc import infer as volc_infer # noqa: F401, E501
|
from ...volc import infer as volc_infer # noqa: F401, E501
|
||||||
|
|
||||||
hf_glm4_9b_model[0]['path'] = 'THUDM/glm-4-9b-hf'
|
|
||||||
|
|
||||||
race_datasets = [race_datasets[1]]
|
race_datasets = [race_datasets[1]]
|
||||||
models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
|
models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
|
||||||
datasets = sum([v for k, v in locals().items() if k.endswith('_datasets')], [])
|
datasets = sum([v for k, v in locals().items() if k.endswith('_datasets')], [])
|
||||||
|
5
.github/scripts/oc_score_assert.py
vendored
5
.github/scripts/oc_score_assert.py
vendored
@ -246,10 +246,7 @@ class TestCmdCase:
|
|||||||
@pytest.mark.parametrize('model, dataset',
|
@pytest.mark.parametrize('model, dataset',
|
||||||
[('internlm2_5-7b-hf', 'race-middle_accuracy'),
|
[('internlm2_5-7b-hf', 'race-middle_accuracy'),
|
||||||
('internlm2_5-7b-hf', 'race-high_accuracy'),
|
('internlm2_5-7b-hf', 'race-high_accuracy'),
|
||||||
('internlm2_5-7b-hf', 'demo_gsm8k_accuracy'),
|
('internlm2_5-7b-hf', 'demo_gsm8k_accuracy')])
|
||||||
('internlm2-1.8b-hf', 'race-middle_accuracy'),
|
|
||||||
('internlm2-1.8b-hf', 'race-high_accuracy'),
|
|
||||||
('internlm2-1.8b-hf', 'demo_gsm8k_accuracy')])
|
|
||||||
def test_cmd_case1(self, baseline_scores, result_scores, model, dataset):
|
def test_cmd_case1(self, baseline_scores, result_scores, model, dataset):
|
||||||
base_score = baseline_scores.get(model).get(dataset)
|
base_score = baseline_scores.get(model).get(dataset)
|
||||||
result_score = result_scores.get(model).get(dataset)
|
result_score = result_scores.get(model).get(dataset)
|
||||||
|
5
.github/scripts/oc_score_baseline.yaml
vendored
5
.github/scripts/oc_score_baseline.yaml
vendored
@ -8,11 +8,6 @@ internlm2_5-7b_hf:
|
|||||||
race-middle_accuracy: 91.78
|
race-middle_accuracy: 91.78
|
||||||
race-high_accuracy: 90.02
|
race-high_accuracy: 90.02
|
||||||
|
|
||||||
internlm2-1.8b-hf:
|
|
||||||
demo_gsm8k_accuracy: 15.62
|
|
||||||
race-middle_accuracy: 71.66
|
|
||||||
race-high_accuracy: 66.38
|
|
||||||
|
|
||||||
internlm2_5-7b-chat-lmdeploy:
|
internlm2_5-7b-chat-lmdeploy:
|
||||||
demo_gsm8k_accuracy: 89.06
|
demo_gsm8k_accuracy: 89.06
|
||||||
race-middle_accuracy: 92.76
|
race-middle_accuracy: 92.76
|
||||||
|
72
.github/scripts/oc_score_baseline_testrange.yaml
vendored
72
.github/scripts/oc_score_baseline_testrange.yaml
vendored
@ -210,13 +210,8 @@ chat:
|
|||||||
gsm8k_accuracy: 78.12
|
gsm8k_accuracy: 78.12
|
||||||
race-high_accuracy: 78.12
|
race-high_accuracy: 78.12
|
||||||
base:
|
base:
|
||||||
glm-4-9b-hf:
|
|
||||||
gsm8k_accuracy: 68.75
|
|
||||||
GPQA_diamond_accuracy: 31.25
|
|
||||||
race-high_accuracy: 93.75
|
|
||||||
winogrande_accuracy: 84.38
|
|
||||||
glm-4-9b-turbomind:
|
glm-4-9b-turbomind:
|
||||||
gsm8k_accuracy: 62.5
|
gsm8k_accuracy: 56.25
|
||||||
GPQA_diamond_accuracy: 28.12
|
GPQA_diamond_accuracy: 28.12
|
||||||
race-high_accuracy: 93.75
|
race-high_accuracy: 93.75
|
||||||
winogrande_accuracy: 84.38
|
winogrande_accuracy: 84.38
|
||||||
@ -226,7 +221,7 @@ base:
|
|||||||
race-high_accuracy: 46.88
|
race-high_accuracy: 46.88
|
||||||
winogrande_accuracy: 71.88
|
winogrande_accuracy: 71.88
|
||||||
deepseek-7b-base-turbomind:
|
deepseek-7b-base-turbomind:
|
||||||
gsm8k_accuracy: 21.88
|
gsm8k_accuracy: 18.75
|
||||||
GPQA_diamond_accuracy: 0
|
GPQA_diamond_accuracy: 0
|
||||||
race-high_accuracy: 43.75
|
race-high_accuracy: 43.75
|
||||||
winogrande_accuracy: 84.38
|
winogrande_accuracy: 84.38
|
||||||
@ -255,16 +250,21 @@ base:
|
|||||||
GPQA_diamond_accuracy: 3.12
|
GPQA_diamond_accuracy: 3.12
|
||||||
race-high_accuracy: 65.62
|
race-high_accuracy: 65.62
|
||||||
winogrande_accuracy: 71.88
|
winogrande_accuracy: 71.88
|
||||||
|
gemma-2-9b-turbomind:
|
||||||
|
gsm8k_accuracy: 68.75
|
||||||
|
GPQA_diamond_accuracy: 0
|
||||||
|
race-high_accuracy: 78.12
|
||||||
|
winogrande_accuracy: 50
|
||||||
gemma-2b-vllm:
|
gemma-2b-vllm:
|
||||||
gsm8k_accuracy: 15.62
|
gsm8k_accuracy: 15.62
|
||||||
GPQA_diamond_accuracy: 3.12
|
GPQA_diamond_accuracy: 3.12
|
||||||
race-high_accuracy:
|
race-high_accuracy: 28.12
|
||||||
winogrande_accuracy:
|
winogrande_accuracy: 68.75
|
||||||
gemma-7b-vllm:
|
gemma-7b-vllm:
|
||||||
gsm8k_accuracy: 43.75
|
gsm8k_accuracy: 43.75
|
||||||
GPQA_diamond_accuracy: 9.38
|
GPQA_diamond_accuracy: 6.25
|
||||||
race-high_accuracy:
|
race-high_accuracy: 81.25
|
||||||
winogrande_accuracy:
|
winogrande_accuracy: 81.25
|
||||||
internlm2_5-7b-hf:
|
internlm2_5-7b-hf:
|
||||||
gsm8k_accuracy: 37.5
|
gsm8k_accuracy: 37.5
|
||||||
GPQA_diamond_accuracy: 25
|
GPQA_diamond_accuracy: 25
|
||||||
@ -275,30 +275,25 @@ base:
|
|||||||
GPQA_diamond_accuracy: 18.75
|
GPQA_diamond_accuracy: 18.75
|
||||||
race-high_accuracy: 62.5
|
race-high_accuracy: 62.5
|
||||||
winogrande_accuracy: 78.12
|
winogrande_accuracy: 78.12
|
||||||
internlm2-base-7b-hf:
|
|
||||||
gsm8k_accuracy: 3.12
|
|
||||||
GPQA_diamond_accuracy: 21.88
|
|
||||||
race-high_accuracy: 75
|
|
||||||
winogrande_accuracy: 65.62
|
|
||||||
internlm2-1.8b-turbomind:
|
internlm2-1.8b-turbomind:
|
||||||
gsm8k_accuracy: 12.5
|
gsm8k_accuracy: 6.25
|
||||||
GPQA_diamond_accuracy: 9.38
|
GPQA_diamond_accuracy: 12.5
|
||||||
race-high_accuracy: 71.88
|
race-high_accuracy: 71.88
|
||||||
winogrande_accuracy: 78.12
|
winogrande_accuracy: 75
|
||||||
internlm2_5-7b-turbomind:
|
internlm2_5-7b-turbomind:
|
||||||
gsm8k_accuracy: 62.50
|
gsm8k_accuracy: 59.38
|
||||||
GPQA_diamond_accuracy: 34.38
|
GPQA_diamond_accuracy: 34.38
|
||||||
race-high_accuracy: 93.75
|
race-high_accuracy: 93.75
|
||||||
winogrande_accuracy: 87.50
|
winogrande_accuracy: 84.38
|
||||||
internlm2-7b-turbomind:
|
internlm2-7b-turbomind:
|
||||||
gsm8k_accuracy: 53.12
|
gsm8k_accuracy: 50
|
||||||
GPQA_diamond_accuracy: 21.88
|
GPQA_diamond_accuracy: 18.75
|
||||||
race-high_accuracy: 71.88
|
race-high_accuracy: 71.88
|
||||||
winogrande_accuracy: 84.38
|
winogrande_accuracy: 84.38
|
||||||
internlm2-base-7b-turbomind:
|
internlm2-base-7b-turbomind:
|
||||||
gsm8k_accuracy: 37.50
|
gsm8k_accuracy: 37.50
|
||||||
GPQA_diamond_accuracy: 28.12
|
GPQA_diamond_accuracy: 21.88
|
||||||
race-high_accuracy: 81.25
|
race-high_accuracy: 84.38
|
||||||
winogrande_accuracy: 75
|
winogrande_accuracy: 75
|
||||||
llama-2-7b-hf:
|
llama-2-7b-hf:
|
||||||
gsm8k_accuracy: 21.88
|
gsm8k_accuracy: 21.88
|
||||||
@ -395,21 +390,6 @@ base:
|
|||||||
GPQA_diamond_accuracy: 40.62
|
GPQA_diamond_accuracy: 40.62
|
||||||
race-high_accuracy: 87.5
|
race-high_accuracy: 87.5
|
||||||
winogrande_accuracy: 71.88
|
winogrande_accuracy: 71.88
|
||||||
deepseek-v2-lite-hf:
|
|
||||||
gsm8k_accuracy: 31.25
|
|
||||||
GPQA_diamond_accuracy: 28.12
|
|
||||||
race-high_accuracy: 59.38
|
|
||||||
winogrande_accuracy: 71.88
|
|
||||||
internlm2-20b-hf:
|
|
||||||
gsm8k_accuracy: 56.25
|
|
||||||
GPQA_diamond_accuracy: 15.62
|
|
||||||
race-high_accuracy: 68.75
|
|
||||||
winogrande_accuracy: 75
|
|
||||||
internlm2-base-20b-hf:
|
|
||||||
gsm8k_accuracy: 12.5
|
|
||||||
GPQA_diamond_accuracy: 9.38
|
|
||||||
race-high_accuracy: 84.38
|
|
||||||
winogrande_accuracy: 65.62
|
|
||||||
internlm2-20b-turbomind:
|
internlm2-20b-turbomind:
|
||||||
gsm8k_accuracy: 71.88
|
gsm8k_accuracy: 71.88
|
||||||
GPQA_diamond_accuracy: 15.62
|
GPQA_diamond_accuracy: 15.62
|
||||||
@ -430,11 +410,6 @@ base:
|
|||||||
GPQA_diamond_accuracy: 28.12
|
GPQA_diamond_accuracy: 28.12
|
||||||
race-high_accuracy: 93.75
|
race-high_accuracy: 93.75
|
||||||
winogrande_accuracy: 81.25
|
winogrande_accuracy: 81.25
|
||||||
deepseek-67b-base-hf:
|
|
||||||
gsm8k_accuracy: 59.38
|
|
||||||
GPQA_diamond_accuracy: 31.25
|
|
||||||
race-high_accuracy: 81.25
|
|
||||||
winogrande_accuracy: 90.62
|
|
||||||
deepseek-67b-base-turbomind:
|
deepseek-67b-base-turbomind:
|
||||||
gsm8k_accuracy: 56.25
|
gsm8k_accuracy: 56.25
|
||||||
GPQA_diamond_accuracy: 28.12
|
GPQA_diamond_accuracy: 28.12
|
||||||
@ -455,8 +430,3 @@ base:
|
|||||||
GPQA_diamond_accuracy: 15.62
|
GPQA_diamond_accuracy: 15.62
|
||||||
race-high_accuracy: 93.75
|
race-high_accuracy: 93.75
|
||||||
winogrande_accuracy: 84.38
|
winogrande_accuracy: 84.38
|
||||||
llama-3-70b-hf:
|
|
||||||
gsm8k_accuracy: 62.5
|
|
||||||
GPQA_diamond_accuracy: 3.12
|
|
||||||
race-high_accuracy: 93.75
|
|
||||||
winogrande_accuracy: 84.38
|
|
||||||
|
Loading…
Reference in New Issue
Block a user