mirror of
https://github.com/open-compass/opencompass.git
synced 2025-05-30 16:03:24 +08:00
Merge remote-tracking branch 'upstream/main' into openicl_eval_refactorize
This commit is contained in:
commit
a997e6532f
6
.github/scripts/eval_regression_api.py
vendored
6
.github/scripts/eval_regression_api.py
vendored
@ -24,9 +24,9 @@ models = [
|
|||||||
abbr='lmdeploy-api-test',
|
abbr='lmdeploy-api-test',
|
||||||
type=OpenAISDK,
|
type=OpenAISDK,
|
||||||
key='EMPTY',
|
key='EMPTY',
|
||||||
openai_api_base='http://0.0.0.0:23333/v1',
|
openai_api_base='http://localhost:23333/v1',
|
||||||
path='internlm2',
|
path='internlm3',
|
||||||
tokenizer_path='internlm/internlm2_5-7b-chat',
|
tokenizer_path='internlm/internlm3-8b-instruct',
|
||||||
rpm_verbose=True,
|
rpm_verbose=True,
|
||||||
meta_template=api_meta_template,
|
meta_template=api_meta_template,
|
||||||
query_per_second=128,
|
query_per_second=128,
|
||||||
|
18
.github/scripts/eval_regression_base_models.py
vendored
18
.github/scripts/eval_regression_base_models.py
vendored
@ -11,18 +11,10 @@ with read_base():
|
|||||||
from opencompass.configs.datasets.winogrande.winogrande_5shot_ll_252f01 import \
|
from opencompass.configs.datasets.winogrande.winogrande_5shot_ll_252f01 import \
|
||||||
winogrande_datasets # noqa: F401, E501
|
winogrande_datasets # noqa: F401, E501
|
||||||
# read hf models - chat models
|
# read hf models - chat models
|
||||||
from opencompass.configs.models.chatglm.hf_glm4_9b import \
|
|
||||||
models as hf_glm4_9b_model # noqa: F401, E501
|
|
||||||
from opencompass.configs.models.chatglm.lmdeploy_glm4_9b import \
|
from opencompass.configs.models.chatglm.lmdeploy_glm4_9b import \
|
||||||
models as lmdeploy_glm4_9b_model # noqa: F401, E501
|
models as lmdeploy_glm4_9b_model # noqa: F401, E501
|
||||||
from opencompass.configs.models.deepseek.hf_deepseek_7b_base import \
|
from opencompass.configs.models.deepseek.hf_deepseek_7b_base import \
|
||||||
models as hf_deepseek_7b_base_model # noqa: F401, E501
|
models as hf_deepseek_7b_base_model # noqa: F401, E501
|
||||||
from opencompass.configs.models.deepseek.hf_deepseek_67b_base import \
|
|
||||||
models as hf_deepseek_67b_base_model # noqa: F401, E501
|
|
||||||
from opencompass.configs.models.deepseek.hf_deepseek_moe_16b_base import \
|
|
||||||
models as hf_deepseek_moe_16b_base_model # noqa: F401, E501
|
|
||||||
from opencompass.configs.models.deepseek.hf_deepseek_v2_lite import \
|
|
||||||
models as hf_deepseek_v2_lite_model # noqa: F401, E501
|
|
||||||
from opencompass.configs.models.deepseek.lmdeploy_deepseek_7b_base import \
|
from opencompass.configs.models.deepseek.lmdeploy_deepseek_7b_base import \
|
||||||
models as lmdeploy_deepseek_7b_base_model # noqa: F401, E501
|
models as lmdeploy_deepseek_7b_base_model # noqa: F401, E501
|
||||||
from opencompass.configs.models.deepseek.lmdeploy_deepseek_67b_base import \
|
from opencompass.configs.models.deepseek.lmdeploy_deepseek_67b_base import \
|
||||||
@ -49,12 +41,6 @@ with read_base():
|
|||||||
models as hf_internlm2_5_7b_model # noqa: F401, E501
|
models as hf_internlm2_5_7b_model # noqa: F401, E501
|
||||||
from opencompass.configs.models.hf_internlm.hf_internlm2_7b import \
|
from opencompass.configs.models.hf_internlm.hf_internlm2_7b import \
|
||||||
models as hf_internlm2_7b_model # noqa: F401, E501
|
models as hf_internlm2_7b_model # noqa: F401, E501
|
||||||
from opencompass.configs.models.hf_internlm.hf_internlm2_20b import \
|
|
||||||
models as hf_internlm2_20b_model # noqa: F401, E501
|
|
||||||
from opencompass.configs.models.hf_internlm.hf_internlm2_base_7b import \
|
|
||||||
models as hf_internlm2_base_7b_model # noqa: F401, E501
|
|
||||||
from opencompass.configs.models.hf_internlm.hf_internlm2_base_20b import \
|
|
||||||
models as hf_internlm2_base_20b_model # noqa: F401, E501
|
|
||||||
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_1_8b import \
|
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_1_8b import \
|
||||||
models as lmdeploy_internlm2_1_8b_model # noqa: F401, E501
|
models as lmdeploy_internlm2_1_8b_model # noqa: F401, E501
|
||||||
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b import \
|
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b import \
|
||||||
@ -65,14 +51,14 @@ with read_base():
|
|||||||
models as lmdeploy_internlm2_20b_model # noqa: F401, E501
|
models as lmdeploy_internlm2_20b_model # noqa: F401, E501
|
||||||
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_base_7b import \
|
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_base_7b import \
|
||||||
models as lmdeploy_internlm2_base_7b_model # noqa: F401, E501
|
models as lmdeploy_internlm2_base_7b_model # noqa: F401, E501
|
||||||
|
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_base_20b import \
|
||||||
|
models as lmdeploy_internlm2_base_20b_model # noqa: F401, E501
|
||||||
from opencompass.configs.models.hf_llama.hf_llama2_7b import \
|
from opencompass.configs.models.hf_llama.hf_llama2_7b import \
|
||||||
models as hf_llama2_7b_model # noqa: F401, E501
|
models as hf_llama2_7b_model # noqa: F401, E501
|
||||||
from opencompass.configs.models.hf_llama.hf_llama3_1_8b import \
|
from opencompass.configs.models.hf_llama.hf_llama3_1_8b import \
|
||||||
models as hf_llama3_1_8b_model # noqa: F401, E501
|
models as hf_llama3_1_8b_model # noqa: F401, E501
|
||||||
from opencompass.configs.models.hf_llama.hf_llama3_8b import \
|
from opencompass.configs.models.hf_llama.hf_llama3_8b import \
|
||||||
models as hf_llama3_8b_model # noqa: F401, E501
|
models as hf_llama3_8b_model # noqa: F401, E501
|
||||||
from opencompass.configs.models.hf_llama.hf_llama3_70b import \
|
|
||||||
models as hf_llama3_70b_model # noqa: F401, E501
|
|
||||||
from opencompass.configs.models.hf_llama.lmdeploy_llama3_1_8b import \
|
from opencompass.configs.models.hf_llama.lmdeploy_llama3_1_8b import \
|
||||||
models as lmdeploy_llama3_1_8b_model # noqa: F401, E501
|
models as lmdeploy_llama3_1_8b_model # noqa: F401, E501
|
||||||
from opencompass.configs.models.hf_llama.lmdeploy_llama3_8b import \
|
from opencompass.configs.models.hf_llama.lmdeploy_llama3_8b import \
|
||||||
|
41
.github/scripts/eval_regression_chat_models.py
vendored
41
.github/scripts/eval_regression_chat_models.py
vendored
@ -15,14 +15,24 @@ with read_base():
|
|||||||
models as vllm_glm4_9b_chat_model # noqa: F401, E501
|
models as vllm_glm4_9b_chat_model # noqa: F401, E501
|
||||||
from opencompass.configs.models.deepseek.hf_deepseek_7b_chat import \
|
from opencompass.configs.models.deepseek.hf_deepseek_7b_chat import \
|
||||||
models as hf_deepseek_7b_chat_model # noqa: F401, E501
|
models as hf_deepseek_7b_chat_model # noqa: F401, E501
|
||||||
from opencompass.configs.models.deepseek.hf_deepseek_67b_chat import \
|
from opencompass.configs.models.deepseek.lmdeploy_deepseek_67b_chat import \
|
||||||
models as hf_deepseek_67b_chat_model # noqa: F401, E501
|
models as lmdeploy_deepseek_67b_chat_model # noqa: F401, E501
|
||||||
from opencompass.configs.models.deepseek.hf_deepseek_moe_16b_chat import \
|
from opencompass.configs.models.deepseek.lmdeploy_deepseek_r1_distill_llama_8b import \
|
||||||
models as hf_deepseek_moe_16b_chat_model # noqa: F401, E501
|
models as \
|
||||||
from opencompass.configs.models.deepseek.hf_deepseek_v2_lite_chat import \
|
lmdeploy_deepseek_r1_distill_llama_8b_model # noqa: F401, E501
|
||||||
models as hf_deepseek_v2_lite_chat_model # noqa: F401, E501
|
from opencompass.configs.models.deepseek.lmdeploy_deepseek_r1_distill_llama_70b import \
|
||||||
|
models as \
|
||||||
|
lmdeploy_deepseek_r1_distill_llama_70b_model # noqa: F401, E501
|
||||||
|
from opencompass.configs.models.deepseek.lmdeploy_deepseek_r1_distill_qwen_1_5b import \
|
||||||
|
models as \
|
||||||
|
lmdeploy_deepseek_r1_distill_qwen_1_5b_model # noqa: F401, E501
|
||||||
|
from opencompass.configs.models.deepseek.lmdeploy_deepseek_r1_distill_qwen_32b import \
|
||||||
|
models as \
|
||||||
|
lmdeploy_deepseek_r1_distill_qwen_32b_model # noqa: F401, E501
|
||||||
from opencompass.configs.models.deepseek.lmdeploy_deepseek_v2_5_1210 import \
|
from opencompass.configs.models.deepseek.lmdeploy_deepseek_v2_5_1210 import \
|
||||||
models as lmdeploy_deepseek_v2_5_1210_model # noqa: F401, E501
|
models as lmdeploy_deepseek_v2_5_1210_model # noqa: F401, E501
|
||||||
|
from opencompass.configs.models.deepseek.lmdeploy_deepseek_v2_lite import \
|
||||||
|
models as lmdeploy_deepseek_v2_lite_model # noqa: F401, E501
|
||||||
from opencompass.configs.models.deepseek.vllm_deepseek_7b_chat import \
|
from opencompass.configs.models.deepseek.vllm_deepseek_7b_chat import \
|
||||||
models as vllm_deepseek_7b_chat_model # noqa: F401, E501
|
models as vllm_deepseek_7b_chat_model # noqa: F401, E501
|
||||||
from opencompass.configs.models.gemma.hf_gemma2_2b_it import \
|
from opencompass.configs.models.gemma.hf_gemma2_2b_it import \
|
||||||
@ -45,6 +55,8 @@ with read_base():
|
|||||||
models as hf_internlm2_5_7b_chat_model # noqa: F401, E501
|
models as hf_internlm2_5_7b_chat_model # noqa: F401, E501
|
||||||
from opencompass.configs.models.hf_internlm.hf_internlm2_5_20b_chat import \
|
from opencompass.configs.models.hf_internlm.hf_internlm2_5_20b_chat import \
|
||||||
models as hf_internlm2_5_20b_chat_model # noqa: F401, E501
|
models as hf_internlm2_5_20b_chat_model # noqa: F401, E501
|
||||||
|
from opencompass.configs.models.hf_internlm.hf_internlm3_8b_instruct import \
|
||||||
|
models as hf_internlm3_8b_instruct_model # noqa: F401, E501
|
||||||
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat import \
|
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat import \
|
||||||
models as lmdeploy_internlm2_5_7b_chat_model # noqa: F401, E501
|
models as lmdeploy_internlm2_5_7b_chat_model # noqa: F401, E501
|
||||||
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_20b_chat import \
|
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_20b_chat import \
|
||||||
@ -57,6 +69,8 @@ with read_base():
|
|||||||
models as lmdeploy_internlm2_chat_7b_model # noqa: F401, E501
|
models as lmdeploy_internlm2_chat_7b_model # noqa: F401, E501
|
||||||
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_chat_7b_sft import \
|
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_chat_7b_sft import \
|
||||||
models as lmdeploy_internlm2_chat_7b_sft_model # noqa: F401, E501
|
models as lmdeploy_internlm2_chat_7b_sft_model # noqa: F401, E501
|
||||||
|
from opencompass.configs.models.hf_internlm.lmdeploy_internlm3_8b_instruct import \
|
||||||
|
models as lmdeploy_internlm3_8b_instruct_model # noqa: F401, E501
|
||||||
from opencompass.configs.models.hf_internlm.vllm_internlm2_chat_7b import \
|
from opencompass.configs.models.hf_internlm.vllm_internlm2_chat_7b import \
|
||||||
models as vllm_internlm2_chat_7b_model # noqa: F401, E501
|
models as vllm_internlm2_chat_7b_model # noqa: F401, E501
|
||||||
from opencompass.configs.models.hf_llama.hf_llama3_1_8b_instruct import \
|
from opencompass.configs.models.hf_llama.hf_llama3_1_8b_instruct import \
|
||||||
@ -83,10 +97,6 @@ with read_base():
|
|||||||
models as hf_mistral_nemo_instruct_2407_model # noqa: F401, E501
|
models as hf_mistral_nemo_instruct_2407_model # noqa: F401, E501
|
||||||
from opencompass.configs.models.mistral.hf_mistral_small_instruct_2409 import \
|
from opencompass.configs.models.mistral.hf_mistral_small_instruct_2409 import \
|
||||||
models as hf_mistral_small_instruct_2409_model # noqa: F401, E501
|
models as hf_mistral_small_instruct_2409_model # noqa: F401, E501
|
||||||
from opencompass.configs.models.mistral.hf_mixtral_8x7b_instruct_v0_1 import \
|
|
||||||
models as hf_mixtral_8x7b_instruct_v0_1_model # noqa: F401, E501
|
|
||||||
from opencompass.configs.models.mistral.hf_mixtral_8x22b_instruct_v0_1 import \
|
|
||||||
models as hf_mixtral_8x22b_instruct_v0_1_model # noqa: F401, E501
|
|
||||||
from opencompass.configs.models.mistral.lmdeploy_mistral_large_instruct_2411 import \
|
from opencompass.configs.models.mistral.lmdeploy_mistral_large_instruct_2411 import \
|
||||||
models as \
|
models as \
|
||||||
lmdeploy_mistral_large_instruct_2411_model # noqa: F401, E501
|
lmdeploy_mistral_large_instruct_2411_model # noqa: F401, E501
|
||||||
@ -95,14 +105,19 @@ with read_base():
|
|||||||
from opencompass.configs.models.mistral.lmdeploy_mistral_small_instruct_2409 import \
|
from opencompass.configs.models.mistral.lmdeploy_mistral_small_instruct_2409 import \
|
||||||
models as \
|
models as \
|
||||||
lmdeploy_mistral_small_instruct_2409_model # noqa: F401, E501
|
lmdeploy_mistral_small_instruct_2409_model # noqa: F401, E501
|
||||||
|
from opencompass.configs.models.mistral.lmdeploy_mixtral_8x22b_instruct_v0_1 import \
|
||||||
|
models as \
|
||||||
|
lmdeploy_mixtral_8x22b_instruct_v0_1_model # noqa: F401, E501
|
||||||
from opencompass.configs.models.mistral.vllm_mistral_7b_instruct_v0_1 import \
|
from opencompass.configs.models.mistral.vllm_mistral_7b_instruct_v0_1 import \
|
||||||
models as vllm_mistral_7b_instruct_v0_1_model # noqa: F401, E501
|
models as vllm_mistral_7b_instruct_v0_1_model # noqa: F401, E501
|
||||||
from opencompass.configs.models.mistral.vllm_mistral_7b_instruct_v0_2 import \
|
from opencompass.configs.models.mistral.vllm_mistral_7b_instruct_v0_2 import \
|
||||||
models as vllm_mistral_7b_instruct_v0_2_model # noqa: F401, E501
|
models as vllm_mistral_7b_instruct_v0_2_model # noqa: F401, E501
|
||||||
|
from opencompass.configs.models.mistral.vllm_mixtral_8x22b_instruct_v0_1 import \
|
||||||
|
models as vllm_mixtral_8x22b_instruct_v0_1_model # noqa: F401, E501
|
||||||
from opencompass.configs.models.nvidia.lmdeploy_nemotron_70b_instruct_hf import \
|
from opencompass.configs.models.nvidia.lmdeploy_nemotron_70b_instruct_hf import \
|
||||||
models as lmdeploy_nemotron_70b_instruct_hf_model # noqa: F401, E501
|
models as lmdeploy_nemotron_70b_instruct_hf_model # noqa: F401, E501
|
||||||
from opencompass.configs.models.phi.hf_phi_3_mini_4k_instruct import \
|
from opencompass.configs.models.phi.hf_phi_4 import \
|
||||||
models as hf_phi_3_mini_4k_instruct_model # noqa: F401, E501
|
models as hf_phi_4_model # noqa: F401, E501
|
||||||
from opencompass.configs.models.qwen2_5.hf_qwen2_5_0_5b_instruct import \
|
from opencompass.configs.models.qwen2_5.hf_qwen2_5_0_5b_instruct import \
|
||||||
models as hf_qwen2_5_0_5b_instruct_model # noqa: F401, E501
|
models as hf_qwen2_5_0_5b_instruct_model # noqa: F401, E501
|
||||||
from opencompass.configs.models.qwen2_5.hf_qwen2_5_3b_instruct import \
|
from opencompass.configs.models.qwen2_5.hf_qwen2_5_3b_instruct import \
|
||||||
@ -142,6 +157,8 @@ with read_base():
|
|||||||
|
|
||||||
from ...volc import infer as volc_infer # noqa: F401, E501
|
from ...volc import infer as volc_infer # noqa: F401, E501
|
||||||
|
|
||||||
|
hf_glm4_9b_chat_model[0]['path'] = 'THUDM/glm-4-9b-chat-hf'
|
||||||
|
|
||||||
race_datasets = [race_datasets[1]]
|
race_datasets = [race_datasets[1]]
|
||||||
datasets = sum([v for k, v in locals().items() if k.endswith('_datasets')], [])
|
datasets = sum([v for k, v in locals().items() if k.endswith('_datasets')], [])
|
||||||
|
|
||||||
|
40
.github/scripts/oc_score_assert.py
vendored
40
.github/scripts/oc_score_assert.py
vendored
@ -175,10 +175,11 @@ class TestApibench:
|
|||||||
class TestVolcFullbench:
|
class TestVolcFullbench:
|
||||||
"""Test cases for chat model."""
|
"""Test cases for chat model."""
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize('model, dataset', [(p1, p2) for p1 in [
|
||||||
'model, dataset',
|
'internlm2_5-7b-chat-turbomind', 'qwen2.5-7b-instruct-turbomind',
|
||||||
[(p1, p2) for p1 in ['internlm2_5-7b-chat-turbomind']
|
'internlm2_5-7b-chat-pytorch', 'qwen2.5-7b-instruct-pytorch',
|
||||||
for p2 in dataset_list('internlm2_5-7b-chat-turbomind', 'objective')])
|
'internlm3-8b-instruct-turbomind', 'internlm3-8b-instruct-pytorch'
|
||||||
|
] for p2 in dataset_list(p1, 'objective')])
|
||||||
@pytest.mark.chat_objective
|
@pytest.mark.chat_objective
|
||||||
def test_chat_objective(self, baseline_scores_fullbench, result_scores,
|
def test_chat_objective(self, baseline_scores_fullbench, result_scores,
|
||||||
model, dataset):
|
model, dataset):
|
||||||
@ -245,10 +246,7 @@ class TestCmdCase:
|
|||||||
@pytest.mark.parametrize('model, dataset',
|
@pytest.mark.parametrize('model, dataset',
|
||||||
[('internlm2_5-7b-hf', 'race-middle_accuracy'),
|
[('internlm2_5-7b-hf', 'race-middle_accuracy'),
|
||||||
('internlm2_5-7b-hf', 'race-high_accuracy'),
|
('internlm2_5-7b-hf', 'race-high_accuracy'),
|
||||||
('internlm2_5-7b-hf', 'demo_gsm8k_accuracy'),
|
('internlm2_5-7b-hf', 'demo_gsm8k_accuracy')])
|
||||||
('internlm2-1.8b-hf', 'race-middle_accuracy'),
|
|
||||||
('internlm2-1.8b-hf', 'race-high_accuracy'),
|
|
||||||
('internlm2-1.8b-hf', 'demo_gsm8k_accuracy')])
|
|
||||||
def test_cmd_case1(self, baseline_scores, result_scores, model, dataset):
|
def test_cmd_case1(self, baseline_scores, result_scores, model, dataset):
|
||||||
base_score = baseline_scores.get(model).get(dataset)
|
base_score = baseline_scores.get(model).get(dataset)
|
||||||
result_score = result_scores.get(model).get(dataset)
|
result_score = result_scores.get(model).get(dataset)
|
||||||
@ -260,9 +258,9 @@ class TestCmdCase:
|
|||||||
[('internlm2_5-7b-chat-lmdeploy', 'race-middle_accuracy'),
|
[('internlm2_5-7b-chat-lmdeploy', 'race-middle_accuracy'),
|
||||||
('internlm2_5-7b-chat-lmdeploy', 'race-high_accuracy'),
|
('internlm2_5-7b-chat-lmdeploy', 'race-high_accuracy'),
|
||||||
('internlm2_5-7b-chat-lmdeploy', 'demo_gsm8k_accuracy'),
|
('internlm2_5-7b-chat-lmdeploy', 'demo_gsm8k_accuracy'),
|
||||||
('internlm2-chat-1.8b-lmdeploy', 'race-middle_accuracy'),
|
('internlm3-8b-instruct-lmdeploy', 'race-middle_accuracy'),
|
||||||
('internlm2-chat-1.8b-lmdeploy', 'race-high_accuracy'),
|
('internlm3-8b-instruct-lmdeploy', 'race-high_accuracy'),
|
||||||
('internlm2-chat-1.8b-lmdeploy', 'demo_gsm8k_accuracy')])
|
('internlm3-8b-instruct-lmdeploy', 'demo_gsm8k_accuracy')])
|
||||||
def test_cmd_case2(self, baseline_scores, result_scores, model, dataset):
|
def test_cmd_case2(self, baseline_scores, result_scores, model, dataset):
|
||||||
base_score = baseline_scores.get(model).get(dataset)
|
base_score = baseline_scores.get(model).get(dataset)
|
||||||
result_score = result_scores.get(model).get(dataset)
|
result_score = result_scores.get(model).get(dataset)
|
||||||
@ -280,13 +278,25 @@ class TestCmdCase:
|
|||||||
|
|
||||||
@pytest.mark.case4
|
@pytest.mark.case4
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
'model, dataset', [('internlm2_5-7b-chat_hf', 'race-middle_accuracy'),
|
'model, dataset',
|
||||||
('internlm2_5-7b-chat_hf', 'race-high_accuracy'),
|
[('internlm3-8b-instruct_hf-lmdeploy', 'race-middle_accuracy'),
|
||||||
('internlm2_5-7b-chat_hf', 'demo_gsm8k_accuracy')])
|
('internlm3-8b-instruct_hf-lmdeploy', 'race-high_accuracy'),
|
||||||
|
('internlm3-8b-instruct_hf-lmdeploy', 'demo_gsm8k_accuracy')])
|
||||||
def test_cmd_case4(self, baseline_scores, result_scores, model, dataset):
|
def test_cmd_case4(self, baseline_scores, result_scores, model, dataset):
|
||||||
base_score = baseline_scores.get(model).get(dataset)
|
base_score = baseline_scores.get(model).get(dataset)
|
||||||
result_score = result_scores.get(model).get(dataset)
|
result_score = result_scores.get(model).get(dataset)
|
||||||
assert_score(model, result_score, base_score, dataset)
|
assert_score(model + '_batch', result_score, base_score, dataset)
|
||||||
|
|
||||||
|
@pytest.mark.case5
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
'model, dataset',
|
||||||
|
[('internlm3-8b-instruct_hf-vllm', 'race-middle_accuracy'),
|
||||||
|
('internlm3-8b-instruct_hf-vllm', 'race-high_accuracy'),
|
||||||
|
('internlm3-8b-instruct_hf-vllm', 'demo_gsm8k_accuracy')])
|
||||||
|
def test_cmd_case5(self, baseline_scores, result_scores, model, dataset):
|
||||||
|
base_score = baseline_scores.get(model).get(dataset)
|
||||||
|
result_score = result_scores.get(model).get(dataset)
|
||||||
|
assert_score(model + '_batch', result_score, base_score, dataset)
|
||||||
|
|
||||||
|
|
||||||
def assert_score(model_type, score, baseline, dataset: str = ''):
|
def assert_score(model_type, score, baseline, dataset: str = ''):
|
||||||
|
29
.github/scripts/oc_score_baseline.yaml
vendored
29
.github/scripts/oc_score_baseline.yaml
vendored
@ -8,20 +8,25 @@ internlm2_5-7b_hf:
|
|||||||
race-middle_accuracy: 91.78
|
race-middle_accuracy: 91.78
|
||||||
race-high_accuracy: 90.02
|
race-high_accuracy: 90.02
|
||||||
|
|
||||||
internlm2-1.8b-hf:
|
|
||||||
demo_gsm8k_accuracy: 15.62
|
|
||||||
race-middle_accuracy: 71.66
|
|
||||||
race-high_accuracy: 66.38
|
|
||||||
|
|
||||||
internlm2_5-7b-chat-lmdeploy:
|
internlm2_5-7b-chat-lmdeploy:
|
||||||
demo_gsm8k_accuracy: 89.06
|
demo_gsm8k_accuracy: 87.50
|
||||||
race-middle_accuracy: 92.76
|
race-middle_accuracy: 92.76
|
||||||
race-high_accuracy: 90.54
|
race-high_accuracy: 90.54
|
||||||
|
|
||||||
internlm2-chat-1.8b-lmdeploy:
|
internlm3-8b-instruct-lmdeploy:
|
||||||
demo_gsm8k_accuracy: 31
|
demo_gsm8k_accuracy: 73.44
|
||||||
race-middle_accuracy: 81.34
|
race-middle_accuracy: 93.38
|
||||||
race-high_accuracy: 73.96
|
race-high_accuracy: 90.34
|
||||||
|
|
||||||
|
internlm3-8b-instruct_hf-lmdeploy:
|
||||||
|
demo_gsm8k_accuracy: 73.44
|
||||||
|
race-middle_accuracy: 93.38
|
||||||
|
race-high_accuracy: 90.34
|
||||||
|
|
||||||
|
internlm3-8b-instruct_hf-vllm:
|
||||||
|
demo_gsm8k_accuracy: 81.25
|
||||||
|
race-middle_accuracy: 92.20
|
||||||
|
race-high_accuracy: 89.88
|
||||||
|
|
||||||
internlm2_5-7b-chat_hf:
|
internlm2_5-7b-chat_hf:
|
||||||
demo_gsm8k_accuracy: 87.50
|
demo_gsm8k_accuracy: 87.50
|
||||||
@ -29,6 +34,6 @@ internlm2_5-7b-chat_hf:
|
|||||||
race-high_accuracy: 90.48
|
race-high_accuracy: 90.48
|
||||||
|
|
||||||
lmdeploy-api-test:
|
lmdeploy-api-test:
|
||||||
gsm8k_accuracy: 68.75
|
gsm8k_accuracy: 56.25
|
||||||
race-middle_accuracy: 87.50
|
race-middle_accuracy: 93.75
|
||||||
race-high_accuracy: 93.75
|
race-high_accuracy: 93.75
|
||||||
|
663
.github/scripts/oc_score_baseline_fullbench.yaml
vendored
663
.github/scripts/oc_score_baseline_fullbench.yaml
vendored
@ -24,8 +24,8 @@ internlm2_5-7b-chat-hf_fullbench:
|
|||||||
lcb_test_output_pass@1: 18.75
|
lcb_test_output_pass@1: 18.75
|
||||||
bbh-logical_deduction_seven_objects_score: 50
|
bbh-logical_deduction_seven_objects_score: 50
|
||||||
bbh-multistep_arithmetic_two_score: 68.75
|
bbh-multistep_arithmetic_two_score: 68.75
|
||||||
mmlu-other_naive_average: 72.6
|
mmlu-other_accuracy: 72.6
|
||||||
cmmlu-china-specific_naive_average: 76.25
|
cmmlu-china-specific_accuracy: 76.25
|
||||||
mmlu_pro_math_accuracy: 25
|
mmlu_pro_math_accuracy: 25
|
||||||
ds1000_Pandas_accuracy: 12.5
|
ds1000_Pandas_accuracy: 12.5
|
||||||
ds1000_Numpy_accuracy: 0
|
ds1000_Numpy_accuracy: 0
|
||||||
@ -39,15 +39,15 @@ internlm2_5-7b-chat-hf_fullbench:
|
|||||||
college_knowledge_naive_average: 87.5
|
college_knowledge_naive_average: 87.5
|
||||||
subjective:
|
subjective:
|
||||||
alignment_bench_v1_1_总分: 0.66
|
alignment_bench_v1_1_总分: 0.66
|
||||||
alpaca_eval_total: 20
|
alpaca_eval_total: 0
|
||||||
arenahard_score: 50
|
arenahard_score: 50
|
||||||
Followbench_naive_average: 1
|
Followbench_naive_average: 1
|
||||||
CompassArena_naive_average: 43
|
CompassArena_naive_average: 43
|
||||||
mtbench101_avg: 7.8
|
mtbench101_avg: 7.8
|
||||||
wildbench_average: -12.78
|
wildbench_average: -15.56
|
||||||
simpleqa_accuracy_given_attempted: 0
|
simpleqa_accuracy_given_attempted: 0
|
||||||
chinese_simpleqa_given_attempted_accuracy: 1
|
chinese_simpleqa_given_attempted_accuracy: 1
|
||||||
alignment_bench_v1_1_专业能力: 7.90
|
alignment_bench_v1_1_专业能力: 8.00
|
||||||
alignment_bench_v1_1_数学计算: 0
|
alignment_bench_v1_1_数学计算: 0
|
||||||
alignment_bench_v1_1_基本任务: 0
|
alignment_bench_v1_1_基本任务: 0
|
||||||
alignment_bench_v1_1_逻辑推理: 0
|
alignment_bench_v1_1_逻辑推理: 0
|
||||||
@ -55,7 +55,7 @@ internlm2_5-7b-chat-hf_fullbench:
|
|||||||
alignment_bench_v1_1_文本写作: 0
|
alignment_bench_v1_1_文本写作: 0
|
||||||
alignment_bench_v1_1_角色扮演: 0
|
alignment_bench_v1_1_角色扮演: 0
|
||||||
alignment_bench_v1_1_综合问答: 0
|
alignment_bench_v1_1_综合问答: 0
|
||||||
alpaca_eval_helpful_base: 20
|
alpaca_eval_helpful_base: 0
|
||||||
compassarena_language_naive_average: 35
|
compassarena_language_naive_average: 35
|
||||||
compassarena_knowledge_naive_average: 55
|
compassarena_knowledge_naive_average: 55
|
||||||
compassarena_reason_v2_naive_average: 40
|
compassarena_reason_v2_naive_average: 40
|
||||||
@ -78,53 +78,53 @@ internlm2_5-7b-chat-hf_fullbench:
|
|||||||
internlm2_5-7b-chat-turbomind_fullbench:
|
internlm2_5-7b-chat-turbomind_fullbench:
|
||||||
objective:
|
objective:
|
||||||
race-high_accuracy: 93.75
|
race-high_accuracy: 93.75
|
||||||
ARC-c_accuracy: 93.75
|
ARC-c_accuracy: 87.50
|
||||||
BoolQ_accuracy: 68.75
|
BoolQ_accuracy: 68.75
|
||||||
triviaqa_wiki_1shot_score: 50
|
triviaqa_wiki_1shot_score: 50
|
||||||
nq_open_1shot_score: 25
|
nq_open_1shot_score: 25
|
||||||
IFEval_Prompt-level-strict-accuracy: 56.25
|
IFEval_Prompt-level-strict-accuracy: 56.25
|
||||||
drop_accuracy: 81.25
|
drop_accuracy: 75
|
||||||
GPQA_diamond_accuracy: 31.25
|
GPQA_diamond_accuracy: 31.25
|
||||||
hellaswag_accuracy: 81.25
|
hellaswag_accuracy: 87.5
|
||||||
TheoremQA_score: 6.25
|
TheoremQA_score: 12.5
|
||||||
musr_average_naive_average: 39.58
|
musr_average_naive_average: 39.58
|
||||||
korbench_single_naive_average: 37.50
|
korbench_single_naive_average: 40
|
||||||
gsm8k_accuracy: 68.75
|
gsm8k_accuracy: 62.5
|
||||||
math_accuracy: 68.75
|
math_accuracy: 75
|
||||||
cmo_fib_accuracy: 6.25
|
cmo_fib_accuracy: 6.25
|
||||||
aime2024_accuracy: 6.25
|
aime2024_accuracy: 6.25
|
||||||
wikibench-wiki-single_choice_cncircular_perf_4: 50.00
|
wikibench-wiki-single_choice_cncircular_perf_4: 25
|
||||||
sanitized_mbpp_score: 68.75
|
sanitized_mbpp_score: 68.75
|
||||||
ds1000_naive_average: 16.96
|
ds1000_naive_average: 17.86
|
||||||
lcb_code_generation_pass@1: 12.5
|
lcb_code_generation_pass@1: 12.5
|
||||||
lcb_code_execution_pass@1: 43.75
|
lcb_code_execution_pass@1: 43.75
|
||||||
lcb_test_output_pass@1: 25.00
|
lcb_test_output_pass@1: 18.75
|
||||||
bbh-logical_deduction_seven_objects_score: 50.00
|
bbh-logical_deduction_seven_objects_score: 56.25
|
||||||
bbh-multistep_arithmetic_two_score: 68.75
|
bbh-multistep_arithmetic_two_score: 75
|
||||||
mmlu-other_naive_average: 69.71
|
mmlu-other_accuracy: 72.6
|
||||||
cmmlu-china-specific_naive_average: 75.83
|
cmmlu-china-specific_accuracy: 78.33
|
||||||
mmlu_pro_math_accuracy: 31.25
|
mmlu_pro_math_accuracy: 31.25
|
||||||
ds1000_Pandas_accuracy: 0
|
ds1000_Pandas_accuracy: 12.5
|
||||||
ds1000_Numpy_accuracy: 0
|
ds1000_Numpy_accuracy: 0
|
||||||
ds1000_Tensorflow_accuracy: 12.5
|
ds1000_Tensorflow_accuracy: 12.5
|
||||||
ds1000_Scipy_accuracy: 18.75
|
ds1000_Scipy_accuracy: 25
|
||||||
ds1000_Sklearn_accuracy: 18.75
|
ds1000_Sklearn_accuracy: 18.75
|
||||||
ds1000_Pytorch_accuracy: 18.75
|
ds1000_Pytorch_accuracy: 6.25
|
||||||
ds1000_Matplotlib_accuracy: 50.00
|
ds1000_Matplotlib_accuracy: 50.00
|
||||||
openai_mmmlu_lite_AR-XY_accuracy: 37.5
|
openai_mmmlu_lite_AR-XY_accuracy: 37.5
|
||||||
college_naive_average: 12.50
|
college_naive_average: 12.50
|
||||||
college_knowledge_naive_average: 87.5
|
college_knowledge_naive_average: 87.5
|
||||||
subjective:
|
subjective:
|
||||||
alignment_bench_v1_1_总分: 0.70
|
alignment_bench_v1_1_总分: 0.66
|
||||||
alpaca_eval_total: 0
|
alpaca_eval_total: 0
|
||||||
arenahard_score: 50
|
arenahard_score: 50
|
||||||
Followbench_naive_average: 1
|
Followbench_naive_average: 1
|
||||||
CompassArena_naive_average: 38
|
CompassArena_naive_average: 40
|
||||||
mtbench101_avg: 7.80
|
mtbench101_avg: 8
|
||||||
wildbench_average: -4.86
|
wildbench_average: -6.81
|
||||||
simpleqa_accuracy_given_attempted: 0
|
simpleqa_accuracy_given_attempted: 0
|
||||||
chinese_simpleqa_given_attempted_accuracy: 1
|
chinese_simpleqa_given_attempted_accuracy: 1
|
||||||
alignment_bench_v1_1_专业能力: 8.4
|
alignment_bench_v1_1_专业能力: 7.9
|
||||||
alignment_bench_v1_1_数学计算: 0
|
alignment_bench_v1_1_数学计算: 0
|
||||||
alignment_bench_v1_1_基本任务: 0
|
alignment_bench_v1_1_基本任务: 0
|
||||||
alignment_bench_v1_1_逻辑推理: 0
|
alignment_bench_v1_1_逻辑推理: 0
|
||||||
@ -134,10 +134,10 @@ internlm2_5-7b-chat-turbomind_fullbench:
|
|||||||
alignment_bench_v1_1_综合问答: 0
|
alignment_bench_v1_1_综合问答: 0
|
||||||
alpaca_eval_helpful_base: 0
|
alpaca_eval_helpful_base: 0
|
||||||
compassarena_language_naive_average: 35
|
compassarena_language_naive_average: 35
|
||||||
compassarena_knowledge_naive_average: 50
|
compassarena_knowledge_naive_average: 45
|
||||||
compassarena_reason_v2_naive_average: 30
|
compassarena_reason_v2_naive_average: 25
|
||||||
compassarena_math_v2_naive_average: 50
|
compassarena_math_v2_naive_average: 60
|
||||||
compassarena_creationv2_zh_naive_average: 25
|
compassarena_creationv2_zh_naive_average: 35
|
||||||
followbench_llmeval_en_HSR_AVG: 1
|
followbench_llmeval_en_HSR_AVG: 1
|
||||||
followbench_llmeval_en_SSR_AVG: 1
|
followbench_llmeval_en_SSR_AVG: 1
|
||||||
followbench_llmeval_en_HSR_L1: 1
|
followbench_llmeval_en_HSR_L1: 1
|
||||||
@ -190,20 +190,20 @@ internlm2_5-7b-turbomind_fullbench:
|
|||||||
drop_accuracy: 62.5
|
drop_accuracy: 62.5
|
||||||
GPQA_diamond_accuracy: 62.5
|
GPQA_diamond_accuracy: 62.5
|
||||||
hellaswag_accuracy: 93.75
|
hellaswag_accuracy: 93.75
|
||||||
TheoremQA_score: 25.00
|
TheoremQA_score: 31.25
|
||||||
winogrande_accuracy: 87.5
|
winogrande_accuracy: 87.5
|
||||||
gsm8k_accuracy: 62.50
|
gsm8k_accuracy: 56.25
|
||||||
GaokaoBench_2010-2022_Math_II_MCQs_score: 81.25
|
GaokaoBench_2010-2022_Math_II_MCQs_score: 68.75
|
||||||
GaokaoBench_2010-2022_Math_II_Fill-in-the-Blank_score: 0
|
GaokaoBench_2010-2022_Math_II_Fill-in-the-Blank_score: 0
|
||||||
math_accuracy: 18.75
|
math_accuracy: 18.75
|
||||||
wikibench-wiki-single_choice_cncircular_perf_4: 25
|
wikibench-wiki-single_choice_cncircular_perf_4: 25
|
||||||
sanitized_mbpp_score: 62.50
|
sanitized_mbpp_score: 62.50
|
||||||
dingo_en_192_score: 31.25
|
dingo_en_192_score: 50.00
|
||||||
dingo_zh_170_score: 93.75
|
dingo_zh_170_score: 93.75
|
||||||
mmlu-other_accuracy: 76.92
|
mmlu-other_accuracy: 76.92
|
||||||
cmmlu-china-specific_accuracy: 84.17
|
cmmlu-china-specific_accuracy: 84.17
|
||||||
mmlu_pro_math_accuracy: 18.75
|
mmlu_pro_math_accuracy: 18.75
|
||||||
bbh-logical_deduction_seven_objects_score: 50
|
bbh-logical_deduction_seven_objects_score: 43.75
|
||||||
bbh-multistep_arithmetic_two_score: 56.25
|
bbh-multistep_arithmetic_two_score: 56.25
|
||||||
college_naive_average: 12.5
|
college_naive_average: 12.5
|
||||||
college_knowledge_naive_average: 87.5
|
college_knowledge_naive_average: 87.5
|
||||||
@ -234,15 +234,15 @@ internlm2_5-7b-turbomind:
|
|||||||
sanitized_mbpp_score: 55.25
|
sanitized_mbpp_score: 55.25
|
||||||
dingo_en_192_score: 60.94
|
dingo_en_192_score: 60.94
|
||||||
dingo_zh_170_score: 67.65
|
dingo_zh_170_score: 67.65
|
||||||
mmlu-stem_naive_average: 63.72
|
mmlu-stem_accuracy: 63.72
|
||||||
mmlu-social-science_naive_average: 80.15
|
mmlu-social-science_accuracy: 80.15
|
||||||
mmlu-humanities_naive_average: 74.27
|
mmlu-humanities_accuracy: 74.27
|
||||||
mmlu-other_naive_average: 71.85
|
mmlu-other_accuracy: 71.85
|
||||||
cmmlu-stem_naive_average: 67.07
|
cmmlu-stem_accuracy: 67.07
|
||||||
cmmlu-social-science_naive_average: 81.49
|
cmmlu-social-science_accuracy: 81.49
|
||||||
cmmlu-humanities_naive_average: 85.84
|
cmmlu-humanities_accuracy: 85.84
|
||||||
cmmlu-other_naive_average: 82.69
|
cmmlu-other_accuracy: 82.69
|
||||||
cmmlu-china-specific_naive_average: 79.88
|
cmmlu-china-specific_accuracy: 79.88
|
||||||
mmlu_pro_biology_accuracy: 58.58
|
mmlu_pro_biology_accuracy: 58.58
|
||||||
mmlu_pro_business_accuracy: 28.01
|
mmlu_pro_business_accuracy: 28.01
|
||||||
mmlu_pro_chemistry_accuracy: 22.79
|
mmlu_pro_chemistry_accuracy: 22.79
|
||||||
@ -281,12 +281,12 @@ internlm2_5-7b-turbomind:
|
|||||||
longbench_naive_average: 46.19
|
longbench_naive_average: 46.19
|
||||||
longbench_zh_naive_average: 49.3
|
longbench_zh_naive_average: 49.3
|
||||||
longbench_en_naive_average: 43.97
|
longbench_en_naive_average: 43.97
|
||||||
longbench_single-document-qa_naive_average: 42.84
|
longbench_single-document-qa_score: 42.84
|
||||||
longbench_multi-document-qa_naive_average: 37.29
|
longbench_multi-document-qa_score: 41.25
|
||||||
longbench_summarization_naive_average: 23.21
|
longbench_summarization_score: 23.21
|
||||||
longbench_few-shot-learning_naive_average: 61.67
|
longbench_few-shot-learning_score: 61.67
|
||||||
longbench_synthetic-tasks_naive_average: 60.05
|
longbench_synthetic-tasks_score: 60.05
|
||||||
longbench_code-completion_naive_average: 52.09
|
longbench_code-completion_score: 52.09
|
||||||
|
|
||||||
internlm2_5-7b-chat-turbomind:
|
internlm2_5-7b-chat-turbomind:
|
||||||
objective:
|
objective:
|
||||||
@ -327,15 +327,15 @@ internlm2_5-7b-chat-turbomind:
|
|||||||
teval_naive_average: 80
|
teval_naive_average: 80
|
||||||
SciCode_sub_accuracy: 5.56
|
SciCode_sub_accuracy: 5.56
|
||||||
qa_dingo_cn_score: 99.01
|
qa_dingo_cn_score: 99.01
|
||||||
mmlu-stem_naive_average: 68.2
|
mmlu-stem_accuracy: 68.2
|
||||||
mmlu-social-science_naive_average: 75.8
|
mmlu-social-science_accuracy: 75.8
|
||||||
mmlu-humanities_naive_average: 69.3
|
mmlu-humanities_accuracy: 69.3
|
||||||
mmlu-other_naive_average: 71.3
|
mmlu-other_accuracy: 71.3
|
||||||
cmmlu-stem_naive_average: 66.64
|
cmmlu-stem_accuracy: 66.64
|
||||||
cmmlu-social-science_naive_average: 76
|
cmmlu-social-science_accuracy: 76
|
||||||
cmmlu-humanities_naive_average: 77.9
|
cmmlu-humanities_accuracy: 77.9
|
||||||
cmmlu-other_naive_average: 77.25
|
cmmlu-other_accuracy: 77.25
|
||||||
cmmlu-china-specific_naive_average: 73.6
|
cmmlu-china-specific_accuracy: 73.6
|
||||||
mmlu_pro_biology_accuracy: 66.67
|
mmlu_pro_biology_accuracy: 66.67
|
||||||
mmlu_pro_business_accuracy: 47.91
|
mmlu_pro_business_accuracy: 47.91
|
||||||
mmlu_pro_chemistry_accuracy: 35
|
mmlu_pro_chemistry_accuracy: 35
|
||||||
@ -409,7 +409,7 @@ internlm2_5-7b-chat-turbomind:
|
|||||||
alpaca_eval_koala: 28.21
|
alpaca_eval_koala: 28.21
|
||||||
alpaca_eval_oasst: 23.4
|
alpaca_eval_oasst: 23.4
|
||||||
alpaca_eval_selfinstruct: 30.95
|
alpaca_eval_selfinstruct: 30.95
|
||||||
alpaca_eval_vicuna: 25
|
alpaca_eval_vicuna: 33.75
|
||||||
compassarena_language_naive_average: 52.5
|
compassarena_language_naive_average: 52.5
|
||||||
compassarena_knowledge_naive_average: 36
|
compassarena_knowledge_naive_average: 36
|
||||||
compassarena_reason_v2_naive_average: 35
|
compassarena_reason_v2_naive_average: 35
|
||||||
@ -448,9 +448,536 @@ internlm2_5-7b-chat-1m-turbomind:
|
|||||||
babilong_32k_naive_average: 48.9
|
babilong_32k_naive_average: 48.9
|
||||||
babilong_128k_naive_average: 40.8
|
babilong_128k_naive_average: 40.8
|
||||||
babilong_256k_naive_average: 23.5
|
babilong_256k_naive_average: 23.5
|
||||||
longbench_single-document-qa_naive_average: 43.56
|
longbench_single-document-qa_score: 43.56
|
||||||
longbench_multi-document-qa_naive_average: 46.24
|
longbench_multi-document-qa_score: 46.24
|
||||||
longbench_summarization_naive_average: 24.32
|
longbench_summarization_score: 24.32
|
||||||
longbench_few-shot-learning_naive_average: 51.67
|
longbench_few-shot-learning_score: 51.67
|
||||||
longbench_synthetic-tasks_naive_average: 66.83
|
longbench_synthetic-tasks_score: 66.83
|
||||||
longbench_code-completion_naive_average: 45.99
|
longbench_code-completion_score: 45.99
|
||||||
|
|
||||||
|
|
||||||
|
qwen2.5-7b-instruct-turbomind:
|
||||||
|
objective:
|
||||||
|
race-high_accuracy: 84.99
|
||||||
|
ARC-c_accuracy: 92.2
|
||||||
|
BoolQ_accuracy: 86.7
|
||||||
|
triviaqa_wiki_1shot_score: 53.06
|
||||||
|
nq_open_1shot_score: 17.51
|
||||||
|
mmmlu_lite_naive_average: 54.96
|
||||||
|
IFEval_Prompt-level-strict-accuracy: 71.53
|
||||||
|
drop_accuracy: 80.07
|
||||||
|
bbh_naive_average: 68.81
|
||||||
|
GPQA_diamond_accuracy: 34.34
|
||||||
|
hellaswag_accuracy: 85.42
|
||||||
|
TheoremQA_score: 18.38
|
||||||
|
musr_average_naive_average: 43.44
|
||||||
|
korbench_single_naive_average: 39.44
|
||||||
|
ARC_Prize_Public_Evaluation_accuracy: 0
|
||||||
|
gsm8k_accuracy: 92.57
|
||||||
|
GaokaoBench_weighted_average: 80.14
|
||||||
|
math_accuracy: 73.58
|
||||||
|
cmo_fib_accuracy: 25
|
||||||
|
aime2024_accuracy: 16.67
|
||||||
|
Mathbench_naive_average: 77.33
|
||||||
|
wikibench-wiki-single_choice_cncircular_perf_4: 34.9
|
||||||
|
cmmlu_naive_average: 75.97
|
||||||
|
mmlu_naive_average: 76.01
|
||||||
|
mmlu_pro_naive_average: 56.12
|
||||||
|
openai_humaneval_humaneval_pass@1: 83.54
|
||||||
|
sanitized_mbpp_score: 74.71
|
||||||
|
humanevalx_naive_average: 48.29
|
||||||
|
ds1000_naive_average: 18.66
|
||||||
|
lcb_code_generation_pass@1: 39.5
|
||||||
|
lcb_code_execution_pass@1: 42.38
|
||||||
|
lcb_test_output_pass@1: 50.68
|
||||||
|
bigcodebench_hard_instruct_pass@1: 16.22
|
||||||
|
bigcodebench_hard_complete_pass@1: 11.49
|
||||||
|
teval_naive_average: 79.72
|
||||||
|
SciCode_sub_accuracy: 100
|
||||||
|
qa_dingo_cn_score: 99.01
|
||||||
|
mmlu_accuracy: 76.01
|
||||||
|
mmlu-stem_accuracy: 77.59
|
||||||
|
mmlu-social-science_accuracy: 79.02
|
||||||
|
mmlu-humanities_accuracy: 72.07
|
||||||
|
mmlu-other_accuracy: 74.86
|
||||||
|
cmmlu_accuracy: 75.97
|
||||||
|
cmmlu-stem_accuracy: 73.09
|
||||||
|
cmmlu-social-science_accuracy: 75.95
|
||||||
|
cmmlu-humanities_accuracy: 76.53
|
||||||
|
cmmlu-other_accuracy: 78.79
|
||||||
|
cmmlu-china-specific_accuracy: 73.17
|
||||||
|
mmlu_pro_accuracy: 56.12
|
||||||
|
mmlu_pro_biology_accuracy: 71.41
|
||||||
|
mmlu_pro_business_accuracy: 67.68
|
||||||
|
mmlu_pro_chemistry_accuracy: 54.59
|
||||||
|
mmlu_pro_computer_science_accuracy: 58.29
|
||||||
|
mmlu_pro_economics_accuracy: 66.82
|
||||||
|
mmlu_pro_engineering_accuracy: 42.41
|
||||||
|
mmlu_pro_health_accuracy: 55.87
|
||||||
|
mmlu_pro_history_accuracy: 46.46
|
||||||
|
mmlu_pro_law_accuracy: 28.97
|
||||||
|
mmlu_pro_math_accuracy: 73.13
|
||||||
|
mmlu_pro_philosophy_accuracy: 44.89
|
||||||
|
mmlu_pro_physics_accuracy: 58.43
|
||||||
|
mmlu_pro_psychology_accuracy: 63.16
|
||||||
|
mmlu_pro_other_accuracy: 53.57
|
||||||
|
humanevalx-python_pass@1: 50
|
||||||
|
humanevalx-cpp_pass@1: 42.07
|
||||||
|
humanevalx-go_pass@1: 0
|
||||||
|
humanevalx-java_pass@1: 74.39
|
||||||
|
humanevalx-js_pass@1: 75
|
||||||
|
ds1000_Pandas_accuracy: 14.09
|
||||||
|
ds1000_Numpy_accuracy: 8.18
|
||||||
|
ds1000_Tensorflow_accuracy: 17.78
|
||||||
|
ds1000_Scipy_accuracy: 15.09
|
||||||
|
ds1000_Sklearn_accuracy: 10.43
|
||||||
|
ds1000_Pytorch_accuracy: 4.41
|
||||||
|
ds1000_Matplotlib_accuracy: 60.65
|
||||||
|
mmmlu_lite_accuracy: 54.96
|
||||||
|
openai_mmmlu_lite_AR-XY_accuracy: 42.32
|
||||||
|
openai_mmmlu_lite_BN-BD_accuracy: 42.25
|
||||||
|
openai_mmmlu_lite_DE-DE_accuracy: 59.93
|
||||||
|
openai_mmmlu_lite_ES-LA_accuracy: 66.53
|
||||||
|
openai_mmmlu_lite_FR-FR_accuracy: 66.88
|
||||||
|
openai_mmmlu_lite_HI-IN_accuracy: 49.26
|
||||||
|
openai_mmmlu_lite_ID-ID_accuracy: 61.26
|
||||||
|
openai_mmmlu_lite_IT-IT_accuracy: 65.47
|
||||||
|
openai_mmmlu_lite_JA-JP_accuracy: 61.54
|
||||||
|
openai_mmmlu_lite_KO-KR_accuracy: 60.28
|
||||||
|
openai_mmmlu_lite_PT-BR_accuracy: 55.51
|
||||||
|
openai_mmmlu_lite_SW-KE_accuracy: 36.42
|
||||||
|
openai_mmmlu_lite_YO-NG_accuracy: 32.14
|
||||||
|
openai_mmmlu_lite_ZH-CN_accuracy: 69.61
|
||||||
|
college_naive_average: 48
|
||||||
|
high_naive_average: 59
|
||||||
|
middle_naive_average: 78
|
||||||
|
primary_naive_average: 85.67
|
||||||
|
arithmetic_naive_average: 75.67
|
||||||
|
mathbench-a (average)_naive_average: 69.27
|
||||||
|
college_knowledge_naive_average: 83.86
|
||||||
|
high_knowledge_naive_average: 80.29
|
||||||
|
middle_knowledge_naive_average: 84.26
|
||||||
|
primary_knowledge_naive_average: 93.16
|
||||||
|
mathbench-t (average)_naive_average: 85.39
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
internlm2_5-7b-chat-pytorch:
|
||||||
|
objective:
|
||||||
|
race-high_accuracy: 86.39
|
||||||
|
ARC-c_accuracy: 90.51
|
||||||
|
BoolQ_accuracy: 88.01
|
||||||
|
triviaqa_wiki_1shot_score: 64.77
|
||||||
|
nq_open_1shot_score: 22.71
|
||||||
|
mmmlu_lite_naive_average: 45.02
|
||||||
|
IFEval_Prompt-level-strict-accuracy: 56.56
|
||||||
|
drop_accuracy: 75.46
|
||||||
|
bbh_naive_average: 73.34
|
||||||
|
GPQA_diamond_accuracy: 32.83
|
||||||
|
hellaswag_accuracy: 94.81
|
||||||
|
TheoremQA_score: 23.88
|
||||||
|
musr_average_naive_average: 51.31
|
||||||
|
korbench_single_naive_average: 32
|
||||||
|
ARC_Prize_Public_Evaluation_accuracy: 0.01
|
||||||
|
gsm8k_accuracy: 86.96
|
||||||
|
GaokaoBench_weighted_average: 78.05
|
||||||
|
math_accuracy: 60.34
|
||||||
|
cmo_fib_accuracy: 12.98
|
||||||
|
aime2024_accuracy: 3.33
|
||||||
|
Mathbench_naive_average: 64.82
|
||||||
|
wikibench-wiki-single_choice_cncircular_perf_4: 31.7
|
||||||
|
cmmlu_naive_average: 74.24
|
||||||
|
mmlu_naive_average: 70.2
|
||||||
|
mmlu_pro_naive_average: 45.39
|
||||||
|
openai_humaneval_humaneval_pass@1: 70.12
|
||||||
|
sanitized_mbpp_score: 64.59
|
||||||
|
humanevalx_naive_average: 38.78
|
||||||
|
ds1000_naive_average: 14.19
|
||||||
|
lcb_code_generation_pass@1: 16.5
|
||||||
|
lcb_code_execution_pass@1: 33.82
|
||||||
|
lcb_test_output_pass@1: 22.62
|
||||||
|
bigcodebench_hard_instruct_pass@1: 6.08
|
||||||
|
bigcodebench_hard_complete_pass@1: 6.76
|
||||||
|
teval_naive_average: 79.73
|
||||||
|
SciCode_sub_accuracy: 100
|
||||||
|
qa_dingo_cn_score: 100
|
||||||
|
mmlu_accuracy: 70.2
|
||||||
|
mmlu-stem_accuracy: 67.73
|
||||||
|
mmlu-social-science_accuracy: 75.49
|
||||||
|
mmlu-humanities_accuracy: 68.56
|
||||||
|
mmlu-other_accuracy: 70.58
|
||||||
|
cmmlu_accuracy: 74.24
|
||||||
|
cmmlu-stem_accuracy: 66.7
|
||||||
|
cmmlu-social-science_accuracy: 75.88
|
||||||
|
cmmlu-humanities_accuracy: 77.56
|
||||||
|
cmmlu-other_accuracy: 77.52
|
||||||
|
cmmlu-china-specific_accuracy: 73.46
|
||||||
|
mmlu_pro_accuracy: 45.39
|
||||||
|
mmlu_pro_biology_accuracy: 65.83
|
||||||
|
mmlu_pro_business_accuracy: 51.96
|
||||||
|
mmlu_pro_chemistry_accuracy: 36.84
|
||||||
|
mmlu_pro_computer_science_accuracy: 48.29
|
||||||
|
mmlu_pro_economics_accuracy: 56.16
|
||||||
|
mmlu_pro_engineering_accuracy: 29.1
|
||||||
|
mmlu_pro_health_accuracy: 44.5
|
||||||
|
mmlu_pro_history_accuracy: 42.26
|
||||||
|
mmlu_pro_law_accuracy: 24.98
|
||||||
|
mmlu_pro_math_accuracy: 54.85
|
||||||
|
mmlu_pro_philosophy_accuracy: 39.28
|
||||||
|
mmlu_pro_physics_accuracy: 37.41
|
||||||
|
mmlu_pro_psychology_accuracy: 58.27
|
||||||
|
mmlu_pro_other_accuracy: 45.78
|
||||||
|
humanevalx-python_pass@1: 56.1
|
||||||
|
humanevalx-cpp_pass@1: 20.73
|
||||||
|
humanevalx-go_pass@1: 0
|
||||||
|
humanevalx-java_pass@1: 59.15
|
||||||
|
humanevalx-js_pass@1: 57.93
|
||||||
|
ds1000_Pandas_accuracy: 8.93
|
||||||
|
ds1000_Numpy_accuracy: 4.09
|
||||||
|
ds1000_Tensorflow_accuracy: 11.11
|
||||||
|
ds1000_Scipy_accuracy: 7.55
|
||||||
|
ds1000_Sklearn_accuracy: 7.83
|
||||||
|
ds1000_Pytorch_accuracy: 8.82
|
||||||
|
ds1000_Matplotlib_accuracy: 50.97
|
||||||
|
mmmlu_lite_accuracy: 45.02
|
||||||
|
openai_mmmlu_lite_AR-XY_accuracy: 18.6
|
||||||
|
openai_mmmlu_lite_BN-BD_accuracy: 27.58
|
||||||
|
openai_mmmlu_lite_DE-DE_accuracy: 51.23
|
||||||
|
openai_mmmlu_lite_ES-LA_accuracy: 56.63
|
||||||
|
openai_mmmlu_lite_FR-FR_accuracy: 58.11
|
||||||
|
openai_mmmlu_lite_HI-IN_accuracy: 33.82
|
||||||
|
openai_mmmlu_lite_ID-ID_accuracy: 50.39
|
||||||
|
openai_mmmlu_lite_IT-IT_accuracy: 50.39
|
||||||
|
openai_mmmlu_lite_JA-JP_accuracy: 50.95
|
||||||
|
openai_mmmlu_lite_KO-KR_accuracy: 45.05
|
||||||
|
openai_mmmlu_lite_PT-BR_accuracy: 57.89
|
||||||
|
openai_mmmlu_lite_SW-KE_accuracy: 32.14
|
||||||
|
openai_mmmlu_lite_YO-NG_accuracy: 32.14
|
||||||
|
openai_mmmlu_lite_ZH-CN_accuracy: 65.33
|
||||||
|
college_naive_average: 21
|
||||||
|
high_naive_average: 47
|
||||||
|
middle_naive_average: 59.67
|
||||||
|
primary_naive_average: 76
|
||||||
|
arithmetic_naive_average: 62
|
||||||
|
mathbench-a (average)_naive_average: 53.13
|
||||||
|
college_knowledge_naive_average: 68.99
|
||||||
|
high_knowledge_naive_average: 70.06
|
||||||
|
middle_knowledge_naive_average: 78.53
|
||||||
|
primary_knowledge_naive_average: 88.49
|
||||||
|
mathbench-t (average)_naive_average: 76.51
|
||||||
|
|
||||||
|
|
||||||
|
qwen2.5-7b-instruct-pytorch:
|
||||||
|
objective:
|
||||||
|
race-high_accuracy: 85.16
|
||||||
|
ARC-c_accuracy: 90.85
|
||||||
|
BoolQ_accuracy: 86.61
|
||||||
|
triviaqa_wiki_1shot_score: 52.96
|
||||||
|
nq_open_1shot_score: 17.62
|
||||||
|
mmmlu_lite_naive_average: 54.7
|
||||||
|
IFEval_Prompt-level-strict-accuracy: 71.35
|
||||||
|
drop_accuracy: 80.23
|
||||||
|
bbh_naive_average: 68.88
|
||||||
|
GPQA_diamond_accuracy: 36.36
|
||||||
|
hellaswag_accuracy: 85.49
|
||||||
|
TheoremQA_score: 18.38
|
||||||
|
musr_average_naive_average: 43.3
|
||||||
|
korbench_single_naive_average: 39.44
|
||||||
|
ARC_Prize_Public_Evaluation_accuracy: 0
|
||||||
|
gsm8k_accuracy: 91.66
|
||||||
|
GaokaoBench_weighted_average: 80.02
|
||||||
|
math_accuracy: 73.74
|
||||||
|
cmo_fib_accuracy: 26.44
|
||||||
|
aime2024_accuracy: 10
|
||||||
|
Mathbench_naive_average: 77.08
|
||||||
|
wikibench-wiki-single_choice_cncircular_perf_4: 34
|
||||||
|
cmmlu_naive_average: 75.9
|
||||||
|
mmlu_naive_average: 76.27
|
||||||
|
mmlu_pro_naive_average: 56.14
|
||||||
|
openai_humaneval_humaneval_pass@1: 84.76
|
||||||
|
sanitized_mbpp_score: 74.71
|
||||||
|
humanevalx_naive_average: 48.17
|
||||||
|
ds1000_naive_average: 18.57
|
||||||
|
lcb_code_generation_pass@1: 38.75
|
||||||
|
lcb_code_execution_pass@1: 42.38
|
||||||
|
lcb_test_output_pass@1: 50.45
|
||||||
|
bigcodebench_hard_instruct_pass@1: 16.89
|
||||||
|
bigcodebench_hard_complete_pass@1: 12.16
|
||||||
|
teval_naive_average: 79.46
|
||||||
|
SciCode_sub_accuracy: 100
|
||||||
|
qa_dingo_cn_score: 100
|
||||||
|
mmlu_accuracy: 76.27
|
||||||
|
mmlu-stem_accuracy: 77.75
|
||||||
|
mmlu-social-science_accuracy: 78.65
|
||||||
|
mmlu-humanities_accuracy: 73.12
|
||||||
|
mmlu-other_accuracy: 75.05
|
||||||
|
cmmlu_accuracy: 75.9
|
||||||
|
cmmlu-stem_accuracy: 73.41
|
||||||
|
cmmlu-social-science_accuracy: 75.97
|
||||||
|
cmmlu-humanities_accuracy: 76.42
|
||||||
|
cmmlu-other_accuracy: 78.15
|
||||||
|
cmmlu-china-specific_accuracy: 73.27
|
||||||
|
mmlu_pro_accuracy: 56.14
|
||||||
|
mmlu_pro_biology_accuracy: 72.25
|
||||||
|
mmlu_pro_business_accuracy: 66.16
|
||||||
|
mmlu_pro_chemistry_accuracy: 55.65
|
||||||
|
mmlu_pro_computer_science_accuracy: 60.24
|
||||||
|
mmlu_pro_economics_accuracy: 66.82
|
||||||
|
mmlu_pro_engineering_accuracy: 41.38
|
||||||
|
mmlu_pro_health_accuracy: 54.89
|
||||||
|
mmlu_pro_history_accuracy: 46.46
|
||||||
|
mmlu_pro_law_accuracy: 29.06
|
||||||
|
mmlu_pro_math_accuracy: 73.58
|
||||||
|
mmlu_pro_philosophy_accuracy: 44.89
|
||||||
|
mmlu_pro_physics_accuracy: 60.05
|
||||||
|
mmlu_pro_psychology_accuracy: 61.9
|
||||||
|
mmlu_pro_other_accuracy: 52.6
|
||||||
|
humanevalx-python_pass@1: 51.83
|
||||||
|
humanevalx-cpp_pass@1: 42.68
|
||||||
|
humanevalx-go_pass@1: 0
|
||||||
|
humanevalx-java_pass@1: 73.78
|
||||||
|
humanevalx-js_pass@1: 72.56
|
||||||
|
ds1000_Pandas_accuracy: 14.09
|
||||||
|
ds1000_Numpy_accuracy: 8.64
|
||||||
|
ds1000_Tensorflow_accuracy: 17.78
|
||||||
|
ds1000_Scipy_accuracy: 15.09
|
||||||
|
ds1000_Sklearn_accuracy: 8.7
|
||||||
|
ds1000_Pytorch_accuracy: 4.41
|
||||||
|
ds1000_Matplotlib_accuracy: 61.29
|
||||||
|
mmmlu_lite_accuracy: 54.7
|
||||||
|
openai_mmmlu_lite_AR-XY_accuracy: 42.32
|
||||||
|
openai_mmmlu_lite_BN-BD_accuracy: 42.18
|
||||||
|
openai_mmmlu_lite_DE-DE_accuracy: 60
|
||||||
|
openai_mmmlu_lite_ES-LA_accuracy: 66.18
|
||||||
|
openai_mmmlu_lite_FR-FR_accuracy: 66.88
|
||||||
|
openai_mmmlu_lite_HI-IN_accuracy: 48.63
|
||||||
|
openai_mmmlu_lite_ID-ID_accuracy: 61.26
|
||||||
|
openai_mmmlu_lite_IT-IT_accuracy: 65.26
|
||||||
|
openai_mmmlu_lite_JA-JP_accuracy: 60.7
|
||||||
|
openai_mmmlu_lite_KO-KR_accuracy: 60.63
|
||||||
|
openai_mmmlu_lite_PT-BR_accuracy: 54.46
|
||||||
|
openai_mmmlu_lite_SW-KE_accuracy: 36
|
||||||
|
openai_mmmlu_lite_YO-NG_accuracy: 31.86
|
||||||
|
openai_mmmlu_lite_ZH-CN_accuracy: 69.4
|
||||||
|
college_naive_average: 48.33
|
||||||
|
high_naive_average: 59.33
|
||||||
|
middle_naive_average: 76.67
|
||||||
|
primary_naive_average: 86.67
|
||||||
|
arithmetic_naive_average: 74.33
|
||||||
|
mathbench-a (average)_naive_average: 69.07
|
||||||
|
college_knowledge_naive_average: 83.54
|
||||||
|
high_knowledge_naive_average: 80.82
|
||||||
|
middle_knowledge_naive_average: 83.79
|
||||||
|
primary_knowledge_naive_average: 92.22
|
||||||
|
mathbench-t (average)_naive_average: 85.1
|
||||||
|
|
||||||
|
|
||||||
|
internlm3-8b-instruct-turbomind:
|
||||||
|
objective:
|
||||||
|
race-high_accuracy: 89.22
|
||||||
|
ARC-c_accuracy: 92.54
|
||||||
|
BoolQ_accuracy: 86.45
|
||||||
|
triviaqa_wiki_1shot_score: 60.72
|
||||||
|
nq_open_1shot_score: 20.25
|
||||||
|
mmmlu_lite_naive_average: 41.82
|
||||||
|
IFEval_Prompt-level-strict-accuracy: 77.45
|
||||||
|
drop_accuracy: 83.27
|
||||||
|
bbh_naive_average: 55.22
|
||||||
|
GPQA_diamond_accuracy: 37.88
|
||||||
|
hellaswag_accuracy: 91.28
|
||||||
|
TheoremQA_score: 20.12
|
||||||
|
musr_average_naive_average: 36.86
|
||||||
|
korbench_single_naive_average: 41.2
|
||||||
|
ARC_Prize_Public_Evaluation_accuracy: 0.06
|
||||||
|
gsm8k_accuracy: 91.28
|
||||||
|
GaokaoBench_weighted_average: 86.59
|
||||||
|
math_accuracy: 76.96
|
||||||
|
cmo_fib_accuracy: 35.1
|
||||||
|
aime2024_accuracy: 16.67
|
||||||
|
Mathbench_naive_average: 78.96
|
||||||
|
wikibench-wiki-single_choice_cncircular_perf_4: 37.45
|
||||||
|
cmmlu_naive_average: 83.33
|
||||||
|
mmlu_naive_average: 76.21
|
||||||
|
mmlu_pro_naive_average: 57.96
|
||||||
|
openai_humaneval_humaneval_pass@1: 81.71
|
||||||
|
sanitized_mbpp_score: 69.65
|
||||||
|
humanevalx_naive_average: 40.73
|
||||||
|
ds1000_naive_average: 27.23
|
||||||
|
lcb_code_generation_pass@1: 34.75
|
||||||
|
lcb_code_execution_pass@1: 49.9
|
||||||
|
lcb_test_output_pass@1: 48.19
|
||||||
|
bigcodebench_hard_instruct_pass@1: 13.51
|
||||||
|
bigcodebench_hard_complete_pass@1: 15.54
|
||||||
|
teval_naive_average: 82.86
|
||||||
|
SciCode_sub_accuracy: 100
|
||||||
|
qa_dingo_cn_score: 100
|
||||||
|
mmlu_accuracy: 76.21
|
||||||
|
mmlu-stem_accuracy: 77.7
|
||||||
|
mmlu-social-science_accuracy: 80.98
|
||||||
|
mmlu-humanities_accuracy: 70.83
|
||||||
|
mmlu-other_accuracy: 75.01
|
||||||
|
cmmlu_accuracy: 83.33
|
||||||
|
cmmlu-stem_accuracy: 79.66
|
||||||
|
cmmlu-social-science_accuracy: 83.39
|
||||||
|
cmmlu-humanities_accuracy: 84.73
|
||||||
|
cmmlu-other_accuracy: 86.2
|
||||||
|
cmmlu-china-specific_accuracy: 81.77
|
||||||
|
mmlu_pro_accuracy: 57.96
|
||||||
|
mmlu_pro_biology_accuracy: 75.45
|
||||||
|
mmlu_pro_business_accuracy: 64.64
|
||||||
|
mmlu_pro_chemistry_accuracy: 59.81
|
||||||
|
mmlu_pro_computer_science_accuracy: 60.24
|
||||||
|
mmlu_pro_economics_accuracy: 68.6
|
||||||
|
mmlu_pro_engineering_accuracy: 44.79
|
||||||
|
mmlu_pro_health_accuracy: 58.31
|
||||||
|
mmlu_pro_history_accuracy: 49.87
|
||||||
|
mmlu_pro_law_accuracy: 32.43
|
||||||
|
mmlu_pro_math_accuracy: 70.17
|
||||||
|
mmlu_pro_philosophy_accuracy: 46.89
|
||||||
|
mmlu_pro_physics_accuracy: 59.58
|
||||||
|
mmlu_pro_psychology_accuracy: 66.29
|
||||||
|
mmlu_pro_other_accuracy: 54.33
|
||||||
|
humanevalx-python_pass@1: 43.9
|
||||||
|
humanevalx-cpp_pass@1: 20.12
|
||||||
|
humanevalx-go_pass@1: 0
|
||||||
|
humanevalx-java_pass@1: 74.39
|
||||||
|
humanevalx-js_pass@1: 65.24
|
||||||
|
ds1000_Pandas_accuracy: 16.49
|
||||||
|
ds1000_Numpy_accuracy: 34.09
|
||||||
|
ds1000_Tensorflow_accuracy: 26.67
|
||||||
|
ds1000_Scipy_accuracy: 17.92
|
||||||
|
ds1000_Sklearn_accuracy: 20.87
|
||||||
|
ds1000_Pytorch_accuracy: 19.12
|
||||||
|
ds1000_Matplotlib_accuracy: 55.48
|
||||||
|
mmmlu_lite_accuracy: 41.82
|
||||||
|
openai_mmmlu_lite_AR-XY_accuracy: 32.56
|
||||||
|
openai_mmmlu_lite_BN-BD_accuracy: 4.56
|
||||||
|
openai_mmmlu_lite_DE-DE_accuracy: 24.91
|
||||||
|
openai_mmmlu_lite_ES-LA_accuracy: 51.09
|
||||||
|
openai_mmmlu_lite_FR-FR_accuracy: 61.68
|
||||||
|
openai_mmmlu_lite_HI-IN_accuracy: 24.98
|
||||||
|
openai_mmmlu_lite_ID-ID_accuracy: 44.56
|
||||||
|
openai_mmmlu_lite_IT-IT_accuracy: 52.35
|
||||||
|
openai_mmmlu_lite_JA-JP_accuracy: 51.02
|
||||||
|
openai_mmmlu_lite_KO-KR_accuracy: 47.93
|
||||||
|
openai_mmmlu_lite_PT-BR_accuracy: 53.89
|
||||||
|
openai_mmmlu_lite_SW-KE_accuracy: 33.47
|
||||||
|
openai_mmmlu_lite_YO-NG_accuracy: 33.47
|
||||||
|
openai_mmmlu_lite_ZH-CN_accuracy: 69.05
|
||||||
|
college_naive_average: 45.67
|
||||||
|
high_naive_average: 64.67
|
||||||
|
middle_naive_average: 82.33
|
||||||
|
primary_naive_average: 90.33
|
||||||
|
arithmetic_naive_average: 74
|
||||||
|
mathbench-a (average)_naive_average: 71.4
|
||||||
|
college_knowledge_naive_average: 85.28
|
||||||
|
high_knowledge_naive_average: 79.43
|
||||||
|
middle_knowledge_naive_average: 87.9
|
||||||
|
primary_knowledge_naive_average: 93.42
|
||||||
|
mathbench-t (average)_naive_average: 86.51
|
||||||
|
|
||||||
|
|
||||||
|
internlm3-8b-instruct-pytorch:
|
||||||
|
objective:
|
||||||
|
race-high_accuracy: 89.02
|
||||||
|
ARC-c_accuracy: 93.56
|
||||||
|
BoolQ_accuracy: 86.67
|
||||||
|
triviaqa_wiki_1shot_score: 60.54
|
||||||
|
nq_open_1shot_score: 20.3
|
||||||
|
mmmlu_lite_naive_average: 42.6
|
||||||
|
IFEval_Prompt-level-strict-accuracy: 79.11
|
||||||
|
drop_accuracy: 83.32
|
||||||
|
bbh_naive_average: 54.76
|
||||||
|
GPQA_diamond_accuracy: 42.42
|
||||||
|
hellaswag_accuracy: 91.31
|
||||||
|
TheoremQA_score: 18
|
||||||
|
musr_average_naive_average: 36.62
|
||||||
|
korbench_single_naive_average: 41.84
|
||||||
|
ARC_Prize_Public_Evaluation_accuracy: 0.06
|
||||||
|
gsm8k_accuracy: 90.67
|
||||||
|
GaokaoBench_weighted_average: 86.27
|
||||||
|
math_accuracy: 76.68
|
||||||
|
cmo_fib_accuracy: 33.65
|
||||||
|
aime2024_accuracy: 10
|
||||||
|
Mathbench_naive_average: 78.92
|
||||||
|
wikibench-wiki-single_choice_cncircular_perf_4: 37.35
|
||||||
|
cmmlu_naive_average: 83.11
|
||||||
|
mmlu_naive_average: 76.23
|
||||||
|
mmlu_pro_naive_average: 58.16
|
||||||
|
openai_humaneval_humaneval_pass@1: 82.32
|
||||||
|
sanitized_mbpp_score: 70.04
|
||||||
|
humanevalx_naive_average: 39.76
|
||||||
|
ds1000_naive_average: 27.84
|
||||||
|
lcb_code_generation_pass@1: 34.5
|
||||||
|
lcb_code_execution_pass@1: 48.02
|
||||||
|
lcb_test_output_pass@1: 47.74
|
||||||
|
bigcodebench_hard_instruct_pass@1: 12.84
|
||||||
|
bigcodebench_hard_complete_pass@1: 15.54
|
||||||
|
teval_naive_average: 82.86
|
||||||
|
SciCode_sub_accuracy: 100
|
||||||
|
qa_dingo_cn_score: 100
|
||||||
|
mmlu_accuracy: 76.23
|
||||||
|
mmlu-stem_accuracy: 78.08
|
||||||
|
mmlu-social-science_accuracy: 80.31
|
||||||
|
mmlu-humanities_accuracy: 71.38
|
||||||
|
mmlu-other_accuracy: 74.63
|
||||||
|
cmmlu_accuracy: 83.11
|
||||||
|
cmmlu-stem_accuracy: 79.42
|
||||||
|
cmmlu-social-science_accuracy: 83.34
|
||||||
|
cmmlu-humanities_accuracy: 83.95
|
||||||
|
cmmlu-other_accuracy: 86.22
|
||||||
|
cmmlu-china-specific_accuracy: 81.5
|
||||||
|
mmlu_pro_accuracy: 58.16
|
||||||
|
mmlu_pro_biology_accuracy: 74.62
|
||||||
|
mmlu_pro_business_accuracy: 65.02
|
||||||
|
mmlu_pro_chemistry_accuracy: 60.69
|
||||||
|
mmlu_pro_computer_science_accuracy: 61.46
|
||||||
|
mmlu_pro_economics_accuracy: 68.25
|
||||||
|
mmlu_pro_engineering_accuracy: 45.3
|
||||||
|
mmlu_pro_health_accuracy: 60.15
|
||||||
|
mmlu_pro_history_accuracy: 50.66
|
||||||
|
mmlu_pro_law_accuracy: 31.7
|
||||||
|
mmlu_pro_math_accuracy: 70.32
|
||||||
|
mmlu_pro_philosophy_accuracy: 47.7
|
||||||
|
mmlu_pro_physics_accuracy: 59.51
|
||||||
|
mmlu_pro_psychology_accuracy: 65.41
|
||||||
|
mmlu_pro_other_accuracy: 53.46
|
||||||
|
humanevalx-python_pass@1: 42.68
|
||||||
|
humanevalx-cpp_pass@1: 19.51
|
||||||
|
humanevalx-go_pass@1: 0
|
||||||
|
humanevalx-java_pass@1: 72.56
|
||||||
|
humanevalx-js_pass@1: 64.02
|
||||||
|
ds1000_Pandas_accuracy: 14.09
|
||||||
|
ds1000_Numpy_accuracy: 35
|
||||||
|
ds1000_Tensorflow_accuracy: 24.44
|
||||||
|
ds1000_Scipy_accuracy: 20.75
|
||||||
|
ds1000_Sklearn_accuracy: 21.74
|
||||||
|
ds1000_Pytorch_accuracy: 22.06
|
||||||
|
ds1000_Matplotlib_accuracy: 56.77
|
||||||
|
mmmlu_lite_accuracy: 42.6
|
||||||
|
openai_mmmlu_lite_AR-XY_accuracy: 32.84
|
||||||
|
openai_mmmlu_lite_BN-BD_accuracy: 10.46
|
||||||
|
openai_mmmlu_lite_DE-DE_accuracy: 24.56
|
||||||
|
openai_mmmlu_lite_ES-LA_accuracy: 50.95
|
||||||
|
openai_mmmlu_lite_FR-FR_accuracy: 61.05
|
||||||
|
openai_mmmlu_lite_HI-IN_accuracy: 30.6
|
||||||
|
openai_mmmlu_lite_ID-ID_accuracy: 45.89
|
||||||
|
openai_mmmlu_lite_IT-IT_accuracy: 51.79
|
||||||
|
openai_mmmlu_lite_JA-JP_accuracy: 51.65
|
||||||
|
openai_mmmlu_lite_KO-KR_accuracy: 48.77
|
||||||
|
openai_mmmlu_lite_PT-BR_accuracy: 52.7
|
||||||
|
openai_mmmlu_lite_SW-KE_accuracy: 32.91
|
||||||
|
openai_mmmlu_lite_YO-NG_accuracy: 32.84
|
||||||
|
openai_mmmlu_lite_ZH-CN_accuracy: 69.33
|
||||||
|
college_naive_average: 47
|
||||||
|
high_naive_average: 66.67
|
||||||
|
middle_naive_average: 81.67
|
||||||
|
primary_naive_average: 89.33
|
||||||
|
arithmetic_naive_average: 73.67
|
||||||
|
mathbench-a (average)_naive_average: 71.67
|
||||||
|
college_knowledge_naive_average: 82.91
|
||||||
|
high_knowledge_naive_average: 79.86
|
||||||
|
middle_knowledge_naive_average: 88.92
|
||||||
|
primary_knowledge_naive_average: 92.96
|
||||||
|
mathbench-t (average)_naive_average: 86.16
|
||||||
|
236
.github/scripts/oc_score_baseline_testrange.yaml
vendored
236
.github/scripts/oc_score_baseline_testrange.yaml
vendored
@ -1,21 +1,24 @@
|
|||||||
chat:
|
chat:
|
||||||
glm-4-9b-chat-hf:
|
glm-4-9b-chat-hf:
|
||||||
gsm8k_accuracy: 68.75
|
gsm8k_accuracy: 56.25
|
||||||
race-high_accuracy: 90.62
|
race-high_accuracy: 84.38
|
||||||
glm-4-9b-chat-turbomind:
|
glm-4-9b-chat-turbomind:
|
||||||
gsm8k_accuracy: 71.88
|
gsm8k_accuracy: 71.88
|
||||||
race-high_accuracy: 90.62
|
race-high_accuracy: 90.62
|
||||||
glm-4-9b-chat-vllm:
|
glm-4-9b-chat-vllm:
|
||||||
gsm8k_accuracy: 71.88
|
gsm8k_accuracy: 68.75
|
||||||
race-high_accuracy: 90.62
|
race-high_accuracy: 90.62
|
||||||
deepseek-7b-chat-hf:
|
deepseek-7b-chat-hf:
|
||||||
gsm8k_accuracy: 46.88
|
gsm8k_accuracy: 46.88
|
||||||
race-high_accuracy: 81.25
|
race-high_accuracy: 81.25
|
||||||
deepseek-moe-16b-chat-hf:
|
deepseek-r1-distill-llama-8b-turbomind:
|
||||||
gsm8k_accuracy: 50
|
gsm8k_accuracy: 31.25
|
||||||
race-high_accuracy: 68.75
|
race-high_accuracy: 81.25
|
||||||
|
deepseek-r1-distill-qwen-1_5b-turbomind:
|
||||||
|
gsm8k_accuracy: 37.5
|
||||||
|
race-high_accuracy: 53.12
|
||||||
deepseek-7b-chat-vllm:
|
deepseek-7b-chat-vllm:
|
||||||
gsm8k_accuracy: 50
|
gsm8k_accuracy: 43.75
|
||||||
race-high_accuracy: 78.12
|
race-high_accuracy: 78.12
|
||||||
gemma2-2b-it-hf:
|
gemma2-2b-it-hf:
|
||||||
gsm8k_accuracy: 50
|
gsm8k_accuracy: 50
|
||||||
@ -36,34 +39,40 @@ chat:
|
|||||||
gsm8k_accuracy: 78.12
|
gsm8k_accuracy: 78.12
|
||||||
race-high_accuracy: 93.75
|
race-high_accuracy: 93.75
|
||||||
gemma-7b-it-vllm:
|
gemma-7b-it-vllm:
|
||||||
gsm8k_accuracy: 46.88
|
gsm8k_accuracy: 31.25
|
||||||
race-high_accuracy: 68.75
|
race-high_accuracy: 68.75
|
||||||
internlm2_5-7b-chat-hf:
|
internlm2_5-7b-chat-hf:
|
||||||
gsm8k_accuracy: 84.38
|
gsm8k_accuracy: 84.38
|
||||||
race-high_accuracy: 90.62
|
race-high_accuracy: 90.62
|
||||||
|
internlm3-8b-instruct-hf:
|
||||||
|
gsm8k_accuracy: 65.62
|
||||||
|
race-high_accuracy: 87.5
|
||||||
internlm2_5-7b-chat-turbomind:
|
internlm2_5-7b-chat-turbomind:
|
||||||
gsm8k_accuracy: 87.50
|
gsm8k_accuracy: 84.38
|
||||||
race-high_accuracy: 90.62
|
race-high_accuracy: 90.62
|
||||||
internlm2-chat-1.8b-turbomind:
|
internlm2-chat-1.8b-turbomind:
|
||||||
gsm8k_accuracy: 28.12
|
gsm8k_accuracy: 28.12
|
||||||
race-high_accuracy: 84.38
|
race-high_accuracy: 84.38
|
||||||
internlm2-chat-1.8b-sft-turbomind:
|
internlm2-chat-1.8b-sft-turbomind:
|
||||||
gsm8k_accuracy: 21.88
|
gsm8k_accuracy: 31.25
|
||||||
race-high_accuracy: 84.38
|
race-high_accuracy: 84.38
|
||||||
internlm2-chat-7b-lmdeploy:
|
internlm2-chat-7b-lmdeploy:
|
||||||
gsm8k_accuracy: 53.12
|
gsm8k_accuracy: 59.38
|
||||||
race-high_accuracy: 84.38
|
race-high_accuracy: 84.38
|
||||||
internlm2-chat-7b-sft-turbomind:
|
internlm2-chat-7b-sft-turbomind:
|
||||||
gsm8k_accuracy: 53.12
|
gsm8k_accuracy: 56.25
|
||||||
race-high_accuracy: 90.62
|
race-high_accuracy: 90.62
|
||||||
|
internlm3-8b-instruct-turbomind:
|
||||||
|
gsm8k_accuracy: 68.75
|
||||||
|
race-high_accuracy: 87.5
|
||||||
internlm2-chat-7b-vllm:
|
internlm2-chat-7b-vllm:
|
||||||
gsm8k_accuracy: 43.75
|
gsm8k_accuracy: 59.38
|
||||||
race-high_accuracy: 84.38
|
race-high_accuracy: 87.50
|
||||||
llama-3_1-8b-instruct-hf:
|
llama-3_1-8b-instruct-hf:
|
||||||
gsm8k_accuracy: 84.38
|
gsm8k_accuracy: 84.38
|
||||||
race-high_accuracy: 90.62
|
race-high_accuracy: 90.62
|
||||||
llama-3_2-3b-instruct-hf:
|
llama-3_2-3b-instruct-hf:
|
||||||
gsm8k_accuracy: 68.75
|
gsm8k_accuracy: 71.88
|
||||||
race-high_accuracy: 81.25
|
race-high_accuracy: 81.25
|
||||||
llama-3-8b-instruct-hf:
|
llama-3-8b-instruct-hf:
|
||||||
gsm8k_accuracy: 68.75
|
gsm8k_accuracy: 68.75
|
||||||
@ -72,14 +81,14 @@ chat:
|
|||||||
gsm8k_accuracy: 18.75
|
gsm8k_accuracy: 18.75
|
||||||
race-high_accuracy: 46.88
|
race-high_accuracy: 46.88
|
||||||
llama-3_1-8b-instruct-turbomind:
|
llama-3_1-8b-instruct-turbomind:
|
||||||
gsm8k_accuracy: 78.12
|
gsm8k_accuracy: 81.25
|
||||||
race-high_accuracy: 90.62
|
race-high_accuracy: 90.62
|
||||||
llama-3_2-3b-instruct-turbomind:
|
llama-3_2-3b-instruct-turbomind:
|
||||||
gsm8k_accuracy: 65.62
|
gsm8k_accuracy: 75.00
|
||||||
race-high_accuracy: 81.25
|
race-high_accuracy: 81.25
|
||||||
llama-3-8b-instruct-turbomind:
|
llama-3-8b-instruct-turbomind:
|
||||||
gsm8k_accuracy: 71.88
|
gsm8k_accuracy: 68.75
|
||||||
race-high_accuracy: 87.5
|
race-high_accuracy: 84.38
|
||||||
mistral-7b-instruct-v0.2-hf:
|
mistral-7b-instruct-v0.2-hf:
|
||||||
gsm8k_accuracy: 40.62
|
gsm8k_accuracy: 40.62
|
||||||
race-high_accuracy: 75
|
race-high_accuracy: 75
|
||||||
@ -94,13 +103,10 @@ chat:
|
|||||||
race-high_accuracy: 78.12
|
race-high_accuracy: 78.12
|
||||||
mistral-7b-instruct-v0.1-vllm:
|
mistral-7b-instruct-v0.1-vllm:
|
||||||
gsm8k_accuracy: 34.38
|
gsm8k_accuracy: 34.38
|
||||||
race-high_accuracy: 68.75
|
race-high_accuracy: 65.62
|
||||||
mistral-7b-instruct-v0.2-vllm:
|
mistral-7b-instruct-v0.2-vllm:
|
||||||
gsm8k_accuracy: 31.25
|
gsm8k_accuracy: 21.88
|
||||||
race-high_accuracy: 75
|
race-high_accuracy: 78.12
|
||||||
phi-3-mini-4k-instruct-hf:
|
|
||||||
gsm8k_accuracy: 81.25
|
|
||||||
race-high_accuracy: 87.50
|
|
||||||
qwen2.5-0.5b-instruct-hf:
|
qwen2.5-0.5b-instruct-hf:
|
||||||
gsm8k_accuracy: 34.38
|
gsm8k_accuracy: 34.38
|
||||||
race-high_accuracy: 46.88
|
race-high_accuracy: 46.88
|
||||||
@ -108,10 +114,10 @@ chat:
|
|||||||
gsm8k_accuracy: 53.12
|
gsm8k_accuracy: 53.12
|
||||||
race-high_accuracy: 90.62
|
race-high_accuracy: 90.62
|
||||||
qwen2.5-0.5b-instruct-turbomind:
|
qwen2.5-0.5b-instruct-turbomind:
|
||||||
gsm8k_accuracy: 28.12
|
gsm8k_accuracy: 31.25
|
||||||
race-high_accuracy: 50
|
race-high_accuracy: 43.75
|
||||||
qwen2.5-3b-instruct-turbomind:
|
qwen2.5-3b-instruct-turbomind:
|
||||||
gsm8k_accuracy: 59.38
|
gsm8k_accuracy: 56.25
|
||||||
race-high_accuracy: 90.62
|
race-high_accuracy: 90.62
|
||||||
qwen1.5-0.5b-chat-hf:
|
qwen1.5-0.5b-chat-hf:
|
||||||
gsm8k_accuracy: 0
|
gsm8k_accuracy: 0
|
||||||
@ -123,11 +129,11 @@ chat:
|
|||||||
gsm8k_accuracy: 68.75
|
gsm8k_accuracy: 68.75
|
||||||
race-high_accuracy: 90.62
|
race-high_accuracy: 90.62
|
||||||
qwen2-1.5b-instruct-turbomind:
|
qwen2-1.5b-instruct-turbomind:
|
||||||
gsm8k_accuracy: 53.12
|
gsm8k_accuracy: 56.25
|
||||||
race-high_accuracy: 84.38
|
race-high_accuracy: 84.38
|
||||||
qwen2-7b-instruct-turbomind:
|
qwen2-7b-instruct-turbomind:
|
||||||
gsm8k_accuracy: 81.25
|
gsm8k_accuracy: 81.25
|
||||||
race-high_accuracy: 90.62
|
race-high_accuracy: 87.50
|
||||||
qwen1.5-0.5b-chat-vllm:
|
qwen1.5-0.5b-chat-vllm:
|
||||||
gsm8k_accuracy: 3.12
|
gsm8k_accuracy: 3.12
|
||||||
race-high_accuracy: 53.12
|
race-high_accuracy: 53.12
|
||||||
@ -143,11 +149,11 @@ chat:
|
|||||||
yi-1.5-9b-chat-turbomind:
|
yi-1.5-9b-chat-turbomind:
|
||||||
gsm8k_accuracy: 71.88
|
gsm8k_accuracy: 71.88
|
||||||
race-high_accuracy: 93.75
|
race-high_accuracy: 93.75
|
||||||
deepseek-v2-lite-chat-hf:
|
deepseek-v2_lite-chat-turbomind:
|
||||||
gsm8k_accuracy: 46.88
|
gsm8k_accuracy: 37.5
|
||||||
race-high_accuracy: 71.88
|
race-high_accuracy: 71.88
|
||||||
gemma2-27b-it-hf:
|
gemma2-27b-it-hf:
|
||||||
gsm8k_accuracy: 75
|
gsm8k_accuracy: 71.88
|
||||||
race-high_accuracy: 93.75
|
race-high_accuracy: 93.75
|
||||||
internlm2_5-20b-chat-hf:
|
internlm2_5-20b-chat-hf:
|
||||||
gsm8k_accuracy: 84.38
|
gsm8k_accuracy: 84.38
|
||||||
@ -161,6 +167,9 @@ chat:
|
|||||||
mistral-small-instruct-2409-turbomind:
|
mistral-small-instruct-2409-turbomind:
|
||||||
gsm8k_accuracy: 81.25
|
gsm8k_accuracy: 81.25
|
||||||
race-high_accuracy: 87.50
|
race-high_accuracy: 87.50
|
||||||
|
phi-4:
|
||||||
|
gsm8k_accuracy: 81.25
|
||||||
|
race-high_accuracy: 87.50
|
||||||
qwen2.5-14b-instruct-hf:
|
qwen2.5-14b-instruct-hf:
|
||||||
gsm8k_accuracy: 71.88
|
gsm8k_accuracy: 71.88
|
||||||
race-high_accuracy: 96.88
|
race-high_accuracy: 96.88
|
||||||
@ -168,40 +177,41 @@ chat:
|
|||||||
gsm8k_accuracy: 68.75
|
gsm8k_accuracy: 68.75
|
||||||
race-high_accuracy: 93.75
|
race-high_accuracy: 93.75
|
||||||
yi-1.5-34b-chat-turbomind:
|
yi-1.5-34b-chat-turbomind:
|
||||||
gsm8k_accuracy: 78.12
|
gsm8k_accuracy: 75.00
|
||||||
race-high_accuracy: 93.75
|
race-high_accuracy: 93.75
|
||||||
deepseek-67b-chat-hf:
|
deepseek-67b-chat-turbomind:
|
||||||
gsm8k_accuracy: 71.88
|
gsm8k_accuracy: 75.00
|
||||||
race-high_accuracy: 78.12
|
race-high_accuracy: 78.12
|
||||||
|
deepseek-r1-distill-qwen-32b-turbomind:
|
||||||
|
gsm8k_accuracy: 25
|
||||||
|
race-high_accuracy: 90.62
|
||||||
llama-3_3-70b-instruct-turbomind:
|
llama-3_3-70b-instruct-turbomind:
|
||||||
gsm8k_accuracy: 93.75
|
gsm8k_accuracy: 93.75
|
||||||
race-high_accuracy: 87.5
|
race-high_accuracy: 87.5
|
||||||
mixtral-8x7b-instruct-v0.1-hf:
|
|
||||||
gsm8k_accuracy: 59.38
|
|
||||||
race-high_accuracy: 81.25
|
|
||||||
mixtral-large-instruct-2411-turbomind:
|
mixtral-large-instruct-2411-turbomind:
|
||||||
gsm8k_accuracy: 90.62
|
gsm8k_accuracy: 87.50
|
||||||
race-high_accuracy: 93.75
|
race-high_accuracy: 93.75
|
||||||
nvidia-3_1-Nemotron-70b-instruct-HF-turbomind:
|
nvidia-3_1-Nemotron-70b-instruct-HF-turbomind:
|
||||||
gsm8k_accuracy: 87.5
|
gsm8k_accuracy: 93.75
|
||||||
race-high_accuracy: 46.88
|
race-high_accuracy: 50.00
|
||||||
qwen2.5-72b-instruct-turbomind:
|
qwen2.5-72b-instruct-turbomind:
|
||||||
gsm8k_accuracy: 75
|
gsm8k_accuracy: 81.25
|
||||||
race-high_accuracy: 93.75
|
race-high_accuracy: 90.62
|
||||||
|
deepseek-r1-distill-llama-70b-turbomind:
|
||||||
|
gsm8k_accuracy: 40.62
|
||||||
|
race-high_accuracy: 90.62
|
||||||
deepseek-v2_5-1210-turbomind:
|
deepseek-v2_5-1210-turbomind:
|
||||||
gsm8k_accuracy: 90.62
|
gsm8k_accuracy: 90.62
|
||||||
race-high_accuracy: 84.38
|
race-high_accuracy: 84.38
|
||||||
mixtral-8x22b-instruct-v0.1-hf:
|
mixtral-8x22b-instruct-v0.1-turbomind:
|
||||||
gsm8k_accuracy: 81.25
|
gsm8k_accuracy: 75
|
||||||
race-high_accuracy: 81.25
|
race-high_accuracy: 78.12
|
||||||
|
mixtral-8x22b-instruct-v0.1-vllm:
|
||||||
|
gsm8k_accuracy: 78.12
|
||||||
|
race-high_accuracy: 78.12
|
||||||
base:
|
base:
|
||||||
glm-4-9b-hf:
|
|
||||||
gsm8k_accuracy: 68.75
|
|
||||||
GPQA_diamond_accuracy: 31.25
|
|
||||||
race-high_accuracy: 93.75
|
|
||||||
winogrande_accuracy: 84.38
|
|
||||||
glm-4-9b-turbomind:
|
glm-4-9b-turbomind:
|
||||||
gsm8k_accuracy: 62.5
|
gsm8k_accuracy: 56.25
|
||||||
GPQA_diamond_accuracy: 28.12
|
GPQA_diamond_accuracy: 28.12
|
||||||
race-high_accuracy: 93.75
|
race-high_accuracy: 93.75
|
||||||
winogrande_accuracy: 84.38
|
winogrande_accuracy: 84.38
|
||||||
@ -210,15 +220,10 @@ base:
|
|||||||
GPQA_diamond_accuracy: 0
|
GPQA_diamond_accuracy: 0
|
||||||
race-high_accuracy: 46.88
|
race-high_accuracy: 46.88
|
||||||
winogrande_accuracy: 71.88
|
winogrande_accuracy: 71.88
|
||||||
deepseek-moe-16b-base-hf:
|
|
||||||
gsm8k_accuracy: 21.88
|
|
||||||
GPQA_diamond_accuracy: 0
|
|
||||||
race-high_accuracy: 21.88
|
|
||||||
winogrande_accuracy: 65.62
|
|
||||||
deepseek-7b-base-turbomind:
|
deepseek-7b-base-turbomind:
|
||||||
gsm8k_accuracy: 21.88
|
gsm8k_accuracy: 18.75
|
||||||
GPQA_diamond_accuracy: 0
|
GPQA_diamond_accuracy: 0
|
||||||
race-high_accuracy: 46.88
|
race-high_accuracy: 43.75
|
||||||
winogrande_accuracy: 84.38
|
winogrande_accuracy: 84.38
|
||||||
deepseek-moe-16b-base-vllm:
|
deepseek-moe-16b-base-vllm:
|
||||||
gsm8k_accuracy: 21.88
|
gsm8k_accuracy: 21.88
|
||||||
@ -245,16 +250,21 @@ base:
|
|||||||
GPQA_diamond_accuracy: 3.12
|
GPQA_diamond_accuracy: 3.12
|
||||||
race-high_accuracy: 65.62
|
race-high_accuracy: 65.62
|
||||||
winogrande_accuracy: 71.88
|
winogrande_accuracy: 71.88
|
||||||
|
gemma-2-9b-turbomind:
|
||||||
|
gsm8k_accuracy: 68.75
|
||||||
|
GPQA_diamond_accuracy: 0
|
||||||
|
race-high_accuracy: 78.12
|
||||||
|
winogrande_accuracy: 50
|
||||||
gemma-2b-vllm:
|
gemma-2b-vllm:
|
||||||
gsm8k_accuracy: 15.62
|
gsm8k_accuracy: 15.62
|
||||||
GPQA_diamond_accuracy: 3.12
|
GPQA_diamond_accuracy: 3.12
|
||||||
race-high_accuracy:
|
race-high_accuracy: 28.12
|
||||||
winogrande_accuracy:
|
winogrande_accuracy: 68.75
|
||||||
gemma-7b-vllm:
|
gemma-7b-vllm:
|
||||||
gsm8k_accuracy: 53.12
|
gsm8k_accuracy: 43.75
|
||||||
GPQA_diamond_accuracy: 9.38
|
GPQA_diamond_accuracy: 6.25
|
||||||
race-high_accuracy:
|
race-high_accuracy: 81.25
|
||||||
winogrande_accuracy:
|
winogrande_accuracy: 81.25
|
||||||
internlm2_5-7b-hf:
|
internlm2_5-7b-hf:
|
||||||
gsm8k_accuracy: 37.5
|
gsm8k_accuracy: 37.5
|
||||||
GPQA_diamond_accuracy: 25
|
GPQA_diamond_accuracy: 25
|
||||||
@ -265,30 +275,25 @@ base:
|
|||||||
GPQA_diamond_accuracy: 18.75
|
GPQA_diamond_accuracy: 18.75
|
||||||
race-high_accuracy: 62.5
|
race-high_accuracy: 62.5
|
||||||
winogrande_accuracy: 78.12
|
winogrande_accuracy: 78.12
|
||||||
internlm2-base-7b-hf:
|
|
||||||
gsm8k_accuracy: 3.12
|
|
||||||
GPQA_diamond_accuracy: 21.88
|
|
||||||
race-high_accuracy: 75
|
|
||||||
winogrande_accuracy: 65.62
|
|
||||||
internlm2-1.8b-turbomind:
|
internlm2-1.8b-turbomind:
|
||||||
gsm8k_accuracy: 12.5
|
gsm8k_accuracy: 6.25
|
||||||
GPQA_diamond_accuracy: 9.38
|
GPQA_diamond_accuracy: 12.5
|
||||||
race-high_accuracy: 71.88
|
race-high_accuracy: 71.88
|
||||||
winogrande_accuracy: 78.12
|
winogrande_accuracy: 75
|
||||||
internlm2_5-7b-turbomind:
|
internlm2_5-7b-turbomind:
|
||||||
gsm8k_accuracy: 62.50
|
gsm8k_accuracy: 59.38
|
||||||
GPQA_diamond_accuracy: 34.38
|
GPQA_diamond_accuracy: 34.38
|
||||||
race-high_accuracy: 93.75
|
race-high_accuracy: 93.75
|
||||||
winogrande_accuracy: 87.50
|
winogrande_accuracy: 84.38
|
||||||
internlm2-7b-turbomind:
|
internlm2-7b-turbomind:
|
||||||
gsm8k_accuracy: 53.12
|
gsm8k_accuracy: 50
|
||||||
GPQA_diamond_accuracy: 21.88
|
GPQA_diamond_accuracy: 18.75
|
||||||
race-high_accuracy: 71.88
|
race-high_accuracy: 71.88
|
||||||
winogrande_accuracy: 84.38
|
winogrande_accuracy: 84.38
|
||||||
internlm2-base-7b-turbomind:
|
internlm2-base-7b-turbomind:
|
||||||
gsm8k_accuracy: 37.50
|
gsm8k_accuracy: 37.50
|
||||||
GPQA_diamond_accuracy: 28.12
|
GPQA_diamond_accuracy: 21.88
|
||||||
race-high_accuracy: 81.25
|
race-high_accuracy: 84.38
|
||||||
winogrande_accuracy: 75
|
winogrande_accuracy: 75
|
||||||
llama-2-7b-hf:
|
llama-2-7b-hf:
|
||||||
gsm8k_accuracy: 21.88
|
gsm8k_accuracy: 21.88
|
||||||
@ -311,7 +316,7 @@ base:
|
|||||||
race-high_accuracy: 78.12
|
race-high_accuracy: 78.12
|
||||||
winogrande_accuracy: 78.12
|
winogrande_accuracy: 78.12
|
||||||
llama-3-8b-turbomind:
|
llama-3-8b-turbomind:
|
||||||
gsm8k_accuracy: 50
|
gsm8k_accuracy: 46.88
|
||||||
GPQA_diamond_accuracy: 12.50
|
GPQA_diamond_accuracy: 12.50
|
||||||
race-high_accuracy: 65.62
|
race-high_accuracy: 65.62
|
||||||
winogrande_accuracy: 78.12
|
winogrande_accuracy: 78.12
|
||||||
@ -327,14 +332,14 @@ base:
|
|||||||
winogrande_accuracy: 71.88
|
winogrande_accuracy: 71.88
|
||||||
qwen2.5-1.5b-turbomind:
|
qwen2.5-1.5b-turbomind:
|
||||||
gsm8k_accuracy: 62.50
|
gsm8k_accuracy: 62.50
|
||||||
GPQA_diamond_accuracy: 12.50
|
GPQA_diamond_accuracy: 15.62
|
||||||
race-high_accuracy: 78.12
|
race-high_accuracy: 75
|
||||||
winogrande_accuracy: 68.75
|
|
||||||
qwen2.5-7b-turbomind:
|
|
||||||
gsm8k_accuracy: 75.00
|
|
||||||
GPQA_diamond_accuracy: 25
|
|
||||||
race-high_accuracy: 87.5
|
|
||||||
winogrande_accuracy: 71.88
|
winogrande_accuracy: 71.88
|
||||||
|
qwen2.5-7b-turbomind:
|
||||||
|
gsm8k_accuracy: 71.88
|
||||||
|
GPQA_diamond_accuracy: 18.75
|
||||||
|
race-high_accuracy: 87.5
|
||||||
|
winogrande_accuracy: 75.00
|
||||||
qwen1.5-moe-a2.7b-hf:
|
qwen1.5-moe-a2.7b-hf:
|
||||||
gsm8k_accuracy: 62.5
|
gsm8k_accuracy: 62.5
|
||||||
GPQA_diamond_accuracy: 18.75
|
GPQA_diamond_accuracy: 18.75
|
||||||
@ -356,17 +361,17 @@ base:
|
|||||||
race-high_accuracy: 87.5
|
race-high_accuracy: 87.5
|
||||||
winogrande_accuracy: 68.75
|
winogrande_accuracy: 68.75
|
||||||
qwen2-1.5b-turbomind:
|
qwen2-1.5b-turbomind:
|
||||||
gsm8k_accuracy: 56.25
|
gsm8k_accuracy: 59.38
|
||||||
GPQA_diamond_accuracy: 9.38
|
GPQA_diamond_accuracy: 12.50
|
||||||
race-high_accuracy: 81.25
|
race-high_accuracy: 81.25
|
||||||
winogrande_accuracy: 75
|
winogrande_accuracy: 75
|
||||||
qwen2-7b-turbomind:
|
qwen2-7b-turbomind:
|
||||||
gsm8k_accuracy: 75.00
|
gsm8k_accuracy: 65.62
|
||||||
GPQA_diamond_accuracy: 12.5
|
GPQA_diamond_accuracy: 12.5
|
||||||
race-high_accuracy: 87.5
|
race-high_accuracy: 87.5
|
||||||
winogrande_accuracy: 71.88
|
winogrande_accuracy: 71.88
|
||||||
qwen1.5-0.5b-vllm:
|
qwen1.5-0.5b-vllm:
|
||||||
gsm8k_accuracy: 9.38
|
gsm8k_accuracy: 6.25
|
||||||
GPQA_diamond_accuracy: 0
|
GPQA_diamond_accuracy: 0
|
||||||
race-high_accuracy: 56.25
|
race-high_accuracy: 56.25
|
||||||
winogrande_accuracy: 62.5
|
winogrande_accuracy: 62.5
|
||||||
@ -382,27 +387,12 @@ base:
|
|||||||
winogrande_accuracy: 59.38
|
winogrande_accuracy: 59.38
|
||||||
yi-1.5-9b-turbomind:
|
yi-1.5-9b-turbomind:
|
||||||
gsm8k_accuracy: 78.12
|
gsm8k_accuracy: 78.12
|
||||||
GPQA_diamond_accuracy: 40.62
|
GPQA_diamond_accuracy: 43.75
|
||||||
race-high_accuracy: 87.5
|
race-high_accuracy: 87.5
|
||||||
winogrande_accuracy: 71.88
|
winogrande_accuracy: 71.88
|
||||||
deepseek-v2-lite-hf:
|
|
||||||
gsm8k_accuracy: 31.25
|
|
||||||
GPQA_diamond_accuracy: 28.12
|
|
||||||
race-high_accuracy: 59.38
|
|
||||||
winogrande_accuracy: 71.88
|
|
||||||
internlm2-20b-hf:
|
|
||||||
gsm8k_accuracy: 56.25
|
|
||||||
GPQA_diamond_accuracy: 15.62
|
|
||||||
race-high_accuracy: 68.75
|
|
||||||
winogrande_accuracy: 75
|
|
||||||
internlm2-base-20b-hf:
|
|
||||||
gsm8k_accuracy: 12.5
|
|
||||||
GPQA_diamond_accuracy: 9.38
|
|
||||||
race-high_accuracy: 84.38
|
|
||||||
winogrande_accuracy: 65.62
|
|
||||||
internlm2-20b-turbomind:
|
internlm2-20b-turbomind:
|
||||||
gsm8k_accuracy: 71.88
|
gsm8k_accuracy: 75
|
||||||
GPQA_diamond_accuracy: 15.62
|
GPQA_diamond_accuracy: 18.75
|
||||||
race-high_accuracy: 68.75
|
race-high_accuracy: 68.75
|
||||||
winogrande_accuracy: 81.25
|
winogrande_accuracy: 81.25
|
||||||
qwen2.5-14b-hf:
|
qwen2.5-14b-hf:
|
||||||
@ -416,37 +406,27 @@ base:
|
|||||||
race-high_accuracy: 93.75
|
race-high_accuracy: 93.75
|
||||||
winogrande_accuracy: 78.12
|
winogrande_accuracy: 78.12
|
||||||
qwen2.5-32b-turbomind:
|
qwen2.5-32b-turbomind:
|
||||||
gsm8k_accuracy: 84.38
|
gsm8k_accuracy: 87.5
|
||||||
GPQA_diamond_accuracy: 28.12
|
GPQA_diamond_accuracy: 18.75
|
||||||
race-high_accuracy: 93.75
|
race-high_accuracy: 93.75
|
||||||
winogrande_accuracy: 81.25
|
winogrande_accuracy: 81.25
|
||||||
deepseek-67b-base-hf:
|
|
||||||
gsm8k_accuracy: 59.38
|
|
||||||
GPQA_diamond_accuracy: 31.25
|
|
||||||
race-high_accuracy: 81.25
|
|
||||||
winogrande_accuracy: 90.62
|
|
||||||
deepseek-67b-base-turbomind:
|
deepseek-67b-base-turbomind:
|
||||||
gsm8k_accuracy: 56.25
|
gsm8k_accuracy: 53.12
|
||||||
GPQA_diamond_accuracy: 28.12
|
GPQA_diamond_accuracy: 28.12
|
||||||
race-high_accuracy: 81.25
|
race-high_accuracy: 81.25
|
||||||
winogrande_accuracy: 84.38
|
winogrande_accuracy: 84.38
|
||||||
llama-3-70b-turbomind:
|
llama-3-70b-turbomind:
|
||||||
gsm8k_accuracy: 59.38
|
gsm8k_accuracy: 56.25
|
||||||
GPQA_diamond_accuracy: 9.38
|
GPQA_diamond_accuracy: 12.50
|
||||||
race-high_accuracy: 93.75
|
race-high_accuracy: 93.75
|
||||||
winogrande_accuracy: 84.38
|
winogrande_accuracy: 84.38
|
||||||
qwen2.5-72b-turbomind:
|
qwen2.5-72b-turbomind:
|
||||||
gsm8k_accuracy: 84.38
|
gsm8k_accuracy: 84.38
|
||||||
GPQA_diamond_accuracy: 34.38
|
GPQA_diamond_accuracy: 31.25
|
||||||
race-high_accuracy: 93.75
|
race-high_accuracy: 93.75
|
||||||
winogrande_accuracy: 87.5
|
winogrande_accuracy: 87.5
|
||||||
deepseek-v2-turbomind:
|
deepseek-v2-turbomind:
|
||||||
gsm8k_accuracy: 65.62
|
gsm8k_accuracy: 59.38
|
||||||
GPQA_diamond_accuracy: 15.62
|
|
||||||
race-high_accuracy: 93.75
|
|
||||||
winogrande_accuracy: 84.38
|
|
||||||
llama-3-70b-hf:
|
|
||||||
gsm8k_accuracy: 62.5
|
|
||||||
GPQA_diamond_accuracy: 3.12
|
GPQA_diamond_accuracy: 3.12
|
||||||
race-high_accuracy: 93.75
|
race-high_accuracy: 93.75
|
||||||
winogrande_accuracy: 84.38
|
winogrande_accuracy: 81.25
|
||||||
|
25
.github/workflows/daily-run-test.yml
vendored
25
.github/workflows/daily-run-test.yml
vendored
@ -61,6 +61,7 @@ env:
|
|||||||
HUGGINGFACE_HUB_CACHE: /fs-computility/llm/shared/llmeval/models/opencompass_hf_hub
|
HUGGINGFACE_HUB_CACHE: /fs-computility/llm/shared/llmeval/models/opencompass_hf_hub
|
||||||
HF_HUB_CACHE: /fs-computility/llm/shared/llmeval/models/opencompass_hf_hub
|
HF_HUB_CACHE: /fs-computility/llm/shared/llmeval/models/opencompass_hf_hub
|
||||||
CONDA_ENV: regression_test
|
CONDA_ENV: regression_test
|
||||||
|
export VLLM_WORKER_MULTIPROC_METHOD: spawn
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
build-pypi:
|
build-pypi:
|
||||||
@ -92,7 +93,6 @@ jobs:
|
|||||||
matrix:
|
matrix:
|
||||||
pyver: [py310]
|
pyver: [py310]
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
environment: 'prod'
|
|
||||||
env:
|
env:
|
||||||
PYTHON_VERSION: ${{ matrix.pyver }}
|
PYTHON_VERSION: ${{ matrix.pyver }}
|
||||||
PLAT_NAME: manylinux2014_x86_64
|
PLAT_NAME: manylinux2014_x86_64
|
||||||
@ -126,7 +126,6 @@ jobs:
|
|||||||
if: ${{!cancelled()}}
|
if: ${{!cancelled()}}
|
||||||
needs: ['build-pypi', 'build-pypi-lmdeploy']
|
needs: ['build-pypi', 'build-pypi-lmdeploy']
|
||||||
runs-on: volc_cu12
|
runs-on: volc_cu12
|
||||||
environment: 'prod'
|
|
||||||
timeout-minutes: 120 #2hours
|
timeout-minutes: 120 #2hours
|
||||||
steps:
|
steps:
|
||||||
- name: Clone repository
|
- name: Clone repository
|
||||||
@ -157,7 +156,9 @@ jobs:
|
|||||||
pip install opencompass*.whl --cache-dir ${{env.PIP_CACHE_PATH}}
|
pip install opencompass*.whl --cache-dir ${{env.PIP_CACHE_PATH}}
|
||||||
pip install opencompass[lmdeploy] --cache-dir ${{env.PIP_CACHE_PATH}}
|
pip install opencompass[lmdeploy] --cache-dir ${{env.PIP_CACHE_PATH}}
|
||||||
pip install opencompass[vllm] --cache-dir ${{env.PIP_CACHE_PATH}}
|
pip install opencompass[vllm] --cache-dir ${{env.PIP_CACHE_PATH}}
|
||||||
pip install torch==2.5.1 torchvision==0.20.1 torchaudio==2.5.1 --cache-dir ${{env.PIP_CACHE_PATH}}
|
pip install opencompass[full] --cache-dir ${{env.PIP_CACHE_PATH}}
|
||||||
|
pip install opencompass[api] --cache-dir ${{env.PIP_CACHE_PATH}}
|
||||||
|
pip install torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0 --cache-dir ${{env.PIP_CACHE_PATH}}
|
||||||
FLASH_ATTENTION_FORCE_BUILD=TRUE pip install /fs-computility/llm/qa-llm-cicd/packages/flash_attn-2.7.0.post2+cu12torch2.5cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
|
FLASH_ATTENTION_FORCE_BUILD=TRUE pip install /fs-computility/llm/qa-llm-cicd/packages/flash_attn-2.7.0.post2+cu12torch2.5cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
|
||||||
pip install xformers --index-url https://download.pytorch.org/whl/cu121 --cache-dir ${{env.PIP_CACHE_PATH}}
|
pip install xformers --index-url https://download.pytorch.org/whl/cu121 --cache-dir ${{env.PIP_CACHE_PATH}}
|
||||||
cp -r /root/nltk_data ${{env.CONDA_PATH}}/envs/${{env.CONDA_ENV}}/nltk_data
|
cp -r /root/nltk_data ${{env.CONDA_PATH}}/envs/${{env.CONDA_ENV}}/nltk_data
|
||||||
@ -188,7 +189,6 @@ jobs:
|
|||||||
matrix:
|
matrix:
|
||||||
regression_func: ${{fromJSON(github.event.inputs.regression_func_volc || '["chat_models","base_models","chat_obj_fullbench","base_fullbench"]')}}
|
regression_func: ${{fromJSON(github.event.inputs.regression_func_volc || '["chat_models","base_models","chat_obj_fullbench","base_fullbench"]')}}
|
||||||
runs-on: volc_cu12_daily
|
runs-on: volc_cu12_daily
|
||||||
environment: 'prod'
|
|
||||||
timeout-minutes: 180 #3hours
|
timeout-minutes: 180 #3hours
|
||||||
steps:
|
steps:
|
||||||
- name: Clone repository
|
- name: Clone repository
|
||||||
@ -229,7 +229,6 @@ jobs:
|
|||||||
matrix:
|
matrix:
|
||||||
regression_func: ${{fromJSON(github.event.inputs.regression_func_local || '["cmd","api","chat_sub_fullbench"]')}}
|
regression_func: ${{fromJSON(github.event.inputs.regression_func_local || '["cmd","api","chat_sub_fullbench"]')}}
|
||||||
runs-on: volc_cu12_local
|
runs-on: volc_cu12_local
|
||||||
environment: 'prod'
|
|
||||||
timeout-minutes: 480 #6hours
|
timeout-minutes: 480 #6hours
|
||||||
steps:
|
steps:
|
||||||
- name: Clone repository
|
- name: Clone repository
|
||||||
@ -256,27 +255,33 @@ jobs:
|
|||||||
conda info --envs
|
conda info --envs
|
||||||
export from_tf=TRUE
|
export from_tf=TRUE
|
||||||
python tools/list_configs.py internlm2_5 mmlu
|
python tools/list_configs.py internlm2_5 mmlu
|
||||||
opencompass --models hf_internlm2_5_7b hf_internlm2_1_8b --datasets race_ppl demo_gsm8k_chat_gen --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd1 --reuse --max-num-workers 2 --dump-eval-details
|
opencompass --models hf_internlm2_5_7b --datasets race_ppl demo_gsm8k_chat_gen --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd1 --reuse --max-num-workers 2 --dump-eval-details
|
||||||
rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd1/*/summary regression_result_daily
|
rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd1/*/summary regression_result_daily
|
||||||
python -m pytest -m case1 -s -v --color=yes .github/scripts/oc_score_assert.py
|
python -m pytest -m case1 -s -v --color=yes .github/scripts/oc_score_assert.py
|
||||||
opencompass --models hf_internlm2_5_7b_chat hf_internlm2_chat_1_8b --datasets race_gen demo_gsm8k_chat_gen -a lmdeploy --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd2 --reuse --max-num-workers 2 --dump-eval-details
|
opencompass --models hf_internlm2_5_7b_chat hf_internlm3_8b_instruct --datasets race_gen demo_gsm8k_chat_gen -a lmdeploy --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd2 --reuse --max-num-workers 2 --dump-eval-details
|
||||||
rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd2/*/summary regression_result_daily
|
rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd2/*/summary regression_result_daily
|
||||||
python -m pytest -m case2 -s -v --color=yes .github/scripts/oc_score_assert.py
|
python -m pytest -m case2 -s -v --color=yes .github/scripts/oc_score_assert.py
|
||||||
opencompass --datasets race_ppl demo_gsm8k_chat_gen --hf-type base --hf-path internlm/internlm2_5-7b --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd3 --reuse --max-num-workers 2 --dump-eval-details
|
opencompass --datasets race_ppl demo_gsm8k_chat_gen --hf-type base --hf-path internlm/internlm2_5-7b --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd3 --reuse --max-num-workers 2 --dump-eval-details
|
||||||
rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd3/*/summary regression_result_daily
|
rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd3/*/summary regression_result_daily
|
||||||
python -m pytest -m case3 -s -v --color=yes .github/scripts/oc_score_assert.py
|
python -m pytest -m case3 -s -v --color=yes .github/scripts/oc_score_assert.py
|
||||||
opencompass --datasets race_gen demo_gsm8k_chat_gen --hf-type chat --hf-path internlm/internlm2_5-7b-chat --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd4 --reuse --max-num-workers 2 --dump-eval-details
|
opencompass --datasets race_gen demo_gsm8k_chat_gen --hf-type chat --hf-path internlm/internlm3-8b-instruct -a lmdeploy --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd4 --reuse --max-num-workers 2 --dump-eval-details
|
||||||
rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd4/*/summary regression_result_daily
|
rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd4/*/summary regression_result_daily
|
||||||
python -m pytest -m case4 -s -v --color=yes .github/scripts/oc_score_assert.py
|
python -m pytest -m case4 -s -v --color=yes .github/scripts/oc_score_assert.py
|
||||||
|
opencompass --datasets race_gen demo_gsm8k_chat_gen --hf-type chat --hf-path internlm/internlm3-8b-instruct -a vllm --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd5 --reuse --max-num-workers 2 --dump-eval-details
|
||||||
|
rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd5/*/summary regression_result_daily
|
||||||
|
python -m pytest -m case5 -s -v --color=yes .github/scripts/oc_score_assert.py
|
||||||
- name: Run model test - api
|
- name: Run model test - api
|
||||||
if: matrix.regression_func == 'api'
|
if: matrix.regression_func == 'api'
|
||||||
run: |
|
run: |
|
||||||
. ${{env.CONDA_PATH}}/bin/activate
|
. ${{env.CONDA_PATH}}/bin/activate
|
||||||
conda activate ${{env.CONDA_ENV}}
|
conda activate ${{env.CONDA_ENV}}
|
||||||
conda info --envs
|
conda info --envs
|
||||||
lmdeploy serve api_server internlm/internlm2_5-7b-chat --max-batch-size 256 --model-name internlm2 > ${{env.REPORT_ROOT}}/${{ github.run_id }}/restful.log 2>&1 &
|
lmdeploy serve api_server internlm/internlm3-8b-instruct --max-batch-size 256 --model-name internlm3 > ${{env.REPORT_ROOT}}/${{ github.run_id }}/restful.log 2>&1 &
|
||||||
echo "restful_pid=$!" >> "$GITHUB_ENV"
|
echo "restful_pid=$!" >> "$GITHUB_ENV"
|
||||||
sleep 180s
|
sleep 180s
|
||||||
|
env | grep PROXY
|
||||||
|
env | grep proxy
|
||||||
|
unset HTTP_PROXY;unset HTTPS_PROXY;unset http_proxy;unset https_proxy;
|
||||||
opencompass .github/scripts/eval_regression_api.py --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/api --reuse --max-num-workers 2 --dump-eval-details
|
opencompass .github/scripts/eval_regression_api.py --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/api --reuse --max-num-workers 2 --dump-eval-details
|
||||||
rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/api/*/summary regression_result_daily
|
rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/api/*/summary regression_result_daily
|
||||||
python -m pytest -m api -s -v --color=yes .github/scripts/oc_score_assert.py
|
python -m pytest -m api -s -v --color=yes .github/scripts/oc_score_assert.py
|
||||||
@ -305,7 +310,6 @@ jobs:
|
|||||||
matrix:
|
matrix:
|
||||||
function_type: ${{fromJSON(github.event.inputs.fullbench_eval || '["base_objective","chat_objective","chat_subjective","base_long_context","chat_long_context"]')}}
|
function_type: ${{fromJSON(github.event.inputs.fullbench_eval || '["base_objective","chat_objective","chat_subjective","base_long_context","chat_long_context"]')}}
|
||||||
runs-on: volc_cu12
|
runs-on: volc_cu12
|
||||||
environment: 'prod'
|
|
||||||
timeout-minutes: 480 #6hours
|
timeout-minutes: 480 #6hours
|
||||||
steps:
|
steps:
|
||||||
- name: Clone repository
|
- name: Clone repository
|
||||||
@ -339,7 +343,6 @@ jobs:
|
|||||||
needs: [daily_run_test_volc, daily_run_test_local, fullbench_run_test]
|
needs: [daily_run_test_volc, daily_run_test_local, fullbench_run_test]
|
||||||
timeout-minutes: 5
|
timeout-minutes: 5
|
||||||
runs-on: self-hosted
|
runs-on: self-hosted
|
||||||
environment: 'prod'
|
|
||||||
steps:
|
steps:
|
||||||
- name: notify
|
- name: notify
|
||||||
run: |
|
run: |
|
||||||
|
2
.github/workflows/pr-run-test.yml
vendored
2
.github/workflows/pr-run-test.yml
vendored
@ -45,7 +45,7 @@ jobs:
|
|||||||
. ${{env.CONDA_PATH}}/bin/activate
|
. ${{env.CONDA_PATH}}/bin/activate
|
||||||
conda activate ${{env.CONDA_ENV}}
|
conda activate ${{env.CONDA_ENV}}
|
||||||
python3 -m pip uninstall opencompass -y
|
python3 -m pip uninstall opencompass -y
|
||||||
python3 -m pip install -e . --cache-dir ${{env.PIP_CACHE_PATH}}
|
python3 -m pip install -e ".[full]" --cache-dir ${{env.PIP_CACHE_PATH}}
|
||||||
conda info --envs
|
conda info --envs
|
||||||
- name: conda env
|
- name: conda env
|
||||||
run: |
|
run: |
|
||||||
|
@ -715,6 +715,12 @@
|
|||||||
paper: https://arxiv.org/pdf/1809.02789v1
|
paper: https://arxiv.org/pdf/1809.02789v1
|
||||||
configpath: opencompass/configs/datasets/obqa/obqa_gen.py
|
configpath: opencompass/configs/datasets/obqa/obqa_gen.py
|
||||||
configpath_llmjudge: ''
|
configpath_llmjudge: ''
|
||||||
|
- olymmath:
|
||||||
|
name: OlymMATH
|
||||||
|
category: Math
|
||||||
|
paper: https://arxiv.org/abs/2503.21380
|
||||||
|
configpath: ''
|
||||||
|
configpath_llmjudge: opencompass/configs/datasets/OlymMATH/olymmath_llm_judeg_gen.py
|
||||||
- piqa:
|
- piqa:
|
||||||
name: OpenBookQA
|
name: OpenBookQA
|
||||||
category: Knowledge / Physics
|
category: Knowledge / Physics
|
||||||
|
@ -117,6 +117,10 @@ html_js_files = [
|
|||||||
'js/custom.js'
|
'js/custom.js'
|
||||||
]
|
]
|
||||||
|
|
||||||
|
html_context = {
|
||||||
|
'github_version': 'main',
|
||||||
|
}
|
||||||
|
|
||||||
# -- Options for HTMLHelp output ---------------------------------------------
|
# -- Options for HTMLHelp output ---------------------------------------------
|
||||||
|
|
||||||
# Output file base name for HTML help builder.
|
# Output file base name for HTML help builder.
|
||||||
|
@ -117,6 +117,10 @@ html_js_files = [
|
|||||||
'js/custom.js'
|
'js/custom.js'
|
||||||
]
|
]
|
||||||
|
|
||||||
|
html_context = {
|
||||||
|
'github_version': 'main',
|
||||||
|
}
|
||||||
|
|
||||||
# -- Options for HTMLHelp output ---------------------------------------------
|
# -- Options for HTMLHelp output ---------------------------------------------
|
||||||
|
|
||||||
# Output file base name for HTML help builder.
|
# Output file base name for HTML help builder.
|
||||||
|
@ -1 +1 @@
|
|||||||
__version__ = '0.4.1'
|
__version__ = '0.4.2'
|
||||||
|
60
opencompass/configs/datasets/OlymMATH/README.md
Normal file
60
opencompass/configs/datasets/OlymMATH/README.md
Normal file
@ -0,0 +1,60 @@
|
|||||||
|
# OlymMATH
|
||||||
|
[GitHub Link](https://github.com/RUCAIBox/OlymMATH)
|
||||||
|
|
||||||
|
Dataset OlymMATH, please refer to the paper:
|
||||||
|
Challenging the Boundaries of Reasoning: An Olympiad-Level Math Benchmark for Large Language Models by Haoxiang Sun, Yingqian Min, Zhipeng Chen, Wayne Xin Zhao, Zheng Liu, Zhongyuan Wang, Lei Fang, and Ji-Rong Wen.
|
||||||
|
|
||||||
|
|
||||||
|
## How to eval OlymMATH with model judge
|
||||||
|
This is a simple example:
|
||||||
|
```python
|
||||||
|
|
||||||
|
from opencompass.models import OpenAISDK, OpenAI
|
||||||
|
from mmengine.config import read_base
|
||||||
|
|
||||||
|
|
||||||
|
with read_base():
|
||||||
|
from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_7b_instruct import models as qwen2_5_7b_instruct_model
|
||||||
|
from opencompass.configs.datasets.OlymMATH.olymmath_gen import olymmath_datasets
|
||||||
|
|
||||||
|
################## Judge Config ##################
|
||||||
|
api_meta_template = dict(round=[
|
||||||
|
dict(role='HUMAN', api_role='HUMAN'),
|
||||||
|
dict(role='BOT', api_role='BOT', generate=True),
|
||||||
|
], )
|
||||||
|
|
||||||
|
judge_cfg = dict(
|
||||||
|
# An API model with OpenAI API format is required for Judge
|
||||||
|
abbr='qwen2-5-32B-Instruct',
|
||||||
|
type=OpenAISDK,
|
||||||
|
path='Qwen/Qwen2.5-32B-Instruct',
|
||||||
|
key='sk-1234',
|
||||||
|
openai_api_base=[
|
||||||
|
'http://172.30.56.1:4000/v1',
|
||||||
|
],
|
||||||
|
meta_template=api_meta_template,
|
||||||
|
query_per_second=16,
|
||||||
|
batch_size=1024,
|
||||||
|
temperature=0.001,
|
||||||
|
max_completion_tokens=32768,
|
||||||
|
tokenizer_path='gpt-4o-2024-05-13',
|
||||||
|
verbose=True,
|
||||||
|
max_out_len=16384,
|
||||||
|
max_seq_len=32768,
|
||||||
|
)
|
||||||
|
|
||||||
|
################## Model Config ##################
|
||||||
|
models = [*qwen2_5_7b_instruct_model]
|
||||||
|
|
||||||
|
################## Dataset Config ##################
|
||||||
|
datasets = [*olymmath_datasets]
|
||||||
|
|
||||||
|
# Set judge_cfg for evaluation
|
||||||
|
for item in datasets:
|
||||||
|
item['infer_cfg']['inferencer']['max_out_len'] = 32768
|
||||||
|
if 'judge_cfg' in item['eval_cfg']['evaluator']:
|
||||||
|
item['eval_cfg']['evaluator']['judge_cfg'] = judge_cfg
|
||||||
|
|
||||||
|
|
||||||
|
work_dir = './outputs/olymmath_llm_eval'
|
||||||
|
```
|
@ -0,0 +1,5 @@
|
|||||||
|
from mmengine.config import read_base
|
||||||
|
|
||||||
|
with read_base():
|
||||||
|
# Default use LLM as a judge
|
||||||
|
from .olymmath_llmverify_gen_97b203 import olymmath_datasets # noqa: F401, F403
|
@ -0,0 +1,99 @@
|
|||||||
|
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||||
|
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||||
|
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||||
|
from opencompass.evaluator import GenericLLMEvaluator
|
||||||
|
from opencompass.datasets import generic_llmjudge_postprocess
|
||||||
|
from opencompass.datasets import OlymMATHDataset
|
||||||
|
|
||||||
|
|
||||||
|
# ----------------------------- Detailed Config -----------------------------
|
||||||
|
|
||||||
|
math_reader_cfg = dict(input_columns=['problem'], output_column='answer', train_split='test')
|
||||||
|
|
||||||
|
math_infer_cfg = dict(
|
||||||
|
prompt_template=dict(
|
||||||
|
type=PromptTemplate,
|
||||||
|
template=dict(
|
||||||
|
round=[
|
||||||
|
dict(role='HUMAN', prompt='{problem}\nRemember to put your final answer within \\boxed{}.'),
|
||||||
|
]
|
||||||
|
),
|
||||||
|
),
|
||||||
|
retriever=dict(type=ZeroRetriever),
|
||||||
|
inferencer=dict(type=GenInferencer),
|
||||||
|
)
|
||||||
|
|
||||||
|
sub_sets = ['en-hard', 'zh-hard', 'en-easy', 'zh-easy']
|
||||||
|
|
||||||
|
GRADER_TEMPLATE = """
|
||||||
|
Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly.
|
||||||
|
|
||||||
|
Here are some evaluation criteria:
|
||||||
|
1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
|
||||||
|
2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
|
||||||
|
3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
|
||||||
|
4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
|
||||||
|
5. If the prediction is given with \\boxed{}, please ignore the \\boxed{} and only judge whether the candidate's answer is consistent with the standard answer.
|
||||||
|
|
||||||
|
Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
|
||||||
|
A: CORRECT
|
||||||
|
B: INCORRECT
|
||||||
|
Just return the letters "A" or "B", with no text around it.
|
||||||
|
|
||||||
|
Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
|
||||||
|
|
||||||
|
|
||||||
|
<Original Question Begin>: \n{problem}\n<Original Question End>\n\n
|
||||||
|
<Gold Target Begin>: \n{answer}\n<Gold Target End>\n\n
|
||||||
|
<Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
|
||||||
|
|
||||||
|
Judging the correctness of candidates' answers:
|
||||||
|
""".strip()
|
||||||
|
|
||||||
|
# Evaluation configuration
|
||||||
|
|
||||||
|
olymmath_datasets = []
|
||||||
|
|
||||||
|
for sub_set in sub_sets:
|
||||||
|
math_eval_cfg = dict(
|
||||||
|
evaluator=dict(
|
||||||
|
type=GenericLLMEvaluator,
|
||||||
|
prompt_template=dict(
|
||||||
|
type=PromptTemplate,
|
||||||
|
template=dict(
|
||||||
|
begin=[
|
||||||
|
dict(
|
||||||
|
role='SYSTEM',
|
||||||
|
fallback_role='HUMAN',
|
||||||
|
prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.")
|
||||||
|
],
|
||||||
|
round=[
|
||||||
|
dict(
|
||||||
|
role='HUMAN',
|
||||||
|
prompt = GRADER_TEMPLATE
|
||||||
|
),
|
||||||
|
]),
|
||||||
|
),
|
||||||
|
dataset_cfg=dict(
|
||||||
|
type=OlymMATHDataset,
|
||||||
|
path='RUC-AIBOX/OlymMATH',
|
||||||
|
reader_cfg=math_reader_cfg,
|
||||||
|
subset=sub_set,
|
||||||
|
),
|
||||||
|
judge_cfg=dict(),
|
||||||
|
dict_postprocessor=dict(type=generic_llmjudge_postprocess),
|
||||||
|
),
|
||||||
|
pred_role='BOT',
|
||||||
|
)
|
||||||
|
|
||||||
|
olymmath_datasets.append(
|
||||||
|
dict(
|
||||||
|
type=OlymMATHDataset,
|
||||||
|
abbr=f'olymmath_llmjudge_{sub_set}',
|
||||||
|
path='RUC-AIBOX/OlymMATH',
|
||||||
|
reader_cfg=math_reader_cfg,
|
||||||
|
infer_cfg=math_infer_cfg,
|
||||||
|
eval_cfg=math_eval_cfg,
|
||||||
|
subset=sub_set,
|
||||||
|
)
|
||||||
|
)
|
@ -1,15 +1,14 @@
|
|||||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||||
from opencompass.openicl.icl_evaluator import MATHEvaluator
|
|
||||||
from opencompass.datasets import (
|
from opencompass.datasets import (
|
||||||
MATHDataset,
|
MATHDataset,
|
||||||
|
MATHEvaluator,
|
||||||
math_postprocess_v2,
|
math_postprocess_v2,
|
||||||
normalize_final_answer,
|
normalize_final_answer,
|
||||||
)
|
)
|
||||||
|
|
||||||
math_reader_cfg = dict(input_columns=['problem'], output_column='solution')
|
math_reader_cfg = dict(input_columns=['problem'], output_column='solution')
|
||||||
|
|
||||||
math_infer_cfg = dict(
|
math_infer_cfg = dict(
|
||||||
prompt_template=dict(
|
prompt_template=dict(
|
||||||
type=PromptTemplate,
|
type=PromptTemplate,
|
||||||
@ -28,7 +27,8 @@ math_infer_cfg = dict(
|
|||||||
|
|
||||||
# postprocess v2
|
# postprocess v2
|
||||||
math_eval_cfg = dict(
|
math_eval_cfg = dict(
|
||||||
evaluator=dict(type=MATHEvaluator)
|
evaluator=dict(type=MATHEvaluator, version='v2'),
|
||||||
|
pred_postprocessor=dict(type=math_postprocess_v2),
|
||||||
)
|
)
|
||||||
|
|
||||||
math_datasets = [
|
math_datasets = [
|
||||||
@ -41,4 +41,4 @@ math_datasets = [
|
|||||||
infer_cfg=math_infer_cfg,
|
infer_cfg=math_infer_cfg,
|
||||||
eval_cfg=math_eval_cfg,
|
eval_cfg=math_eval_cfg,
|
||||||
)
|
)
|
||||||
]
|
]
|
@ -0,0 +1,44 @@
|
|||||||
|
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||||
|
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||||
|
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||||
|
from opencompass.openicl.icl_evaluator import MATHEvaluator
|
||||||
|
from opencompass.datasets import (
|
||||||
|
MATHDataset,
|
||||||
|
math_postprocess_v2,
|
||||||
|
normalize_final_answer,
|
||||||
|
)
|
||||||
|
|
||||||
|
math_reader_cfg = dict(input_columns=['problem'], output_column='solution')
|
||||||
|
|
||||||
|
math_infer_cfg = dict(
|
||||||
|
prompt_template=dict(
|
||||||
|
type=PromptTemplate,
|
||||||
|
template=dict(
|
||||||
|
round=[
|
||||||
|
dict(
|
||||||
|
role='HUMAN',
|
||||||
|
prompt='{problem}\nPlease reason step by step, and put your final answer within \\boxed{}.',
|
||||||
|
),
|
||||||
|
]
|
||||||
|
),
|
||||||
|
),
|
||||||
|
retriever=dict(type=ZeroRetriever),
|
||||||
|
inferencer=dict(type=GenInferencer),
|
||||||
|
)
|
||||||
|
|
||||||
|
# postprocess v2
|
||||||
|
math_eval_cfg = dict(
|
||||||
|
evaluator=dict(type=MATHEvaluator)
|
||||||
|
)
|
||||||
|
|
||||||
|
math_datasets = [
|
||||||
|
dict(
|
||||||
|
type=MATHDataset,
|
||||||
|
abbr='math_prm800k_500',
|
||||||
|
path='opencompass/math',
|
||||||
|
file_name='test_prm800k_500.json',
|
||||||
|
reader_cfg=math_reader_cfg,
|
||||||
|
infer_cfg=math_infer_cfg,
|
||||||
|
eval_cfg=math_eval_cfg,
|
||||||
|
)
|
||||||
|
]
|
@ -1,4 +1,4 @@
|
|||||||
from mmengine.config import read_base
|
from mmengine.config import read_base
|
||||||
|
|
||||||
with read_base():
|
with read_base():
|
||||||
from .math_prm800k_500_0shot_cot_gen import math_datasets # noqa: F401, F403
|
from .math_prm800k_500_0shot_cot_gen_11c4b5 import math_datasets # noqa: F401, F403
|
@ -0,0 +1,15 @@
|
|||||||
|
from opencompass.models import TurboMindModel
|
||||||
|
|
||||||
|
models = [
|
||||||
|
dict(
|
||||||
|
type=TurboMindModel,
|
||||||
|
abbr='internvl2_5-38b-turbomind',
|
||||||
|
path='OpenGVLab/InternVL2_5-38B',
|
||||||
|
engine_config=dict(session_len=8192, max_batch_size=8, tp=4),
|
||||||
|
gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=1024),
|
||||||
|
max_seq_len=8192,
|
||||||
|
max_out_len=8192,
|
||||||
|
batch_size=8,
|
||||||
|
run_cfg=dict(num_gpus=4),
|
||||||
|
)
|
||||||
|
]
|
@ -0,0 +1,15 @@
|
|||||||
|
from opencompass.models import TurboMindModel
|
||||||
|
|
||||||
|
models = [
|
||||||
|
dict(
|
||||||
|
type=TurboMindModel,
|
||||||
|
abbr='internvl2_5-8b-turbomind',
|
||||||
|
path='OpenGVLab/InternVL2_5-8B',
|
||||||
|
engine_config=dict(session_len=8192, max_batch_size=16, tp=1),
|
||||||
|
gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=8192),
|
||||||
|
max_seq_len=8192,
|
||||||
|
max_out_len=8192,
|
||||||
|
batch_size=16,
|
||||||
|
run_cfg=dict(num_gpus=1),
|
||||||
|
)
|
||||||
|
]
|
@ -0,0 +1,22 @@
|
|||||||
|
from opencompass.models import TurboMindModelwithChatTemplate
|
||||||
|
|
||||||
|
models = [
|
||||||
|
dict(
|
||||||
|
type=TurboMindModelwithChatTemplate,
|
||||||
|
abbr='mixtral-8x22b-instruct-v0.1-turbomind',
|
||||||
|
path='mistralai/Mixtral-8x22B-Instruct-v0.1',
|
||||||
|
engine_config=dict(
|
||||||
|
session_len=32768,
|
||||||
|
max_batch_size=16,
|
||||||
|
tp=8,
|
||||||
|
cache_max_entry_count=0.7,
|
||||||
|
),
|
||||||
|
gen_config=dict(
|
||||||
|
top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=4096
|
||||||
|
),
|
||||||
|
max_seq_len=32768,
|
||||||
|
max_out_len=4096,
|
||||||
|
batch_size=8,
|
||||||
|
run_cfg=dict(num_gpus=8),
|
||||||
|
)
|
||||||
|
]
|
@ -48,7 +48,7 @@ def clean_units(pred_str: str):
|
|||||||
|
|
||||||
|
|
||||||
def number_it(num):
|
def number_it(num):
|
||||||
from latex2sympy2 import latex2sympy
|
from latex2sympy2_extended import latex2sympy
|
||||||
if isinstance(num, (int, float)):
|
if isinstance(num, (int, float)):
|
||||||
return num
|
return num
|
||||||
|
|
||||||
|
@ -17,7 +17,7 @@ def time_limit(seconds: float):
|
|||||||
|
|
||||||
|
|
||||||
def extract_theoremqa_answer(pred: str, answer_flag: bool = True):
|
def extract_theoremqa_answer(pred: str, answer_flag: bool = True):
|
||||||
from latex2sympy2 import latex2sympy
|
from latex2sympy2_extended import latex2sympy
|
||||||
|
|
||||||
if any([option in pred.lower() for option in ['yes', 'true']]):
|
if any([option in pred.lower() for option in ['yes', 'true']]):
|
||||||
pred = 'True'
|
pred = 'True'
|
||||||
|
@ -106,6 +106,7 @@ from .natural_question import * # noqa: F401, F403
|
|||||||
from .natural_question_cn import * # noqa: F401, F403
|
from .natural_question_cn import * # noqa: F401, F403
|
||||||
from .NPHardEval import * # noqa: F401, F403
|
from .NPHardEval import * # noqa: F401, F403
|
||||||
from .obqa import * # noqa: F401, F403
|
from .obqa import * # noqa: F401, F403
|
||||||
|
from .olymmath import * # noqa: F401, F403
|
||||||
from .OlympiadBench import * # noqa: F401, F403
|
from .OlympiadBench import * # noqa: F401, F403
|
||||||
from .OpenFinData import * # noqa: F401, F403
|
from .OpenFinData import * # noqa: F401, F403
|
||||||
from .piqa import * # noqa: F401, F403
|
from .piqa import * # noqa: F401, F403
|
||||||
|
14
opencompass/datasets/olymmath.py
Normal file
14
opencompass/datasets/olymmath.py
Normal file
@ -0,0 +1,14 @@
|
|||||||
|
from datasets import load_dataset
|
||||||
|
|
||||||
|
from opencompass.registry import LOAD_DATASET
|
||||||
|
|
||||||
|
from .base import BaseDataset
|
||||||
|
|
||||||
|
|
||||||
|
@LOAD_DATASET.register_module()
|
||||||
|
class OlymMATHDataset(BaseDataset):
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def load(path: str, subset: str):
|
||||||
|
dataset = load_dataset(path, subset)
|
||||||
|
return dataset
|
@ -33,6 +33,7 @@ class ClaudeSDK(BaseAPIModel):
|
|||||||
max_seq_len: int = 2048,
|
max_seq_len: int = 2048,
|
||||||
meta_template: Optional[Dict] = None,
|
meta_template: Optional[Dict] = None,
|
||||||
temperature: Optional[float] = 0.0,
|
temperature: Optional[float] = 0.0,
|
||||||
|
thinking: Optional[Dict] = None,
|
||||||
retry: int = 2,
|
retry: int = 2,
|
||||||
):
|
):
|
||||||
super().__init__(path=path,
|
super().__init__(path=path,
|
||||||
@ -49,6 +50,7 @@ class ClaudeSDK(BaseAPIModel):
|
|||||||
self.anthropic = Anthropic(api_key=key)
|
self.anthropic = Anthropic(api_key=key)
|
||||||
self.model = path
|
self.model = path
|
||||||
self.temperature = temperature
|
self.temperature = temperature
|
||||||
|
self.thinking = thinking
|
||||||
|
|
||||||
def generate(
|
def generate(
|
||||||
self,
|
self,
|
||||||
@ -108,11 +110,26 @@ class ClaudeSDK(BaseAPIModel):
|
|||||||
while num_retries < self.retry:
|
while num_retries < self.retry:
|
||||||
self.wait()
|
self.wait()
|
||||||
try:
|
try:
|
||||||
responses = self.anthropic.messages.create(
|
api_params = {
|
||||||
model=self.model,
|
'model': self.model,
|
||||||
max_tokens=max_out_len,
|
'max_tokens': max_out_len,
|
||||||
temperature=self.temperature,
|
'temperature': self.temperature,
|
||||||
messages=messages)
|
'messages': messages,
|
||||||
|
}
|
||||||
|
|
||||||
|
if self.thinking is not None:
|
||||||
|
api_params['thinking'] = self.thinking
|
||||||
|
api_params['stream'] = True
|
||||||
|
|
||||||
|
responses = self.anthropic.messages.create(**api_params)
|
||||||
|
|
||||||
|
# Handle new response format
|
||||||
|
for content in responses.content:
|
||||||
|
if content.type == 'text':
|
||||||
|
return content.text
|
||||||
|
|
||||||
|
# If no text type content is found, return the first
|
||||||
|
# content (backward compatibility)
|
||||||
return responses.content[0].text
|
return responses.content[0].text
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
self.logger.error(e)
|
self.logger.error(e)
|
||||||
|
@ -652,7 +652,6 @@ class OpenAISDK(OpenAI):
|
|||||||
self.logger.info('Start calling OpenAI API')
|
self.logger.info('Start calling OpenAI API')
|
||||||
responses = self.openai_client.chat.completions.create(
|
responses = self.openai_client.chat.completions.create(
|
||||||
**query_data, timeout=timeout) # timeout in seconds
|
**query_data, timeout=timeout) # timeout in seconds
|
||||||
|
|
||||||
if self.verbose:
|
if self.verbose:
|
||||||
self.logger.info(
|
self.logger.info(
|
||||||
'Successfully get response from OpenAI API')
|
'Successfully get response from OpenAI API')
|
||||||
@ -660,10 +659,18 @@ class OpenAISDK(OpenAI):
|
|||||||
self.logger.info(responses)
|
self.logger.info(responses)
|
||||||
except Exception:
|
except Exception:
|
||||||
pass # noqa F841
|
pass # noqa F841
|
||||||
if not responses.choices:
|
|
||||||
|
# Check if response is empty or content is empty
|
||||||
|
if not responses.choices or not responses.choices[
|
||||||
|
0].message.content:
|
||||||
self.logger.error(
|
self.logger.error(
|
||||||
'Response is empty, it is an internal server error \
|
'API response is empty, it might be due to excessive '
|
||||||
from the API provider.')
|
'input length or an internal server error '
|
||||||
|
'from your API provider.')
|
||||||
|
num_retries += 1
|
||||||
|
# Continue to retry instead of returning empty response
|
||||||
|
continue
|
||||||
|
|
||||||
return responses.choices[0].message.content
|
return responses.choices[0].message.content
|
||||||
|
|
||||||
except (BadRequestError, APIStatusError) as e:
|
except (BadRequestError, APIStatusError) as e:
|
||||||
|
@ -147,6 +147,5 @@ class CommonSummarizer(CompassArenaSummarizer):
|
|||||||
f.write(','.join(new_header) + '\n')
|
f.write(','.join(new_header) + '\n')
|
||||||
for line in new_table:
|
for line in new_table:
|
||||||
f.write(','.join(map(str, line)) + '\n')
|
f.write(','.join(map(str, line)) + '\n')
|
||||||
print(t)
|
|
||||||
print(output_file)
|
print(output_file)
|
||||||
return {'qa_bench_' + show_dataset_abbr:json_result}
|
return {'qa_bench_' + show_dataset_abbr:json_result}
|
||||||
|
@ -11,12 +11,10 @@ faiss_gpu==1.7.2
|
|||||||
-e git+https://github.com/open-compass/human-eval.git#egg=human-eval
|
-e git+https://github.com/open-compass/human-eval.git#egg=human-eval
|
||||||
# IFEval
|
# IFEval
|
||||||
langdetect
|
langdetect
|
||||||
# TheoremQA
|
|
||||||
latex2sympy2==1.9.1
|
|
||||||
# Lawbench, leval
|
# Lawbench, leval
|
||||||
ltp
|
ltp
|
||||||
# Math
|
# Math
|
||||||
math-verify
|
math-verify[antlr4_11_0]
|
||||||
# Taco, apps Dataset
|
# Taco, apps Dataset
|
||||||
pyext
|
pyext
|
||||||
# Law Bench
|
# Law Bench
|
||||||
|
Loading…
Reference in New Issue
Block a user