mirror of
https://github.com/open-compass/opencompass.git
synced 2025-05-30 16:03:24 +08:00
[CI] fix baseline score (#2000)
* update * update * update * update * update * update * update * updaste * update * update * updaste * updaste * update * update * update * update * update * update * update * update
This commit is contained in:
parent
3a9a384173
commit
f982d6278e
6
.github/scripts/eval_regression_api.py
vendored
6
.github/scripts/eval_regression_api.py
vendored
@ -24,9 +24,9 @@ models = [
|
||||
abbr='lmdeploy-api-test',
|
||||
type=OpenAISDK,
|
||||
key='EMPTY',
|
||||
openai_api_base='http://0.0.0.0:23333/v1',
|
||||
path='internlm2',
|
||||
tokenizer_path='internlm/internlm2_5-7b-chat',
|
||||
openai_api_base='http://localhost:23333/v1',
|
||||
path='internlm3',
|
||||
tokenizer_path='internlm/internlm3-8b-instruct',
|
||||
rpm_verbose=True,
|
||||
meta_template=api_meta_template,
|
||||
query_per_second=128,
|
||||
|
18
.github/scripts/eval_regression_base_models.py
vendored
18
.github/scripts/eval_regression_base_models.py
vendored
@ -11,18 +11,10 @@ with read_base():
|
||||
from opencompass.configs.datasets.winogrande.winogrande_5shot_ll_252f01 import \
|
||||
winogrande_datasets # noqa: F401, E501
|
||||
# read hf models - chat models
|
||||
from opencompass.configs.models.chatglm.hf_glm4_9b import \
|
||||
models as hf_glm4_9b_model # noqa: F401, E501
|
||||
from opencompass.configs.models.chatglm.lmdeploy_glm4_9b import \
|
||||
models as lmdeploy_glm4_9b_model # noqa: F401, E501
|
||||
from opencompass.configs.models.deepseek.hf_deepseek_7b_base import \
|
||||
models as hf_deepseek_7b_base_model # noqa: F401, E501
|
||||
from opencompass.configs.models.deepseek.hf_deepseek_67b_base import \
|
||||
models as hf_deepseek_67b_base_model # noqa: F401, E501
|
||||
from opencompass.configs.models.deepseek.hf_deepseek_moe_16b_base import \
|
||||
models as hf_deepseek_moe_16b_base_model # noqa: F401, E501
|
||||
from opencompass.configs.models.deepseek.hf_deepseek_v2_lite import \
|
||||
models as hf_deepseek_v2_lite_model # noqa: F401, E501
|
||||
from opencompass.configs.models.deepseek.lmdeploy_deepseek_7b_base import \
|
||||
models as lmdeploy_deepseek_7b_base_model # noqa: F401, E501
|
||||
from opencompass.configs.models.deepseek.lmdeploy_deepseek_67b_base import \
|
||||
@ -49,12 +41,6 @@ with read_base():
|
||||
models as hf_internlm2_5_7b_model # noqa: F401, E501
|
||||
from opencompass.configs.models.hf_internlm.hf_internlm2_7b import \
|
||||
models as hf_internlm2_7b_model # noqa: F401, E501
|
||||
from opencompass.configs.models.hf_internlm.hf_internlm2_20b import \
|
||||
models as hf_internlm2_20b_model # noqa: F401, E501
|
||||
from opencompass.configs.models.hf_internlm.hf_internlm2_base_7b import \
|
||||
models as hf_internlm2_base_7b_model # noqa: F401, E501
|
||||
from opencompass.configs.models.hf_internlm.hf_internlm2_base_20b import \
|
||||
models as hf_internlm2_base_20b_model # noqa: F401, E501
|
||||
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_1_8b import \
|
||||
models as lmdeploy_internlm2_1_8b_model # noqa: F401, E501
|
||||
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b import \
|
||||
@ -65,14 +51,14 @@ with read_base():
|
||||
models as lmdeploy_internlm2_20b_model # noqa: F401, E501
|
||||
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_base_7b import \
|
||||
models as lmdeploy_internlm2_base_7b_model # noqa: F401, E501
|
||||
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_base_20b import \
|
||||
models as lmdeploy_internlm2_base_20b_model # noqa: F401, E501
|
||||
from opencompass.configs.models.hf_llama.hf_llama2_7b import \
|
||||
models as hf_llama2_7b_model # noqa: F401, E501
|
||||
from opencompass.configs.models.hf_llama.hf_llama3_1_8b import \
|
||||
models as hf_llama3_1_8b_model # noqa: F401, E501
|
||||
from opencompass.configs.models.hf_llama.hf_llama3_8b import \
|
||||
models as hf_llama3_8b_model # noqa: F401, E501
|
||||
from opencompass.configs.models.hf_llama.hf_llama3_70b import \
|
||||
models as hf_llama3_70b_model # noqa: F401, E501
|
||||
from opencompass.configs.models.hf_llama.lmdeploy_llama3_1_8b import \
|
||||
models as lmdeploy_llama3_1_8b_model # noqa: F401, E501
|
||||
from opencompass.configs.models.hf_llama.lmdeploy_llama3_8b import \
|
||||
|
41
.github/scripts/eval_regression_chat_models.py
vendored
41
.github/scripts/eval_regression_chat_models.py
vendored
@ -15,14 +15,24 @@ with read_base():
|
||||
models as vllm_glm4_9b_chat_model # noqa: F401, E501
|
||||
from opencompass.configs.models.deepseek.hf_deepseek_7b_chat import \
|
||||
models as hf_deepseek_7b_chat_model # noqa: F401, E501
|
||||
from opencompass.configs.models.deepseek.hf_deepseek_67b_chat import \
|
||||
models as hf_deepseek_67b_chat_model # noqa: F401, E501
|
||||
from opencompass.configs.models.deepseek.hf_deepseek_moe_16b_chat import \
|
||||
models as hf_deepseek_moe_16b_chat_model # noqa: F401, E501
|
||||
from opencompass.configs.models.deepseek.hf_deepseek_v2_lite_chat import \
|
||||
models as hf_deepseek_v2_lite_chat_model # noqa: F401, E501
|
||||
from opencompass.configs.models.deepseek.lmdeploy_deepseek_67b_chat import \
|
||||
models as lmdeploy_deepseek_67b_chat_model # noqa: F401, E501
|
||||
from opencompass.configs.models.deepseek.lmdeploy_deepseek_r1_distill_llama_8b import \
|
||||
models as \
|
||||
lmdeploy_deepseek_r1_distill_llama_8b_model # noqa: F401, E501
|
||||
from opencompass.configs.models.deepseek.lmdeploy_deepseek_r1_distill_llama_70b import \
|
||||
models as \
|
||||
lmdeploy_deepseek_r1_distill_llama_70b_model # noqa: F401, E501
|
||||
from opencompass.configs.models.deepseek.lmdeploy_deepseek_r1_distill_qwen_1_5b import \
|
||||
models as \
|
||||
lmdeploy_deepseek_r1_distill_qwen_1_5b_model # noqa: F401, E501
|
||||
from opencompass.configs.models.deepseek.lmdeploy_deepseek_r1_distill_qwen_32b import \
|
||||
models as \
|
||||
lmdeploy_deepseek_r1_distill_qwen_32b_model # noqa: F401, E501
|
||||
from opencompass.configs.models.deepseek.lmdeploy_deepseek_v2_5_1210 import \
|
||||
models as lmdeploy_deepseek_v2_5_1210_model # noqa: F401, E501
|
||||
from opencompass.configs.models.deepseek.lmdeploy_deepseek_v2_lite import \
|
||||
models as lmdeploy_deepseek_v2_lite_model # noqa: F401, E501
|
||||
from opencompass.configs.models.deepseek.vllm_deepseek_7b_chat import \
|
||||
models as vllm_deepseek_7b_chat_model # noqa: F401, E501
|
||||
from opencompass.configs.models.gemma.hf_gemma2_2b_it import \
|
||||
@ -45,6 +55,8 @@ with read_base():
|
||||
models as hf_internlm2_5_7b_chat_model # noqa: F401, E501
|
||||
from opencompass.configs.models.hf_internlm.hf_internlm2_5_20b_chat import \
|
||||
models as hf_internlm2_5_20b_chat_model # noqa: F401, E501
|
||||
from opencompass.configs.models.hf_internlm.hf_internlm3_8b_instruct import \
|
||||
models as hf_internlm3_8b_instruct_model # noqa: F401, E501
|
||||
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat import \
|
||||
models as lmdeploy_internlm2_5_7b_chat_model # noqa: F401, E501
|
||||
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_20b_chat import \
|
||||
@ -57,6 +69,8 @@ with read_base():
|
||||
models as lmdeploy_internlm2_chat_7b_model # noqa: F401, E501
|
||||
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_chat_7b_sft import \
|
||||
models as lmdeploy_internlm2_chat_7b_sft_model # noqa: F401, E501
|
||||
from opencompass.configs.models.hf_internlm.lmdeploy_internlm3_8b_instruct import \
|
||||
models as lmdeploy_internlm3_8b_instruct_model # noqa: F401, E501
|
||||
from opencompass.configs.models.hf_internlm.vllm_internlm2_chat_7b import \
|
||||
models as vllm_internlm2_chat_7b_model # noqa: F401, E501
|
||||
from opencompass.configs.models.hf_llama.hf_llama3_1_8b_instruct import \
|
||||
@ -83,10 +97,6 @@ with read_base():
|
||||
models as hf_mistral_nemo_instruct_2407_model # noqa: F401, E501
|
||||
from opencompass.configs.models.mistral.hf_mistral_small_instruct_2409 import \
|
||||
models as hf_mistral_small_instruct_2409_model # noqa: F401, E501
|
||||
from opencompass.configs.models.mistral.hf_mixtral_8x7b_instruct_v0_1 import \
|
||||
models as hf_mixtral_8x7b_instruct_v0_1_model # noqa: F401, E501
|
||||
from opencompass.configs.models.mistral.hf_mixtral_8x22b_instruct_v0_1 import \
|
||||
models as hf_mixtral_8x22b_instruct_v0_1_model # noqa: F401, E501
|
||||
from opencompass.configs.models.mistral.lmdeploy_mistral_large_instruct_2411 import \
|
||||
models as \
|
||||
lmdeploy_mistral_large_instruct_2411_model # noqa: F401, E501
|
||||
@ -95,14 +105,19 @@ with read_base():
|
||||
from opencompass.configs.models.mistral.lmdeploy_mistral_small_instruct_2409 import \
|
||||
models as \
|
||||
lmdeploy_mistral_small_instruct_2409_model # noqa: F401, E501
|
||||
from opencompass.configs.models.mistral.lmdeploy_mixtral_8x22b_instruct_v0_1 import \
|
||||
models as \
|
||||
lmdeploy_mixtral_8x22b_instruct_v0_1_model # noqa: F401, E501
|
||||
from opencompass.configs.models.mistral.vllm_mistral_7b_instruct_v0_1 import \
|
||||
models as vllm_mistral_7b_instruct_v0_1_model # noqa: F401, E501
|
||||
from opencompass.configs.models.mistral.vllm_mistral_7b_instruct_v0_2 import \
|
||||
models as vllm_mistral_7b_instruct_v0_2_model # noqa: F401, E501
|
||||
from opencompass.configs.models.mistral.vllm_mixtral_8x22b_instruct_v0_1 import \
|
||||
models as vllm_mixtral_8x22b_instruct_v0_1_model # noqa: F401, E501
|
||||
from opencompass.configs.models.nvidia.lmdeploy_nemotron_70b_instruct_hf import \
|
||||
models as lmdeploy_nemotron_70b_instruct_hf_model # noqa: F401, E501
|
||||
from opencompass.configs.models.phi.hf_phi_3_mini_4k_instruct import \
|
||||
models as hf_phi_3_mini_4k_instruct_model # noqa: F401, E501
|
||||
from opencompass.configs.models.phi.hf_phi_4 import \
|
||||
models as hf_phi_4_model # noqa: F401, E501
|
||||
from opencompass.configs.models.qwen2_5.hf_qwen2_5_0_5b_instruct import \
|
||||
models as hf_qwen2_5_0_5b_instruct_model # noqa: F401, E501
|
||||
from opencompass.configs.models.qwen2_5.hf_qwen2_5_3b_instruct import \
|
||||
@ -142,6 +157,8 @@ with read_base():
|
||||
|
||||
from ...volc import infer as volc_infer # noqa: F401, E501
|
||||
|
||||
hf_glm4_9b_chat_model[0]['path'] = 'THUDM/glm-4-9b-chat-hf'
|
||||
|
||||
race_datasets = [race_datasets[1]]
|
||||
datasets = sum([v for k, v in locals().items() if k.endswith('_datasets')], [])
|
||||
|
||||
|
40
.github/scripts/oc_score_assert.py
vendored
40
.github/scripts/oc_score_assert.py
vendored
@ -175,10 +175,11 @@ class TestApibench:
|
||||
class TestVolcFullbench:
|
||||
"""Test cases for chat model."""
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
'model, dataset',
|
||||
[(p1, p2) for p1 in ['internlm2_5-7b-chat-turbomind']
|
||||
for p2 in dataset_list('internlm2_5-7b-chat-turbomind', 'objective')])
|
||||
@pytest.mark.parametrize('model, dataset', [(p1, p2) for p1 in [
|
||||
'internlm2_5-7b-chat-turbomind', 'qwen2.5-7b-instruct-turbomind',
|
||||
'internlm2_5-7b-chat-pytorch', 'qwen2.5-7b-instruct-pytorch',
|
||||
'internlm3-8b-instruct-turbomind', 'internlm3-8b-instruct-pytorch'
|
||||
] for p2 in dataset_list(p1, 'objective')])
|
||||
@pytest.mark.chat_objective
|
||||
def test_chat_objective(self, baseline_scores_fullbench, result_scores,
|
||||
model, dataset):
|
||||
@ -245,10 +246,7 @@ class TestCmdCase:
|
||||
@pytest.mark.parametrize('model, dataset',
|
||||
[('internlm2_5-7b-hf', 'race-middle_accuracy'),
|
||||
('internlm2_5-7b-hf', 'race-high_accuracy'),
|
||||
('internlm2_5-7b-hf', 'demo_gsm8k_accuracy'),
|
||||
('internlm2-1.8b-hf', 'race-middle_accuracy'),
|
||||
('internlm2-1.8b-hf', 'race-high_accuracy'),
|
||||
('internlm2-1.8b-hf', 'demo_gsm8k_accuracy')])
|
||||
('internlm2_5-7b-hf', 'demo_gsm8k_accuracy')])
|
||||
def test_cmd_case1(self, baseline_scores, result_scores, model, dataset):
|
||||
base_score = baseline_scores.get(model).get(dataset)
|
||||
result_score = result_scores.get(model).get(dataset)
|
||||
@ -260,9 +258,9 @@ class TestCmdCase:
|
||||
[('internlm2_5-7b-chat-lmdeploy', 'race-middle_accuracy'),
|
||||
('internlm2_5-7b-chat-lmdeploy', 'race-high_accuracy'),
|
||||
('internlm2_5-7b-chat-lmdeploy', 'demo_gsm8k_accuracy'),
|
||||
('internlm2-chat-1.8b-lmdeploy', 'race-middle_accuracy'),
|
||||
('internlm2-chat-1.8b-lmdeploy', 'race-high_accuracy'),
|
||||
('internlm2-chat-1.8b-lmdeploy', 'demo_gsm8k_accuracy')])
|
||||
('internlm3-8b-instruct-lmdeploy', 'race-middle_accuracy'),
|
||||
('internlm3-8b-instruct-lmdeploy', 'race-high_accuracy'),
|
||||
('internlm3-8b-instruct-lmdeploy', 'demo_gsm8k_accuracy')])
|
||||
def test_cmd_case2(self, baseline_scores, result_scores, model, dataset):
|
||||
base_score = baseline_scores.get(model).get(dataset)
|
||||
result_score = result_scores.get(model).get(dataset)
|
||||
@ -280,13 +278,25 @@ class TestCmdCase:
|
||||
|
||||
@pytest.mark.case4
|
||||
@pytest.mark.parametrize(
|
||||
'model, dataset', [('internlm2_5-7b-chat_hf', 'race-middle_accuracy'),
|
||||
('internlm2_5-7b-chat_hf', 'race-high_accuracy'),
|
||||
('internlm2_5-7b-chat_hf', 'demo_gsm8k_accuracy')])
|
||||
'model, dataset',
|
||||
[('internlm3-8b-instruct_hf-lmdeploy', 'race-middle_accuracy'),
|
||||
('internlm3-8b-instruct_hf-lmdeploy', 'race-high_accuracy'),
|
||||
('internlm3-8b-instruct_hf-lmdeploy', 'demo_gsm8k_accuracy')])
|
||||
def test_cmd_case4(self, baseline_scores, result_scores, model, dataset):
|
||||
base_score = baseline_scores.get(model).get(dataset)
|
||||
result_score = result_scores.get(model).get(dataset)
|
||||
assert_score(model, result_score, base_score, dataset)
|
||||
assert_score(model + '_batch', result_score, base_score, dataset)
|
||||
|
||||
@pytest.mark.case5
|
||||
@pytest.mark.parametrize(
|
||||
'model, dataset',
|
||||
[('internlm3-8b-instruct_hf-vllm', 'race-middle_accuracy'),
|
||||
('internlm3-8b-instruct_hf-vllm', 'race-high_accuracy'),
|
||||
('internlm3-8b-instruct_hf-vllm', 'demo_gsm8k_accuracy')])
|
||||
def test_cmd_case5(self, baseline_scores, result_scores, model, dataset):
|
||||
base_score = baseline_scores.get(model).get(dataset)
|
||||
result_score = result_scores.get(model).get(dataset)
|
||||
assert_score(model + '_batch', result_score, base_score, dataset)
|
||||
|
||||
|
||||
def assert_score(model_type, score, baseline, dataset: str = ''):
|
||||
|
29
.github/scripts/oc_score_baseline.yaml
vendored
29
.github/scripts/oc_score_baseline.yaml
vendored
@ -8,20 +8,25 @@ internlm2_5-7b_hf:
|
||||
race-middle_accuracy: 91.78
|
||||
race-high_accuracy: 90.02
|
||||
|
||||
internlm2-1.8b-hf:
|
||||
demo_gsm8k_accuracy: 15.62
|
||||
race-middle_accuracy: 71.66
|
||||
race-high_accuracy: 66.38
|
||||
|
||||
internlm2_5-7b-chat-lmdeploy:
|
||||
demo_gsm8k_accuracy: 89.06
|
||||
demo_gsm8k_accuracy: 87.50
|
||||
race-middle_accuracy: 92.76
|
||||
race-high_accuracy: 90.54
|
||||
|
||||
internlm2-chat-1.8b-lmdeploy:
|
||||
demo_gsm8k_accuracy: 31
|
||||
race-middle_accuracy: 81.34
|
||||
race-high_accuracy: 73.96
|
||||
internlm3-8b-instruct-lmdeploy:
|
||||
demo_gsm8k_accuracy: 73.44
|
||||
race-middle_accuracy: 93.38
|
||||
race-high_accuracy: 90.34
|
||||
|
||||
internlm3-8b-instruct_hf-lmdeploy:
|
||||
demo_gsm8k_accuracy: 73.44
|
||||
race-middle_accuracy: 93.38
|
||||
race-high_accuracy: 90.34
|
||||
|
||||
internlm3-8b-instruct_hf-vllm:
|
||||
demo_gsm8k_accuracy: 81.25
|
||||
race-middle_accuracy: 92.20
|
||||
race-high_accuracy: 89.88
|
||||
|
||||
internlm2_5-7b-chat_hf:
|
||||
demo_gsm8k_accuracy: 87.50
|
||||
@ -29,6 +34,6 @@ internlm2_5-7b-chat_hf:
|
||||
race-high_accuracy: 90.48
|
||||
|
||||
lmdeploy-api-test:
|
||||
gsm8k_accuracy: 68.75
|
||||
race-middle_accuracy: 87.50
|
||||
gsm8k_accuracy: 56.25
|
||||
race-middle_accuracy: 93.75
|
||||
race-high_accuracy: 93.75
|
||||
|
599
.github/scripts/oc_score_baseline_fullbench.yaml
vendored
599
.github/scripts/oc_score_baseline_fullbench.yaml
vendored
@ -39,15 +39,15 @@ internlm2_5-7b-chat-hf_fullbench:
|
||||
college_knowledge_naive_average: 87.5
|
||||
subjective:
|
||||
alignment_bench_v1_1_总分: 0.66
|
||||
alpaca_eval_total: 20
|
||||
alpaca_eval_total: 0
|
||||
arenahard_score: 50
|
||||
Followbench_naive_average: 1
|
||||
CompassArena_naive_average: 43
|
||||
mtbench101_avg: 7.8
|
||||
wildbench_average: -12.78
|
||||
wildbench_average: -15.56
|
||||
simpleqa_accuracy_given_attempted: 0
|
||||
chinese_simpleqa_given_attempted_accuracy: 1
|
||||
alignment_bench_v1_1_专业能力: 7.90
|
||||
alignment_bench_v1_1_专业能力: 8.00
|
||||
alignment_bench_v1_1_数学计算: 0
|
||||
alignment_bench_v1_1_基本任务: 0
|
||||
alignment_bench_v1_1_逻辑推理: 0
|
||||
@ -55,7 +55,7 @@ internlm2_5-7b-chat-hf_fullbench:
|
||||
alignment_bench_v1_1_文本写作: 0
|
||||
alignment_bench_v1_1_角色扮演: 0
|
||||
alignment_bench_v1_1_综合问答: 0
|
||||
alpaca_eval_helpful_base: 20
|
||||
alpaca_eval_helpful_base: 0
|
||||
compassarena_language_naive_average: 35
|
||||
compassarena_knowledge_naive_average: 55
|
||||
compassarena_reason_v2_naive_average: 40
|
||||
@ -78,53 +78,53 @@ internlm2_5-7b-chat-hf_fullbench:
|
||||
internlm2_5-7b-chat-turbomind_fullbench:
|
||||
objective:
|
||||
race-high_accuracy: 93.75
|
||||
ARC-c_accuracy: 93.75
|
||||
ARC-c_accuracy: 87.50
|
||||
BoolQ_accuracy: 68.75
|
||||
triviaqa_wiki_1shot_score: 50
|
||||
nq_open_1shot_score: 25
|
||||
IFEval_Prompt-level-strict-accuracy: 56.25
|
||||
drop_accuracy: 81.25
|
||||
drop_accuracy: 75
|
||||
GPQA_diamond_accuracy: 31.25
|
||||
hellaswag_accuracy: 81.25
|
||||
TheoremQA_score: 6.25
|
||||
hellaswag_accuracy: 87.5
|
||||
TheoremQA_score: 12.5
|
||||
musr_average_naive_average: 39.58
|
||||
korbench_single_naive_average: 37.50
|
||||
gsm8k_accuracy: 68.75
|
||||
math_accuracy: 68.75
|
||||
korbench_single_naive_average: 40
|
||||
gsm8k_accuracy: 62.5
|
||||
math_accuracy: 75
|
||||
cmo_fib_accuracy: 6.25
|
||||
aime2024_accuracy: 6.25
|
||||
wikibench-wiki-single_choice_cncircular_perf_4: 50.00
|
||||
wikibench-wiki-single_choice_cncircular_perf_4: 25
|
||||
sanitized_mbpp_score: 68.75
|
||||
ds1000_naive_average: 16.96
|
||||
ds1000_naive_average: 17.86
|
||||
lcb_code_generation_pass@1: 12.5
|
||||
lcb_code_execution_pass@1: 43.75
|
||||
lcb_test_output_pass@1: 25.00
|
||||
bbh-logical_deduction_seven_objects_score: 50.00
|
||||
bbh-multistep_arithmetic_two_score: 68.75
|
||||
mmlu-other_accuracy: 69.71
|
||||
cmmlu-china-specific_accuracy: 75.83
|
||||
lcb_test_output_pass@1: 18.75
|
||||
bbh-logical_deduction_seven_objects_score: 56.25
|
||||
bbh-multistep_arithmetic_two_score: 75
|
||||
mmlu-other_accuracy: 72.6
|
||||
cmmlu-china-specific_accuracy: 78.33
|
||||
mmlu_pro_math_accuracy: 31.25
|
||||
ds1000_Pandas_accuracy: 0
|
||||
ds1000_Pandas_accuracy: 12.5
|
||||
ds1000_Numpy_accuracy: 0
|
||||
ds1000_Tensorflow_accuracy: 12.5
|
||||
ds1000_Scipy_accuracy: 18.75
|
||||
ds1000_Scipy_accuracy: 25
|
||||
ds1000_Sklearn_accuracy: 18.75
|
||||
ds1000_Pytorch_accuracy: 18.75
|
||||
ds1000_Pytorch_accuracy: 6.25
|
||||
ds1000_Matplotlib_accuracy: 50.00
|
||||
openai_mmmlu_lite_AR-XY_accuracy: 37.5
|
||||
college_naive_average: 12.50
|
||||
college_knowledge_naive_average: 87.5
|
||||
subjective:
|
||||
alignment_bench_v1_1_总分: 0.70
|
||||
alignment_bench_v1_1_总分: 0.66
|
||||
alpaca_eval_total: 0
|
||||
arenahard_score: 50
|
||||
Followbench_naive_average: 1
|
||||
CompassArena_naive_average: 38
|
||||
mtbench101_avg: 7.80
|
||||
wildbench_average: -4.86
|
||||
CompassArena_naive_average: 40
|
||||
mtbench101_avg: 8
|
||||
wildbench_average: -6.81
|
||||
simpleqa_accuracy_given_attempted: 0
|
||||
chinese_simpleqa_given_attempted_accuracy: 1
|
||||
alignment_bench_v1_1_专业能力: 8.4
|
||||
alignment_bench_v1_1_专业能力: 7.9
|
||||
alignment_bench_v1_1_数学计算: 0
|
||||
alignment_bench_v1_1_基本任务: 0
|
||||
alignment_bench_v1_1_逻辑推理: 0
|
||||
@ -134,10 +134,10 @@ internlm2_5-7b-chat-turbomind_fullbench:
|
||||
alignment_bench_v1_1_综合问答: 0
|
||||
alpaca_eval_helpful_base: 0
|
||||
compassarena_language_naive_average: 35
|
||||
compassarena_knowledge_naive_average: 50
|
||||
compassarena_reason_v2_naive_average: 30
|
||||
compassarena_math_v2_naive_average: 50
|
||||
compassarena_creationv2_zh_naive_average: 25
|
||||
compassarena_knowledge_naive_average: 45
|
||||
compassarena_reason_v2_naive_average: 25
|
||||
compassarena_math_v2_naive_average: 60
|
||||
compassarena_creationv2_zh_naive_average: 35
|
||||
followbench_llmeval_en_HSR_AVG: 1
|
||||
followbench_llmeval_en_SSR_AVG: 1
|
||||
followbench_llmeval_en_HSR_L1: 1
|
||||
@ -190,20 +190,20 @@ internlm2_5-7b-turbomind_fullbench:
|
||||
drop_accuracy: 62.5
|
||||
GPQA_diamond_accuracy: 62.5
|
||||
hellaswag_accuracy: 93.75
|
||||
TheoremQA_score: 25.00
|
||||
TheoremQA_score: 31.25
|
||||
winogrande_accuracy: 87.5
|
||||
gsm8k_accuracy: 62.50
|
||||
GaokaoBench_2010-2022_Math_II_MCQs_score: 81.25
|
||||
gsm8k_accuracy: 56.25
|
||||
GaokaoBench_2010-2022_Math_II_MCQs_score: 68.75
|
||||
GaokaoBench_2010-2022_Math_II_Fill-in-the-Blank_score: 0
|
||||
math_accuracy: 18.75
|
||||
wikibench-wiki-single_choice_cncircular_perf_4: 25
|
||||
sanitized_mbpp_score: 62.50
|
||||
dingo_en_192_score: 31.25
|
||||
dingo_en_192_score: 50.00
|
||||
dingo_zh_170_score: 93.75
|
||||
mmlu-other_accuracy: 76.92
|
||||
cmmlu-china-specific_accuracy: 84.17
|
||||
mmlu_pro_math_accuracy: 18.75
|
||||
bbh-logical_deduction_seven_objects_score: 50
|
||||
bbh-logical_deduction_seven_objects_score: 43.75
|
||||
bbh-multistep_arithmetic_two_score: 56.25
|
||||
college_naive_average: 12.5
|
||||
college_knowledge_naive_average: 87.5
|
||||
@ -409,7 +409,7 @@ internlm2_5-7b-chat-turbomind:
|
||||
alpaca_eval_koala: 28.21
|
||||
alpaca_eval_oasst: 23.4
|
||||
alpaca_eval_selfinstruct: 30.95
|
||||
alpaca_eval_vicuna: 25
|
||||
alpaca_eval_vicuna: 33.75
|
||||
compassarena_language_naive_average: 52.5
|
||||
compassarena_knowledge_naive_average: 36
|
||||
compassarena_reason_v2_naive_average: 35
|
||||
@ -454,3 +454,530 @@ internlm2_5-7b-chat-1m-turbomind:
|
||||
longbench_few-shot-learning_score: 51.67
|
||||
longbench_synthetic-tasks_score: 66.83
|
||||
longbench_code-completion_score: 45.99
|
||||
|
||||
|
||||
qwen2.5-7b-instruct-turbomind:
|
||||
objective:
|
||||
race-high_accuracy: 84.99
|
||||
ARC-c_accuracy: 92.2
|
||||
BoolQ_accuracy: 86.7
|
||||
triviaqa_wiki_1shot_score: 53.06
|
||||
nq_open_1shot_score: 17.51
|
||||
mmmlu_lite_naive_average: 54.96
|
||||
IFEval_Prompt-level-strict-accuracy: 71.53
|
||||
drop_accuracy: 80.07
|
||||
bbh_naive_average: 68.81
|
||||
GPQA_diamond_accuracy: 34.34
|
||||
hellaswag_accuracy: 85.42
|
||||
TheoremQA_score: 18.38
|
||||
musr_average_naive_average: 43.44
|
||||
korbench_single_naive_average: 39.44
|
||||
ARC_Prize_Public_Evaluation_accuracy: 0
|
||||
gsm8k_accuracy: 92.57
|
||||
GaokaoBench_weighted_average: 80.14
|
||||
math_accuracy: 73.58
|
||||
cmo_fib_accuracy: 25
|
||||
aime2024_accuracy: 16.67
|
||||
Mathbench_naive_average: 77.33
|
||||
wikibench-wiki-single_choice_cncircular_perf_4: 34.9
|
||||
cmmlu_naive_average: 75.97
|
||||
mmlu_naive_average: 76.01
|
||||
mmlu_pro_naive_average: 56.12
|
||||
openai_humaneval_humaneval_pass@1: 83.54
|
||||
sanitized_mbpp_score: 74.71
|
||||
humanevalx_naive_average: 48.29
|
||||
ds1000_naive_average: 18.66
|
||||
lcb_code_generation_pass@1: 39.5
|
||||
lcb_code_execution_pass@1: 42.38
|
||||
lcb_test_output_pass@1: 50.68
|
||||
bigcodebench_hard_instruct_pass@1: 16.22
|
||||
bigcodebench_hard_complete_pass@1: 11.49
|
||||
teval_naive_average: 79.72
|
||||
SciCode_sub_accuracy: 100
|
||||
qa_dingo_cn_score: 99.01
|
||||
mmlu_accuracy: 76.01
|
||||
mmlu-stem_accuracy: 77.59
|
||||
mmlu-social-science_accuracy: 79.02
|
||||
mmlu-humanities_accuracy: 72.07
|
||||
mmlu-other_accuracy: 74.86
|
||||
cmmlu_accuracy: 75.97
|
||||
cmmlu-stem_accuracy: 73.09
|
||||
cmmlu-social-science_accuracy: 75.95
|
||||
cmmlu-humanities_accuracy: 76.53
|
||||
cmmlu-other_accuracy: 78.79
|
||||
cmmlu-china-specific_accuracy: 73.17
|
||||
mmlu_pro_accuracy: 56.12
|
||||
mmlu_pro_biology_accuracy: 71.41
|
||||
mmlu_pro_business_accuracy: 67.68
|
||||
mmlu_pro_chemistry_accuracy: 54.59
|
||||
mmlu_pro_computer_science_accuracy: 58.29
|
||||
mmlu_pro_economics_accuracy: 66.82
|
||||
mmlu_pro_engineering_accuracy: 42.41
|
||||
mmlu_pro_health_accuracy: 55.87
|
||||
mmlu_pro_history_accuracy: 46.46
|
||||
mmlu_pro_law_accuracy: 28.97
|
||||
mmlu_pro_math_accuracy: 73.13
|
||||
mmlu_pro_philosophy_accuracy: 44.89
|
||||
mmlu_pro_physics_accuracy: 58.43
|
||||
mmlu_pro_psychology_accuracy: 63.16
|
||||
mmlu_pro_other_accuracy: 53.57
|
||||
humanevalx-python_pass@1: 50
|
||||
humanevalx-cpp_pass@1: 42.07
|
||||
humanevalx-go_pass@1: 0
|
||||
humanevalx-java_pass@1: 74.39
|
||||
humanevalx-js_pass@1: 75
|
||||
ds1000_Pandas_accuracy: 14.09
|
||||
ds1000_Numpy_accuracy: 8.18
|
||||
ds1000_Tensorflow_accuracy: 17.78
|
||||
ds1000_Scipy_accuracy: 15.09
|
||||
ds1000_Sklearn_accuracy: 10.43
|
||||
ds1000_Pytorch_accuracy: 4.41
|
||||
ds1000_Matplotlib_accuracy: 60.65
|
||||
mmmlu_lite_accuracy: 54.96
|
||||
openai_mmmlu_lite_AR-XY_accuracy: 42.32
|
||||
openai_mmmlu_lite_BN-BD_accuracy: 42.25
|
||||
openai_mmmlu_lite_DE-DE_accuracy: 59.93
|
||||
openai_mmmlu_lite_ES-LA_accuracy: 66.53
|
||||
openai_mmmlu_lite_FR-FR_accuracy: 66.88
|
||||
openai_mmmlu_lite_HI-IN_accuracy: 49.26
|
||||
openai_mmmlu_lite_ID-ID_accuracy: 61.26
|
||||
openai_mmmlu_lite_IT-IT_accuracy: 65.47
|
||||
openai_mmmlu_lite_JA-JP_accuracy: 61.54
|
||||
openai_mmmlu_lite_KO-KR_accuracy: 60.28
|
||||
openai_mmmlu_lite_PT-BR_accuracy: 55.51
|
||||
openai_mmmlu_lite_SW-KE_accuracy: 36.42
|
||||
openai_mmmlu_lite_YO-NG_accuracy: 32.14
|
||||
openai_mmmlu_lite_ZH-CN_accuracy: 69.61
|
||||
college_naive_average: 48
|
||||
high_naive_average: 59
|
||||
middle_naive_average: 78
|
||||
primary_naive_average: 85.67
|
||||
arithmetic_naive_average: 75.67
|
||||
mathbench-a (average)_naive_average: 69.27
|
||||
college_knowledge_naive_average: 83.86
|
||||
high_knowledge_naive_average: 80.29
|
||||
middle_knowledge_naive_average: 84.26
|
||||
primary_knowledge_naive_average: 93.16
|
||||
mathbench-t (average)_naive_average: 85.39
|
||||
|
||||
|
||||
|
||||
|
||||
internlm2_5-7b-chat-pytorch:
|
||||
objective:
|
||||
race-high_accuracy: 86.39
|
||||
ARC-c_accuracy: 90.51
|
||||
BoolQ_accuracy: 88.01
|
||||
triviaqa_wiki_1shot_score: 64.77
|
||||
nq_open_1shot_score: 22.71
|
||||
mmmlu_lite_naive_average: 45.02
|
||||
IFEval_Prompt-level-strict-accuracy: 56.56
|
||||
drop_accuracy: 75.46
|
||||
bbh_naive_average: 73.34
|
||||
GPQA_diamond_accuracy: 32.83
|
||||
hellaswag_accuracy: 94.81
|
||||
TheoremQA_score: 23.88
|
||||
musr_average_naive_average: 51.31
|
||||
korbench_single_naive_average: 32
|
||||
ARC_Prize_Public_Evaluation_accuracy: 0.01
|
||||
gsm8k_accuracy: 86.96
|
||||
GaokaoBench_weighted_average: 78.05
|
||||
math_accuracy: 60.34
|
||||
cmo_fib_accuracy: 12.98
|
||||
aime2024_accuracy: 3.33
|
||||
Mathbench_naive_average: 64.82
|
||||
wikibench-wiki-single_choice_cncircular_perf_4: 31.7
|
||||
cmmlu_naive_average: 74.24
|
||||
mmlu_naive_average: 70.2
|
||||
mmlu_pro_naive_average: 45.39
|
||||
openai_humaneval_humaneval_pass@1: 70.12
|
||||
sanitized_mbpp_score: 64.59
|
||||
humanevalx_naive_average: 38.78
|
||||
ds1000_naive_average: 14.19
|
||||
lcb_code_generation_pass@1: 16.5
|
||||
lcb_code_execution_pass@1: 33.82
|
||||
lcb_test_output_pass@1: 22.62
|
||||
bigcodebench_hard_instruct_pass@1: 6.08
|
||||
bigcodebench_hard_complete_pass@1: 6.76
|
||||
teval_naive_average: 79.73
|
||||
SciCode_sub_accuracy: 100
|
||||
qa_dingo_cn_score: 100
|
||||
mmlu_accuracy: 70.2
|
||||
mmlu-stem_accuracy: 67.73
|
||||
mmlu-social-science_accuracy: 75.49
|
||||
mmlu-humanities_accuracy: 68.56
|
||||
mmlu-other_accuracy: 70.58
|
||||
cmmlu_accuracy: 74.24
|
||||
cmmlu-stem_accuracy: 66.7
|
||||
cmmlu-social-science_accuracy: 75.88
|
||||
cmmlu-humanities_accuracy: 77.56
|
||||
cmmlu-other_accuracy: 77.52
|
||||
cmmlu-china-specific_accuracy: 73.46
|
||||
mmlu_pro_accuracy: 45.39
|
||||
mmlu_pro_biology_accuracy: 65.83
|
||||
mmlu_pro_business_accuracy: 51.96
|
||||
mmlu_pro_chemistry_accuracy: 36.84
|
||||
mmlu_pro_computer_science_accuracy: 48.29
|
||||
mmlu_pro_economics_accuracy: 56.16
|
||||
mmlu_pro_engineering_accuracy: 29.1
|
||||
mmlu_pro_health_accuracy: 44.5
|
||||
mmlu_pro_history_accuracy: 42.26
|
||||
mmlu_pro_law_accuracy: 24.98
|
||||
mmlu_pro_math_accuracy: 54.85
|
||||
mmlu_pro_philosophy_accuracy: 39.28
|
||||
mmlu_pro_physics_accuracy: 37.41
|
||||
mmlu_pro_psychology_accuracy: 58.27
|
||||
mmlu_pro_other_accuracy: 45.78
|
||||
humanevalx-python_pass@1: 56.1
|
||||
humanevalx-cpp_pass@1: 20.73
|
||||
humanevalx-go_pass@1: 0
|
||||
humanevalx-java_pass@1: 59.15
|
||||
humanevalx-js_pass@1: 57.93
|
||||
ds1000_Pandas_accuracy: 8.93
|
||||
ds1000_Numpy_accuracy: 4.09
|
||||
ds1000_Tensorflow_accuracy: 11.11
|
||||
ds1000_Scipy_accuracy: 7.55
|
||||
ds1000_Sklearn_accuracy: 7.83
|
||||
ds1000_Pytorch_accuracy: 8.82
|
||||
ds1000_Matplotlib_accuracy: 50.97
|
||||
mmmlu_lite_accuracy: 45.02
|
||||
openai_mmmlu_lite_AR-XY_accuracy: 18.6
|
||||
openai_mmmlu_lite_BN-BD_accuracy: 27.58
|
||||
openai_mmmlu_lite_DE-DE_accuracy: 51.23
|
||||
openai_mmmlu_lite_ES-LA_accuracy: 56.63
|
||||
openai_mmmlu_lite_FR-FR_accuracy: 58.11
|
||||
openai_mmmlu_lite_HI-IN_accuracy: 33.82
|
||||
openai_mmmlu_lite_ID-ID_accuracy: 50.39
|
||||
openai_mmmlu_lite_IT-IT_accuracy: 50.39
|
||||
openai_mmmlu_lite_JA-JP_accuracy: 50.95
|
||||
openai_mmmlu_lite_KO-KR_accuracy: 45.05
|
||||
openai_mmmlu_lite_PT-BR_accuracy: 57.89
|
||||
openai_mmmlu_lite_SW-KE_accuracy: 32.14
|
||||
openai_mmmlu_lite_YO-NG_accuracy: 32.14
|
||||
openai_mmmlu_lite_ZH-CN_accuracy: 65.33
|
||||
college_naive_average: 21
|
||||
high_naive_average: 47
|
||||
middle_naive_average: 59.67
|
||||
primary_naive_average: 76
|
||||
arithmetic_naive_average: 62
|
||||
mathbench-a (average)_naive_average: 53.13
|
||||
college_knowledge_naive_average: 68.99
|
||||
high_knowledge_naive_average: 70.06
|
||||
middle_knowledge_naive_average: 78.53
|
||||
primary_knowledge_naive_average: 88.49
|
||||
mathbench-t (average)_naive_average: 76.51
|
||||
|
||||
|
||||
qwen2.5-7b-instruct-pytorch:
|
||||
objective:
|
||||
race-high_accuracy: 85.16
|
||||
ARC-c_accuracy: 90.85
|
||||
BoolQ_accuracy: 86.61
|
||||
triviaqa_wiki_1shot_score: 52.96
|
||||
nq_open_1shot_score: 17.62
|
||||
mmmlu_lite_naive_average: 54.7
|
||||
IFEval_Prompt-level-strict-accuracy: 71.35
|
||||
drop_accuracy: 80.23
|
||||
bbh_naive_average: 68.88
|
||||
GPQA_diamond_accuracy: 36.36
|
||||
hellaswag_accuracy: 85.49
|
||||
TheoremQA_score: 18.38
|
||||
musr_average_naive_average: 43.3
|
||||
korbench_single_naive_average: 39.44
|
||||
ARC_Prize_Public_Evaluation_accuracy: 0
|
||||
gsm8k_accuracy: 91.66
|
||||
GaokaoBench_weighted_average: 80.02
|
||||
math_accuracy: 73.74
|
||||
cmo_fib_accuracy: 26.44
|
||||
aime2024_accuracy: 10
|
||||
Mathbench_naive_average: 77.08
|
||||
wikibench-wiki-single_choice_cncircular_perf_4: 34
|
||||
cmmlu_naive_average: 75.9
|
||||
mmlu_naive_average: 76.27
|
||||
mmlu_pro_naive_average: 56.14
|
||||
openai_humaneval_humaneval_pass@1: 84.76
|
||||
sanitized_mbpp_score: 74.71
|
||||
humanevalx_naive_average: 48.17
|
||||
ds1000_naive_average: 18.57
|
||||
lcb_code_generation_pass@1: 38.75
|
||||
lcb_code_execution_pass@1: 42.38
|
||||
lcb_test_output_pass@1: 50.45
|
||||
bigcodebench_hard_instruct_pass@1: 16.89
|
||||
bigcodebench_hard_complete_pass@1: 12.16
|
||||
teval_naive_average: 79.46
|
||||
SciCode_sub_accuracy: 100
|
||||
qa_dingo_cn_score: 100
|
||||
mmlu_accuracy: 76.27
|
||||
mmlu-stem_accuracy: 77.75
|
||||
mmlu-social-science_accuracy: 78.65
|
||||
mmlu-humanities_accuracy: 73.12
|
||||
mmlu-other_accuracy: 75.05
|
||||
cmmlu_accuracy: 75.9
|
||||
cmmlu-stem_accuracy: 73.41
|
||||
cmmlu-social-science_accuracy: 75.97
|
||||
cmmlu-humanities_accuracy: 76.42
|
||||
cmmlu-other_accuracy: 78.15
|
||||
cmmlu-china-specific_accuracy: 73.27
|
||||
mmlu_pro_accuracy: 56.14
|
||||
mmlu_pro_biology_accuracy: 72.25
|
||||
mmlu_pro_business_accuracy: 66.16
|
||||
mmlu_pro_chemistry_accuracy: 55.65
|
||||
mmlu_pro_computer_science_accuracy: 60.24
|
||||
mmlu_pro_economics_accuracy: 66.82
|
||||
mmlu_pro_engineering_accuracy: 41.38
|
||||
mmlu_pro_health_accuracy: 54.89
|
||||
mmlu_pro_history_accuracy: 46.46
|
||||
mmlu_pro_law_accuracy: 29.06
|
||||
mmlu_pro_math_accuracy: 73.58
|
||||
mmlu_pro_philosophy_accuracy: 44.89
|
||||
mmlu_pro_physics_accuracy: 60.05
|
||||
mmlu_pro_psychology_accuracy: 61.9
|
||||
mmlu_pro_other_accuracy: 52.6
|
||||
humanevalx-python_pass@1: 51.83
|
||||
humanevalx-cpp_pass@1: 42.68
|
||||
humanevalx-go_pass@1: 0
|
||||
humanevalx-java_pass@1: 73.78
|
||||
humanevalx-js_pass@1: 72.56
|
||||
ds1000_Pandas_accuracy: 14.09
|
||||
ds1000_Numpy_accuracy: 8.64
|
||||
ds1000_Tensorflow_accuracy: 17.78
|
||||
ds1000_Scipy_accuracy: 15.09
|
||||
ds1000_Sklearn_accuracy: 8.7
|
||||
ds1000_Pytorch_accuracy: 4.41
|
||||
ds1000_Matplotlib_accuracy: 61.29
|
||||
mmmlu_lite_accuracy: 54.7
|
||||
openai_mmmlu_lite_AR-XY_accuracy: 42.32
|
||||
openai_mmmlu_lite_BN-BD_accuracy: 42.18
|
||||
openai_mmmlu_lite_DE-DE_accuracy: 60
|
||||
openai_mmmlu_lite_ES-LA_accuracy: 66.18
|
||||
openai_mmmlu_lite_FR-FR_accuracy: 66.88
|
||||
openai_mmmlu_lite_HI-IN_accuracy: 48.63
|
||||
openai_mmmlu_lite_ID-ID_accuracy: 61.26
|
||||
openai_mmmlu_lite_IT-IT_accuracy: 65.26
|
||||
openai_mmmlu_lite_JA-JP_accuracy: 60.7
|
||||
openai_mmmlu_lite_KO-KR_accuracy: 60.63
|
||||
openai_mmmlu_lite_PT-BR_accuracy: 54.46
|
||||
openai_mmmlu_lite_SW-KE_accuracy: 36
|
||||
openai_mmmlu_lite_YO-NG_accuracy: 31.86
|
||||
openai_mmmlu_lite_ZH-CN_accuracy: 69.4
|
||||
college_naive_average: 48.33
|
||||
high_naive_average: 59.33
|
||||
middle_naive_average: 76.67
|
||||
primary_naive_average: 86.67
|
||||
arithmetic_naive_average: 74.33
|
||||
mathbench-a (average)_naive_average: 69.07
|
||||
college_knowledge_naive_average: 83.54
|
||||
high_knowledge_naive_average: 80.82
|
||||
middle_knowledge_naive_average: 83.79
|
||||
primary_knowledge_naive_average: 92.22
|
||||
mathbench-t (average)_naive_average: 85.1
|
||||
|
||||
|
||||
internlm3-8b-instruct-turbomind:
|
||||
objective:
|
||||
race-high_accuracy: 89.22
|
||||
ARC-c_accuracy: 92.54
|
||||
BoolQ_accuracy: 86.45
|
||||
triviaqa_wiki_1shot_score: 60.72
|
||||
nq_open_1shot_score: 20.25
|
||||
mmmlu_lite_naive_average: 41.82
|
||||
IFEval_Prompt-level-strict-accuracy: 77.45
|
||||
drop_accuracy: 83.27
|
||||
bbh_naive_average: 55.22
|
||||
GPQA_diamond_accuracy: 37.88
|
||||
hellaswag_accuracy: 91.28
|
||||
TheoremQA_score: 20.12
|
||||
musr_average_naive_average: 36.86
|
||||
korbench_single_naive_average: 41.2
|
||||
ARC_Prize_Public_Evaluation_accuracy: 0.06
|
||||
gsm8k_accuracy: 91.28
|
||||
GaokaoBench_weighted_average: 86.59
|
||||
math_accuracy: 76.96
|
||||
cmo_fib_accuracy: 35.1
|
||||
aime2024_accuracy: 16.67
|
||||
Mathbench_naive_average: 78.96
|
||||
wikibench-wiki-single_choice_cncircular_perf_4: 37.45
|
||||
cmmlu_naive_average: 83.33
|
||||
mmlu_naive_average: 76.21
|
||||
mmlu_pro_naive_average: 57.96
|
||||
openai_humaneval_humaneval_pass@1: 81.71
|
||||
sanitized_mbpp_score: 69.65
|
||||
humanevalx_naive_average: 40.73
|
||||
ds1000_naive_average: 27.23
|
||||
lcb_code_generation_pass@1: 34.75
|
||||
lcb_code_execution_pass@1: 49.9
|
||||
lcb_test_output_pass@1: 48.19
|
||||
bigcodebench_hard_instruct_pass@1: 13.51
|
||||
bigcodebench_hard_complete_pass@1: 15.54
|
||||
teval_naive_average: 82.86
|
||||
SciCode_sub_accuracy: 100
|
||||
qa_dingo_cn_score: 100
|
||||
mmlu_accuracy: 76.21
|
||||
mmlu-stem_accuracy: 77.7
|
||||
mmlu-social-science_accuracy: 80.98
|
||||
mmlu-humanities_accuracy: 70.83
|
||||
mmlu-other_accuracy: 75.01
|
||||
cmmlu_accuracy: 83.33
|
||||
cmmlu-stem_accuracy: 79.66
|
||||
cmmlu-social-science_accuracy: 83.39
|
||||
cmmlu-humanities_accuracy: 84.73
|
||||
cmmlu-other_accuracy: 86.2
|
||||
cmmlu-china-specific_accuracy: 81.77
|
||||
mmlu_pro_accuracy: 57.96
|
||||
mmlu_pro_biology_accuracy: 75.45
|
||||
mmlu_pro_business_accuracy: 64.64
|
||||
mmlu_pro_chemistry_accuracy: 59.81
|
||||
mmlu_pro_computer_science_accuracy: 60.24
|
||||
mmlu_pro_economics_accuracy: 68.6
|
||||
mmlu_pro_engineering_accuracy: 44.79
|
||||
mmlu_pro_health_accuracy: 58.31
|
||||
mmlu_pro_history_accuracy: 49.87
|
||||
mmlu_pro_law_accuracy: 32.43
|
||||
mmlu_pro_math_accuracy: 70.17
|
||||
mmlu_pro_philosophy_accuracy: 46.89
|
||||
mmlu_pro_physics_accuracy: 59.58
|
||||
mmlu_pro_psychology_accuracy: 66.29
|
||||
mmlu_pro_other_accuracy: 54.33
|
||||
humanevalx-python_pass@1: 43.9
|
||||
humanevalx-cpp_pass@1: 20.12
|
||||
humanevalx-go_pass@1: 0
|
||||
humanevalx-java_pass@1: 74.39
|
||||
humanevalx-js_pass@1: 65.24
|
||||
ds1000_Pandas_accuracy: 16.49
|
||||
ds1000_Numpy_accuracy: 34.09
|
||||
ds1000_Tensorflow_accuracy: 26.67
|
||||
ds1000_Scipy_accuracy: 17.92
|
||||
ds1000_Sklearn_accuracy: 20.87
|
||||
ds1000_Pytorch_accuracy: 19.12
|
||||
ds1000_Matplotlib_accuracy: 55.48
|
||||
mmmlu_lite_accuracy: 41.82
|
||||
openai_mmmlu_lite_AR-XY_accuracy: 32.56
|
||||
openai_mmmlu_lite_BN-BD_accuracy: 4.56
|
||||
openai_mmmlu_lite_DE-DE_accuracy: 24.91
|
||||
openai_mmmlu_lite_ES-LA_accuracy: 51.09
|
||||
openai_mmmlu_lite_FR-FR_accuracy: 61.68
|
||||
openai_mmmlu_lite_HI-IN_accuracy: 24.98
|
||||
openai_mmmlu_lite_ID-ID_accuracy: 44.56
|
||||
openai_mmmlu_lite_IT-IT_accuracy: 52.35
|
||||
openai_mmmlu_lite_JA-JP_accuracy: 51.02
|
||||
openai_mmmlu_lite_KO-KR_accuracy: 47.93
|
||||
openai_mmmlu_lite_PT-BR_accuracy: 53.89
|
||||
openai_mmmlu_lite_SW-KE_accuracy: 33.47
|
||||
openai_mmmlu_lite_YO-NG_accuracy: 33.47
|
||||
openai_mmmlu_lite_ZH-CN_accuracy: 69.05
|
||||
college_naive_average: 45.67
|
||||
high_naive_average: 64.67
|
||||
middle_naive_average: 82.33
|
||||
primary_naive_average: 90.33
|
||||
arithmetic_naive_average: 74
|
||||
mathbench-a (average)_naive_average: 71.4
|
||||
college_knowledge_naive_average: 85.28
|
||||
high_knowledge_naive_average: 79.43
|
||||
middle_knowledge_naive_average: 87.9
|
||||
primary_knowledge_naive_average: 93.42
|
||||
mathbench-t (average)_naive_average: 86.51
|
||||
|
||||
|
||||
internlm3-8b-instruct-pytorch:
|
||||
objective:
|
||||
race-high_accuracy: 89.02
|
||||
ARC-c_accuracy: 93.56
|
||||
BoolQ_accuracy: 86.67
|
||||
triviaqa_wiki_1shot_score: 60.54
|
||||
nq_open_1shot_score: 20.3
|
||||
mmmlu_lite_naive_average: 42.6
|
||||
IFEval_Prompt-level-strict-accuracy: 79.11
|
||||
drop_accuracy: 83.32
|
||||
bbh_naive_average: 54.76
|
||||
GPQA_diamond_accuracy: 42.42
|
||||
hellaswag_accuracy: 91.31
|
||||
TheoremQA_score: 18
|
||||
musr_average_naive_average: 36.62
|
||||
korbench_single_naive_average: 41.84
|
||||
ARC_Prize_Public_Evaluation_accuracy: 0.06
|
||||
gsm8k_accuracy: 90.67
|
||||
GaokaoBench_weighted_average: 86.27
|
||||
math_accuracy: 76.68
|
||||
cmo_fib_accuracy: 33.65
|
||||
aime2024_accuracy: 10
|
||||
Mathbench_naive_average: 78.92
|
||||
wikibench-wiki-single_choice_cncircular_perf_4: 37.35
|
||||
cmmlu_naive_average: 83.11
|
||||
mmlu_naive_average: 76.23
|
||||
mmlu_pro_naive_average: 58.16
|
||||
openai_humaneval_humaneval_pass@1: 82.32
|
||||
sanitized_mbpp_score: 70.04
|
||||
humanevalx_naive_average: 39.76
|
||||
ds1000_naive_average: 27.84
|
||||
lcb_code_generation_pass@1: 34.5
|
||||
lcb_code_execution_pass@1: 48.02
|
||||
lcb_test_output_pass@1: 47.74
|
||||
bigcodebench_hard_instruct_pass@1: 12.84
|
||||
bigcodebench_hard_complete_pass@1: 15.54
|
||||
teval_naive_average: 82.86
|
||||
SciCode_sub_accuracy: 100
|
||||
qa_dingo_cn_score: 100
|
||||
mmlu_accuracy: 76.23
|
||||
mmlu-stem_accuracy: 78.08
|
||||
mmlu-social-science_accuracy: 80.31
|
||||
mmlu-humanities_accuracy: 71.38
|
||||
mmlu-other_accuracy: 74.63
|
||||
cmmlu_accuracy: 83.11
|
||||
cmmlu-stem_accuracy: 79.42
|
||||
cmmlu-social-science_accuracy: 83.34
|
||||
cmmlu-humanities_accuracy: 83.95
|
||||
cmmlu-other_accuracy: 86.22
|
||||
cmmlu-china-specific_accuracy: 81.5
|
||||
mmlu_pro_accuracy: 58.16
|
||||
mmlu_pro_biology_accuracy: 74.62
|
||||
mmlu_pro_business_accuracy: 65.02
|
||||
mmlu_pro_chemistry_accuracy: 60.69
|
||||
mmlu_pro_computer_science_accuracy: 61.46
|
||||
mmlu_pro_economics_accuracy: 68.25
|
||||
mmlu_pro_engineering_accuracy: 45.3
|
||||
mmlu_pro_health_accuracy: 60.15
|
||||
mmlu_pro_history_accuracy: 50.66
|
||||
mmlu_pro_law_accuracy: 31.7
|
||||
mmlu_pro_math_accuracy: 70.32
|
||||
mmlu_pro_philosophy_accuracy: 47.7
|
||||
mmlu_pro_physics_accuracy: 59.51
|
||||
mmlu_pro_psychology_accuracy: 65.41
|
||||
mmlu_pro_other_accuracy: 53.46
|
||||
humanevalx-python_pass@1: 42.68
|
||||
humanevalx-cpp_pass@1: 19.51
|
||||
humanevalx-go_pass@1: 0
|
||||
humanevalx-java_pass@1: 72.56
|
||||
humanevalx-js_pass@1: 64.02
|
||||
ds1000_Pandas_accuracy: 14.09
|
||||
ds1000_Numpy_accuracy: 35
|
||||
ds1000_Tensorflow_accuracy: 24.44
|
||||
ds1000_Scipy_accuracy: 20.75
|
||||
ds1000_Sklearn_accuracy: 21.74
|
||||
ds1000_Pytorch_accuracy: 22.06
|
||||
ds1000_Matplotlib_accuracy: 56.77
|
||||
mmmlu_lite_accuracy: 42.6
|
||||
openai_mmmlu_lite_AR-XY_accuracy: 32.84
|
||||
openai_mmmlu_lite_BN-BD_accuracy: 10.46
|
||||
openai_mmmlu_lite_DE-DE_accuracy: 24.56
|
||||
openai_mmmlu_lite_ES-LA_accuracy: 50.95
|
||||
openai_mmmlu_lite_FR-FR_accuracy: 61.05
|
||||
openai_mmmlu_lite_HI-IN_accuracy: 30.6
|
||||
openai_mmmlu_lite_ID-ID_accuracy: 45.89
|
||||
openai_mmmlu_lite_IT-IT_accuracy: 51.79
|
||||
openai_mmmlu_lite_JA-JP_accuracy: 51.65
|
||||
openai_mmmlu_lite_KO-KR_accuracy: 48.77
|
||||
openai_mmmlu_lite_PT-BR_accuracy: 52.7
|
||||
openai_mmmlu_lite_SW-KE_accuracy: 32.91
|
||||
openai_mmmlu_lite_YO-NG_accuracy: 32.84
|
||||
openai_mmmlu_lite_ZH-CN_accuracy: 69.33
|
||||
college_naive_average: 47
|
||||
high_naive_average: 66.67
|
||||
middle_naive_average: 81.67
|
||||
primary_naive_average: 89.33
|
||||
arithmetic_naive_average: 73.67
|
||||
mathbench-a (average)_naive_average: 71.67
|
||||
college_knowledge_naive_average: 82.91
|
||||
high_knowledge_naive_average: 79.86
|
||||
middle_knowledge_naive_average: 88.92
|
||||
primary_knowledge_naive_average: 92.96
|
||||
mathbench-t (average)_naive_average: 86.16
|
||||
|
236
.github/scripts/oc_score_baseline_testrange.yaml
vendored
236
.github/scripts/oc_score_baseline_testrange.yaml
vendored
@ -1,21 +1,24 @@
|
||||
chat:
|
||||
glm-4-9b-chat-hf:
|
||||
gsm8k_accuracy: 68.75
|
||||
race-high_accuracy: 90.62
|
||||
gsm8k_accuracy: 56.25
|
||||
race-high_accuracy: 84.38
|
||||
glm-4-9b-chat-turbomind:
|
||||
gsm8k_accuracy: 71.88
|
||||
race-high_accuracy: 90.62
|
||||
glm-4-9b-chat-vllm:
|
||||
gsm8k_accuracy: 71.88
|
||||
gsm8k_accuracy: 68.75
|
||||
race-high_accuracy: 90.62
|
||||
deepseek-7b-chat-hf:
|
||||
gsm8k_accuracy: 46.88
|
||||
race-high_accuracy: 81.25
|
||||
deepseek-moe-16b-chat-hf:
|
||||
gsm8k_accuracy: 50
|
||||
race-high_accuracy: 68.75
|
||||
deepseek-r1-distill-llama-8b-turbomind:
|
||||
gsm8k_accuracy: 31.25
|
||||
race-high_accuracy: 81.25
|
||||
deepseek-r1-distill-qwen-1_5b-turbomind:
|
||||
gsm8k_accuracy: 37.5
|
||||
race-high_accuracy: 53.12
|
||||
deepseek-7b-chat-vllm:
|
||||
gsm8k_accuracy: 50
|
||||
gsm8k_accuracy: 43.75
|
||||
race-high_accuracy: 78.12
|
||||
gemma2-2b-it-hf:
|
||||
gsm8k_accuracy: 50
|
||||
@ -36,34 +39,40 @@ chat:
|
||||
gsm8k_accuracy: 78.12
|
||||
race-high_accuracy: 93.75
|
||||
gemma-7b-it-vllm:
|
||||
gsm8k_accuracy: 46.88
|
||||
gsm8k_accuracy: 31.25
|
||||
race-high_accuracy: 68.75
|
||||
internlm2_5-7b-chat-hf:
|
||||
gsm8k_accuracy: 84.38
|
||||
race-high_accuracy: 90.62
|
||||
internlm3-8b-instruct-hf:
|
||||
gsm8k_accuracy: 65.62
|
||||
race-high_accuracy: 87.5
|
||||
internlm2_5-7b-chat-turbomind:
|
||||
gsm8k_accuracy: 87.50
|
||||
gsm8k_accuracy: 84.38
|
||||
race-high_accuracy: 90.62
|
||||
internlm2-chat-1.8b-turbomind:
|
||||
gsm8k_accuracy: 28.12
|
||||
race-high_accuracy: 84.38
|
||||
internlm2-chat-1.8b-sft-turbomind:
|
||||
gsm8k_accuracy: 21.88
|
||||
gsm8k_accuracy: 31.25
|
||||
race-high_accuracy: 84.38
|
||||
internlm2-chat-7b-lmdeploy:
|
||||
gsm8k_accuracy: 53.12
|
||||
gsm8k_accuracy: 59.38
|
||||
race-high_accuracy: 84.38
|
||||
internlm2-chat-7b-sft-turbomind:
|
||||
gsm8k_accuracy: 53.12
|
||||
gsm8k_accuracy: 56.25
|
||||
race-high_accuracy: 90.62
|
||||
internlm3-8b-instruct-turbomind:
|
||||
gsm8k_accuracy: 68.75
|
||||
race-high_accuracy: 87.5
|
||||
internlm2-chat-7b-vllm:
|
||||
gsm8k_accuracy: 43.75
|
||||
race-high_accuracy: 84.38
|
||||
gsm8k_accuracy: 59.38
|
||||
race-high_accuracy: 87.50
|
||||
llama-3_1-8b-instruct-hf:
|
||||
gsm8k_accuracy: 84.38
|
||||
race-high_accuracy: 90.62
|
||||
llama-3_2-3b-instruct-hf:
|
||||
gsm8k_accuracy: 68.75
|
||||
gsm8k_accuracy: 71.88
|
||||
race-high_accuracy: 81.25
|
||||
llama-3-8b-instruct-hf:
|
||||
gsm8k_accuracy: 68.75
|
||||
@ -72,14 +81,14 @@ chat:
|
||||
gsm8k_accuracy: 18.75
|
||||
race-high_accuracy: 46.88
|
||||
llama-3_1-8b-instruct-turbomind:
|
||||
gsm8k_accuracy: 78.12
|
||||
gsm8k_accuracy: 81.25
|
||||
race-high_accuracy: 90.62
|
||||
llama-3_2-3b-instruct-turbomind:
|
||||
gsm8k_accuracy: 65.62
|
||||
gsm8k_accuracy: 75.00
|
||||
race-high_accuracy: 81.25
|
||||
llama-3-8b-instruct-turbomind:
|
||||
gsm8k_accuracy: 71.88
|
||||
race-high_accuracy: 87.5
|
||||
gsm8k_accuracy: 68.75
|
||||
race-high_accuracy: 84.38
|
||||
mistral-7b-instruct-v0.2-hf:
|
||||
gsm8k_accuracy: 40.62
|
||||
race-high_accuracy: 75
|
||||
@ -94,13 +103,10 @@ chat:
|
||||
race-high_accuracy: 78.12
|
||||
mistral-7b-instruct-v0.1-vllm:
|
||||
gsm8k_accuracy: 34.38
|
||||
race-high_accuracy: 68.75
|
||||
race-high_accuracy: 65.62
|
||||
mistral-7b-instruct-v0.2-vllm:
|
||||
gsm8k_accuracy: 31.25
|
||||
race-high_accuracy: 75
|
||||
phi-3-mini-4k-instruct-hf:
|
||||
gsm8k_accuracy: 81.25
|
||||
race-high_accuracy: 87.50
|
||||
gsm8k_accuracy: 21.88
|
||||
race-high_accuracy: 78.12
|
||||
qwen2.5-0.5b-instruct-hf:
|
||||
gsm8k_accuracy: 34.38
|
||||
race-high_accuracy: 46.88
|
||||
@ -108,10 +114,10 @@ chat:
|
||||
gsm8k_accuracy: 53.12
|
||||
race-high_accuracy: 90.62
|
||||
qwen2.5-0.5b-instruct-turbomind:
|
||||
gsm8k_accuracy: 28.12
|
||||
race-high_accuracy: 50
|
||||
gsm8k_accuracy: 31.25
|
||||
race-high_accuracy: 43.75
|
||||
qwen2.5-3b-instruct-turbomind:
|
||||
gsm8k_accuracy: 59.38
|
||||
gsm8k_accuracy: 56.25
|
||||
race-high_accuracy: 90.62
|
||||
qwen1.5-0.5b-chat-hf:
|
||||
gsm8k_accuracy: 0
|
||||
@ -123,11 +129,11 @@ chat:
|
||||
gsm8k_accuracy: 68.75
|
||||
race-high_accuracy: 90.62
|
||||
qwen2-1.5b-instruct-turbomind:
|
||||
gsm8k_accuracy: 53.12
|
||||
gsm8k_accuracy: 56.25
|
||||
race-high_accuracy: 84.38
|
||||
qwen2-7b-instruct-turbomind:
|
||||
gsm8k_accuracy: 81.25
|
||||
race-high_accuracy: 90.62
|
||||
race-high_accuracy: 87.50
|
||||
qwen1.5-0.5b-chat-vllm:
|
||||
gsm8k_accuracy: 3.12
|
||||
race-high_accuracy: 53.12
|
||||
@ -143,11 +149,11 @@ chat:
|
||||
yi-1.5-9b-chat-turbomind:
|
||||
gsm8k_accuracy: 71.88
|
||||
race-high_accuracy: 93.75
|
||||
deepseek-v2-lite-chat-hf:
|
||||
gsm8k_accuracy: 46.88
|
||||
deepseek-v2_lite-chat-turbomind:
|
||||
gsm8k_accuracy: 37.5
|
||||
race-high_accuracy: 71.88
|
||||
gemma2-27b-it-hf:
|
||||
gsm8k_accuracy: 75
|
||||
gsm8k_accuracy: 71.88
|
||||
race-high_accuracy: 93.75
|
||||
internlm2_5-20b-chat-hf:
|
||||
gsm8k_accuracy: 84.38
|
||||
@ -161,6 +167,9 @@ chat:
|
||||
mistral-small-instruct-2409-turbomind:
|
||||
gsm8k_accuracy: 81.25
|
||||
race-high_accuracy: 87.50
|
||||
phi-4:
|
||||
gsm8k_accuracy: 81.25
|
||||
race-high_accuracy: 87.50
|
||||
qwen2.5-14b-instruct-hf:
|
||||
gsm8k_accuracy: 71.88
|
||||
race-high_accuracy: 96.88
|
||||
@ -168,40 +177,41 @@ chat:
|
||||
gsm8k_accuracy: 68.75
|
||||
race-high_accuracy: 93.75
|
||||
yi-1.5-34b-chat-turbomind:
|
||||
gsm8k_accuracy: 78.12
|
||||
gsm8k_accuracy: 75.00
|
||||
race-high_accuracy: 93.75
|
||||
deepseek-67b-chat-hf:
|
||||
gsm8k_accuracy: 71.88
|
||||
deepseek-67b-chat-turbomind:
|
||||
gsm8k_accuracy: 75.00
|
||||
race-high_accuracy: 78.12
|
||||
deepseek-r1-distill-qwen-32b-turbomind:
|
||||
gsm8k_accuracy: 25
|
||||
race-high_accuracy: 90.62
|
||||
llama-3_3-70b-instruct-turbomind:
|
||||
gsm8k_accuracy: 93.75
|
||||
race-high_accuracy: 87.5
|
||||
mixtral-8x7b-instruct-v0.1-hf:
|
||||
gsm8k_accuracy: 59.38
|
||||
race-high_accuracy: 81.25
|
||||
mixtral-large-instruct-2411-turbomind:
|
||||
gsm8k_accuracy: 90.62
|
||||
gsm8k_accuracy: 87.50
|
||||
race-high_accuracy: 93.75
|
||||
nvidia-3_1-Nemotron-70b-instruct-HF-turbomind:
|
||||
gsm8k_accuracy: 87.5
|
||||
race-high_accuracy: 46.88
|
||||
gsm8k_accuracy: 93.75
|
||||
race-high_accuracy: 50.00
|
||||
qwen2.5-72b-instruct-turbomind:
|
||||
gsm8k_accuracy: 75
|
||||
race-high_accuracy: 93.75
|
||||
gsm8k_accuracy: 81.25
|
||||
race-high_accuracy: 90.62
|
||||
deepseek-r1-distill-llama-70b-turbomind:
|
||||
gsm8k_accuracy: 40.62
|
||||
race-high_accuracy: 90.62
|
||||
deepseek-v2_5-1210-turbomind:
|
||||
gsm8k_accuracy: 90.62
|
||||
race-high_accuracy: 84.38
|
||||
mixtral-8x22b-instruct-v0.1-hf:
|
||||
gsm8k_accuracy: 81.25
|
||||
race-high_accuracy: 81.25
|
||||
mixtral-8x22b-instruct-v0.1-turbomind:
|
||||
gsm8k_accuracy: 75
|
||||
race-high_accuracy: 78.12
|
||||
mixtral-8x22b-instruct-v0.1-vllm:
|
||||
gsm8k_accuracy: 78.12
|
||||
race-high_accuracy: 78.12
|
||||
base:
|
||||
glm-4-9b-hf:
|
||||
gsm8k_accuracy: 68.75
|
||||
GPQA_diamond_accuracy: 31.25
|
||||
race-high_accuracy: 93.75
|
||||
winogrande_accuracy: 84.38
|
||||
glm-4-9b-turbomind:
|
||||
gsm8k_accuracy: 62.5
|
||||
gsm8k_accuracy: 56.25
|
||||
GPQA_diamond_accuracy: 28.12
|
||||
race-high_accuracy: 93.75
|
||||
winogrande_accuracy: 84.38
|
||||
@ -210,15 +220,10 @@ base:
|
||||
GPQA_diamond_accuracy: 0
|
||||
race-high_accuracy: 46.88
|
||||
winogrande_accuracy: 71.88
|
||||
deepseek-moe-16b-base-hf:
|
||||
gsm8k_accuracy: 21.88
|
||||
GPQA_diamond_accuracy: 0
|
||||
race-high_accuracy: 21.88
|
||||
winogrande_accuracy: 65.62
|
||||
deepseek-7b-base-turbomind:
|
||||
gsm8k_accuracy: 21.88
|
||||
gsm8k_accuracy: 18.75
|
||||
GPQA_diamond_accuracy: 0
|
||||
race-high_accuracy: 46.88
|
||||
race-high_accuracy: 43.75
|
||||
winogrande_accuracy: 84.38
|
||||
deepseek-moe-16b-base-vllm:
|
||||
gsm8k_accuracy: 21.88
|
||||
@ -245,16 +250,21 @@ base:
|
||||
GPQA_diamond_accuracy: 3.12
|
||||
race-high_accuracy: 65.62
|
||||
winogrande_accuracy: 71.88
|
||||
gemma-2-9b-turbomind:
|
||||
gsm8k_accuracy: 68.75
|
||||
GPQA_diamond_accuracy: 0
|
||||
race-high_accuracy: 78.12
|
||||
winogrande_accuracy: 50
|
||||
gemma-2b-vllm:
|
||||
gsm8k_accuracy: 15.62
|
||||
GPQA_diamond_accuracy: 3.12
|
||||
race-high_accuracy:
|
||||
winogrande_accuracy:
|
||||
race-high_accuracy: 28.12
|
||||
winogrande_accuracy: 68.75
|
||||
gemma-7b-vllm:
|
||||
gsm8k_accuracy: 53.12
|
||||
GPQA_diamond_accuracy: 9.38
|
||||
race-high_accuracy:
|
||||
winogrande_accuracy:
|
||||
gsm8k_accuracy: 43.75
|
||||
GPQA_diamond_accuracy: 6.25
|
||||
race-high_accuracy: 81.25
|
||||
winogrande_accuracy: 81.25
|
||||
internlm2_5-7b-hf:
|
||||
gsm8k_accuracy: 37.5
|
||||
GPQA_diamond_accuracy: 25
|
||||
@ -265,30 +275,25 @@ base:
|
||||
GPQA_diamond_accuracy: 18.75
|
||||
race-high_accuracy: 62.5
|
||||
winogrande_accuracy: 78.12
|
||||
internlm2-base-7b-hf:
|
||||
gsm8k_accuracy: 3.12
|
||||
GPQA_diamond_accuracy: 21.88
|
||||
race-high_accuracy: 75
|
||||
winogrande_accuracy: 65.62
|
||||
internlm2-1.8b-turbomind:
|
||||
gsm8k_accuracy: 12.5
|
||||
GPQA_diamond_accuracy: 9.38
|
||||
gsm8k_accuracy: 6.25
|
||||
GPQA_diamond_accuracy: 12.5
|
||||
race-high_accuracy: 71.88
|
||||
winogrande_accuracy: 78.12
|
||||
winogrande_accuracy: 75
|
||||
internlm2_5-7b-turbomind:
|
||||
gsm8k_accuracy: 62.50
|
||||
gsm8k_accuracy: 59.38
|
||||
GPQA_diamond_accuracy: 34.38
|
||||
race-high_accuracy: 93.75
|
||||
winogrande_accuracy: 87.50
|
||||
winogrande_accuracy: 84.38
|
||||
internlm2-7b-turbomind:
|
||||
gsm8k_accuracy: 53.12
|
||||
GPQA_diamond_accuracy: 21.88
|
||||
gsm8k_accuracy: 50
|
||||
GPQA_diamond_accuracy: 18.75
|
||||
race-high_accuracy: 71.88
|
||||
winogrande_accuracy: 84.38
|
||||
internlm2-base-7b-turbomind:
|
||||
gsm8k_accuracy: 37.50
|
||||
GPQA_diamond_accuracy: 28.12
|
||||
race-high_accuracy: 81.25
|
||||
GPQA_diamond_accuracy: 21.88
|
||||
race-high_accuracy: 84.38
|
||||
winogrande_accuracy: 75
|
||||
llama-2-7b-hf:
|
||||
gsm8k_accuracy: 21.88
|
||||
@ -311,7 +316,7 @@ base:
|
||||
race-high_accuracy: 78.12
|
||||
winogrande_accuracy: 78.12
|
||||
llama-3-8b-turbomind:
|
||||
gsm8k_accuracy: 50
|
||||
gsm8k_accuracy: 46.88
|
||||
GPQA_diamond_accuracy: 12.50
|
||||
race-high_accuracy: 65.62
|
||||
winogrande_accuracy: 78.12
|
||||
@ -327,14 +332,14 @@ base:
|
||||
winogrande_accuracy: 71.88
|
||||
qwen2.5-1.5b-turbomind:
|
||||
gsm8k_accuracy: 62.50
|
||||
GPQA_diamond_accuracy: 12.50
|
||||
race-high_accuracy: 78.12
|
||||
winogrande_accuracy: 68.75
|
||||
qwen2.5-7b-turbomind:
|
||||
gsm8k_accuracy: 75.00
|
||||
GPQA_diamond_accuracy: 25
|
||||
race-high_accuracy: 87.5
|
||||
GPQA_diamond_accuracy: 15.62
|
||||
race-high_accuracy: 75
|
||||
winogrande_accuracy: 71.88
|
||||
qwen2.5-7b-turbomind:
|
||||
gsm8k_accuracy: 71.88
|
||||
GPQA_diamond_accuracy: 18.75
|
||||
race-high_accuracy: 87.5
|
||||
winogrande_accuracy: 75.00
|
||||
qwen1.5-moe-a2.7b-hf:
|
||||
gsm8k_accuracy: 62.5
|
||||
GPQA_diamond_accuracy: 18.75
|
||||
@ -356,17 +361,17 @@ base:
|
||||
race-high_accuracy: 87.5
|
||||
winogrande_accuracy: 68.75
|
||||
qwen2-1.5b-turbomind:
|
||||
gsm8k_accuracy: 56.25
|
||||
GPQA_diamond_accuracy: 9.38
|
||||
gsm8k_accuracy: 59.38
|
||||
GPQA_diamond_accuracy: 12.50
|
||||
race-high_accuracy: 81.25
|
||||
winogrande_accuracy: 75
|
||||
qwen2-7b-turbomind:
|
||||
gsm8k_accuracy: 75.00
|
||||
gsm8k_accuracy: 65.62
|
||||
GPQA_diamond_accuracy: 12.5
|
||||
race-high_accuracy: 87.5
|
||||
winogrande_accuracy: 71.88
|
||||
qwen1.5-0.5b-vllm:
|
||||
gsm8k_accuracy: 9.38
|
||||
gsm8k_accuracy: 6.25
|
||||
GPQA_diamond_accuracy: 0
|
||||
race-high_accuracy: 56.25
|
||||
winogrande_accuracy: 62.5
|
||||
@ -382,27 +387,12 @@ base:
|
||||
winogrande_accuracy: 59.38
|
||||
yi-1.5-9b-turbomind:
|
||||
gsm8k_accuracy: 78.12
|
||||
GPQA_diamond_accuracy: 40.62
|
||||
GPQA_diamond_accuracy: 43.75
|
||||
race-high_accuracy: 87.5
|
||||
winogrande_accuracy: 71.88
|
||||
deepseek-v2-lite-hf:
|
||||
gsm8k_accuracy: 31.25
|
||||
GPQA_diamond_accuracy: 28.12
|
||||
race-high_accuracy: 59.38
|
||||
winogrande_accuracy: 71.88
|
||||
internlm2-20b-hf:
|
||||
gsm8k_accuracy: 56.25
|
||||
GPQA_diamond_accuracy: 15.62
|
||||
race-high_accuracy: 68.75
|
||||
winogrande_accuracy: 75
|
||||
internlm2-base-20b-hf:
|
||||
gsm8k_accuracy: 12.5
|
||||
GPQA_diamond_accuracy: 9.38
|
||||
race-high_accuracy: 84.38
|
||||
winogrande_accuracy: 65.62
|
||||
internlm2-20b-turbomind:
|
||||
gsm8k_accuracy: 71.88
|
||||
GPQA_diamond_accuracy: 15.62
|
||||
gsm8k_accuracy: 75
|
||||
GPQA_diamond_accuracy: 18.75
|
||||
race-high_accuracy: 68.75
|
||||
winogrande_accuracy: 81.25
|
||||
qwen2.5-14b-hf:
|
||||
@ -416,37 +406,27 @@ base:
|
||||
race-high_accuracy: 93.75
|
||||
winogrande_accuracy: 78.12
|
||||
qwen2.5-32b-turbomind:
|
||||
gsm8k_accuracy: 84.38
|
||||
GPQA_diamond_accuracy: 28.12
|
||||
gsm8k_accuracy: 87.5
|
||||
GPQA_diamond_accuracy: 18.75
|
||||
race-high_accuracy: 93.75
|
||||
winogrande_accuracy: 81.25
|
||||
deepseek-67b-base-hf:
|
||||
gsm8k_accuracy: 59.38
|
||||
GPQA_diamond_accuracy: 31.25
|
||||
race-high_accuracy: 81.25
|
||||
winogrande_accuracy: 90.62
|
||||
deepseek-67b-base-turbomind:
|
||||
gsm8k_accuracy: 56.25
|
||||
gsm8k_accuracy: 53.12
|
||||
GPQA_diamond_accuracy: 28.12
|
||||
race-high_accuracy: 81.25
|
||||
winogrande_accuracy: 84.38
|
||||
llama-3-70b-turbomind:
|
||||
gsm8k_accuracy: 59.38
|
||||
GPQA_diamond_accuracy: 9.38
|
||||
gsm8k_accuracy: 56.25
|
||||
GPQA_diamond_accuracy: 12.50
|
||||
race-high_accuracy: 93.75
|
||||
winogrande_accuracy: 84.38
|
||||
qwen2.5-72b-turbomind:
|
||||
gsm8k_accuracy: 84.38
|
||||
GPQA_diamond_accuracy: 34.38
|
||||
GPQA_diamond_accuracy: 31.25
|
||||
race-high_accuracy: 93.75
|
||||
winogrande_accuracy: 87.5
|
||||
deepseek-v2-turbomind:
|
||||
gsm8k_accuracy: 65.62
|
||||
GPQA_diamond_accuracy: 15.62
|
||||
race-high_accuracy: 93.75
|
||||
winogrande_accuracy: 84.38
|
||||
llama-3-70b-hf:
|
||||
gsm8k_accuracy: 62.5
|
||||
gsm8k_accuracy: 59.38
|
||||
GPQA_diamond_accuracy: 3.12
|
||||
race-high_accuracy: 93.75
|
||||
winogrande_accuracy: 84.38
|
||||
winogrande_accuracy: 81.25
|
||||
|
21
.github/workflows/daily-run-test.yml
vendored
21
.github/workflows/daily-run-test.yml
vendored
@ -61,6 +61,7 @@ env:
|
||||
HUGGINGFACE_HUB_CACHE: /fs-computility/llm/shared/llmeval/models/opencompass_hf_hub
|
||||
HF_HUB_CACHE: /fs-computility/llm/shared/llmeval/models/opencompass_hf_hub
|
||||
CONDA_ENV: regression_test
|
||||
export VLLM_WORKER_MULTIPROC_METHOD: spawn
|
||||
|
||||
jobs:
|
||||
build-pypi:
|
||||
@ -92,7 +93,6 @@ jobs:
|
||||
matrix:
|
||||
pyver: [py310]
|
||||
runs-on: ubuntu-latest
|
||||
environment: 'prod'
|
||||
env:
|
||||
PYTHON_VERSION: ${{ matrix.pyver }}
|
||||
PLAT_NAME: manylinux2014_x86_64
|
||||
@ -126,7 +126,6 @@ jobs:
|
||||
if: ${{!cancelled()}}
|
||||
needs: ['build-pypi', 'build-pypi-lmdeploy']
|
||||
runs-on: volc_cu12
|
||||
environment: 'prod'
|
||||
timeout-minutes: 120 #2hours
|
||||
steps:
|
||||
- name: Clone repository
|
||||
@ -190,7 +189,6 @@ jobs:
|
||||
matrix:
|
||||
regression_func: ${{fromJSON(github.event.inputs.regression_func_volc || '["chat_models","base_models","chat_obj_fullbench","base_fullbench"]')}}
|
||||
runs-on: volc_cu12_daily
|
||||
environment: 'prod'
|
||||
timeout-minutes: 180 #3hours
|
||||
steps:
|
||||
- name: Clone repository
|
||||
@ -231,7 +229,6 @@ jobs:
|
||||
matrix:
|
||||
regression_func: ${{fromJSON(github.event.inputs.regression_func_local || '["cmd","api","chat_sub_fullbench"]')}}
|
||||
runs-on: volc_cu12_local
|
||||
environment: 'prod'
|
||||
timeout-minutes: 480 #6hours
|
||||
steps:
|
||||
- name: Clone repository
|
||||
@ -258,27 +255,33 @@ jobs:
|
||||
conda info --envs
|
||||
export from_tf=TRUE
|
||||
python tools/list_configs.py internlm2_5 mmlu
|
||||
opencompass --models hf_internlm2_5_7b hf_internlm2_1_8b --datasets race_ppl demo_gsm8k_chat_gen --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd1 --reuse --max-num-workers 2 --dump-eval-details
|
||||
opencompass --models hf_internlm2_5_7b --datasets race_ppl demo_gsm8k_chat_gen --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd1 --reuse --max-num-workers 2 --dump-eval-details
|
||||
rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd1/*/summary regression_result_daily
|
||||
python -m pytest -m case1 -s -v --color=yes .github/scripts/oc_score_assert.py
|
||||
opencompass --models hf_internlm2_5_7b_chat hf_internlm2_chat_1_8b --datasets race_gen demo_gsm8k_chat_gen -a lmdeploy --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd2 --reuse --max-num-workers 2 --dump-eval-details
|
||||
opencompass --models hf_internlm2_5_7b_chat hf_internlm3_8b_instruct --datasets race_gen demo_gsm8k_chat_gen -a lmdeploy --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd2 --reuse --max-num-workers 2 --dump-eval-details
|
||||
rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd2/*/summary regression_result_daily
|
||||
python -m pytest -m case2 -s -v --color=yes .github/scripts/oc_score_assert.py
|
||||
opencompass --datasets race_ppl demo_gsm8k_chat_gen --hf-type base --hf-path internlm/internlm2_5-7b --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd3 --reuse --max-num-workers 2 --dump-eval-details
|
||||
rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd3/*/summary regression_result_daily
|
||||
python -m pytest -m case3 -s -v --color=yes .github/scripts/oc_score_assert.py
|
||||
opencompass --datasets race_gen demo_gsm8k_chat_gen --hf-type chat --hf-path internlm/internlm2_5-7b-chat --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd4 --reuse --max-num-workers 2 --dump-eval-details
|
||||
opencompass --datasets race_gen demo_gsm8k_chat_gen --hf-type chat --hf-path internlm/internlm3-8b-instruct -a lmdeploy --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd4 --reuse --max-num-workers 2 --dump-eval-details
|
||||
rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd4/*/summary regression_result_daily
|
||||
python -m pytest -m case4 -s -v --color=yes .github/scripts/oc_score_assert.py
|
||||
opencompass --datasets race_gen demo_gsm8k_chat_gen --hf-type chat --hf-path internlm/internlm3-8b-instruct -a vllm --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd5 --reuse --max-num-workers 2 --dump-eval-details
|
||||
rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd5/*/summary regression_result_daily
|
||||
python -m pytest -m case5 -s -v --color=yes .github/scripts/oc_score_assert.py
|
||||
- name: Run model test - api
|
||||
if: matrix.regression_func == 'api'
|
||||
run: |
|
||||
. ${{env.CONDA_PATH}}/bin/activate
|
||||
conda activate ${{env.CONDA_ENV}}
|
||||
conda info --envs
|
||||
lmdeploy serve api_server internlm/internlm2_5-7b-chat --max-batch-size 256 --model-name internlm2 > ${{env.REPORT_ROOT}}/${{ github.run_id }}/restful.log 2>&1 &
|
||||
lmdeploy serve api_server internlm/internlm3-8b-instruct --max-batch-size 256 --model-name internlm3 > ${{env.REPORT_ROOT}}/${{ github.run_id }}/restful.log 2>&1 &
|
||||
echo "restful_pid=$!" >> "$GITHUB_ENV"
|
||||
sleep 180s
|
||||
env | grep PROXY
|
||||
env | grep proxy
|
||||
unset HTTP_PROXY;unset HTTPS_PROXY;unset http_proxy;unset https_proxy;
|
||||
opencompass .github/scripts/eval_regression_api.py --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/api --reuse --max-num-workers 2 --dump-eval-details
|
||||
rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/api/*/summary regression_result_daily
|
||||
python -m pytest -m api -s -v --color=yes .github/scripts/oc_score_assert.py
|
||||
@ -307,7 +310,6 @@ jobs:
|
||||
matrix:
|
||||
function_type: ${{fromJSON(github.event.inputs.fullbench_eval || '["base_objective","chat_objective","chat_subjective","base_long_context","chat_long_context"]')}}
|
||||
runs-on: volc_cu12
|
||||
environment: 'prod'
|
||||
timeout-minutes: 480 #6hours
|
||||
steps:
|
||||
- name: Clone repository
|
||||
@ -341,7 +343,6 @@ jobs:
|
||||
needs: [daily_run_test_volc, daily_run_test_local, fullbench_run_test]
|
||||
timeout-minutes: 5
|
||||
runs-on: self-hosted
|
||||
environment: 'prod'
|
||||
steps:
|
||||
- name: notify
|
||||
run: |
|
||||
|
@ -0,0 +1,22 @@
|
||||
from opencompass.models import TurboMindModelwithChatTemplate
|
||||
|
||||
models = [
|
||||
dict(
|
||||
type=TurboMindModelwithChatTemplate,
|
||||
abbr='mixtral-8x22b-instruct-v0.1-turbomind',
|
||||
path='mistralai/Mixtral-8x22B-Instruct-v0.1',
|
||||
engine_config=dict(
|
||||
session_len=32768,
|
||||
max_batch_size=16,
|
||||
tp=8,
|
||||
cache_max_entry_count=0.7,
|
||||
),
|
||||
gen_config=dict(
|
||||
top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=4096
|
||||
),
|
||||
max_seq_len=32768,
|
||||
max_out_len=4096,
|
||||
batch_size=8,
|
||||
run_cfg=dict(num_gpus=8),
|
||||
)
|
||||
]
|
@ -147,6 +147,5 @@ class CommonSummarizer(CompassArenaSummarizer):
|
||||
f.write(','.join(new_header) + '\n')
|
||||
for line in new_table:
|
||||
f.write(','.join(map(str, line)) + '\n')
|
||||
print(t)
|
||||
print(output_file)
|
||||
return {'qa_bench_' + show_dataset_abbr:json_result}
|
||||
|
Loading…
Reference in New Issue
Block a user