[ci] remove testcase into volc engine (#1777)

* update

* update

* update

* update

* update

* update

* updaste

* update

* update

* update

* update

* update

* update

* update

* updaste

* update

* update

* update

* update

* update

* update

* update

* update

* update

* Update daily-run-test.yml

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update
This commit is contained in:
zhulinJulia24 2024-12-25 17:26:50 +08:00 committed by GitHub
parent ebefffed61
commit c48bbde26f
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
11 changed files with 1226 additions and 1422 deletions

View File

@ -66,6 +66,8 @@ with read_base():
from opencompass.configs.summarizers.groups.mmlu_pro import \ from opencompass.configs.summarizers.groups.mmlu_pro import \
mmlu_pro_summary_groups # noqa: F401, E501 mmlu_pro_summary_groups # noqa: F401, E501
from ...volc import infer as volc_infer # noqa: F401, E501
race_datasets = [race_datasets[1]] # Only take RACE-High race_datasets = [race_datasets[1]] # Only take RACE-High
humaneval_v2_datasets[0]['abbr'] = 'openai_humaneval_v2' humaneval_v2_datasets[0]['abbr'] = 'openai_humaneval_v2'
bbh_datasets = [ bbh_datasets = [

View File

@ -13,12 +13,22 @@ with read_base():
# read hf models - chat models # read hf models - chat models
from opencompass.configs.models.chatglm.hf_glm4_9b import \ from opencompass.configs.models.chatglm.hf_glm4_9b import \
models as hf_glm4_9b_model # noqa: F401, E501 models as hf_glm4_9b_model # noqa: F401, E501
from opencompass.configs.models.chatglm.lmdeploy_glm4_9b import \
models as lmdeploy_glm4_9b_model # noqa: F401, E501
from opencompass.configs.models.deepseek.hf_deepseek_7b_base import \
models as hf_deepseek_7b_base_model # noqa: F401, E501
from opencompass.configs.models.deepseek.hf_deepseek_67b_base import \
models as hf_deepseek_67b_base_model # noqa: F401, E501
from opencompass.configs.models.deepseek.hf_deepseek_moe_16b_base import \ from opencompass.configs.models.deepseek.hf_deepseek_moe_16b_base import \
models as hf_deepseek_moe_16b_base_model # noqa: F401, E501 models as hf_deepseek_moe_16b_base_model # noqa: F401, E501
from opencompass.configs.models.deepseek.hf_deepseek_v2_lite import \ from opencompass.configs.models.deepseek.hf_deepseek_v2_lite import \
models as hf_deepseek_v2_lite_model # noqa: F401, E501 models as hf_deepseek_v2_lite_model # noqa: F401, E501
from opencompass.configs.models.deepseek.lmdeploy_deepseek_7b_base import \ from opencompass.configs.models.deepseek.lmdeploy_deepseek_7b_base import \
models as lmdeploy_deepseek_7b_base_model # noqa: F401, E501 models as lmdeploy_deepseek_7b_base_model # noqa: F401, E501
from opencompass.configs.models.deepseek.lmdeploy_deepseek_67b_base import \
models as lmdeploy_deepseek_67b_base_model # noqa: F401, E501
from opencompass.configs.models.deepseek.lmdeploy_deepseek_v2 import \
lmdeploy_deepseek_v2_model # noqa: F401, E501
from opencompass.configs.models.deepseek.vllm_deepseek_moe_16b_base import \ from opencompass.configs.models.deepseek.vllm_deepseek_moe_16b_base import \
models as vllm_deepseek_moe_16b_base_model # noqa: F401, E501 models as vllm_deepseek_moe_16b_base_model # noqa: F401, E501
from opencompass.configs.models.gemma.hf_gemma2_2b import \ from opencompass.configs.models.gemma.hf_gemma2_2b import \
@ -29,6 +39,8 @@ with read_base():
models as hf_gemma_2b_model # noqa: F401, E501 models as hf_gemma_2b_model # noqa: F401, E501
from opencompass.configs.models.gemma.hf_gemma_7b import \ from opencompass.configs.models.gemma.hf_gemma_7b import \
models as hf_gemma_7b_model # noqa: F401, E501 models as hf_gemma_7b_model # noqa: F401, E501
from opencompass.configs.models.gemma.lmdeploy_gemma_9b import \
models as lmdeploy_gemma_9b_model # noqa: F401, E501
from opencompass.configs.models.gemma.vllm_gemma_2b import \ from opencompass.configs.models.gemma.vllm_gemma_2b import \
models as vllm_gemma_2b_model # noqa: F401, E501 models as vllm_gemma_2b_model # noqa: F401, E501
from opencompass.configs.models.gemma.vllm_gemma_7b import \ from opencompass.configs.models.gemma.vllm_gemma_7b import \
@ -59,10 +71,14 @@ with read_base():
models as hf_llama3_1_8b_model # noqa: F401, E501 models as hf_llama3_1_8b_model # noqa: F401, E501
from opencompass.configs.models.hf_llama.hf_llama3_8b import \ from opencompass.configs.models.hf_llama.hf_llama3_8b import \
models as hf_llama3_8b_model # noqa: F401, E501 models as hf_llama3_8b_model # noqa: F401, E501
from opencompass.configs.models.hf_llama.hf_llama3_70b import \
models as hf_llama3_70b_model # noqa: F401, E501
from opencompass.configs.models.hf_llama.lmdeploy_llama3_1_8b import \ from opencompass.configs.models.hf_llama.lmdeploy_llama3_1_8b import \
models as lmdeploy_llama3_1_8b_model # noqa: F401, E501 models as lmdeploy_llama3_1_8b_model # noqa: F401, E501
from opencompass.configs.models.hf_llama.lmdeploy_llama3_8b import \ from opencompass.configs.models.hf_llama.lmdeploy_llama3_8b import \
models as lmdeploy_llama3_8b_model # noqa: F401, E501 models as lmdeploy_llama3_8b_model # noqa: F401, E501
from opencompass.configs.models.hf_llama.lmdeploy_llama3_70b import \
models as lmdeploy_llama3_70b_model # noqa: F401, E501
from opencompass.configs.models.mistral.hf_mistral_7b_v0_2 import \ from opencompass.configs.models.mistral.hf_mistral_7b_v0_2 import \
models as hf_mistral_7b_v0_2_model # noqa: F401, E501 models as hf_mistral_7b_v0_2_model # noqa: F401, E501
from opencompass.configs.models.mistral.hf_mistral_7b_v0_3 import \ from opencompass.configs.models.mistral.hf_mistral_7b_v0_3 import \
@ -73,10 +89,16 @@ with read_base():
models as hf_qwen_2_5_7b_model # noqa: F401, E501 models as hf_qwen_2_5_7b_model # noqa: F401, E501
from opencompass.configs.models.qwen2_5.hf_qwen_2_5_14b import \ from opencompass.configs.models.qwen2_5.hf_qwen_2_5_14b import \
models as hf_qwen_2_5_14b_model # noqa: F401, E501 models as hf_qwen_2_5_14b_model # noqa: F401, E501
from opencompass.configs.models.qwen2_5.hf_qwen_2_5_32b import \
models as hf_qwen_2_5_32b_model # noqa: F401, E501
from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_1_5b import \ from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_1_5b import \
models as lmdeploy_qwen2_5_1_5b_model # noqa: F401, E501 models as lmdeploy_qwen2_5_1_5b_model # noqa: F401, E501
from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_7b import \ from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_7b import \
models as lmdeploy_qwen2_5_7b_model # noqa: F401, E501 models as lmdeploy_qwen2_5_7b_model # noqa: F401, E501
from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_32b import \
models as lmdeploy_qwen2_5_32b_model # noqa: F401, E501
from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_72b import \
models as lmdeploy_qwen2_5_72b_model # noqa: F401, E501
from opencompass.configs.models.qwen.hf_qwen1_5_moe_a2_7b import \ from opencompass.configs.models.qwen.hf_qwen1_5_moe_a2_7b import \
models as hf_qwen1_5_moe_a2_7b_model # noqa: F401, E501 models as hf_qwen1_5_moe_a2_7b_model # noqa: F401, E501
from opencompass.configs.models.qwen.hf_qwen2_0_5b import \ from opencompass.configs.models.qwen.hf_qwen2_0_5b import \
@ -95,6 +117,10 @@ with read_base():
models as hf_yi_1_5_6b_model # noqa: F401, E501 models as hf_yi_1_5_6b_model # noqa: F401, E501
from opencompass.configs.models.yi.hf_yi_1_5_9b import \ from opencompass.configs.models.yi.hf_yi_1_5_9b import \
models as hf_yi_1_5_9b_model # noqa: F401, E501 models as hf_yi_1_5_9b_model # noqa: F401, E501
from opencompass.configs.models.yi.lmdeploy_yi_1_5_9b import \
models as lmdeploy_yi_1_5_9b_model # noqa: F401, E501
from ...volc import infer as volc_infer # noqa: F401, E501
race_datasets = [race_datasets[1]] race_datasets = [race_datasets[1]]
models = sum([v for k, v in locals().items() if k.endswith('_model')], []) models = sum([v for k, v in locals().items() if k.endswith('_model')], [])

View File

@ -7,8 +7,6 @@ with read_base():
from opencompass.configs.datasets.race.race_gen import \ from opencompass.configs.datasets.race.race_gen import \
race_datasets # noqa: F401, E501 race_datasets # noqa: F401, E501
# read hf models - chat models # read hf models - chat models
from opencompass.configs.models.baichuan.hf_baichuan2_7b_chat import \
models as hf_baichuan2_7b_chat_model # noqa: F401, E501
from opencompass.configs.models.chatglm.hf_glm4_9b_chat import \ from opencompass.configs.models.chatglm.hf_glm4_9b_chat import \
models as hf_glm4_9b_chat_model # noqa: F401, E501 models as hf_glm4_9b_chat_model # noqa: F401, E501
from opencompass.configs.models.chatglm.lmdeploy_glm4_9b_chat import \ from opencompass.configs.models.chatglm.lmdeploy_glm4_9b_chat import \
@ -17,22 +15,30 @@ with read_base():
models as vllm_glm4_9b_chat_model # noqa: F401, E501 models as vllm_glm4_9b_chat_model # noqa: F401, E501
from opencompass.configs.models.deepseek.hf_deepseek_7b_chat import \ from opencompass.configs.models.deepseek.hf_deepseek_7b_chat import \
models as hf_deepseek_7b_chat_model # noqa: F401, E501 models as hf_deepseek_7b_chat_model # noqa: F401, E501
from opencompass.configs.models.deepseek.hf_deepseek_67b_chat import \
models as hf_deepseek_67b_chat_model # noqa: F401, E501
from opencompass.configs.models.deepseek.hf_deepseek_moe_16b_chat import \ from opencompass.configs.models.deepseek.hf_deepseek_moe_16b_chat import \
models as hf_deepseek_moe_16b_chat_model # noqa: F401, E501 models as hf_deepseek_moe_16b_chat_model # noqa: F401, E501
from opencompass.configs.models.deepseek.hf_deepseek_v2_lite_chat import \ from opencompass.configs.models.deepseek.hf_deepseek_v2_lite_chat import \
models as hf_deepseek_v2_lite_chat_model # noqa: F401, E501 models as hf_deepseek_v2_lite_chat_model # noqa: F401, E501
from opencompass.configs.models.deepseek.lmdeploy_deepseek_v2_5_1210 import \
models as lmdeploy_deepseek_v2_5_1210_model # noqa: F401, E501
from opencompass.configs.models.deepseek.vllm_deepseek_7b_chat import \ from opencompass.configs.models.deepseek.vllm_deepseek_7b_chat import \
models as vllm_deepseek_7b_chat_model # noqa: F401, E501 models as vllm_deepseek_7b_chat_model # noqa: F401, E501
from opencompass.configs.models.gemma.hf_gemma2_2b_it import \ from opencompass.configs.models.gemma.hf_gemma2_2b_it import \
models as hf_gemma2_2b_it_model # noqa: F401, E501 models as hf_gemma2_2b_it_model # noqa: F401, E501
from opencompass.configs.models.gemma.hf_gemma2_9b_it import \ from opencompass.configs.models.gemma.hf_gemma2_9b_it import \
models as hf_gemma2_9b_it_model # noqa: F401, E501 models as hf_gemma2_9b_it_model # noqa: F401, E501
from opencompass.configs.models.gemma.hf_gemma2_27b_it import \
models as hf_gemma2_27b_it_model # noqa: F401, E501
from opencompass.configs.models.gemma.hf_gemma_2b_it import \ from opencompass.configs.models.gemma.hf_gemma_2b_it import \
models as hf_gemma_2b_it_model # noqa: F401, E501 models as hf_gemma_2b_it_model # noqa: F401, E501
from opencompass.configs.models.gemma.hf_gemma_7b_it import \ from opencompass.configs.models.gemma.hf_gemma_7b_it import \
models as hf_gemma_7b_it_model # noqa: F401, E501 models as hf_gemma_7b_it_model # noqa: F401, E501
from opencompass.configs.models.gemma.lmdeploy_gemma_9b_it import \ from opencompass.configs.models.gemma.lmdeploy_gemma_9b_it import \
models as lmdeploy_gemma_9b_it_model # noqa: F401, E501 models as lmdeploy_gemma_9b_it_model # noqa: F401, E501
from opencompass.configs.models.gemma.lmdeploy_gemma_27b_it import \
models as lmdeploy_gemma_27b_it_model # noqa: F401, E501
from opencompass.configs.models.gemma.vllm_gemma_7b_it import \ from opencompass.configs.models.gemma.vllm_gemma_7b_it import \
models as vllm_gemma_7b_it_model # noqa: F401, E501 models as vllm_gemma_7b_it_model # noqa: F401, E501
from opencompass.configs.models.hf_internlm.hf_internlm2_5_7b_chat import \ from opencompass.configs.models.hf_internlm.hf_internlm2_5_7b_chat import \
@ -65,6 +71,8 @@ with read_base():
models as lmdeploy_llama3_1_8b_instruct_model # noqa: F401, E501 models as lmdeploy_llama3_1_8b_instruct_model # noqa: F401, E501
from opencompass.configs.models.hf_llama.lmdeploy_llama3_2_3b_instruct import \ from opencompass.configs.models.hf_llama.lmdeploy_llama3_2_3b_instruct import \
models as lmdeploy_llama3_2_3b_instruct_model # noqa: F401, E501 models as lmdeploy_llama3_2_3b_instruct_model # noqa: F401, E501
from opencompass.configs.models.hf_llama.lmdeploy_llama3_3_70b_instruct import \
models as lmdeploy_llama3_3_70b_instruct_model # noqa: F401, E501
from opencompass.configs.models.hf_llama.lmdeploy_llama3_8b_instruct import \ from opencompass.configs.models.hf_llama.lmdeploy_llama3_8b_instruct import \
models as lmdeploy_llama3_8b_instruct_model # noqa: F401, E501 models as lmdeploy_llama3_8b_instruct_model # noqa: F401, E501
from opencompass.configs.models.mistral.hf_mistral_7b_instruct_v0_2 import \ from opencompass.configs.models.mistral.hf_mistral_7b_instruct_v0_2 import \
@ -75,6 +83,13 @@ with read_base():
models as hf_mistral_nemo_instruct_2407_model # noqa: F401, E501 models as hf_mistral_nemo_instruct_2407_model # noqa: F401, E501
from opencompass.configs.models.mistral.hf_mistral_small_instruct_2409 import \ from opencompass.configs.models.mistral.hf_mistral_small_instruct_2409 import \
models as hf_mistral_small_instruct_2409_model # noqa: F401, E501 models as hf_mistral_small_instruct_2409_model # noqa: F401, E501
from opencompass.configs.models.mistral.hf_mixtral_8x7b_instruct_v0_1 import \
models as hf_mixtral_8x7b_instruct_v0_1_model # noqa: F401, E501
from opencompass.configs.models.mistral.hf_mixtral_8x22b_instruct_v0_1 import \
models as hf_mixtral_8x22b_instruct_v0_1_model # noqa: F401, E501
from opencompass.configs.models.mistral.lmdeploy_mistral_large_instruct_2411 import \
models as \
lmdeploy_mistral_large_instruct_2411_model # noqa: F401, E501
from opencompass.configs.models.mistral.lmdeploy_mistral_nemo_instruct_2407 import \ from opencompass.configs.models.mistral.lmdeploy_mistral_nemo_instruct_2407 import \
models as lmdeploy_mistral_nemo_instruct_2407_model # noqa: F401, E501 models as lmdeploy_mistral_nemo_instruct_2407_model # noqa: F401, E501
from opencompass.configs.models.mistral.lmdeploy_mistral_small_instruct_2409 import \ from opencompass.configs.models.mistral.lmdeploy_mistral_small_instruct_2409 import \
@ -84,22 +99,28 @@ with read_base():
models as vllm_mistral_7b_instruct_v0_1_model # noqa: F401, E501 models as vllm_mistral_7b_instruct_v0_1_model # noqa: F401, E501
from opencompass.configs.models.mistral.vllm_mistral_7b_instruct_v0_2 import \ from opencompass.configs.models.mistral.vllm_mistral_7b_instruct_v0_2 import \
models as vllm_mistral_7b_instruct_v0_2_model # noqa: F401, E501 models as vllm_mistral_7b_instruct_v0_2_model # noqa: F401, E501
from opencompass.configs.models.nvidia.lmdeploy_nemotron_70b_instruct_hf import \
models as lmdeploy_nemotron_70b_instruct_hf_model # noqa: F401, E501
from opencompass.configs.models.openbmb.hf_minicpm3_4b import \ from opencompass.configs.models.openbmb.hf_minicpm3_4b import \
models as hf_minicpm3_4b_model # noqa: F401, E501 models as hf_minicpm3_4b_model # noqa: F401, E501
from opencompass.configs.models.openbmb.hf_minicpm_2b_dpo_fp32 import \
models as hf_minicpm_2b_dpo_fp32_model # noqa: F401, E501
from opencompass.configs.models.openbmb.hf_minicpm_2b_sft_bf16 import \
models as hf_minicpm_2b_sft_bf16_model # noqa: F401, E501
from opencompass.configs.models.openbmb.hf_minicpm_2b_sft_fp32 import \
models as hf_minicpm_2b_sft_fp32_model # noqa: F401, E501
from opencompass.configs.models.phi.hf_phi_3_mini_4k_instruct import \ from opencompass.configs.models.phi.hf_phi_3_mini_4k_instruct import \
models as hf_phi_3_mini_4k_instruct_model # noqa: F401, E501 models as hf_phi_3_mini_4k_instruct_model # noqa: F401, E501
from opencompass.configs.models.phi.hf_phi_3_small_8k_instruct import \ from opencompass.configs.models.phi.hf_phi_3_small_8k_instruct import \
models as hf_phi_3_mini_8k_instruct_model # noqa: F401, E501 models as hf_phi_3_mini_8k_instruct_model # noqa: F401, E501
from opencompass.configs.models.qwen2_5.hf_qwen2_5_0_5b_instruct import \
models as hf_qwen2_5_0_5b_instruct_model # noqa: F401, E501
from opencompass.configs.models.qwen2_5.hf_qwen2_5_3b_instruct import \
models as hf_qwen2_5_3b_instruct_model # noqa: F401, E501
from opencompass.configs.models.qwen2_5.hf_qwen2_5_14b_instruct import \ from opencompass.configs.models.qwen2_5.hf_qwen2_5_14b_instruct import \
models as hf_qwen2_5_14b_instruct_model # noqa: F401, E501 models as hf_qwen2_5_14b_instruct_model # noqa: F401, E501
from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_0_5b_instruct import \
models as lmdeploy_qwen2_5_0_5b_instruct_model # noqa: F401, E501
from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_3b_instruct import \
models as lmdeploy_qwen2_5_3b_instruct_model # noqa: F401, E501
from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_14b_instruct import \ from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_14b_instruct import \
models as lmdeploy_qwen2_5_14b_instruct_model # noqa: F401, E501 models as lmdeploy_qwen2_5_14b_instruct_model # noqa: F401, E501
from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_72b_instruct import \
models as lmdeploy_qwen2_5_72b_instruct_model # noqa: F401, E501
from opencompass.configs.models.qwen.hf_qwen1_5_0_5b_chat import \ from opencompass.configs.models.qwen.hf_qwen1_5_0_5b_chat import \
models as hf_qwen1_5_0_5b_chat_model # noqa: F401, E501 models as hf_qwen1_5_0_5b_chat_model # noqa: F401, E501
from opencompass.configs.models.qwen.hf_qwen2_1_5b_instruct import \ from opencompass.configs.models.qwen.hf_qwen2_1_5b_instruct import \
@ -116,6 +137,14 @@ with read_base():
models as hf_yi_1_5_6b_chat_model # noqa: F401, E501 models as hf_yi_1_5_6b_chat_model # noqa: F401, E501
from opencompass.configs.models.yi.hf_yi_1_5_9b_chat import \ from opencompass.configs.models.yi.hf_yi_1_5_9b_chat import \
models as hf_yi_1_5_9b_chat_model # noqa: F401, E501 models as hf_yi_1_5_9b_chat_model # noqa: F401, E501
from opencompass.configs.models.yi.lmdeploy_yi_1_5_6b_chat import \
models as lmdeploy_yi_1_5_6b_chat_model # noqa: F401, E501
from opencompass.configs.models.yi.lmdeploy_yi_1_5_9b_chat import \
models as lmdeploy_yi_1_5_9b_chat_model # noqa: F401, E501
from opencompass.configs.models.yi.lmdeploy_yi_1_5_34b_chat import \
models as lmdeploy_yi_1_5_34b_chat_model # noqa: F401, E501
from ...volc import infer as volc_infer # noqa: F401, E501
race_datasets = [race_datasets[1]] race_datasets = [race_datasets[1]]
datasets = sum([v for k, v in locals().items() if k.endswith('_datasets')], []) datasets = sum([v for k, v in locals().items() if k.endswith('_datasets')], [])

View File

@ -107,6 +107,8 @@ with read_base():
from opencompass.configs.summarizers.mmmlu_lite import \ from opencompass.configs.summarizers.mmmlu_lite import \
mmmlu_summary_groups # noqa: F401, E501 mmmlu_summary_groups # noqa: F401, E501
from ...volc import infer as volc_infer # noqa: F401, E501
# For HumanEval-X Evaluation # For HumanEval-X Evaluation
# Apply the evaluator ip_address and port # Apply the evaluator ip_address and port
race_datasets = [race_datasets[1]] race_datasets = [race_datasets[1]]

View File

@ -22,8 +22,7 @@ with read_base():
arenahard_datasets # noqa: F401, E501 arenahard_datasets # noqa: F401, E501
from opencompass.configs.datasets.subjective.compassarena.compassarena_compare_new import \ from opencompass.configs.datasets.subjective.compassarena.compassarena_compare_new import \
compassarena_datasets # noqa: F401, E501 compassarena_datasets # noqa: F401, E501
from opencompass.configs.datasets.subjective.fofo.fofo_bilingual_judge_new import \ # from opencompass.configs.datasets.subjective.fofo.fofo_bilingual_judge_new import fofo_datasets # noqa: F401, E501
fofo_datasets # noqa: F401, E501
from opencompass.configs.datasets.subjective.followbench.followbench_llmeval_new import \ from opencompass.configs.datasets.subjective.followbench.followbench_llmeval_new import \
followbench_llmeval_datasets # noqa: F401, E501 followbench_llmeval_datasets # noqa: F401, E501
from opencompass.configs.datasets.subjective.multiround.mtbench101_judge_new import \ from opencompass.configs.datasets.subjective.multiround.mtbench101_judge_new import \
@ -35,6 +34,8 @@ with read_base():
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat import \ from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat import \
models as lmdeploy_internlm2_5_7b_chat_model # noqa: F401, E501 models as lmdeploy_internlm2_5_7b_chat_model # noqa: F401, E501
from ...volc import infer as volc_infer # noqa: F401, E501
datasets = sum((v for k, v in locals().items() if k.endswith('_datasets') datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')
and 'mtbench101' not in k and 'wildbench' not in k), []) and 'mtbench101' not in k and 'wildbench' not in k), [])
datasets += mtbench101_datasets # noqa: F401, E501 datasets += mtbench101_datasets # noqa: F401, E501
@ -73,25 +74,15 @@ eval = dict(
summary_groups = [] summary_groups = []
summary_groups.append({ summary_groups.append({
'name': 'name': 'compassarena_language',
'compassarena_language',
'subsets': [ 'subsets': [
['compassarena_language', '内容总结'], ['compassarena_language', '内容总结'],
['compassarena_language', '情感分析'],
['compassarena_language', 'Information Retrival'],
['compassarena_language', '综合问答'],
['compassarena_language', '中华文化'],
], ],
}) })
summary_groups.append({ summary_groups.append({
'name': 'name': 'compassarena_knowledge',
'compassarena_knowledge',
'subsets': [ 'subsets': [
['compassarena_knowledge', '生活常识_ZH'], ['compassarena_knowledge', '生活常识_ZH'],
['compassarena_knowledge', '自然科学工科_ZH'],
['compassarena_knowledge', '人文科学_ZH'],
['compassarena_knowledge', '自然科学理科_ZH'],
['compassarena_knowledge', '社会科学_ZH'],
], ],
}) })
summary_groups.append({ summary_groups.append({
@ -101,21 +92,15 @@ summary_groups.append({
], ],
}) })
summary_groups.append({ summary_groups.append({
'name': 'name': 'compassarena_math_v2',
'compassarena_math_v2',
'subsets': [ 'subsets': [
['compassarena_math_v2', '高等数学_ZH'], ['compassarena_math_v2', '高等数学_ZH'],
['compassarena_math_v2', '初等数学_ZH'],
['compassarena_math_v2', '中等数学_ZH'],
], ],
}) })
summary_groups.append({ summary_groups.append({
'name': 'name': 'compassarena_creationv2_zh',
'compassarena_creationv2_zh',
'subsets': [ 'subsets': [
['compassarena_creationv2_zh', '内容扩写_ZH'], ['compassarena_creationv2_zh', '内容扩写_ZH'],
['compassarena_creationv2_zh', '内容续写_ZH'],
['compassarena_creationv2_zh', '内容改写_ZH'],
], ],
}) })
summary_groups.append({ summary_groups.append({

View File

@ -6,74 +6,19 @@ import yaml
output_path = 'regression_result_daily' output_path = 'regression_result_daily'
chat_model_list = [
'baichuan2-7b-chat-hf', def model_list(type):
'glm-4-9b-chat-hf', config_path = '.github/scripts/oc_score_baseline_testrange.yaml'
'glm-4-9b-chat-turbomind', with open(config_path) as f:
'glm-4-9b-chat-vllm', config = yaml.load(f.read(), Loader=yaml.SafeLoader)
'deepseek-7b-chat-hf', return config.get(type).keys()
'deepseek-moe-16b-chat-hf',
'deepseek-7b-chat-vllm',
'gemma2-2b-it-hf', def dataset_list(model, type):
'gemma2-9b-it-hf', config_path = '.github/scripts/oc_score_baseline_fullbench.yaml'
'gemma-2b-it-hf', with open(config_path) as f:
'gemma-7b-it-hf', config = yaml.load(f.read(), Loader=yaml.SafeLoader)
'gemma-2-9b-it-turbomind', return config.get(model).get(type).keys()
'gemma-7b-it-vllm',
'internlm2_5-7b-chat-hf',
'internlm2_5-7b-chat-turbomind',
'internlm2-chat-1.8b-turbomind',
'internlm2-chat-1.8b-sft-turbomind',
'internlm2-chat-7b-lmdeploy',
'internlm2-chat-7b-sft-turbomind',
'internlm2-chat-7b-vllm',
'llama-3_1-8b-instruct-hf',
'llama-3_2-3b-instruct-hf',
'llama-3-8b-instruct-hf',
'llama-3_1-8b-instruct-turbomind',
'llama-3_2-3b-instruct-turbomind',
'llama-3-8b-instruct-turbomind',
'mistral-7b-instruct-v0.2-hf',
'mistral-7b-instruct-v0.3-hf',
'mistral-nemo-instruct-2407-hf',
'mistral-nemo-instruct-2407-turbomind',
'mistral-7b-instruct-v0.1-vllm',
'mistral-7b-instruct-v0.2-vllm',
# 'MiniCPM3-4B-hf', 'minicpm-2b-dpo-fp32-hf', 'minicpm-2b-sft-bf16-hf',
# 'minicpm-2b-sft-fp32-hf',
'phi-3-mini-4k-instruct-hf',
'qwen1.5-0.5b-chat-hf',
'qwen2-1.5b-instruct-hf',
'qwen2-7b-instruct-hf',
'qwen2-1.5b-instruct-turbomind',
'qwen2-7b-instruct-turbomind',
'qwen1.5-0.5b-chat-vllm',
'yi-1.5-6b-chat-hf',
'yi-1.5-9b-chat-hf',
'deepseek-v2-lite-chat-hf',
'internlm2_5-20b-chat-hf',
'internlm2_5-20b-chat-turbomind',
'mistral-small-instruct-2409-hf',
'mistral-small-instruct-2409-turbomind',
'qwen2.5-14b-instruct-hf',
'qwen2.5-14b-instruct-turbomind'
]
base_model_list = [
'glm-4-9b-hf', 'deepseek-moe-16b-base-hf', 'deepseek-7b-base-turbomind',
'deepseek-moe-16b-base-vllm', 'gemma2-2b-hf', 'gemma2-9b-hf',
'gemma-2b-hf', 'gemma-7b-hf', 'gemma-2b-vllm', 'gemma-7b-vllm',
'internlm2_5-7b-hf', 'internlm2-7b-hf', 'internlm2-base-7b-hf',
'internlm2-1.8b-turbomind', 'internlm2_5-7b-turbomind',
'internlm2-7b-turbomind', 'internlm2-base-7b-turbomind', 'llama-2-7b-hf',
'llama-3_1-8b-hf', 'llama-3-8b-hf', 'llama-3.1-8b-turbomind',
'llama-3-8b-turbomind', 'mistral-7b-v0.2-hf', 'mistral-7b-v0.3-hf',
'mistral-7b-v0.2-vllm', 'qwen2.5-7b-hf', 'qwen2.5-1.5b-turbomind',
'qwen2.5-7b-turbomind', 'qwen1.5-moe-a2.7b-hf', 'qwen2-0.5b-hf',
'qwen2-1.5b-hf', 'qwen2-7b-hf', 'qwen2-1.5b-turbomind',
'qwen2-7b-turbomind', 'qwen1.5-0.5b-vllm', 'yi-1.5-6b-hf', 'yi-1.5-9b-hf',
'deepseek-v2-lite-hf', 'internlm2-20b-hf', 'internlm2-base-20b-hf',
'internlm2-20b-turbomind', 'qwen2.5-14b-hf'
]
@pytest.fixture() @pytest.fixture()
@ -115,36 +60,39 @@ def result_scores():
@pytest.mark.usefixtures('result_scores') @pytest.mark.usefixtures('result_scores')
@pytest.mark.usefixtures('baseline_scores_testrange') @pytest.mark.usefixtures('baseline_scores_testrange')
@pytest.mark.chat @pytest.mark.chat_models
class TestChat: class TestChat:
"""Test cases for chat model.""" """Test cases for chat model."""
@pytest.mark.parametrize( @pytest.mark.parametrize(
'model, dataset', [(p1, p2) for p1 in chat_model_list 'model, dataset', [(p1, p2) for p1 in model_list('chat')
for p2 in ['gsm8k_accuracy', 'race-high_accuracy']]) for p2 in ['gsm8k_accuracy', 'race-high_accuracy']])
def test_model_dataset_score(self, baseline_scores_testrange, def test_model_dataset_score(self, baseline_scores_testrange,
result_scores, model, dataset): result_scores, model, dataset):
base_score = baseline_scores_testrange.get(model).get(dataset) base_score = baseline_scores_testrange.get('chat').get(model).get(
dataset)
result_score = result_scores.get(model).get(dataset) result_score = result_scores.get(model).get(dataset)
assert_score(model, result_score, base_score) assert_score(model, result_score, base_score)
@pytest.mark.usefixtures('result_scores') @pytest.mark.usefixtures('result_scores')
@pytest.mark.usefixtures('baseline_scores_testrange') @pytest.mark.usefixtures('baseline_scores_testrange')
@pytest.mark.base @pytest.mark.base_models
class TestBase: class TestBase:
"""Test cases for base model.""" """Test cases for base model."""
@pytest.mark.parametrize('model, dataset', [ @pytest.mark.parametrize('model, dataset',
(p1, p2) for p1 in base_model_list for p2 in [(p1, p2) for p1 in model_list('base') for p2 in [
['gsm8k_accuracy', 'GPQA_diamond', 'race-high_accuracy', 'winogrande'] 'gsm8k_accuracy', 'GPQA_diamond_accuracy',
]) 'race-high_accuracy', 'winogrande_accuracy'
]])
def test_model_dataset_score(self, baseline_scores_testrange, def test_model_dataset_score(self, baseline_scores_testrange,
result_scores, model, dataset): result_scores, model, dataset):
if model in ['gemma-2b-vllm', 'gemma-7b-vllm' if model in ['gemma-2b-vllm', 'gemma-7b-vllm'
] and dataset != 'gsm8k_accuracy': ] and dataset != 'gsm8k_accuracy':
return return
base_score = baseline_scores_testrange.get(model).get(dataset) base_score = baseline_scores_testrange.get('base').get(model).get(
dataset)
result_score = result_scores.get(model).get(dataset) result_score = result_scores.get(model).get(dataset)
assert_score(model, result_score, base_score) assert_score(model, result_score, base_score)
@ -158,28 +106,11 @@ class TestChatObjFullbench:
@pytest.mark.parametrize('model, dataset', [(p1, p2) for p1 in [ @pytest.mark.parametrize('model, dataset', [(p1, p2) for p1 in [
'internlm2_5-7b-chat-hf_fullbench', 'internlm2_5-7b-chat-hf_fullbench',
'internlm2_5-7b-chat-turbomind_fullbench' 'internlm2_5-7b-chat-turbomind_fullbench'
] for p2 in [ ] for p2 in dataset_list('internlm2_5-7b-chat-hf_fullbench', 'objective')])
'race-high_accuracy', 'ARC-c_accuracy', 'BoolQ_accuracy',
'triviaqa_wiki_1shot_score', 'nq_open_1shot_score',
'IFEval_Prompt-level-strict-accuracy', 'drop_accuracy',
'GPQA_diamond_accuracy', 'hellaswag_accuracy', 'TheoremQA_score',
'musr_average_naive_average', 'korbench_single_naive_average',
'gsm8k_accuracy', 'math_accuracy', 'cmo_fib_accuracy',
'aime2024_accuracy', 'wikibench-wiki-single_choice_cncircular_perf_4',
'sanitized_mbpp_score', 'ds1000_naive_average',
'lcb_code_generation_pass@1', 'lcb_code_execution_pass@1',
'lcb_test_output_pass@1', 'bbh-logical_deduction_seven_objects_score',
'bbh-multistep_arithmetic_two_score', 'mmlu-other_naive_average',
'cmmlu-china-specific_naive_average', 'mmlu_pro_math_accuracy',
'ds1000_Pandas_accuracy', 'ds1000_Numpy_accuracy',
'ds1000_Tensorflow_accuracy', 'ds1000_Scipy_accuracy',
'ds1000_Sklearn_accuracy', 'ds1000_Pytorch_accuracy',
'ds1000_Matplotlib_accuracy', 'openai_mmmlu_lite_AR-XY_accuracy',
'college_naive_average', 'college_knowledge_naive_average'
]])
def test_model_dataset_score(self, baseline_scores_fullbench, def test_model_dataset_score(self, baseline_scores_fullbench,
result_scores, model, dataset): result_scores, model, dataset):
base_score = baseline_scores_fullbench.get(model).get(dataset) base_score = baseline_scores_fullbench.get(model).get('objective').get(
dataset)
result_score = result_scores.get(model).get(dataset) result_score = result_scores.get(model).get(dataset)
assert_score(model, result_score, base_score) assert_score(model, result_score, base_score)
@ -193,32 +124,12 @@ class TestChatSubFullbench:
@pytest.mark.parametrize('model, dataset', [(p1, p2) for p1 in [ @pytest.mark.parametrize('model, dataset', [(p1, p2) for p1 in [
'internlm2_5-7b-chat-hf_fullbench', 'internlm2_5-7b-chat-hf_fullbench',
'internlm2_5-7b-chat-turbomind_fullbench' 'internlm2_5-7b-chat-turbomind_fullbench'
] for p2 in [ ] for p2 in dataset_list('internlm2_5-7b-chat-hf_fullbench', 'subjective')]
'alignment_bench_v1_1_总分', 'alpaca_eval_total', 'arenahard_score', )
'Followbench_naive_average', 'CompassArena_naive_average',
'mtbench101_avg', 'wildbench_average',
'simpleqa_accuracy_given_attempted',
'chinese_simpleqa_given_attempted_accuracy',
'alignment_bench_v1_1_专业能力', 'alignment_bench_v1_1_数学计算',
'alignment_bench_v1_1_基本任务', 'alignment_bench_v1_1_逻辑推理',
'alignment_bench_v1_1_中文理解', 'alignment_bench_v1_1_文本写作',
'alignment_bench_v1_1_角色扮演', 'alignment_bench_v1_1_综合问答',
'alpaca_eval_helpful_base', 'compassarena_language_naive_average',
'compassarena_knowledge_naive_average',
'compassarena_reason_v2_naive_average',
'compassarena_math_v2_naive_average',
'compassarena_creationv2_zh_naive_average',
'fofo_test_prompts_overall', 'followbench_llmeval_en_HSR_AVG',
'followbench_llmeval_en_SSR_AVG', 'followbench_llmeval_en_HSR_L1',
'followbench_llmeval_en_HSR_L2', 'followbench_llmeval_en_HSR_L3',
'followbench_llmeval_en_HSR_L4', 'followbench_llmeval_en_HSR_L5',
'followbench_llmeval_en_SSR_L1', 'followbench_llmeval_en_SSR_L2',
'followbench_llmeval_en_SSR_L3', 'followbench_llmeval_en_SSR_L4',
'followbench_llmeval_en_SSR_L5', 'simpleqa_f1'
]])
def test_model_dataset_score(self, baseline_scores_fullbench, def test_model_dataset_score(self, baseline_scores_fullbench,
result_scores, model, dataset): result_scores, model, dataset):
base_score = baseline_scores_fullbench.get(model).get(dataset) base_score = baseline_scores_fullbench.get(model).get(
'subjective').get(dataset)
result_score = result_scores.get(model).get(dataset) result_score = result_scores.get(model).get(dataset)
assert_score(model, result_score, base_score) assert_score(model, result_score, base_score)
@ -229,25 +140,15 @@ class TestChatSubFullbench:
class TestBaseFullbench: class TestBaseFullbench:
"""Test cases for chat model.""" """Test cases for chat model."""
@pytest.mark.parametrize('model, dataset', [(p1, p2) for p1 in [ @pytest.mark.parametrize(
'internlm2_5-7b-hf_fullbench', 'internlm2_5-7b-turbomind_fullbench' 'model, dataset',
] for p2 in [ [(p1, p2) for p1 in
'race-high_accuracy', 'ARC-c_accuracy', 'BoolQ_accuracy', ['internlm2_5-7b-hf_fullbench', 'internlm2_5-7b-turbomind_fullbench']
'triviaqa_wiki_1shot_score', 'nq_open_1shot_score', 'drop_accuracy', for p2 in dataset_list('internlm2_5-7b-hf_fullbench', 'objective')])
'GPQA_diamond_accuracy', 'hellaswag_accuracy', 'TheoremQA_score',
'winogrande_accuracy', 'gsm8k_accuracy',
'GaokaoBench_2010-2022_Math_II_MCQs_score',
'GaokaoBench_2010-2022_Math_II_Fill-in-the-Blank_score',
'math_accuracy', 'wikibench-wiki-single_choice_cncircular_perf_4',
'sanitized_mbpp_score', 'dingo_en_192_score', 'dingo_zh_170_score',
'mmlu-other_accuracy', 'cmmlu-china-specific_accuracy',
'mmlu_pro_math_accuracy', 'bbh-logical_deduction_seven_objects_score',
'bbh-multistep_arithmetic_two_score', 'college_naive_average',
'college_knowledge_naive_average'
]])
def test_model_dataset_score(self, baseline_scores_fullbench, def test_model_dataset_score(self, baseline_scores_fullbench,
result_scores, model, dataset): result_scores, model, dataset):
base_score = baseline_scores_fullbench.get(model).get(dataset) base_score = baseline_scores_fullbench.get(model).get('objective').get(
dataset)
result_score = result_scores.get(model).get(dataset) result_score = result_scores.get(model).get(dataset)
assert_score(model, result_score, base_score) assert_score(model, result_score, base_score)
@ -274,193 +175,64 @@ class TestApibench:
class TestVolcFullbench: class TestVolcFullbench:
"""Test cases for chat model.""" """Test cases for chat model."""
@pytest.mark.parametrize('model, dataset', [(
p1, p2
) for p1 in ['internlm2_5-7b-chat-turbomind'] for p2 in [
'race-high_accuracy', 'ARC-c_accuracy', 'BoolQ_accuracy',
'triviaqa_wiki_1shot_score', 'nq_open_1shot_score',
'mmmlu_lite_naive_average', 'IFEval_Prompt-level-strict-accuracy',
'drop_accuracy', 'bbh_naive_average', 'GPQA_diamond_accuracy',
'hellaswag_accuracy', 'TheoremQA_score', 'musr_average_naive_average',
'korbench_single_naive_average',
'ARC_Prize_Public_Evaluation_accuracy', 'gsm8k_accuracy',
'GaokaoBench_weighted_average', 'math_accuracy', 'cmo_fib_accuracy',
'aime2024_accuracy', 'Mathbench_naive_average',
'wikibench-wiki-single_choice_cncircular_perf_4',
'cmmlu_naive_average', 'mmlu_naive_average', 'mmlu_pro_naive_average',
'openai_humaneval_humaneval_pass@1', 'sanitized_mbpp_score',
'humanevalx_naive_average', 'ds1000_naive_average',
'lcb_code_generation_pass@1', 'lcb_code_execution_pass@1',
'lcb_test_output_pass@1', 'bigcodebench_hard_instruct_pass@1',
'bigcodebench_hard_complete_pass@1', 'teval_naive_average',
'qa_dingo_cn_score', 'mmlu-stem_naive_average',
'mmlu-social-science_naive_average', 'mmlu-humanities_naive_average',
'mmlu-other_naive_average', 'cmmlu-stem_naive_average',
'cmmlu-social-science_naive_average', 'cmmlu-humanities_naive_average',
'cmmlu-other_naive_average', 'cmmlu-china-specific_naive_average',
'mmlu_pro_biology_accuracy', 'mmlu_pro_business_accuracy',
'mmlu_pro_chemistry_accuracy', 'mmlu_pro_computer_science_accuracy',
'mmlu_pro_economics_accuracy', 'mmlu_pro_engineering_accuracy',
'mmlu_pro_health_accuracy', 'mmlu_pro_history_accuracy',
'mmlu_pro_law_accuracy', 'mmlu_pro_math_accuracy',
'mmlu_pro_philosophy_accuracy', 'mmlu_pro_physics_accuracy',
'mmlu_pro_psychology_accuracy', 'mmlu_pro_other_accuracy',
'humanevalx-python_pass@1', 'humanevalx-cpp_pass@1',
'humanevalx-go_pass@1', 'humanevalx-java_pass@1',
'humanevalx-js_pass@1', 'ds1000_Pandas_accuracy',
'ds1000_Numpy_accuracy', 'ds1000_Tensorflow_accuracy',
'ds1000_Scipy_accuracy', 'ds1000_Sklearn_accuracy',
'ds1000_Pytorch_accuracy', 'ds1000_Matplotlib_accuracy',
'openai_mmmlu_lite_AR-XY_accuracy', 'openai_mmmlu_lite_BN-BD_accuracy',
'openai_mmmlu_lite_DE-DE_accuracy', 'openai_mmmlu_lite_ES-LA_accuracy',
'openai_mmmlu_lite_FR-FR_accuracy', 'openai_mmmlu_lite_HI-IN_accuracy',
'openai_mmmlu_lite_ID-ID_accuracy', 'openai_mmmlu_lite_IT-IT_accuracy',
'openai_mmmlu_lite_JA-JP_accuracy', 'openai_mmmlu_lite_KO-KR_accuracy',
'openai_mmmlu_lite_PT-BR_accuracy', 'openai_mmmlu_lite_SW-KE_accuracy',
'openai_mmmlu_lite_YO-NG_accuracy', 'openai_mmmlu_lite_ZH-CN_accuracy',
'college_naive_average', 'high_naive_average', 'middle_naive_average',
'primary_naive_average', 'arithmetic_naive_average',
'mathbench-a (average)_naive_average',
'college_knowledge_naive_average', 'high_knowledge_naive_average',
'middle_knowledge_naive_average', 'primary_knowledge_naive_average',
'mathbench-t (average)_naive_average'
]])
@pytest.mark.chat_objective
def test_chat_objective(self, baseline_scores_fullbench, result_scores,
model, dataset):
base_score = baseline_scores_fullbench.get(model).get(dataset)
result_score = result_scores.get(model).get(dataset)
assert_score(model + '_batch', result_score, base_score)
@pytest.mark.parametrize( @pytest.mark.parametrize(
'model, dataset', 'model, dataset',
[(p1, p2) for p1 in ['internlm2_5-7b-chat-turbomind'] [(p1, p2) for p1 in ['internlm2_5-7b-chat-turbomind']
for p2 in [ for p2 in dataset_list('internlm2_5-7b-chat-turbomind', 'objective')])
'alignment_bench_v1_1_总分', 'alpaca_eval_total', 'arenahard_score', @pytest.mark.chat_objective
'Followbench_naive_average', 'CompassArena_naive_average', def test_chat_objective(self, baseline_scores_fullbench, result_scores,
'FoFo_naive_average', 'mtbench101_avg', 'wildbench_average', model, dataset):
'simpleqa_accuracy_given_attempted', base_score = baseline_scores_fullbench.get(model).get('objective').get(
'chinese_simpleqa_given_attempted_accuracy', dataset)
'alignment_bench_v1_1_专业能力', 'alignment_bench_v1_1_数学计算',
'alignment_bench_v1_1_基本任务', 'alignment_bench_v1_1_逻辑推理',
'alignment_bench_v1_1_中文理解', 'alignment_bench_v1_1_文本写作',
'alignment_bench_v1_1_角色扮演', 'alignment_bench_v1_1_综合问答',
'alpaca_eval_helpful_base', 'alpaca_eval_koala',
'alpaca_eval_oasst', 'alpaca_eval_selfinstruct',
'alpaca_eval_vicuna', 'compassarena_language_naive_average',
'compassarena_knowledge_naive_average',
'compassarena_reason_v2_naive_average',
'compassarena_math_v2_naive_average',
'compassarena_creationv2_zh_naive_average',
'fofo_test_prompts_overall', 'fofo_test_prompts_cn_overall',
'followbench_llmeval_en_HSR_AVG',
'followbench_llmeval_en_SSR_AVG', 'followbench_llmeval_en_HSR_L1',
'followbench_llmeval_en_HSR_L2', 'followbench_llmeval_en_HSR_L3',
'followbench_llmeval_en_HSR_L4', 'followbench_llmeval_en_HSR_L5',
'followbench_llmeval_en_SSR_L1', 'followbench_llmeval_en_SSR_L2',
'followbench_llmeval_en_SSR_L3', 'followbench_llmeval_en_SSR_L4',
'followbench_llmeval_en_SSR_L5', 'simpleqa_f1'
]])
@pytest.mark.chat_subjective
def test_chat_subjective(self, baseline_scores_fullbench, result_scores,
model, dataset):
base_score = baseline_scores_fullbench.get(model).get(dataset)
result_score = result_scores.get(model).get(dataset) result_score = result_scores.get(model).get(dataset)
assert_score(model + '_batch', result_score, base_score) assert_score(model + '_batch', result_score, base_score)
@pytest.mark.parametrize('model, dataset', [( @pytest.mark.parametrize('model, dataset', [
p1, p2 (p1, p2) for p1 in ['internlm2_5-7b-chat-turbomind']
) for p1 in ['internlm2_5-7b-turbomind'] for p2 in [ for p2 in dataset_list('internlm2_5-7b-chat-turbomind', 'subjective')
'race-high_accuracy', 'ARC-c_accuracy', 'BoolQ_accuracy', ])
'triviaqa_wiki_1shot_score', 'nq_open_1shot_score', 'drop_accuracy', @pytest.mark.chat_subjective
'bbh_naive_average', 'GPQA_diamond_accuracy', 'hellaswag_accuracy', def test_chat_subjective(self, baseline_scores_fullbench, result_scores,
'TheoremQA_score', 'winogrande_accuracy', 'gsm8k_accuracy', model, dataset):
'GaokaoBench_weighted_average', 'math_accuracy', base_score = baseline_scores_fullbench.get(model).get(
'Mathbench_naive_average', 'subjective').get(dataset)
'wikibench-wiki-single_choice_cncircular_perf_4',
'cmmlu_naive_average', 'mmlu_naive_average', 'mmlu_pro_naive_average',
'openai_humaneval_humaneval_pass@1',
'openai_humaneval_v2_humaneval_pass@1', 'sanitized_mbpp_score',
'dingo_en_192_score', 'dingo_zh_170_score', 'mmlu-stem_naive_average',
'mmlu-social-science_naive_average', 'mmlu-humanities_naive_average',
'mmlu-other_naive_average', 'cmmlu-stem_naive_average',
'cmmlu-social-science_naive_average', 'cmmlu-humanities_naive_average',
'cmmlu-other_naive_average', 'cmmlu-china-specific_naive_average',
'mmlu_pro_biology_accuracy', 'mmlu_pro_business_accuracy',
'mmlu_pro_chemistry_accuracy', 'mmlu_pro_computer_science_accuracy',
'mmlu_pro_economics_accuracy', 'mmlu_pro_engineering_accuracy',
'mmlu_pro_health_accuracy', 'mmlu_pro_history_accuracy',
'mmlu_pro_law_accuracy', 'mmlu_pro_math_accuracy',
'mmlu_pro_philosophy_accuracy', 'mmlu_pro_physics_accuracy',
'mmlu_pro_psychology_accuracy', 'mmlu_pro_other_accuracy',
'college_naive_average', 'high_naive_average', 'middle_naive_average',
'primary_naive_average', 'arithmetic_naive_average',
'mathbench-a (average)_naive_average',
'college_knowledge_naive_average', 'high_knowledge_naive_average',
'middle_knowledge_naive_average', 'primary_knowledge_naive_average',
'mathbench-t (average)_naive_average'
]])
@pytest.mark.base_objective
def test_base_objective(self, baseline_scores_fullbench, result_scores,
model, dataset):
base_score = baseline_scores_fullbench.get(model).get(dataset)
result_score = result_scores.get(model).get(dataset) result_score = result_scores.get(model).get(dataset)
assert_score(model + '_batch', result_score, base_score) assert_score(model + '_batch', result_score, base_score)
@pytest.mark.parametrize( @pytest.mark.parametrize(
'model, dataset', 'model, dataset',
[(p1, p2) for p1 in ['internlm2_5-7b-turbomind'] [(p1, p2) for p1 in ['internlm2_5-7b-turbomind']
for p2 in [ for p2 in dataset_list('internlm2_5-7b-turbomind', 'objective')])
'Single-Needle-Retrieval(S-RT)-32000_naive_average', @pytest.mark.base_objective
'Single-Needle-Retrieval-EN-32000_naive_average', def test_base_objective(self, baseline_scores_fullbench, result_scores,
'Single-Needle-Retrieval-ZH-32000_naive_average', model, dataset):
'Single-Needle-Retrieval(S-RT)-100000_naive_average', base_score = baseline_scores_fullbench.get(model).get('objective').get(
'Single-Needle-Retrieval-EN-100000_naive_average', dataset)
'Single-Needle-Retrieval-ZH-100000_naive_average',
'Single-Needle-Retrieval(S-RT)-200000_naive_average',
'Single-Needle-Retrieval-EN-200000_naive_average',
'Single-Needle-Retrieval-ZH-200000_naive_average',
'longbench_naive_average', 'longbench_zh_naive_average',
'longbench_en_naive_average',
'longbench_single-document-qa_naive_average',
'longbench_multi-document-qa_naive_average',
'longbench_summarization_naive_average',
'longbench_few-shot-learning_naive_average',
'longbench_synthetic-tasks_naive_average',
'longbench_code-completion_naive_average'
]])
@pytest.mark.base_long_context
def test_base_long_context(self, baseline_scores_fullbench, result_scores,
model, dataset):
base_score = baseline_scores_fullbench.get(model).get(dataset)
result_score = result_scores.get(model).get(dataset) result_score = result_scores.get(model).get(dataset)
assert_score(model + '_batch', result_score, base_score) assert_score(model + '_batch', result_score, base_score)
@pytest.mark.parametrize( @pytest.mark.parametrize(
'model, dataset', 'model, dataset',
[(p1, p2) for p1 in ['internlm2_5-7b-chat-1m-turbomind'] [(p1, p2) for p1 in ['internlm2_5-7b-turbomind']
for p2 in [ for p2 in dataset_list('internlm2_5-7b-turbomind', 'long_context')])
'ruler_8k_naive_average', 'ruler_32k_naive_average', @pytest.mark.base_long_context
'ruler_128k_naive_average', def test_base_long_context(self, baseline_scores_fullbench, result_scores,
'NeedleBench-Overall-Score-8K_weighted_average', model, dataset):
'NeedleBench-Overall-Score-32K_weighted_average', base_score = baseline_scores_fullbench.get(model).get(
'NeedleBench-Overall-Score-128K_weighted_average', 'long_context').get(dataset)
'longbench_naive_average', 'longbench_zh_naive_average', result_score = result_scores.get(model).get(dataset)
'longbench_en_naive_average', 'babilong_0k_naive_average', assert_score(model + '_batch', result_score, base_score)
'babilong_4k_naive_average', 'babilong_16k_naive_average',
'babilong_32k_naive_average', 'babilong_128k_naive_average', @pytest.mark.parametrize(
'babilong_256k_naive_average', 'model, dataset',
'longbench_single-document-qa_naive_average', [(p1, p2)
'longbench_multi-document-qa_naive_average', for p1 in ['internlm2_5-7b-chat-1m-turbomind'] for p2 in dataset_list(
'longbench_summarization_naive_average', 'internlm2_5-7b-chat-1m-turbomind', 'long_context')])
'longbench_few-shot-learning_naive_average',
'longbench_synthetic-tasks_naive_average',
'longbench_code-completion_naive_average'
]])
@pytest.mark.chat_long_context @pytest.mark.chat_long_context
def test_chat_long_context(self, baseline_scores_fullbench, result_scores, def test_chat_long_context(self, baseline_scores_fullbench, result_scores,
model, dataset): model, dataset):
base_score = baseline_scores_fullbench.get(model).get(dataset) base_score = baseline_scores_fullbench.get(model).get(
'long_context').get(dataset)
result_score = result_scores.get(model).get(dataset) result_score = result_scores.get(model).get(dataset)
assert_score(model + '_batch', result_score, base_score) assert_score(model + '_batch', result_score, base_score)

View File

@ -14,12 +14,12 @@ internlm2-1.8b-hf:
race-high_accuracy: 66.38 race-high_accuracy: 66.38
internlm2_5-7b-chat-lmdeploy: internlm2_5-7b-chat-lmdeploy:
demo_gsm8k_accuracy: 84.38 demo_gsm8k_accuracy: 89.06
race-middle_accuracy: 92.76 race-middle_accuracy: 92.76
race-high_accuracy: 90.54 race-high_accuracy: 90.54
internlm2-chat-1.8b-lmdeploy: internlm2-chat-1.8b-lmdeploy:
demo_gsm8k_accuracy: 31 demo_gsm8k_accuracy: 32
race-middle_accuracy: 81.34 race-middle_accuracy: 81.34
race-high_accuracy: 73.96 race-high_accuracy: 73.96

View File

@ -1,447 +1,456 @@
internlm2_5-7b-chat-hf_fullbench: internlm2_5-7b-chat-hf_fullbench:
race-high_accuracy: 93.75 objective:
ARC-c_accuracy: 93.75 race-high_accuracy: 93.75
BoolQ_accuracy: 81.25 ARC-c_accuracy: 93.75
triviaqa_wiki_1shot_score: 50 BoolQ_accuracy: 81.25
nq_open_1shot_score: 25 triviaqa_wiki_1shot_score: 50
IFEval_Prompt-level-strict-accuracy: 50 nq_open_1shot_score: 25
drop_accuracy: 81.25 IFEval_Prompt-level-strict-accuracy: 50
GPQA_diamond_accuracy: 25 drop_accuracy: 81.25
hellaswag_accuracy: 87.5 GPQA_diamond_accuracy: 25
TheoremQA_score: 18.75 hellaswag_accuracy: 87.5
musr_average_naive_average: 39.58 TheoremQA_score: 18.75
korbench_single_naive_average: 40 musr_average_naive_average: 39.58
gsm8k_accuracy: 62.50 korbench_single_naive_average: 40
math_accuracy: 75 gsm8k_accuracy: 62.50
cmo_fib_accuracy: 6.25 math_accuracy: 75
aime2024_accuracy: 6.25 cmo_fib_accuracy: 6.25
wikibench-wiki-single_choice_cncircular_perf_4: 50 aime2024_accuracy: 6.25
sanitized_mbpp_score: 68.75 wikibench-wiki-single_choice_cncircular_perf_4: 50
ds1000_naive_average: 16.96 sanitized_mbpp_score: 68.75
lcb_code_generation_pass@1: 12.5 ds1000_naive_average: 16.96
lcb_code_execution_pass@1: 43.75 lcb_code_generation_pass@1: 12.5
lcb_test_output_pass@1: 18.75 lcb_code_execution_pass@1: 43.75
bbh-logical_deduction_seven_objects_score: 50 lcb_test_output_pass@1: 18.75
bbh-multistep_arithmetic_two_score: 68.75 bbh-logical_deduction_seven_objects_score: 50
mmlu-other_naive_average: 72.6 bbh-multistep_arithmetic_two_score: 68.75
cmmlu-china-specific_naive_average: 76.25 mmlu-other_naive_average: 72.6
mmlu_pro_math_accuracy: 25 cmmlu-china-specific_naive_average: 76.25
ds1000_Pandas_accuracy: 12.5 mmlu_pro_math_accuracy: 25
ds1000_Numpy_accuracy: 0 ds1000_Pandas_accuracy: 12.5
ds1000_Tensorflow_accuracy: 12.5 ds1000_Numpy_accuracy: 0
ds1000_Scipy_accuracy: 18.75 ds1000_Tensorflow_accuracy: 12.5
ds1000_Sklearn_accuracy: 18.75 ds1000_Scipy_accuracy: 18.75
ds1000_Pytorch_accuracy: 12.5 ds1000_Sklearn_accuracy: 18.75
ds1000_Matplotlib_accuracy: 43.75 ds1000_Pytorch_accuracy: 12.5
openai_mmmlu_lite_AR-XY_accuracy: 37.5 ds1000_Matplotlib_accuracy: 43.75
college_naive_average: 12.5 openai_mmmlu_lite_AR-XY_accuracy: 37.5
college_knowledge_naive_average: 87.5 college_naive_average: 12.5
alignment_bench_v1_1_总分: 0.66 college_knowledge_naive_average: 87.5
alpaca_eval_total: 0 subjective:
arenahard_score: 50 alignment_bench_v1_1_总分: 0.66
Followbench_naive_average: 1 alpaca_eval_total: 20
CompassArena_naive_average: 54.48 arenahard_score: 50
mtbench101_avg: 8.1 Followbench_naive_average: 1
wildbench_average: -9.86 CompassArena_naive_average: 44.00
simpleqa_accuracy_given_attempted: 0 mtbench101_avg: 7.8
chinese_simpleqa_given_attempted_accuracy: 1 wildbench_average: -12.78
alignment_bench_v1_1_专业能力: 8 simpleqa_accuracy_given_attempted: 0
alignment_bench_v1_1_数学计算: 0 chinese_simpleqa_given_attempted_accuracy: 1
alignment_bench_v1_1_基本任务: 0 alignment_bench_v1_1_专业能力: 7.90
alignment_bench_v1_1_逻辑推理: 0 alignment_bench_v1_1_数学计算: 0
alignment_bench_v1_1_中文理解: 0 alignment_bench_v1_1_基本任务: 0
alignment_bench_v1_1_文本写作: 0 alignment_bench_v1_1_逻辑推理: 0
alignment_bench_v1_1_角色扮演: 0 alignment_bench_v1_1_中文理解: 0
alignment_bench_v1_1_综合问答: 0 alignment_bench_v1_1_文本写作: 0
alpaca_eval_helpful_base: 0 alignment_bench_v1_1_角色扮演: 0
compassarena_language_naive_average: 62 alignment_bench_v1_1_综合问答: 0
compassarena_knowledge_naive_average: 56 alpaca_eval_helpful_base: 20
compassarena_reason_v2_naive_average: 49 compassarena_language_naive_average: 35
compassarena_math_v2_naive_average: 57.05 compassarena_knowledge_naive_average: 55
compassarena_creationv2_zh_naive_average: 48.34 compassarena_reason_v2_naive_average: 45.00
fofo_test_prompts_overall: 1 compassarena_math_v2_naive_average: 55
followbench_llmeval_en_HSR_AVG: 1 compassarena_creationv2_zh_naive_average: 30
followbench_llmeval_en_SSR_AVG: 1 followbench_llmeval_en_HSR_AVG: 1
followbench_llmeval_en_HSR_L1: 1 followbench_llmeval_en_SSR_AVG: 1
followbench_llmeval_en_HSR_L2: 1 followbench_llmeval_en_HSR_L1: 1
followbench_llmeval_en_HSR_L3: 1 followbench_llmeval_en_HSR_L2: 1
followbench_llmeval_en_HSR_L4: 1 followbench_llmeval_en_HSR_L3: 1
followbench_llmeval_en_HSR_L5: 1 followbench_llmeval_en_HSR_L4: 1
followbench_llmeval_en_SSR_L1: 1 followbench_llmeval_en_HSR_L5: 1
followbench_llmeval_en_SSR_L2: 1 followbench_llmeval_en_SSR_L1: 1
followbench_llmeval_en_SSR_L3: 1 followbench_llmeval_en_SSR_L2: 1
followbench_llmeval_en_SSR_L4: 1 followbench_llmeval_en_SSR_L3: 1
followbench_llmeval_en_SSR_L5: 1 followbench_llmeval_en_SSR_L4: 1
simpleqa_f1: 0 followbench_llmeval_en_SSR_L5: 1
simpleqa_f1: 0
internlm2_5-7b-chat-turbomind_fullbench: internlm2_5-7b-chat-turbomind_fullbench:
race-high_accuracy: 93.75 objective:
ARC-c_accuracy: 87.5 race-high_accuracy: 93.75
BoolQ_accuracy: 68.75 ARC-c_accuracy: 93.75
triviaqa_wiki_1shot_score: 50 BoolQ_accuracy: 68.75
nq_open_1shot_score: 25 triviaqa_wiki_1shot_score: 50
IFEval_Prompt-level-strict-accuracy: 50 nq_open_1shot_score: 25
drop_accuracy: 75 IFEval_Prompt-level-strict-accuracy: 56.25
GPQA_diamond_accuracy: 25 drop_accuracy: 81.25
hellaswag_accuracy: 81.25 GPQA_diamond_accuracy: 31.25
TheoremQA_score: 6.25 hellaswag_accuracy: 81.25
musr_average_naive_average: 37.5 TheoremQA_score: 6.25
korbench_single_naive_average: 41.25 musr_average_naive_average: 39.58
gsm8k_accuracy: 68.75 korbench_single_naive_average: 37.50
math_accuracy: 75 gsm8k_accuracy: 68.75
cmo_fib_accuracy: 6.25 math_accuracy: 68.75
aime2024_accuracy: 6.25 cmo_fib_accuracy: 6.25
wikibench-wiki-single_choice_cncircular_perf_4: 25 aime2024_accuracy: 6.25
sanitized_mbpp_score: 68.75 wikibench-wiki-single_choice_cncircular_perf_4: 50.00
ds1000_naive_average: 13.39 sanitized_mbpp_score: 68.75
lcb_code_generation_pass@1: 12.5 ds1000_naive_average: 16.96
lcb_code_execution_pass@1: 43.75 lcb_code_generation_pass@1: 12.5
lcb_test_output_pass@1: 12.5 lcb_code_execution_pass@1: 43.75
bbh-logical_deduction_seven_objects_score: 56.25 lcb_test_output_pass@1: 25.00
bbh-multistep_arithmetic_two_score: 68.75 bbh-logical_deduction_seven_objects_score: 50.00
mmlu-other_naive_average: 74.04 bbh-multistep_arithmetic_two_score: 68.75
cmmlu-china-specific_naive_average: 76.25 mmlu-other_naive_average: 69.71
mmlu_pro_math_accuracy: 25 cmmlu-china-specific_naive_average: 75.83
ds1000_Pandas_accuracy: 0 mmlu_pro_math_accuracy: 31.25
ds1000_Numpy_accuracy: 0 ds1000_Pandas_accuracy: 0
ds1000_Tensorflow_accuracy: 12.5 ds1000_Numpy_accuracy: 0
ds1000_Scipy_accuracy: 18.75 ds1000_Tensorflow_accuracy: 12.5
ds1000_Sklearn_accuracy: 18.75 ds1000_Scipy_accuracy: 18.75
ds1000_Pytorch_accuracy: 6.25 ds1000_Sklearn_accuracy: 18.75
ds1000_Matplotlib_accuracy: 37.5 ds1000_Pytorch_accuracy: 18.75
openai_mmmlu_lite_AR-XY_accuracy: 37.5 ds1000_Matplotlib_accuracy: 50.00
college_naive_average: 0 openai_mmmlu_lite_AR-XY_accuracy: 37.5
college_knowledge_naive_average: 87.5 college_naive_average: 12.50
alignment_bench_v1_1_总分: 0.68 college_knowledge_naive_average: 87.5
alpaca_eval_total: 10 subjective:
arenahard_score: 50 alignment_bench_v1_1_总分: 0.70
Followbench_naive_average: 1 alpaca_eval_total: 0
CompassArena_naive_average: 52.95 arenahard_score: 50
mtbench101_avg: 8.1 Followbench_naive_average: 1
wildbench_average: -4.44 CompassArena_naive_average: 38
simpleqa_accuracy_given_attempted: 0 mtbench101_avg: 7.80
chinese_simpleqa_given_attempted_accuracy: 1 wildbench_average: -4.86
alignment_bench_v1_1_专业能力: 8.2 simpleqa_accuracy_given_attempted: 0
alignment_bench_v1_1_数学计算: 0 chinese_simpleqa_given_attempted_accuracy: 1
alignment_bench_v1_1_基本任务: 0 alignment_bench_v1_1_专业能力: 8.4
alignment_bench_v1_1_逻辑推理: 0 alignment_bench_v1_1_数学计算: 0
alignment_bench_v1_1_中文理解: 0 alignment_bench_v1_1_基本任务: 0
alignment_bench_v1_1_文本写作: 0 alignment_bench_v1_1_逻辑推理: 0
alignment_bench_v1_1_角色扮演: 0 alignment_bench_v1_1_中文理解: 0
alignment_bench_v1_1_综合问答: 0 alignment_bench_v1_1_文本写作: 0
alpaca_eval_helpful_base: 10 alignment_bench_v1_1_角色扮演: 0
compassarena_language_naive_average: 61.5 alignment_bench_v1_1_综合问答: 0
compassarena_knowledge_naive_average: 56.5 alpaca_eval_helpful_base: 0
compassarena_reason_v2_naive_average: 47.5 compassarena_language_naive_average: 35
compassarena_math_v2_naive_average: 53.03 compassarena_knowledge_naive_average: 50
compassarena_creationv2_zh_naive_average: 46.22 compassarena_reason_v2_naive_average: 30
fofo_test_prompts_overall: 1 compassarena_math_v2_naive_average: 50
followbench_llmeval_en_HSR_AVG: 1 compassarena_creationv2_zh_naive_average: 25
followbench_llmeval_en_SSR_AVG: 1 followbench_llmeval_en_HSR_AVG: 1
followbench_llmeval_en_HSR_L1: 1 followbench_llmeval_en_SSR_AVG: 1
followbench_llmeval_en_HSR_L2: 1 followbench_llmeval_en_HSR_L1: 1
followbench_llmeval_en_HSR_L3: 1 followbench_llmeval_en_HSR_L2: 1
followbench_llmeval_en_HSR_L4: 1 followbench_llmeval_en_HSR_L3: 1
followbench_llmeval_en_HSR_L5: 1 followbench_llmeval_en_HSR_L4: 1
followbench_llmeval_en_SSR_L1: 1 followbench_llmeval_en_HSR_L5: 1
followbench_llmeval_en_SSR_L2: 1 followbench_llmeval_en_SSR_L1: 1
followbench_llmeval_en_SSR_L3: 1 followbench_llmeval_en_SSR_L2: 1
followbench_llmeval_en_SSR_L4: 1 followbench_llmeval_en_SSR_L3: 1
followbench_llmeval_en_SSR_L5: 1 followbench_llmeval_en_SSR_L4: 1
simpleqa_f1: 0 followbench_llmeval_en_SSR_L5: 1
simpleqa_f1: 0
internlm2_5-7b-hf_fullbench: internlm2_5-7b-hf_fullbench:
race-high_accuracy: 100 objective:
ARC-c_accuracy: 68.75 race-high_accuracy: 100
BoolQ_accuracy: 87.5 ARC-c_accuracy: 68.75
triviaqa_wiki_1shot_score: 43.75 BoolQ_accuracy: 87.5
nq_open_1shot_score: 43.75 triviaqa_wiki_1shot_score: 43.75
drop_accuracy: 62.5 nq_open_1shot_score: 43.75
GPQA_diamond_accuracy: 62.5 drop_accuracy: 62.5
hellaswag_accuracy: 93.75 GPQA_diamond_accuracy: 62.5
TheoremQA_score: 25 hellaswag_accuracy: 93.75
winogrande_accuracy: 75 TheoremQA_score: 25
gsm8k_accuracy: 37.5 winogrande_accuracy: 75
GaokaoBench_2010-2022_Math_II_MCQs_score: 62.5 gsm8k_accuracy: 37.5
GaokaoBench_2010-2022_Math_II_Fill-in-the-Blank_score: 0 GaokaoBench_2010-2022_Math_II_MCQs_score: 62.5
math_accuracy: 12.5 GaokaoBench_2010-2022_Math_II_Fill-in-the-Blank_score: 0
wikibench-wiki-single_choice_cncircular_perf_4: 25 math_accuracy: 12.5
sanitized_mbpp_score: 56.25 wikibench-wiki-single_choice_cncircular_perf_4: 25
dingo_en_192_score: 37.5 sanitized_mbpp_score: 56.25
dingo_zh_170_score: 100 dingo_en_192_score: 37.5
mmlu-other_accuracy: 76.92 dingo_zh_170_score: 100
cmmlu-china-specific_accuracy: 84.17 mmlu-other_accuracy: 76.92
mmlu_pro_math_accuracy: 18.75 cmmlu-china-specific_accuracy: 84.17
bbh-logical_deduction_seven_objects_score: 43.75 mmlu_pro_math_accuracy: 18.75
bbh-multistep_arithmetic_two_score: 56.25 bbh-logical_deduction_seven_objects_score: 43.75
college_naive_average: 12.5 bbh-multistep_arithmetic_two_score: 56.25
college_knowledge_naive_average: 87.5 college_naive_average: 12.5
college_knowledge_naive_average: 87.5
internlm2_5-7b-turbomind_fullbench: internlm2_5-7b-turbomind_fullbench:
race-high_accuracy: 100 objective:
ARC-c_accuracy: 68.75 race-high_accuracy: 100
BoolQ_accuracy: 87.5 ARC-c_accuracy: 68.75
triviaqa_wiki_1shot_score: 43.75 BoolQ_accuracy: 87.5
nq_open_1shot_score: 43.75 triviaqa_wiki_1shot_score: 43.75
drop_accuracy: 62.5 nq_open_1shot_score: 43.75
GPQA_diamond_accuracy: 62.5 drop_accuracy: 62.5
hellaswag_accuracy: 93.75 GPQA_diamond_accuracy: 62.5
TheoremQA_score: 31.25 hellaswag_accuracy: 93.75
winogrande_accuracy: 87.5 TheoremQA_score: 25.00
gsm8k_accuracy: 68.75 winogrande_accuracy: 87.5
GaokaoBench_2010-2022_Math_II_MCQs_score: 62.5 gsm8k_accuracy: 62.50
GaokaoBench_2010-2022_Math_II_Fill-in-the-Blank_score: 0 GaokaoBench_2010-2022_Math_II_MCQs_score: 81.25
math_accuracy: 18.75 GaokaoBench_2010-2022_Math_II_Fill-in-the-Blank_score: 0
wikibench-wiki-single_choice_cncircular_perf_4: 25 math_accuracy: 18.75
sanitized_mbpp_score: 56.25 wikibench-wiki-single_choice_cncircular_perf_4: 25
dingo_en_192_score: 43.75 sanitized_mbpp_score: 62.50
dingo_zh_170_score: 100 dingo_en_192_score: 31.25
mmlu-other_accuracy: 76.92 dingo_zh_170_score: 93.75
cmmlu-china-specific_accuracy: 84.17 mmlu-other_accuracy: 76.92
mmlu_pro_math_accuracy: 18.75 cmmlu-china-specific_accuracy: 84.17
bbh-logical_deduction_seven_objects_score: 50 mmlu_pro_math_accuracy: 18.75
bbh-multistep_arithmetic_two_score: 56.25 bbh-logical_deduction_seven_objects_score: 50
college_naive_average: 12.5 bbh-multistep_arithmetic_two_score: 56.25
college_knowledge_naive_average: 87.5 college_naive_average: 12.5
college_knowledge_naive_average: 87.5
internlm2_5-7b-turbomind: internlm2_5-7b-turbomind:
race-high_accuracy: 89.28 objective:
ARC-c_accuracy: 52.2 race-high_accuracy: 89.28
BoolQ_accuracy: 89.72 ARC-c_accuracy: 52.2
triviaqa_wiki_1shot_score: 65.88 BoolQ_accuracy: 89.72
nq_open_1shot_score: 34.82 triviaqa_wiki_1shot_score: 65.88
drop_accuracy: 68.1 nq_open_1shot_score: 34.82
bbh_naive_average: 72.15 drop_accuracy: 68.1
GPQA_diamond_accuracy: 32.83 bbh_naive_average: 72.15
hellaswag_accuracy: 88.36 GPQA_diamond_accuracy: 32.83
TheoremQA_score: 25 hellaswag_accuracy: 88.36
winogrande_accuracy: 81.29 TheoremQA_score: 25
gsm8k_accuracy: 74.68 winogrande_accuracy: 81.29
GaokaoBench_weighted_average: 58.19 gsm8k_accuracy: 74.68
math_accuracy: 33.98 GaokaoBench_weighted_average: 58.19
Mathbench_naive_average: 48.38 math_accuracy: 33.98
wikibench-wiki-single_choice_cncircular_perf_4: 29.1 Mathbench_naive_average: 48.38
cmmlu_naive_average: 78.94 wikibench-wiki-single_choice_cncircular_perf_4: 29.1
mmlu_naive_average: 71.44 cmmlu_naive_average: 78.94
mmlu_pro_naive_average: 38.18 mmlu_naive_average: 71.44
openai_humaneval_humaneval_pass@1: 59.76 mmlu_pro_naive_average: 38.18
openai_humaneval_v2_humaneval_pass@1: 51.22 openai_humaneval_humaneval_pass@1: 59.76
sanitized_mbpp_score: 55.25 openai_humaneval_v2_humaneval_pass@1: 51.22
dingo_en_192_score: 60.94 sanitized_mbpp_score: 55.25
dingo_zh_170_score: 67.65 dingo_en_192_score: 60.94
mmlu-stem_naive_average: 63.72 dingo_zh_170_score: 67.65
mmlu-social-science_naive_average: 80.15 mmlu-stem_naive_average: 63.72
mmlu-humanities_naive_average: 74.27 mmlu-social-science_naive_average: 80.15
mmlu-other_naive_average: 71.85 mmlu-humanities_naive_average: 74.27
cmmlu-stem_naive_average: 67.07 mmlu-other_naive_average: 71.85
cmmlu-social-science_naive_average: 81.49 cmmlu-stem_naive_average: 67.07
cmmlu-humanities_naive_average: 85.84 cmmlu-social-science_naive_average: 81.49
cmmlu-other_naive_average: 82.69 cmmlu-humanities_naive_average: 85.84
cmmlu-china-specific_naive_average: 79.88 cmmlu-other_naive_average: 82.69
mmlu_pro_biology_accuracy: 58.58 cmmlu-china-specific_naive_average: 79.88
mmlu_pro_business_accuracy: 28.01 mmlu_pro_biology_accuracy: 58.58
mmlu_pro_chemistry_accuracy: 22.79 mmlu_pro_business_accuracy: 28.01
mmlu_pro_computer_science_accuracy: 39.02 mmlu_pro_chemistry_accuracy: 22.79
mmlu_pro_economics_accuracy: 53.08 mmlu_pro_computer_science_accuracy: 39.02
mmlu_pro_engineering_accuracy: 25.7 mmlu_pro_economics_accuracy: 53.08
mmlu_pro_health_accuracy: 46.94 mmlu_pro_engineering_accuracy: 25.7
mmlu_pro_history_accuracy: 43.04 mmlu_pro_health_accuracy: 46.94
mmlu_pro_law_accuracy: 29.7 mmlu_pro_history_accuracy: 43.04
mmlu_pro_math_accuracy: 24.2 mmlu_pro_law_accuracy: 29.7
mmlu_pro_philosophy_accuracy: 42.48 mmlu_pro_math_accuracy: 24.2
mmlu_pro_physics_accuracy: 26.02 mmlu_pro_philosophy_accuracy: 42.48
mmlu_pro_psychology_accuracy: 52.76 mmlu_pro_physics_accuracy: 26.02
mmlu_pro_other_accuracy: 42.21 mmlu_pro_psychology_accuracy: 52.76
college_naive_average: 10.67 mmlu_pro_other_accuracy: 42.21
high_naive_average: 6.67 college_naive_average: 10.67
middle_naive_average: 26.67 high_naive_average: 6.67
primary_naive_average: 60 middle_naive_average: 26.67
arithmetic_naive_average: 55 primary_naive_average: 60
mathbench-a (average)_naive_average: 31.8 arithmetic_naive_average: 55
college_knowledge_naive_average: 62.34 mathbench-a (average)_naive_average: 31.8
high_knowledge_naive_average: 59.83 college_knowledge_naive_average: 62.34
middle_knowledge_naive_average: 71.15 high_knowledge_naive_average: 59.83
primary_knowledge_naive_average: 66.55 middle_knowledge_naive_average: 71.15
mathbench-t (average)_naive_average: 64.97 primary_knowledge_naive_average: 66.55
Single-Needle-Retrieval(S-RT)-32000_naive_average: 100 mathbench-t (average)_naive_average: 64.97
Single-Needle-Retrieval-EN-32000_naive_average: 100 long_context:
Single-Needle-Retrieval-ZH-32000_naive_average: 100 Single-Needle-Retrieval(S-RT)-32000_naive_average: 100
Single-Needle-Retrieval(S-RT)-100000_naive_average: 100 Single-Needle-Retrieval-EN-32000_naive_average: 100
Single-Needle-Retrieval-EN-100000_naive_average: 100 Single-Needle-Retrieval-ZH-32000_naive_average: 100
Single-Needle-Retrieval-ZH-100000_naive_average: 100 Single-Needle-Retrieval(S-RT)-100000_naive_average: 100
Single-Needle-Retrieval(S-RT)-200000_naive_average: 100 Single-Needle-Retrieval-EN-100000_naive_average: 100
Single-Needle-Retrieval-EN-200000_naive_average: 100 Single-Needle-Retrieval-ZH-100000_naive_average: 100
Single-Needle-Retrieval-ZH-200000_naive_average: 100 Single-Needle-Retrieval(S-RT)-200000_naive_average: 100
longbench_naive_average: 46.19 Single-Needle-Retrieval-EN-200000_naive_average: 100
longbench_zh_naive_average: 49.3 Single-Needle-Retrieval-ZH-200000_naive_average: 100
longbench_en_naive_average: 43.97 longbench_naive_average: 46.19
longbench_single-document-qa_naive_average: 42.84 longbench_zh_naive_average: 49.3
longbench_multi-document-qa_naive_average: 37.29 longbench_en_naive_average: 43.97
longbench_summarization_naive_average: 23.21 longbench_single-document-qa_naive_average: 42.84
longbench_few-shot-learning_naive_average: 61.67 longbench_multi-document-qa_naive_average: 37.29
longbench_synthetic-tasks_naive_average: 60.05 longbench_summarization_naive_average: 23.21
longbench_code-completion_naive_average: 52.09 longbench_few-shot-learning_naive_average: 61.67
longbench_synthetic-tasks_naive_average: 60.05
longbench_code-completion_naive_average: 52.09
internlm2_5-7b-chat-turbomind: internlm2_5-7b-chat-turbomind:
race-high_accuracy: 86.16 objective:
ARC-c_accuracy: 90.17 race-high_accuracy: 86.16
BoolQ_accuracy: 87.89 ARC-c_accuracy: 90.17
triviaqa_wiki_1shot_score: 64.91 BoolQ_accuracy: 87.89
nq_open_1shot_score: 22.69 triviaqa_wiki_1shot_score: 64.91
mmmlu_lite_naive_average: 44.96 nq_open_1shot_score: 22.69
IFEval_Prompt-level-strict-accuracy: 58.04 mmmlu_lite_naive_average: 44.96
drop_accuracy: 77.68 IFEval_Prompt-level-strict-accuracy: 58.04
bbh_naive_average: 73.14 drop_accuracy: 77.68
GPQA_diamond_accuracy: 25.76 bbh_naive_average: 73.14
hellaswag_accuracy: 94.79 GPQA_diamond_accuracy: 25.76
TheoremQA_score: 21.5 hellaswag_accuracy: 94.79
musr_average_naive_average: 51.03 TheoremQA_score: 21.5
korbench_single_naive_average: 31.92 musr_average_naive_average: 51.03
ARC_Prize_Public_Evaluation_accuracy: 0.01 korbench_single_naive_average: 31.92
gsm8k_accuracy: 86.73 ARC_Prize_Public_Evaluation_accuracy: 0.01
GaokaoBench_weighted_average: 77.89 gsm8k_accuracy: 86.73
math_accuracy: 61.5 GaokaoBench_weighted_average: 77.89
cmo_fib_accuracy: 12.5 math_accuracy: 61.5
aime2024_accuracy: 3.33 cmo_fib_accuracy: 12.5
Mathbench_naive_average: 65.17 aime2024_accuracy: 3.33
wikibench-wiki-single_choice_cncircular_perf_4: 31.55 Mathbench_naive_average: 65.17
cmmlu_naive_average: 74.14 wikibench-wiki-single_choice_cncircular_perf_4: 31.55
mmlu_naive_average: 70.52 cmmlu_naive_average: 74.14
mmlu_pro_naive_average: 44.98 mmlu_naive_average: 70.52
openai_humaneval_humaneval_pass@1: 70.73 mmlu_pro_naive_average: 44.98
sanitized_mbpp_score: 63.81 openai_humaneval_humaneval_pass@1: 70.73
humanevalx_naive_average: 38.17 sanitized_mbpp_score: 63.81
ds1000_naive_average: 14.15 humanevalx_naive_average: 38.17
lcb_code_generation_pass@1: 17.75 ds1000_naive_average: 14.15
lcb_code_execution_pass@1: 32.57 lcb_code_generation_pass@1: 17.75
lcb_test_output_pass@1: 24.89 lcb_code_execution_pass@1: 32.57
bigcodebench_hard_instruct_pass@1: 0.08 lcb_test_output_pass@1: 24.89
bigcodebench_hard_complete_pass@1: 0.06 bigcodebench_hard_instruct_pass@1: 0.08
teval_naive_average: 80.03 bigcodebench_hard_complete_pass@1: 0.06
qa_dingo_cn_score: 99.01 teval_naive_average: 80.03
mmlu-stem_naive_average: 68.2 qa_dingo_cn_score: 99.01
mmlu-social-science_naive_average: 76.11 mmlu-stem_naive_average: 68.2
mmlu-humanities_naive_average: 68.71 mmlu-social-science_naive_average: 76.11
mmlu-other_naive_average: 70.56 mmlu-humanities_naive_average: 68.71
cmmlu-stem_naive_average: 66.27 mmlu-other_naive_average: 70.56
cmmlu-social-science_naive_average: 75.7 cmmlu-stem_naive_average: 66.27
cmmlu-humanities_naive_average: 77.7 cmmlu-social-science_naive_average: 75.7
cmmlu-other_naive_average: 77.71 cmmlu-humanities_naive_average: 77.7
cmmlu-china-specific_naive_average: 72.94 cmmlu-other_naive_average: 77.71
mmlu_pro_biology_accuracy: 66.25 cmmlu-china-specific_naive_average: 72.94
mmlu_pro_business_accuracy: 48.42 mmlu_pro_biology_accuracy: 66.25
mmlu_pro_chemistry_accuracy: 35.25 mmlu_pro_business_accuracy: 48.42
mmlu_pro_computer_science_accuracy: 47.56 mmlu_pro_chemistry_accuracy: 35.25
mmlu_pro_economics_accuracy: 55.92 mmlu_pro_computer_science_accuracy: 47.56
mmlu_pro_engineering_accuracy: 30.44 mmlu_pro_economics_accuracy: 55.92
mmlu_pro_health_accuracy: 45.97 mmlu_pro_engineering_accuracy: 30.44
mmlu_pro_history_accuracy: 41.21 mmlu_pro_health_accuracy: 45.97
mmlu_pro_law_accuracy: 25.79 mmlu_pro_history_accuracy: 41.21
mmlu_pro_math_accuracy: 54.03 mmlu_pro_law_accuracy: 25.79
mmlu_pro_philosophy_accuracy: 36.47 mmlu_pro_math_accuracy: 54.03
mmlu_pro_physics_accuracy: 37.41 mmlu_pro_philosophy_accuracy: 36.47
mmlu_pro_psychology_accuracy: 58.77 mmlu_pro_physics_accuracy: 37.41
mmlu_pro_other_accuracy: 46.21 mmlu_pro_psychology_accuracy: 58.77
humanevalx-python_pass@1: 53.66 mmlu_pro_other_accuracy: 46.21
humanevalx-cpp_pass@1: 24.39 humanevalx-python_pass@1: 53.66
humanevalx-go_pass@1: 0 humanevalx-cpp_pass@1: 24.39
humanevalx-java_pass@1: 57.93 humanevalx-go_pass@1: 0
humanevalx-js_pass@1: 54.88 humanevalx-java_pass@1: 57.93
ds1000_Pandas_accuracy: 12.03 humanevalx-js_pass@1: 54.88
ds1000_Numpy_accuracy: 4.09 ds1000_Pandas_accuracy: 12.03
ds1000_Tensorflow_accuracy: 11.11 ds1000_Numpy_accuracy: 4.09
ds1000_Scipy_accuracy: 8.49 ds1000_Tensorflow_accuracy: 11.11
ds1000_Sklearn_accuracy: 6.96 ds1000_Scipy_accuracy: 8.49
ds1000_Pytorch_accuracy: 7.35 ds1000_Sklearn_accuracy: 6.96
ds1000_Matplotlib_accuracy: 49.03 ds1000_Pytorch_accuracy: 7.35
openai_mmmlu_lite_AR-XY_accuracy: 17.89 ds1000_Matplotlib_accuracy: 49.03
openai_mmmlu_lite_BN-BD_accuracy: 27.58 openai_mmmlu_lite_AR-XY_accuracy: 17.89
openai_mmmlu_lite_DE-DE_accuracy: 51.16 openai_mmmlu_lite_BN-BD_accuracy: 27.58
openai_mmmlu_lite_ES-LA_accuracy: 56.84 openai_mmmlu_lite_DE-DE_accuracy: 51.16
openai_mmmlu_lite_FR-FR_accuracy: 57.96 openai_mmmlu_lite_ES-LA_accuracy: 56.84
openai_mmmlu_lite_HI-IN_accuracy: 33.68 openai_mmmlu_lite_FR-FR_accuracy: 57.96
openai_mmmlu_lite_ID-ID_accuracy: 51.02 openai_mmmlu_lite_HI-IN_accuracy: 33.68
openai_mmmlu_lite_IT-IT_accuracy: 50.46 openai_mmmlu_lite_ID-ID_accuracy: 51.02
openai_mmmlu_lite_JA-JP_accuracy: 50.53 openai_mmmlu_lite_IT-IT_accuracy: 50.46
openai_mmmlu_lite_KO-KR_accuracy: 45.05 openai_mmmlu_lite_JA-JP_accuracy: 50.53
openai_mmmlu_lite_PT-BR_accuracy: 57.68 openai_mmmlu_lite_KO-KR_accuracy: 45.05
openai_mmmlu_lite_SW-KE_accuracy: 32.77 openai_mmmlu_lite_PT-BR_accuracy: 57.68
openai_mmmlu_lite_YO-NG_accuracy: 31.79 openai_mmmlu_lite_SW-KE_accuracy: 32.77
openai_mmmlu_lite_ZH-CN_accuracy: 65.05 openai_mmmlu_lite_YO-NG_accuracy: 31.79
college_naive_average: 20.33 openai_mmmlu_lite_ZH-CN_accuracy: 65.05
high_naive_average: 47.67 college_naive_average: 20.33
middle_naive_average: 62 high_naive_average: 47.67
primary_naive_average: 72 middle_naive_average: 62
arithmetic_naive_average: 62.33 primary_naive_average: 72
mathbench-a (average)_naive_average: 52.87 arithmetic_naive_average: 62.33
college_knowledge_naive_average: 70.57 mathbench-a (average)_naive_average: 52.87
high_knowledge_naive_average: 70.13 college_knowledge_naive_average: 70.57
middle_knowledge_naive_average: 81.17 high_knowledge_naive_average: 70.13
primary_knowledge_naive_average: 88.01 middle_knowledge_naive_average: 81.17
mathbench-t (average)_naive_average: 77.47 primary_knowledge_naive_average: 88.01
alignment_bench_v1_1_总分: 5.68 mathbench-t (average)_naive_average: 77.47
alpaca_eval_total: 25.96 subjective:
arenahard_score: 17.15 alignment_bench_v1_1_总分: 5.68
Followbench_naive_average: 0.81 alpaca_eval_total: 25.96
CompassArena_naive_average: 34.61 arenahard_score: 17.15
FoFo_naive_average: 0.38 Followbench_naive_average: 0.81
mtbench101_avg: 8.01 CompassArena_naive_average: 34.61
wildbench_average: -15.69 FoFo_naive_average: 0.38
simpleqa_accuracy_given_attempted: 0.04 mtbench101_avg: 8.01
chinese_simpleqa_given_attempted_accuracy: 0.34 wildbench_average: -15.69
alignment_bench_v1_1_专业能力: 6.05 simpleqa_accuracy_given_attempted: 0.04
alignment_bench_v1_1_数学计算: 5.87 chinese_simpleqa_given_attempted_accuracy: 0.34
alignment_bench_v1_1_基本任务: 6.01 alignment_bench_v1_1_专业能力: 6.05
alignment_bench_v1_1_逻辑推理: 4.48 alignment_bench_v1_1_数学计算: 5.87
alignment_bench_v1_1_中文理解: 6.17 alignment_bench_v1_1_基本任务: 6.01
alignment_bench_v1_1_文本写作: 6.06 alignment_bench_v1_1_逻辑推理: 4.48
alignment_bench_v1_1_角色扮演: 6.3 alignment_bench_v1_1_中文理解: 6.17
alignment_bench_v1_1_综合问答: 6.45 alignment_bench_v1_1_文本写作: 6.06
alpaca_eval_helpful_base: 17.83 alignment_bench_v1_1_角色扮演: 6.3
alpaca_eval_koala: 28.21 alignment_bench_v1_1_综合问答: 6.45
alpaca_eval_oasst: 23.4 alpaca_eval_helpful_base: 17.83
alpaca_eval_selfinstruct: 30.95 alpaca_eval_koala: 28.21
alpaca_eval_vicuna: 25 alpaca_eval_oasst: 23.4
compassarena_language_naive_average: 52.5 alpaca_eval_selfinstruct: 30.95
compassarena_knowledge_naive_average: 36 alpaca_eval_vicuna: 25
compassarena_reason_v2_naive_average: 35 compassarena_language_naive_average: 52.5
compassarena_math_v2_naive_average: 19.91 compassarena_knowledge_naive_average: 36
compassarena_creationv2_zh_naive_average: 29.64 compassarena_reason_v2_naive_average: 35
fofo_test_prompts_overall: 0.35 compassarena_math_v2_naive_average: 19.91
fofo_test_prompts_cn_overall: 0.41 compassarena_creationv2_zh_naive_average: 29.64
followbench_llmeval_en_HSR_AVG: 0.73 fofo_test_prompts_overall: 0.35
followbench_llmeval_en_SSR_AVG: 0.88 fofo_test_prompts_cn_overall: 0.41
followbench_llmeval_en_HSR_L1: 0.94 followbench_llmeval_en_HSR_AVG: 0.73
followbench_llmeval_en_HSR_L2: 0.77 followbench_llmeval_en_SSR_AVG: 0.88
followbench_llmeval_en_HSR_L3: 0.73 followbench_llmeval_en_HSR_L1: 0.94
followbench_llmeval_en_HSR_L4: 0.68 followbench_llmeval_en_HSR_L2: 0.77
followbench_llmeval_en_HSR_L5: 0.54 followbench_llmeval_en_HSR_L3: 0.73
followbench_llmeval_en_SSR_L1: 0.94 followbench_llmeval_en_HSR_L4: 0.68
followbench_llmeval_en_SSR_L2: 0.88 followbench_llmeval_en_HSR_L5: 0.54
followbench_llmeval_en_SSR_L3: 0.87 followbench_llmeval_en_SSR_L1: 0.94
followbench_llmeval_en_SSR_L4: 0.87 followbench_llmeval_en_SSR_L2: 0.88
followbench_llmeval_en_SSR_L5: 0.85 followbench_llmeval_en_SSR_L3: 0.87
simpleqa_f1: 0.04 followbench_llmeval_en_SSR_L4: 0.87
followbench_llmeval_en_SSR_L5: 0.85
simpleqa_f1: 0.04
internlm2_5-7b-chat-1m-turbomind: internlm2_5-7b-chat-1m-turbomind:
ruler_8k_naive_average: 88.53 long_context:
ruler_32k_naive_average: 83.84 ruler_8k_naive_average: 88.53
ruler_128k_naive_average: 70.94 ruler_32k_naive_average: 83.84
NeedleBench-Overall-Score-8K_weighted_average: 91.89 ruler_128k_naive_average: 70.94
NeedleBench-Overall-Score-32K_weighted_average: 91.42 NeedleBench-Overall-Score-8K_weighted_average: 91.89
NeedleBench-Overall-Score-128K_weighted_average: 88.57 NeedleBench-Overall-Score-32K_weighted_average: 91.42
longbench_naive_average: 46.44 NeedleBench-Overall-Score-128K_weighted_average: 88.57
longbench_zh_naive_average: 45.19 longbench_naive_average: 46.44
longbench_en_naive_average: 45.71 longbench_zh_naive_average: 45.19
babilong_0k_naive_average: 79.3 longbench_en_naive_average: 45.71
babilong_4k_naive_average: 67 babilong_0k_naive_average: 79.3
babilong_16k_naive_average: 52.7 babilong_4k_naive_average: 67
babilong_32k_naive_average: 48.9 babilong_16k_naive_average: 52.7
babilong_128k_naive_average: 40.8 babilong_32k_naive_average: 48.9
babilong_256k_naive_average: 23.5 babilong_128k_naive_average: 40.8
longbench_single-document-qa_naive_average: 43.56 babilong_256k_naive_average: 23.5
longbench_multi-document-qa_naive_average: 46.24 longbench_single-document-qa_naive_average: 43.56
longbench_summarization_naive_average: 24.32 longbench_multi-document-qa_naive_average: 46.24
longbench_few-shot-learning_naive_average: 51.67 longbench_summarization_naive_average: 24.32
longbench_synthetic-tasks_naive_average: 66.83 longbench_few-shot-learning_naive_average: 51.67
longbench_code-completion_naive_average: 45.99 longbench_synthetic-tasks_naive_average: 66.83
longbench_code-completion_naive_average: 45.99

View File

@ -1,459 +1,468 @@
baichuan2-7b-chat-hf: chat:
gsm8k_accuracy: 18.75 glm-4-9b-chat-hf:
race-high_accuracy: 78.12 gsm8k_accuracy: 68.75
race-high_accuracy: 90.62
glm-4-9b-chat-hf: glm-4-9b-chat-turbomind:
gsm8k_accuracy: 68.75 gsm8k_accuracy: 71.88
race-high_accuracy: 90.62 race-high_accuracy: 90.62
glm-4-9b-chat-vllm:
glm-4-9b-chat-turbomind: gsm8k_accuracy: 65.62
gsm8k_accuracy: 75.00 race-high_accuracy: 90.62
race-high_accuracy: 90.62 deepseek-7b-chat-hf:
gsm8k_accuracy: 46.88
glm-4-9b-chat-vllm: race-high_accuracy: 81.25
gsm8k_accuracy: 65.62 deepseek-moe-16b-chat-hf:
race-high_accuracy: 90.62 gsm8k_accuracy: 50
race-high_accuracy: 68.75
deepseek-7b-chat-hf: deepseek-7b-chat-vllm:
gsm8k_accuracy: 46.88 gsm8k_accuracy: 43.75
race-high_accuracy: 81.25 race-high_accuracy: 75
gemma2-2b-it-hf:
deepseek-moe-16b-chat-hf: gsm8k_accuracy: 50
gsm8k_accuracy: 50 race-high_accuracy: 71.88
race-high_accuracy: 68.75 gemma2-9b-it-hf:
gsm8k_accuracy: 71.88
deepseek-7b-chat-vllm: race-high_accuracy: 84.38
gsm8k_accuracy: 43.75 gemma-2b-it-hf:
race-high_accuracy: 75 gsm8k_accuracy: 3.12
race-high_accuracy: 40.62
gemma2-2b-it-hf: gemma-7b-it-hf:
gsm8k_accuracy: 50 gsm8k_accuracy: 40.62
race-high_accuracy: 71.88 race-high_accuracy: 68.75
gemma-2-9b-it-turbomind:
gemma2-9b-it-hf: gsm8k_accuracy: 71.88
gsm8k_accuracy: 71.88 race-high_accuracy: 84.38
race-high_accuracy: 84.38 gemma-2-27b-it-turbomind:
gsm8k_accuracy: 78.12
gemma-2b-it-hf: race-high_accuracy: 93.75
gsm8k_accuracy: 3.12 gemma-7b-it-vllm:
race-high_accuracy: 40.62 gsm8k_accuracy: 34.38
race-high_accuracy: 68.75
gemma-7b-it-hf: internlm2_5-7b-chat-hf:
gsm8k_accuracy: 40.62 gsm8k_accuracy: 84.38
race-high_accuracy: 68.75 race-high_accuracy: 90.62
internlm2_5-7b-chat-turbomind:
gemma-2-9b-it-turbomind: gsm8k_accuracy: 87.50
gsm8k_accuracy: 65.62 race-high_accuracy: 90.62
race-high_accuracy: 84.38 internlm2-chat-1.8b-turbomind:
gsm8k_accuracy: 28.12
gemma-7b-it-vllm: race-high_accuracy: 84.38
gsm8k_accuracy: 34.38 internlm2-chat-1.8b-sft-turbomind:
race-high_accuracy: 68.75 gsm8k_accuracy: 21.88
race-high_accuracy: 84.38
internlm2_5-7b-chat-hf: internlm2-chat-7b-lmdeploy:
gsm8k_accuracy: 84.38 gsm8k_accuracy: 53.12
race-high_accuracy: 90.62 race-high_accuracy: 84.38
internlm2-chat-7b-sft-turbomind:
internlm2_5-7b-chat-turbomind: gsm8k_accuracy: 53.12
gsm8k_accuracy: 84.38 race-high_accuracy: 90.62
race-high_accuracy: 90.62 internlm2-chat-7b-vllm:
gsm8k_accuracy: 56.25
internlm2-chat-1.8b-turbomind: race-high_accuracy: 84.38
gsm8k_accuracy: 25 llama-3_1-8b-instruct-hf:
race-high_accuracy: 84.38 gsm8k_accuracy: 84.38
race-high_accuracy: 90.62
internlm2-chat-1.8b-sft-turbomind: llama-3_2-3b-instruct-hf:
gsm8k_accuracy: 21.88 gsm8k_accuracy: 68.75
race-high_accuracy: 84.38 race-high_accuracy: 81.25
llama-3-8b-instruct-hf:
internlm2-chat-7b-lmdeploy: gsm8k_accuracy: 68.75
gsm8k_accuracy: 53.12 race-high_accuracy: 87.5
race-high_accuracy: 84.38 llama-2-7b-chat-turbomind:
gsm8k_accuracy: 18.75
internlm2-chat-7b-sft-turbomind: race-high_accuracy: 46.88
gsm8k_accuracy: 50 llama-3_1-8b-instruct-turbomind:
race-high_accuracy: 90.62 gsm8k_accuracy: 78.12
race-high_accuracy: 90.62
internlm2-chat-7b-vllm: llama-3_2-3b-instruct-turbomind:
gsm8k_accuracy: 43.75 gsm8k_accuracy: 71.88
race-high_accuracy: 87.5 race-high_accuracy: 81.25
llama-3-8b-instruct-turbomind:
llama-3_1-8b-instruct-hf: gsm8k_accuracy: 71.88
gsm8k_accuracy: 84.38 race-high_accuracy: 87.5
race-high_accuracy: 90.62 mistral-7b-instruct-v0.2-hf:
gsm8k_accuracy: 40.62
llama-3_2-3b-instruct-hf: race-high_accuracy: 75
gsm8k_accuracy: 68.75 mistral-7b-instruct-v0.3-hf:
race-high_accuracy: 81.25 gsm8k_accuracy: 40.62
race-high_accuracy: 75
llama-3-8b-instruct-hf: mistral-nemo-instruct-2407-hf:
gsm8k_accuracy: 68.75 gsm8k_accuracy: 75
race-high_accuracy: 87.5 race-high_accuracy: 81.25
mistral-nemo-instruct-2407-turbomind:
llama-3_1-8b-instruct-turbomind: gsm8k_accuracy: 65.62
gsm8k_accuracy: 78.12 race-high_accuracy: 87.50
race-high_accuracy: 90.62 mistral-7b-instruct-v0.1-vllm:
gsm8k_accuracy: 34.38
llama-3_2-3b-instruct-turbomind: race-high_accuracy: 68.75
gsm8k_accuracy: 65.62 mistral-7b-instruct-v0.2-vllm:
race-high_accuracy: 81.25 gsm8k_accuracy: 43.75
race-high_accuracy: 75
llama-3-8b-instruct-turbomind: MiniCPM3-4B-hf:
gsm8k_accuracy: 68.75 gsm8k_accuracy: 68.75
race-high_accuracy: 87.5 race-high_accuracy: 84.38
phi-3-mini-4k-instruct-hf:
mistral-7b-instruct-v0.2-hf: gsm8k_accuracy: 56.25
gsm8k_accuracy: 40.62 race-high_accuracy: 84.38
race-high_accuracy: 75 phi-3-small-8k-instruct-hf:
gsm8k_accuracy: 0
mistral-7b-instruct-v0.3-hf: race-high_accuracy: 0
gsm8k_accuracy: 40.62 qwen2.5-0.5b-instruct-hf:
race-high_accuracy: 75 gsm8k_accuracy: 34.38
race-high_accuracy: 46.88
mistral-nemo-instruct-2407-hf: qwen2.5-3b-instruct-hf :
gsm8k_accuracy: 75 gsm8k_accuracy: 53.12
race-high_accuracy: 81.25 race-high_accuracy: 90.62
qwen2.5-0.5b-instruct-turbomind:
mistral-nemo-instruct-2407-turbomind: gsm8k_accuracy: 28.12
gsm8k_accuracy: 68.75 race-high_accuracy: 50
race-high_accuracy: 87.50 qwen2.5-3b-instruct-turbomind:
gsm8k_accuracy: 59.38
mistral-7b-instruct-v0.1-vllm: race-high_accuracy: 90.62
gsm8k_accuracy: 34.38 qwen1.5-0.5b-chat-hf:
race-high_accuracy: 68.75 gsm8k_accuracy: 0
race-high_accuracy: 53.12
mistral-7b-instruct-v0.2-vllm: qwen2-1.5b-instruct-hf:
gsm8k_accuracy: 43.75 gsm8k_accuracy: 62.5
race-high_accuracy: 75 race-high_accuracy: 84.38
qwen2-7b-instruct-hf:
MiniCPM3-4B-hf: gsm8k_accuracy: 68.75
gsm8k_accuracy: 68.75 race-high_accuracy: 90.62
race-high_accuracy: 84.38 qwen2-1.5b-instruct-turbomind:
gsm8k_accuracy: 53.12
minicpm-2b-dpo-fp32-hf: race-high_accuracy: 84.38
gsm8k_accuracy: 56.25 qwen2-7b-instruct-turbomind:
race-high_accuracy: 53.12 gsm8k_accuracy: 81.25
race-high_accuracy: 90.62
minicpm-2b-sft-bf16-hf: qwen1.5-0.5b-chat-vllm:
gsm8k_accuracy: 46.88 gsm8k_accuracy: 3.12
race-high_accuracy: 65.62 race-high_accuracy: 53.12
yi-1.5-6b-chat-hf:
minicpm-2b-sft-fp32-hf: gsm8k_accuracy: 65.62
gsm8k_accuracy: 46.88 race-high_accuracy: 84.38
race-high_accuracy: 65.62 yi-1.5-9b-chat-hf:
gsm8k_accuracy: 75
phi-3-mini-4k-instruct-hf: race-high_accuracy: 93.75
gsm8k_accuracy: 56.25 yi-1.5-6b-chat-turbomind:
race-high_accuracy: 84.38 gsm8k_accuracy: 62.5
race-high_accuracy: 84.38
qwen1.5-0.5b-chat-hf: yi-1.5-9b-chat-turbomind:
gsm8k_accuracy: 0 gsm8k_accuracy: 71.88
race-high_accuracy: 53.12 race-high_accuracy: 93.75
deepseek-v2-lite-chat-hf:
qwen2-1.5b-instruct-hf: gsm8k_accuracy: 46.88
gsm8k_accuracy: 62.5 race-high_accuracy: 71.88
race-high_accuracy: 84.38 gemma2-27b-it-hf:
gsm8k_accuracy: 75
qwen2-7b-instruct-hf: race-high_accuracy: 93.75
gsm8k_accuracy: 68.75 internlm2_5-20b-chat-hf:
race-high_accuracy: 90.62 gsm8k_accuracy: 84.38
race-high_accuracy: 87.5
qwen2-1.5b-instruct-turbomind: internlm2_5-20b-chat-turbomind:
gsm8k_accuracy: 62.50 gsm8k_accuracy: 87.50
race-high_accuracy: 84.38 race-high_accuracy: 87.5
mistral-small-instruct-2409-hf:
qwen2-7b-instruct-turbomind: gsm8k_accuracy: 81.25
gsm8k_accuracy: 81.25 race-high_accuracy: 87.50
race-high_accuracy: 87.5 mistral-small-instruct-2409-turbomind:
gsm8k_accuracy: 81.25
qwen1.5-0.5b-chat-vllm: race-high_accuracy: 87.50
gsm8k_accuracy: 3.12 qwen2.5-14b-instruct-hf:
race-high_accuracy: 53.12 gsm8k_accuracy: 71.88
race-high_accuracy: 96.88
yi-1.5-6b-chat-hf: qwen2.5-14b-instruct-turbomind:
gsm8k_accuracy: 65.62 gsm8k_accuracy: 68.75
race-high_accuracy: 84.38 race-high_accuracy: 93.75
yi-1.5-34b-chat-turbomind:
yi-1.5-9b-chat-hf: gsm8k_accuracy: 78.12
gsm8k_accuracy: 75 race-high_accuracy: 93.75
race-high_accuracy: 93.75 deepseek-67b-chat-hf:
gsm8k_accuracy: 71.88
deepseek-v2-lite-chat-hf: race-high_accuracy: 78.12
gsm8k_accuracy: 43.75 llama-3_3-70b-instruct-turbomind:
race-high_accuracy: 71.88 gsm8k_accuracy: 93.75
race-high_accuracy: 87.5
internlm2_5-20b-chat-hf: mixtral-8x7b-instruct-v0.1-hf:
gsm8k_accuracy: 84.38 gsm8k_accuracy: 56.25
race-high_accuracy: 87.5 race-high_accuracy: 81.25
mixtral-large-instruct-2411-turbomind:
internlm2_5-20b-chat-turbomind: gsm8k_accuracy: 90.62
gsm8k_accuracy: 84.38 race-high_accuracy: 93.75
race-high_accuracy: 87.5 nvidia-3_1-Nemotron-70b-instruct-HF-turbomind:
gsm8k_accuracy: 87.5
mistral-small-instruct-2409-hf: race-high_accuracy: 46.88
gsm8k_accuracy: 81.25 qwen2.5-72b-instruct-turbomind:
race-high_accuracy: 87.50 gsm8k_accuracy: 75
race-high_accuracy: 93.75
mistral-small-instruct-2409-turbomind: deepseek-v2_5-1210-turbomind:
gsm8k_accuracy: 78.12 gsm8k_accuracy: 90.62
race-high_accuracy: 87.50 race-high_accuracy: 84.38
mixtral-8x22b-instruct-v0.1-hf:
qwen2.5-14b-instruct-hf: gsm8k_accuracy: 81.25
gsm8k_accuracy: 71.88 race-high_accuracy: 81.25
race-high_accuracy: 96.88 base:
glm-4-9b-hf:
qwen2.5-14b-instruct-turbomind: gsm8k_accuracy: 68.75
gsm8k_accuracy: 71.88 GPQA_diamond_accuracy: 31.25
race-high_accuracy: 93.75 race-high_accuracy: 93.75
winogrande_accuracy: 84.38
glm-4-9b-hf: glm-4-9b-turbomind:
gsm8k_accuracy: 68.75 gsm8k_accuracy: 62.5
GPQA_diamond_accuracy: 31.25 GPQA_diamond_accuracy: 28.12
race-high_accuracy: 93.75 race-high_accuracy: 93.75
winogrande_accuracy: 84.38 winogrande_accuracy: 84.38
deepseek-7b-base-hf:
deepseek-moe-16b-base-hf: gsm8k_accuracy: 25
gsm8k_accuracy: 21.88 GPQA_diamond_accuracy: 0
GPQA_diamond_accuracy: 0 race-high_accuracy: 46.88
race-high_accuracy: 21.88 winogrande_accuracy: 71.88
winogrande_accuracy: 65.62 deepseek-moe-16b-base-hf:
gsm8k_accuracy: 21.88
deepseek-7b-base-turbomind: GPQA_diamond_accuracy: 0
gsm8k_accuracy: 21.88 race-high_accuracy: 21.88
GPQA_diamond_accuracy: 0 winogrande_accuracy: 65.62
race-high_accuracy: 46.88 deepseek-7b-base-turbomind:
winogrande_accuracy: 84.38 gsm8k_accuracy: 21.88
GPQA_diamond_accuracy: 0
deepseek-moe-16b-base-vllm: race-high_accuracy: 46.88
gsm8k_accuracy: 21.88 winogrande_accuracy: 84.38
GPQA_diamond_accuracy: 0 deepseek-moe-16b-base-vllm:
race-high_accuracy: 25 gsm8k_accuracy: 21.88
winogrande_accuracy: 68.75 GPQA_diamond_accuracy: 0
race-high_accuracy: 25
gemma2-2b-hf: winogrande_accuracy: 68.75
gsm8k_accuracy: 31.25 gemma2-2b-hf:
GPQA_diamond_accuracy: 3.12 gsm8k_accuracy: 28.12
race-high_accuracy: 56.25 GPQA_diamond_accuracy: 3.12
winogrande_accuracy: 71.88 race-high_accuracy: 56.25
winogrande_accuracy: 71.88
gemma2-9b-hf: gemma2-9b-hf:
gsm8k_accuracy: 68.75 gsm8k_accuracy: 68.75
GPQA_diamond_accuracy: 0 GPQA_diamond_accuracy: 0
race-high_accuracy: 81.25 race-high_accuracy: 81.25
winogrande_accuracy: 84.38 winogrande_accuracy: 84.38
gemma-2b-hf:
gemma-2b-hf: gsm8k_accuracy: 18.75
gsm8k_accuracy: 18.75 GPQA_diamond_accuracy: 3.12
GPQA_diamond_accuracy: 3.12 race-high_accuracy: 25
race-high_accuracy: 25 winogrande_accuracy: 53.12
winogrande_accuracy: 53.12 gemma-7b-hf:
gsm8k_accuracy: 56.25
gemma-7b-hf: GPQA_diamond_accuracy: 6.25
gsm8k_accuracy: 56.25 race-high_accuracy: 65.62
GPQA_diamond_accuracy: 6.25 winogrande_accuracy: 78.12
race-high_accuracy: 65.62 gemma-2b-vllm:
winogrande_accuracy: 78.12 gsm8k_accuracy: 15.62
GPQA_diamond_accuracy: 3.12
gemma-2b-vllm: race-high_accuracy:
gsm8k_accuracy: 15.62 winogrande_accuracy:
GPQA_diamond_accuracy: 6.25 gemma-7b-vllm:
race-high_accuracy: gsm8k_accuracy: 53.12
winogrande_accuracy: GPQA_diamond_accuracy: 9.38
race-high_accuracy:
gemma-7b-vllm: winogrande_accuracy:
gsm8k_accuracy: 53.12 internlm2_5-7b-hf:
GPQA_diamond_accuracy: 6.25 gsm8k_accuracy: 37.5
race-high_accuracy: GPQA_diamond_accuracy: 25
winogrande_accuracy: race-high_accuracy: 93.75
winogrande_accuracy: 71.88
internlm2_5-7b-hf: internlm2-7b-hf:
gsm8k_accuracy: 37.5 gsm8k_accuracy: 53.12
GPQA_diamond_accuracy: 25 GPQA_diamond_accuracy: 18.75
race-high_accuracy: 93.75 race-high_accuracy: 62.5
winogrande_accuracy: 71.88 winogrande_accuracy: 78.12
internlm2-base-7b-hf:
internlm2-7b-hf: gsm8k_accuracy: 3.12
gsm8k_accuracy: 53.12 GPQA_diamond_accuracy: 21.88
GPQA_diamond_accuracy: 18.75 race-high_accuracy: 75
race-high_accuracy: 62.5 winogrande_accuracy: 65.62
winogrande_accuracy: 78.12 internlm2-1.8b-turbomind:
gsm8k_accuracy: 12.5
internlm2-base-7b-hf: GPQA_diamond_accuracy: 9.38
gsm8k_accuracy: 3.12 race-high_accuracy: 71.88
GPQA_diamond_accuracy: 21.88 winogrande_accuracy: 78.12
race-high_accuracy: 75 internlm2_5-7b-turbomind:
winogrande_accuracy: 65.62 gsm8k_accuracy: 62.50
GPQA_diamond_accuracy: 34.38
internlm2-1.8b-turbomind: race-high_accuracy: 93.75
gsm8k_accuracy: 12.5 winogrande_accuracy: 87.50
GPQA_diamond_accuracy: 12.5 internlm2-7b-turbomind:
race-high_accuracy: 71.88 gsm8k_accuracy: 53.12
winogrande_accuracy: 75 GPQA_diamond_accuracy: 21.88
race-high_accuracy: 71.88
internlm2_5-7b-turbomind: winogrande_accuracy: 84.38
gsm8k_accuracy: 68.75 internlm2-base-7b-turbomind:
GPQA_diamond_accuracy: 31.25 gsm8k_accuracy: 37.50
race-high_accuracy: 93.75 GPQA_diamond_accuracy: 28.12
winogrande_accuracy: 84.38 race-high_accuracy: 81.25
winogrande_accuracy: 75
internlm2-7b-turbomind: llama-2-7b-hf:
gsm8k_accuracy: 56.25 gsm8k_accuracy: 21.88
GPQA_diamond_accuracy: 21.88 GPQA_diamond_accuracy: 21.88
race-high_accuracy: 75 race-high_accuracy: 40.62
winogrande_accuracy: 81.25 winogrande_accuracy: 71.88
llama-3_1-8b-hf:
internlm2-base-7b-turbomind: gsm8k_accuracy: 78.12
gsm8k_accuracy: 40.62 GPQA_diamond_accuracy: 25
GPQA_diamond_accuracy: 28.12 race-high_accuracy: 90.62
race-high_accuracy: 84.38 winogrande_accuracy: 62.5
winogrande_accuracy: 71.88 llama-3-8b-hf:
gsm8k_accuracy: 46.88
llama-2-7b-hf: GPQA_diamond_accuracy: 6.25
gsm8k_accuracy: 21.88 race-high_accuracy: 65.62
GPQA_diamond_accuracy: 21.88 winogrande_accuracy: 65.62
race-high_accuracy: 40.62 llama-3.1-8b-turbomind:
winogrande_accuracy: 71.88 gsm8k_accuracy: 56.25
GPQA_diamond_accuracy: 9.38
llama-3_1-8b-hf: race-high_accuracy: 78.12
gsm8k_accuracy: 78.12 winogrande_accuracy: 78.12
GPQA_diamond_accuracy: 25 llama-3-8b-turbomind:
race-high_accuracy: 90.62 gsm8k_accuracy: 50
winogrande_accuracy: 62.5 GPQA_diamond_accuracy: 12.50
race-high_accuracy: 65.62
llama-3-8b-hf: winogrande_accuracy: 78.12
gsm8k_accuracy: 46.88 mistral-7b-v0.2-hf:
GPQA_diamond_accuracy: 6.25 gsm8k_accuracy: 31.25
race-high_accuracy: 65.62 GPQA_diamond_accuracy: 6.25
winogrande_accuracy: 65.62 race-high_accuracy: 62.5
winogrande_accuracy: 59.38
llama-3.1-8b-turbomind: mistral-7b-v0.3-hf:
gsm8k_accuracy: 56.25 gsm8k_accuracy: 31.25
GPQA_diamond_accuracy: 6.25 GPQA_diamond_accuracy: 6.25
race-high_accuracy: 78.12 race-high_accuracy: 62.5
winogrande_accuracy: 78.12 winogrande_accuracy: 59.38
mistral-7b-v0.2-vllm:
llama-3-8b-turbomind: gsm8k_accuracy: 34.38
gsm8k_accuracy: 50 GPQA_diamond_accuracy: 6.25
GPQA_diamond_accuracy: 9.38 race-high_accuracy: 62.5
race-high_accuracy: 65.62 winogrande_accuracy: 65.62
winogrande_accuracy: 78.12 qwen2.5-7b-hf:
gsm8k_accuracy: 81.25
mistral-7b-v0.2-hf: GPQA_diamond_accuracy: 18.75
gsm8k_accuracy: 31.25 race-high_accuracy: 87.5
GPQA_diamond_accuracy: 6.25 winogrande_accuracy: 71.88
race-high_accuracy: 62.5 qwen2.5-1.5b-turbomind:
winogrande_accuracy: 59.38 gsm8k_accuracy: 62.50
GPQA_diamond_accuracy: 12.50
mistral-7b-v0.3-hf: race-high_accuracy: 78.12
gsm8k_accuracy: 31.25 winogrande_accuracy: 68.75
GPQA_diamond_accuracy: 6.25 qwen2.5-7b-turbomind:
race-high_accuracy: 62.5 gsm8k_accuracy: 75.00
winogrande_accuracy: 59.38 GPQA_diamond_accuracy: 25
race-high_accuracy: 87.5
mistral-7b-v0.2-vllm: winogrande_accuracy: 71.88
gsm8k_accuracy: 34.38 qwen1.5-moe-a2.7b-hf:
GPQA_diamond_accuracy: 6.25 gsm8k_accuracy: 62.5
race-high_accuracy: 62.5 GPQA_diamond_accuracy: 18.75
winogrande_accuracy: 65.62 race-high_accuracy: 84.38
winogrande_accuracy: 75
qwen2.5-7b-hf: qwen2-0.5b-hf:
gsm8k_accuracy: 81.25 gsm8k_accuracy: 25
GPQA_diamond_accuracy: 18.75 GPQA_diamond_accuracy: 0
race-high_accuracy: 87.5 race-high_accuracy: 40.62
winogrande_accuracy: 71.88 winogrande_accuracy: 62.5
qwen2-1.5b-hf:
qwen2.5-1.5b-turbomind: gsm8k_accuracy: 59.38
gsm8k_accuracy: 71.88 GPQA_diamond_accuracy: 9.38
GPQA_diamond_accuracy: 15.62 race-high_accuracy: 81.25
race-high_accuracy: 78.12 winogrande_accuracy: 62.5
winogrande_accuracy: 71.88 qwen2-7b-hf:
gsm8k_accuracy: 68.75
qwen2.5-7b-turbomind: GPQA_diamond_accuracy: 9.38
gsm8k_accuracy: 71.88 race-high_accuracy: 87.5
GPQA_diamond_accuracy: 25 winogrande_accuracy: 68.75
race-high_accuracy: 87.5 qwen2-1.5b-turbomind:
winogrande_accuracy: 71.88 gsm8k_accuracy: 56.25
GPQA_diamond_accuracy: 9.38
qwen1.5-moe-a2.7b-hf: race-high_accuracy: 81.25
gsm8k_accuracy: 62.5 winogrande_accuracy: 75
GPQA_diamond_accuracy: 18.75 qwen2-7b-turbomind:
race-high_accuracy: 84.38 gsm8k_accuracy: 75.00
winogrande_accuracy: 75 GPQA_diamond_accuracy: 12.5
race-high_accuracy: 87.5
qwen2-0.5b-hf: winogrande_accuracy: 71.88
gsm8k_accuracy: 25 qwen1.5-0.5b-vllm:
GPQA_diamond_accuracy: 0 gsm8k_accuracy: 9.38
race-high_accuracy: 40.62 GPQA_diamond_accuracy: 0
winogrande_accuracy: 62.5 race-high_accuracy: 56.25
winogrande_accuracy: 62.5
qwen2-1.5b-hf: yi-1.5-6b-hf:
gsm8k_accuracy: 59.38 gsm8k_accuracy: 62.5
GPQA_diamond_accuracy: 9.38 GPQA_diamond_accuracy: 3.12
race-high_accuracy: 81.25 race-high_accuracy: 87.5
winogrande_accuracy: 62.5 winogrande_accuracy: 62.5
yi-1.5-9b-hf:
qwen2-7b-hf: gsm8k_accuracy: 75
gsm8k_accuracy: 68.75 GPQA_diamond_accuracy: 40.62
GPQA_diamond_accuracy: 9.38 race-high_accuracy: 87.5
race-high_accuracy: 87.5 winogrande_accuracy: 59.38
winogrande_accuracy: 68.75 yi-1.5-9b-turbomind:
gsm8k_accuracy: 78.12
qwen2-1.5b-turbomind: GPQA_diamond_accuracy: 40.62
gsm8k_accuracy: 62.50 race-high_accuracy: 87.5
GPQA_diamond_accuracy: 6.25 winogrande_accuracy: 71.88
race-high_accuracy: 81.25 deepseek-v2-lite-hf:
winogrande_accuracy: 75 gsm8k_accuracy: 31.25
GPQA_diamond_accuracy: 28.12
qwen2-7b-turbomind: race-high_accuracy: 59.38
gsm8k_accuracy: 68.75 winogrande_accuracy: 71.88
GPQA_diamond_accuracy: 12.5 internlm2-20b-hf:
race-high_accuracy: 87.5 gsm8k_accuracy: 56.25
winogrande_accuracy: 71.88 GPQA_diamond_accuracy: 15.62
race-high_accuracy: 68.75
qwen1.5-0.5b-vllm: winogrande_accuracy: 75
gsm8k_accuracy: 9.38 internlm2-base-20b-hf:
GPQA_diamond_accuracy: 0 gsm8k_accuracy: 12.5
race-high_accuracy: 56.25 GPQA_diamond_accuracy: 9.38
winogrande_accuracy: 62.5 race-high_accuracy: 84.38
winogrande_accuracy: 65.62
yi-1.5-6b-hf: internlm2-20b-turbomind:
gsm8k_accuracy: 62.5 gsm8k_accuracy: 71.88
GPQA_diamond_accuracy: 3.12 GPQA_diamond_accuracy: 15.62
race-high_accuracy: 87.5 race-high_accuracy: 68.75
winogrande_accuracy: 62.5 winogrande_accuracy: 81.25
qwen2.5-14b-hf:
yi-1.5-9b-hf: gsm8k_accuracy: 75
gsm8k_accuracy: 75 GPQA_diamond_accuracy: 37.5
GPQA_diamond_accuracy: 40.62 race-high_accuracy: 93.75
race-high_accuracy: 87.5 winogrande_accuracy: 84.38
winogrande_accuracy: 59.38 qwen2.5-32b-hf:
gsm8k_accuracy: 87.5
deepseek-v2-lite-hf: GPQA_diamond_accuracy: 31.25
gsm8k_accuracy: 28.12 race-high_accuracy: 93.75
GPQA_diamond_accuracy: 21.88 winogrande_accuracy: 78.12
race-high_accuracy: 59.38 qwen2.5-32b-turbomind:
winogrande_accuracy: 75 gsm8k_accuracy: 84.38
GPQA_diamond_accuracy: 28.12
internlm2-20b-hf: race-high_accuracy: 93.75
gsm8k_accuracy: 56.25 winogrande_accuracy: 81.25
GPQA_diamond_accuracy: 15.62 deepseek-67b-base-hf:
race-high_accuracy: 68.75 gsm8k_accuracy: 59.38
winogrande_accuracy: 75 GPQA_diamond_accuracy: 31.25
race-high_accuracy: 81.25
internlm2-base-20b-hf: winogrande_accuracy: 90.62
gsm8k_accuracy: 12.5 deepseek-67b-base-turbomind:
GPQA_diamond_accuracy: 9.38 gsm8k_accuracy: 56.25
race-high_accuracy: 84.38 GPQA_diamond_accuracy: 28.12
winogrande_accuracy: 65.62 race-high_accuracy: 81.25
winogrande_accuracy: 84.38
internlm2-20b-turbomind: llama-3-70b-turbomind:
gsm8k_accuracy: 68.75 gsm8k_accuracy: 59.38
GPQA_diamond_accuracy: 15.62 GPQA_diamond_accuracy: 9.38
race-high_accuracy: 68.75 race-high_accuracy: 93.75
winogrande_accuracy: 81.25 winogrande_accuracy: 84.38
qwen2.5-72b-turbomind:
qwen2.5-14b-hf: gsm8k_accuracy: 84.38
gsm8k_accuracy: 75 GPQA_diamond_accuracy: 34.38
GPQA_diamond_accuracy: 37.5 race-high_accuracy: 93.75
race-high_accuracy: 93.75 winogrande_accuracy: 87.5
winogrande_accuracy: 84.38 deepseek-v2-turbomind:
gsm8k_accuracy: 62.5
GPQA_diamond_accuracy: 3.12
race-high_accuracy: 81.25
winogrande_accuracy: 68.75
llama-3-70b-hf:
gsm8k_accuracy: 62.5
GPQA_diamond_accuracy: 3.12
race-high_accuracy: 93.75
winogrande_accuracy: 84.38

View File

@ -28,21 +28,21 @@ on:
description: 'Set branch or tag or commit id. Default is "main"' description: 'Set branch or tag or commit id. Default is "main"'
type: string type: string
default: 'main' default: 'main'
regression_func: regression_func_volc:
required: true required: true
description: 'regression functions' description: 'regression functions'
type: string type: string
default: "['chat_models','base_models', 'chat_obj_fullbench', 'chat_sub_fullbench', 'base_fullbench','cmd', 'api']" default: "['chat_models','base_models', 'chat_obj_fullbench', 'base_fullbench']"
cuda_env: regression_func_local:
required: true required: true
description: "regression conda env, eg. ['dsw_cu11','dsw_cu12']" description: 'regression functions'
type: string type: string
default: "['dsw_cu12']" default: "['cmd', 'api', 'chat_sub_fullbench']"
fullbench_eval: fullbench_eval:
required: true required: true
description: 'fullbench volc functions' description: 'fullbench volc functions'
type: string type: string
default: "['base_long_context','base_objective','chat_long_context','chat_objective','chat_subjective']" default: "['base_objective','chat_objective','chat_subjective','base_long_context','chat_long_context']"
schedule: schedule:
- cron: '15 14 * * *' - cron: '15 14 * * *'
@ -54,6 +54,13 @@ env:
LMDEPLOY_USE_MODELSCOPE: false LMDEPLOY_USE_MODELSCOPE: false
HF_HUB_OFFLINE: 1 HF_HUB_OFFLINE: 1
OUTPUT_FOLDER: cuda12.1_dist_${{ github.run_id }} OUTPUT_FOLDER: cuda12.1_dist_${{ github.run_id }}
CONDA_PATH: /fs-computility/llm/qa-llm-cicd/miniconda3
PIP_CACHE_PATH: /fs-computility/llm/qa-llm-cicd/.cache/pip
REPORT_ROOT: /fs-computility/llm/qa-llm-cicd/eval_report/regression
COMPASS_DATA_CACHE: /fs-computility/llm/shared/llmeval/datasets/compass_data_cache
HUGGINGFACE_HUB_CACHE: /fs-computility/llm/shared/llmeval/models/opencompass_hf_hub
HF_HUB_CACHE: /fs-computility/llm/shared/llmeval/models/opencompass_hf_hub
CONDA_ENV: regression_test
jobs: jobs:
build-pypi: build-pypi:
@ -117,14 +124,7 @@ jobs:
prepare_env: prepare_env:
if: ${{!cancelled()}} if: ${{!cancelled()}}
needs: ['build-pypi', 'build-pypi-lmdeploy'] needs: ['build-pypi', 'build-pypi-lmdeploy']
strategy: runs-on: volc_cu12
fail-fast: false
matrix:
cuda_env: ${{ fromJSON(inputs.cuda_env || '["dsw_cu12"]')}}
runs-on: ${{ matrix.cuda_env }}
env:
CONDA_ENV: opencompass_regression
PIP_CACHE_PATH: /cpfs01/user/qa-llm-cicd/.cache/pip
environment: 'prod' environment: 'prod'
timeout-minutes: 240 #4hours timeout-minutes: 240 #4hours
steps: steps:
@ -140,79 +140,52 @@ jobs:
- name: Remove Conda Env - name: Remove Conda Env
if: always() if: always()
run: | run: |
. /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate . /fs-computility/llm/qa-llm-cicd/miniconda3/bin/activate
conda env remove -y --name ${{env.CONDA_ENV}}_${{ matrix.cuda_env }} conda env remove -y --name ${{env.CONDA_ENV}}
conda info --envs conda info --envs
- name: Prepare - create conda env and install torch - cu11
if: ${{matrix.cuda_env == 'dsw_cu11'}}
uses: nick-fields/retry@v3
id: retry1
with:
max_attempts: 3
timeout_minutes: 40
command: |
. /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate
conda create -y --name ${{env.CONDA_ENV}}_${{ matrix.cuda_env }} python=3.10
conda activate ${{env.CONDA_ENV}}_${{ matrix.cuda_env }}
pip install -r /cpfs01/shared/public/qa-llm-cicd/requirements-cu11.txt --cache-dir ${{env.PIP_CACHE_PATH}}
pip install opencompass*.whl --cache-dir ${{env.PIP_CACHE_PATH}}
pip install /cpfs01/user/qa-llm-cicd/packages/lmdeploy-0.6.1+cu118-cp310-cp310-manylinux2014_x86_64.whl --cache-dir ${{env.PIP_CACHE_PATH}}
pip install /cpfs01/user/qa-llm-cicd/packages/vllm-0.6.1.post1+cu118-cp310-cp310-manylinux1_x86_64.whl --cache-dir ${{env.PIP_CACHE_PATH}}
pip uninstall torch torchvision torchaudio -y
pip install torch==2.5.1 torchvision==0.20.1 torchaudio==2.5.1 --cache-dir ${{env.PIP_CACHE_PATH}} --index-url https://download.pytorch.org/whl/cu118
FLASH_ATTENTION_FORCE_BUILD=TRUE pip install /cpfs01/user/qa-llm-cicd/packages/flash_attn-2.7.0.post2+cu11torch2.5cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
pip install /cpfs01/user/qa-llm-cicd/packages/xformers-0.0.28.post3-cp310-cp310-manylinux_2_28_x86_64.whl --cache-dir ${{env.PIP_CACHE_PATH}}
conda info --envs
pip list
- name: Prepare - create conda env and install torch - cu12 - name: Prepare - create conda env and install torch - cu12
if: ${{matrix.cuda_env == 'dsw_cu12'}}
uses: nick-fields/retry@v3 uses: nick-fields/retry@v3
id: retry2
with: with:
max_attempts: 3 max_attempts: 3
timeout_minutes: 40 timeout_minutes: 240
command: | command: |
. /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate . ${{env.CONDA_PATH}}/bin/activate
conda create -y --name ${{env.CONDA_ENV}}_${{ matrix.cuda_env }} python=3.10 conda create -y --name ${{env.CONDA_ENV}} python=3.10
conda activate ${{env.CONDA_ENV}}_${{ matrix.cuda_env }} conda activate ${{env.CONDA_ENV}}
pip install -r /cpfs01/shared/public/qa-llm-cicd/requirements-cu12.txt --cache-dir ${{env.PIP_CACHE_PATH}} pip install -r /fs-computility/llm/qa-llm-cicd/config/requirements.txt --cache-dir ${{env.PIP_CACHE_PATH}}
pip install opencompass*.whl --cache-dir ${{env.PIP_CACHE_PATH}} pip install opencompass*.whl --cache-dir ${{env.PIP_CACHE_PATH}}
pip install opencompass[lmdeploy] --cache-dir ${{env.PIP_CACHE_PATH}} pip install opencompass[lmdeploy] --cache-dir ${{env.PIP_CACHE_PATH}}
pip install opencompass[vllm] --cache-dir ${{env.PIP_CACHE_PATH}} pip install opencompass[vllm] --cache-dir ${{env.PIP_CACHE_PATH}}
pip install torch==2.5.1 torchvision==0.20.1 torchaudio==2.5.1 --cache-dir ${{env.PIP_CACHE_PATH}} pip install torch==2.5.1 torchvision==0.20.1 torchaudio==2.5.1 --cache-dir ${{env.PIP_CACHE_PATH}}
FLASH_ATTENTION_FORCE_BUILD=TRUE pip install /cpfs01/user/qa-llm-cicd/packages/flash_attn-2.7.0.post2+cu12torch2.5cxx11abiFALSE-cp310-cp310-linux_x86_64.whl FLASH_ATTENTION_FORCE_BUILD=TRUE pip install /fs-computility/llm/qa-llm-cicd/packages/flash_attn-2.7.0.post2+cu12torch2.5cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
pip install /cpfs01/user/qa-llm-cicd/packages/xformers-0.0.28.post3-cp310-cp310-manylinux_2_28_x86_64.whl --cache-dir ${{env.PIP_CACHE_PATH}} pip install xformers --index-url https://download.pytorch.org/whl/cu121 --cache-dir ${{env.PIP_CACHE_PATH}}
conda info --envs cp -r /root/nltk_data ${{env.CONDA_PATH}}/envs/${{env.CONDA_ENV}}/nltk_data
pip list
- name: Prepare - reinstall lmdeploy - cu12 - name: Prepare - reinstall lmdeploy - cu12
if: ${{matrix.cuda_env == 'dsw_cu12' && inputs.build_lmdeploy}} if: ${{inputs.build_lmdeploy}}
uses: actions/download-artifact@v4 uses: actions/download-artifact@v4
with: with:
name: my-artifact-${{ github.run_id }}-py310 name: my-artifact-${{ github.run_id }}-py310
- name: Prepare - reinstall lmdeploy - cu12 - name: Prepare - reinstall lmdeploy - cu12
if: ${{matrix.cuda_env == 'dsw_cu12' && inputs.build_lmdeploy}} if: ${{inputs.build_lmdeploy}}
run: | run: |
. /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate . ${{env.CONDA_PATH}}/bin/activate
conda activate ${{env.CONDA_ENV}}_${{ matrix.cuda_env }} conda activate ${{env.CONDA_ENV}}
pip install lmdeploy-*.whl --no-deps pip install lmdeploy-*.whl --no-deps
- name: conda env
run: |
. ${{env.CONDA_PATH}}/bin/activate
conda activate ${{env.CONDA_ENV}}
conda info --envs
pip list
daily_run_test: daily_run_test_volc:
if: ${{!cancelled()}} if: ${{!cancelled()}}
needs: prepare_env needs: prepare_env
strategy: strategy:
fail-fast: false fail-fast: false
matrix: matrix:
cuda_env: ${{ fromJSON(inputs.cuda_env || '["dsw_cu12"]')}} regression_func: ${{fromJSON(github.event.inputs.regression_func_volc || '["chat_models","base_models","chat_obj_fullbench","base_fullbench"]')}}
regression_func: ${{fromJSON(github.event.inputs.regression_func || '["chat_models","base_models","chat_obj_fullbench","chat_sub_fullbench","base_fullbench","cmd","api"]')}} runs-on: volc_cu12_daily
runs-on: ${{ matrix.cuda_env }}
env:
CONDA_ENV: opencompass_regression
PIP_CACHE_PATH: /cpfs01/user/qa-llm-cicd/.cache/pip
HF_CACHE_PATH: /cpfs01/shared/public/public_hdd/llmeval/model_weights/hf_hub
HUGGINGFACE_HUB_CACHE: /cpfs01/shared/public/public_hdd/llmeval/model_weights/hf_hub
HF_HUB_CACHE: /cpfs01/shared/public/public_hdd/llmeval/model_weights/hf_hub
COMPASS_DATA_CACHE: /cpfs01/shared/public/llmeval/compass_data_cache
REPORT_ROOT: /cpfs01/shared/public/qa-llm-cicd/report
environment: 'prod' environment: 'prod'
timeout-minutes: 240 #4hours timeout-minutes: 240 #4hours
steps: steps:
@ -221,105 +194,114 @@ jobs:
with: with:
repository: ${{ github.event.inputs.repo_org || 'open-compass/opencompass' }} repository: ${{ github.event.inputs.repo_org || 'open-compass/opencompass' }}
ref: ${{github.event.inputs.repo_ref || 'main'}} ref: ${{github.event.inputs.repo_ref || 'main'}}
- name: Prepare - prepare data and hf model - name: conda env
run: | run: |
rm -rf ~/.cache/huggingface/hub -f && mkdir ~/.cache -p && mkdir ~/.cache/huggingface -p . ${{env.CONDA_PATH}}/bin/activate
ln -s ${{env.HF_CACHE_PATH}} ~/.cache/huggingface/hub conda activate ${{env.CONDA_ENV}}
conda info --envs
pip list
- name: modify config
if: matrix.regression_func != 'chat_sub_fullbench'
run: |
cp -r /fs-computility/llm/qa-llm-cicd/ocplayground/template/configs_cluster/volc.py .
cat /fs-computility/llm/qa-llm-cicd/config/test_config.txt >> .github/scripts/eval_regression_${{matrix.regression_func}}.py
- name: Run test
uses: nick-fields/retry@v3
with:
max_attempts: 3
timeout_minutes: 40
command: |
. ${{env.CONDA_PATH}}/bin/activate
conda activate ${{env.CONDA_ENV}}
conda info --envs
opencompass .github/scripts/eval_regression_${{matrix.regression_func}}.py --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/${{matrix.regression_func}} --reuse --dump-eval-details
rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/${{matrix.regression_func}}/*/summary regression_result_daily
python -m pytest -m ${{matrix.regression_func}} -s -v --color=yes .github/scripts/oc_score_assert.py
daily_run_test_local:
if: ${{!cancelled()}}
needs: prepare_env
strategy:
fail-fast: false
matrix:
regression_func: ${{fromJSON(github.event.inputs.regression_func_local || '["cmd","api","chat_sub_fullbench"]')}}
runs-on: volc_cu12_local
environment: 'prod'
timeout-minutes: 240 #4hours
steps:
- name: Clone repository
uses: actions/checkout@v2
with:
repository: ${{ github.event.inputs.repo_org || 'open-compass/opencompass' }}
ref: ${{github.event.inputs.repo_ref || 'main'}}
- name: conda env
run: |
. ${{env.CONDA_PATH}}/bin/activate
conda activate ${{env.CONDA_ENV}}
conda info --envs
pip list
- name: modify config
if: matrix.regression_func == 'chat_sub_fullbench'
run: |
cp -r /fs-computility/llm/qa-llm-cicd/ocplayground/template/configs_cluster/volc.py .
cat /fs-computility/llm/qa-llm-cicd/config/test_config_sub.txt >> .github/scripts/eval_regression_${{matrix.regression_func}}.py
- name: Run command testcase - name: Run command testcase
if: matrix.regression_func == 'cmd' if: matrix.regression_func == 'cmd'
run: | run: |
. /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate . ${{env.CONDA_PATH}}/bin/activate
conda activate ${{env.CONDA_ENV}}_${{ matrix.cuda_env }} conda activate ${{env.CONDA_ENV}}
conda info --envs conda info --envs
export from_tf=TRUE export from_tf=TRUE
python tools/list_configs.py internlm2_5 mmlu python tools/list_configs.py internlm2_5 mmlu
opencompass --models hf_internlm2_5_7b hf_internlm2_1_8b --datasets race_ppl demo_gsm8k_chat_gen --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd1_${{ matrix.cuda_env }} --reuse --max-num-workers 2 --dump-eval-details opencompass --models hf_internlm2_5_7b hf_internlm2_1_8b --datasets race_ppl demo_gsm8k_chat_gen --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd1 --reuse --max-num-workers 2 --dump-eval-details
rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd1_${{ matrix.cuda_env }}/*/summary regression_result_daily rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd1/*/summary regression_result_daily
python -m pytest -m case1 -s -v --color=yes .github/scripts/oc_score_assert.py python -m pytest -m case1 -s -v --color=yes .github/scripts/oc_score_assert.py
opencompass --models hf_internlm2_5_7b_chat hf_internlm2_chat_1_8b --datasets race_gen demo_gsm8k_chat_gen -a lmdeploy --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd2_${{ matrix.cuda_env }} --reuse --max-num-workers 2 --dump-eval-details opencompass --models hf_internlm2_5_7b_chat hf_internlm2_chat_1_8b --datasets race_gen demo_gsm8k_chat_gen -a lmdeploy --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd2 --reuse --max-num-workers 2 --dump-eval-details
rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd2_${{ matrix.cuda_env }}/*/summary regression_result_daily rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd2/*/summary regression_result_daily
python -m pytest -m case2 -s -v --color=yes .github/scripts/oc_score_assert.py python -m pytest -m case2 -s -v --color=yes .github/scripts/oc_score_assert.py
opencompass --datasets race_ppl demo_gsm8k_chat_gen --hf-type base --hf-path internlm/internlm2_5-7b --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd3_${{ matrix.cuda_env }} --reuse --max-num-workers 2 --dump-eval-details opencompass --datasets race_ppl demo_gsm8k_chat_gen --hf-type base --hf-path internlm/internlm2_5-7b --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd3 --reuse --max-num-workers 2 --dump-eval-details
rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd3_${{ matrix.cuda_env }}/*/summary regression_result_daily rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd3/*/summary regression_result_daily
python -m pytest -m case3 -s -v --color=yes .github/scripts/oc_score_assert.py python -m pytest -m case3 -s -v --color=yes .github/scripts/oc_score_assert.py
opencompass --datasets race_gen demo_gsm8k_chat_gen --hf-type chat --hf-path internlm/internlm2_5-7b-chat --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd4_${{ matrix.cuda_env }} --reuse --max-num-workers 2 --dump-eval-details opencompass --datasets race_gen demo_gsm8k_chat_gen --hf-type chat --hf-path internlm/internlm2_5-7b-chat --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd4 --reuse --max-num-workers 2 --dump-eval-details
rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd4_${{ matrix.cuda_env }}/*/summary regression_result_daily rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd4/*/summary regression_result_daily
python -m pytest -m case4 -s -v --color=yes .github/scripts/oc_score_assert.py python -m pytest -m case4 -s -v --color=yes .github/scripts/oc_score_assert.py
- name: Run chat model test
if: matrix.regression_func == 'chat_models'
run: |
. /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate
conda activate ${{env.CONDA_ENV}}_${{ matrix.cuda_env }}
conda info --envs
opencompass .github/scripts/eval_regression_chat.py --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/chat_${{ matrix.cuda_env }} --reuse --max-num-workers 2 --dump-eval-details
rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/chat_${{ matrix.cuda_env }}/*/summary regression_result_daily
python -m pytest -m chat -s -v --color=yes .github/scripts/oc_score_assert.py
- name: Run base model test
if: matrix.regression_func == 'base_models'
run: |
. /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate
conda activate ${{env.CONDA_ENV}}_${{ matrix.cuda_env }}
conda info --envs
opencompass .github/scripts/eval_regression_base.py --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/base_${{ matrix.cuda_env }} --reuse --max-num-workers 2 --dump-eval-details
rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/base_${{ matrix.cuda_env }}/*/summary regression_result_daily
python -m pytest -m base -s -v --color=yes .github/scripts/oc_score_assert.py
- name: Run chat model test - fullbench
if: matrix.regression_func == 'chat_obj_fullbench'
run: |
. /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate
conda activate ${{env.CONDA_ENV}}_${{ matrix.cuda_env }}
conda info --envs
opencompass .github/scripts/eval_regression_chat_objective_fullbench.py --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/chat_obj_${{ matrix.cuda_env }} --reuse --max-num-workers 2 --dump-eval-details
rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/chat_obj_${{ matrix.cuda_env }}/*/summary regression_result_daily
python -m pytest -m chat_obj_fullbench -s -v --color=yes .github/scripts/oc_score_assert.py
- name: Run chat model test - fullbench
if: matrix.regression_func == 'chat_sub_fullbench'
env:
COMPASS_DATA_CACHE: /cpfs01/shared/public/llmeval/compass_data_cache_subset
run: |
. /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate
conda activate ${{env.CONDA_ENV}}_${{ matrix.cuda_env }}
conda info --envs
opencompass .github/scripts/eval_regression_chat_subjective_fullbench.py --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/chat_sub_${{ matrix.cuda_env }} --reuse --max-num-workers 2 --dump-eval-details
rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/chat_sub_${{ matrix.cuda_env }}/*/summary regression_result_daily
python -m pytest -m chat_sub_fullbench -s -v --color=yes .github/scripts/oc_score_assert.py
- name: Run base model test - fullbench
if: matrix.regression_func == 'base_fullbench'
run: |
. /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate
conda activate ${{env.CONDA_ENV}}_${{ matrix.cuda_env }}
conda info --envs
opencompass .github/scripts/eval_regression_base_fullbench.py --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/base_full_${{ matrix.cuda_env }} --reuse --max-num-workers 2 --dump-eval-details
rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/base_full_${{ matrix.cuda_env }}/*/summary regression_result_daily
python -m pytest -m base_fullbench -s -v --color=yes .github/scripts/oc_score_assert.py
- name: Run model test - api - name: Run model test - api
if: matrix.regression_func == 'api' if: matrix.regression_func == 'api'
run: | run: |
. /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate . ${{env.CONDA_PATH}}/bin/activate
conda activate ${{env.CONDA_ENV}}_${{ matrix.cuda_env }} conda activate ${{env.CONDA_ENV}}
conda info --envs conda info --envs
lmdeploy serve api_server internlm/internlm2_5-7b-chat --max-batch-size 256 --model-name internlm2 > ${{env.REPORT_ROOT}}/${{ github.run_id }}/restful.log 2>&1 & lmdeploy serve api_server internlm/internlm2_5-7b-chat --max-batch-size 256 --model-name internlm2 > ${{env.REPORT_ROOT}}/${{ github.run_id }}/restful.log 2>&1 &
echo "restful_pid=$!" >> "$GITHUB_ENV" echo "restful_pid=$!" >> "$GITHUB_ENV"
sleep 120s sleep 120s
opencompass .github/scripts/eval_regression_api.py --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/api_${{ matrix.cuda_env }} --reuse --max-num-workers 2 --dump-eval-details opencompass .github/scripts/eval_regression_api.py --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/api --reuse --max-num-workers 2 --dump-eval-details
rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/api_${{ matrix.cuda_env }}/*/summary regression_result_daily rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/api/*/summary regression_result_daily
python -m pytest -m api -s -v --color=yes .github/scripts/oc_score_assert.py python -m pytest -m api -s -v --color=yes .github/scripts/oc_score_assert.py
- name: Run model test - api kill - name: Run model test - api kill
if: always() && matrix.regression_func == 'api' if: always() && matrix.regression_func == 'api'
run: | run: |
kill -15 "$restful_pid" kill -15 "$restful_pid"
- name: Run testcase
if: matrix.regression_func == 'chat_sub_fullbench'
env:
COMPASS_DATA_CACHE: /fs-computility/llm/shared/llmeval/datasets/compass_data_cache_subset
run: |
. ${{env.CONDA_PATH}}/bin/activate
conda activate ${{env.CONDA_ENV}}
conda info --envs
export from_tf=TRUE
opencompass .github/scripts/eval_regression_${{matrix.regression_func}}.py --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/${{matrix.regression_func}} --reuse --dump-eval-details
rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/${{matrix.regression_func}}/*/summary regression_result_daily
python -m pytest -m ${{matrix.regression_func}} -s -v --color=yes .github/scripts/oc_score_assert.py
fullbench_run_test: fullbench_run_test:
if: ${{!cancelled()}} if: ${{!cancelled()}}
needs: ['build-pypi', 'build-pypi-lmdeploy'] needs: prepare_env
env:
FULLBENCH_CONDA_ENV: regression_test
FULLBENCH_REPORT_ROOT: /fs-computility/llm/qa-llm-cicd/eval_report/regression
COMPASS_DATA_CACHE: /fs-computility/llm/shared/llmeval/datasets/compass_data_cache
strategy: strategy:
fail-fast: false fail-fast: false
matrix: matrix:
function_type: ${{fromJSON(github.event.inputs.fullbench_eval || '["base_long_context","base_objective","chat_long_context","chat_objective","chat_subjective"]')}} function_type: ${{fromJSON(github.event.inputs.fullbench_eval || '["base_objective","chat_objective","chat_subjective","base_long_context","chat_long_context"]')}}
runs-on: volc_cu12 runs-on: volc_cu12
environment: 'prod' environment: 'prod'
timeout-minutes: 360 #6hours timeout-minutes: 360 #6hours
@ -329,48 +311,30 @@ jobs:
with: with:
repository: ${{ github.event.inputs.repo_org || 'open-compass/opencompass' }} repository: ${{ github.event.inputs.repo_org || 'open-compass/opencompass' }}
ref: ${{github.event.inputs.repo_ref || 'main'}} ref: ${{github.event.inputs.repo_ref || 'main'}}
- name: Download Artifacts - name: conda env
uses: actions/download-artifact@v4
with:
name: my-artifact-${{ github.run_id }}
- name: Prepare - reinstall opencompass - cu12
if: ${{matrix.cuda_env == 'dsw_cu12' && inputs.build_lmdeploy}}
run: | run: |
. /fs-computility/llm/qa-llm-cicd/miniconda3/bin/activate . ${{env.CONDA_PATH}}/bin/activate
conda activate ${{env.FULLBENCH_CONDA_ENV}} conda activate ${{env.CONDA_ENV}}
pip install opencompass*.whl --no-deps
- name: Prepare - reinstall lmdeploy - cu12
if: ${{matrix.cuda_env == 'dsw_cu12' && inputs.build_lmdeploy}}
uses: actions/download-artifact@v4
with:
name: my-artifact-${{ github.run_id }}-py310
- name: Prepare - reinstall lmdeploy - cu12
if: ${{matrix.cuda_env == 'dsw_cu12' && inputs.build_lmdeploy}}
run: |
. /fs-computility/llm/qa-llm-cicd/miniconda3/bin/activate
conda activate ${{env.FULLBENCH_CONDA_ENV}}
pip install lmdeploy-*.whl --no-deps
- name: Conda env
if: ${{matrix.cuda_env == 'dsw_cu12' && inputs.build_lmdeploy}}
run: |
. /fs-computility/llm/qa-llm-cicd/miniconda3/bin/activate
conda activate ${{env.FULLBENCH_CONDA_ENV}}
conda info --envs conda info --envs
pip list pip list
- name: Run command testcase - name: Run testcase
run: | uses: nick-fields/retry@v3
. /fs-computility/llm/qa-llm-cicd/miniconda3/bin/activate with:
conda activate ${{env.FULLBENCH_CONDA_ENV}} max_attempts: 3
conda info --envs timeout_minutes: 240
export from_tf=TRUE command: |
opencompass /fs-computility/llm/qa-llm-cicd/ocplayground/template/regression/eval_${{ matrix.function_type }}.py --work-dir ${{env.FULLBENCH_REPORT_ROOT}}/${{ github.run_id }}/${{ matrix.function_type }} --reuse . ${{env.CONDA_PATH}}/bin/activate
rm regression_result_daily -f && ln -s ${{env.FULLBENCH_REPORT_ROOT}}/${{ github.run_id }}/${{ matrix.function_type }}/*/summary regression_result_daily conda activate ${{env.CONDA_ENV}}
python -m pytest -m ${{ matrix.function_type }} -s -v --color=yes .github/scripts/oc_score_assert.py conda info --envs
export from_tf=TRUE
opencompass /fs-computility/llm/qa-llm-cicd/ocplayground/template/regression/eval_${{ matrix.function_type }}.py --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/${{ matrix.function_type }} --reuse
rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/${{ matrix.function_type }}/*/summary regression_result_daily
python -m pytest -m ${{ matrix.function_type }} -s -v --color=yes .github/scripts/oc_score_assert.py
notify_to_feishu: notify_to_feishu:
if: ${{ always() && !cancelled() && contains(needs.*.result, 'failure') && (github.ref_name == 'develop' || github.ref_name == 'main') }} if: ${{ always() && !cancelled() && contains(needs.*.result, 'failure') && (github.ref_name == 'develop' || github.ref_name == 'main') }}
needs: [daily_run_test, fullbench_run_test] needs: [daily_run_test_volc, daily_run_test_local, fullbench_run_test]
environment: 'prod' environment: 'prod'
timeout-minutes: 5 timeout-minutes: 5
runs-on: self-hosted runs-on: self-hosted

View File

@ -18,18 +18,23 @@ concurrency:
cancel-in-progress: true cancel-in-progress: true
env: env:
CONDA_ENV: opencompass_ CONDA_ENV: pr_test
USERSPACE_PREFIX: /cpfs01/user/qa-llm-cicd
HF_CACHE_PATH: /cpfs01/shared/public/public_hdd/llmeval/model_weights/hf_hub
HF_DATASETS_OFFLINE: 1 HF_DATASETS_OFFLINE: 1
HF_EVALUATE_OFFLINE: 1
TRANSFORMERS_OFFLINE: 1 TRANSFORMERS_OFFLINE: 1
HF_HUB_OFFLINE: 1
VLLM_USE_MODELSCOPE: false VLLM_USE_MODELSCOPE: false
LMDEPLOY_USE_MODELSCOPE: false LMDEPLOY_USE_MODELSCOPE: false
HF_HUB_OFFLINE: 1
CONDA_PATH: /fs-computility/llm/qa-llm-cicd/miniconda3
PIP_CACHE_PATH: /fs-computility/llm/qa-llm-cicd/.cache/pip
REPORT_ROOT: /fs-computility/llm/qa-llm-cicd/eval_report/prtest
COMPASS_DATA_CACHE: /fs-computility/llm/shared/llmeval/datasets/compass_data_cache
HUGGINGFACE_HUB_CACHE: /fs-computility/llm/shared/llmeval/models/opencompass_hf_hub
HF_HUB_CACHE: /fs-computility/llm/shared/llmeval/models/opencompass_hf_hub
jobs: jobs:
pr_run_test: pr_run_test:
runs-on: dsw_cu12 runs-on: volc_cu12_local
environment: 'prod' environment: 'prod'
timeout-minutes: 30 timeout-minutes: 30
steps: steps:
@ -37,54 +42,55 @@ jobs:
uses: actions/checkout@v2 uses: actions/checkout@v2
- name: Prepare - Install opencompass - name: Prepare - Install opencompass
run: | run: |
. /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate . ${{env.CONDA_PATH}}/bin/activate
conda activate ${{env.CONDA_ENV}}${{ runner.name }} conda activate ${{env.CONDA_ENV}}
python3 -m pip uninstall opencompass -y python3 -m pip uninstall opencompass -y
python3 -m pip install -e . --cache-dir ${{env.USERSPACE_PREFIX}}/.cache/pip python3 -m pip install -e . --cache-dir ${{env.PIP_CACHE_PATH}}
conda info --envs conda info --envs
- name: Prepare - prepare data and hf model - name: conda env
run: | run: |
cp -r ${{env.USERSPACE_PREFIX}}/data . . ${{env.CONDA_PATH}}/bin/activate
rm -rf ~/.cache/huggingface/hub -f && mkdir ~/.cache -p && mkdir ~/.cache/huggingface -p conda activate ${{env.CONDA_ENV}}
ln -s ${{env.HF_CACHE_PATH}} ~/.cache/huggingface/hub conda info --envs
pip list
lmdeploy check_env
- name: Run test - name: Run test
run: | run: |
. /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate . ${{env.CONDA_PATH}}/bin/activate
conda activate ${{env.CONDA_ENV}}${{ runner.name }} conda activate ${{env.CONDA_ENV}}
conda info --envs conda info --envs
rm -rf regression_result rm -rf regression_result
opencompass --models hf_internlm2_5_20b_chat --datasets demo_gsm8k_chat_gen --work-dir regression_result1 --debug opencompass --models hf_internlm2_5_20b_chat --datasets demo_gsm8k_chat_gen --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/regression_result1 --debug
opencompass --models hf_internlm2_5_7b_chat --datasets demo_gsm8k_chat_gen --work-dir regression_result2 --debug --max-num-workers 2 opencompass --models hf_internlm2_5_7b_chat --datasets demo_gsm8k_chat_gen --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/regression_result2 --debug --max-num-workers 2
opencompass --models hf_internlm2_5_7b_chat --datasets demo_gsm8k_chat_gen -a lmdeploy --work-dir regression_result3 --debug --max-num-workers 2 opencompass --models hf_internlm2_5_7b_chat --datasets demo_gsm8k_chat_gen -a lmdeploy --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/regression_result3 --debug --max-num-workers 2
- name: Get result - name: Get result
run: | run: |
score=$(sed -n '$p' regression_result1/*/summary/*.csv | awk -F ',' '{print $NF}') score=$(sed -n '$p' ${{env.REPORT_ROOT}}/${{ github.run_id }}/regression_result1/*/summary/*.csv | awk -F ',' '{print $NF}')
if (( ${score%.*} >= 88 && ${score%.*} <= 89 )); then if (( ${score%.*} >= 88 && ${score%.*} <= 89 )); then
echo "score is $score between 88 and 89" echo "score is $score between 88 and 89"
else else
echo "score is $score not between 88 and 89" echo "score is $score not between 88 and 89"
exit 1 exit 1
fi fi
score=$(sed -n '$p' regression_result2/*/summary/*.csv | awk -F ',' '{print $NF}') score=$(sed -n '$p' ${{env.REPORT_ROOT}}/${{ github.run_id }}/regression_result2/*/summary/*.csv | awk -F ',' '{print $NF}')
if (( ${score%.*} >= 87 && ${score%.*} <= 88 )); then if (( ${score%.*} >= 87 && ${score%.*} <= 88 )); then
echo "score is $score between 87 and 88" echo "score is $score between 87 and 88"
else else
echo "score is $score not between 87 and 88" echo "score is $score not between 87 and 88"
exit 1 exit 1
fi fi
score=$(sed -n '$p' regression_result3/*/summary/*.csv | awk -F ',' '{print $NF}') score=$(sed -n '$p' ${{env.REPORT_ROOT}}/${{ github.run_id }}/regression_result3/*/summary/*.csv | awk -F ',' '{print $NF}')
if (( ${score%.*} >= 87 && ${score%.*} <= 89 )); then if (( ${score%.*} >= 87 && ${score%.*} <= 89 )); then
echo "score is $score between 87 and 89" echo "score is $score between 87 and 89"
else else
echo "score is $score not between 87 and 89" echo "score is $score not between 87 and 89"
exit 1 exit 1
fi fi
rm -rf regression_result1 & rm -rf regression_result2 & rm -rf regression_result3
- name: Uninstall opencompass - name: Uninstall opencompass
if: always() if: always()
run: | run: |
. /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate . ${{env.CONDA_PATH}}/bin/activate
conda activate ${{env.CONDA_ENV}}${{ runner.name }} conda activate ${{env.CONDA_ENV}}
python3 -m pip uninstall opencompass -y python3 -m pip uninstall opencompass -y
conda info --envs conda info --envs