From c48bbde26fc98a0876a3450fe8655f6a0e7f2faf Mon Sep 17 00:00:00 2001 From: zhulinJulia24 <145004780+zhulinJulia24@users.noreply.github.com> Date: Wed, 25 Dec 2024 17:26:50 +0800 Subject: [PATCH] [ci] remove testcase into volc engine (#1777) * update * update * update * update * update * update * updaste * update * update * update * update * update * update * update * updaste * update * update * update * update * update * update * update * update * update * Update daily-run-test.yml * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update --- .../scripts/eval_regression_base_fullbench.py | 2 + ...base.py => eval_regression_base_models.py} | 26 + ...chat.py => eval_regression_chat_models.py} | 45 +- ... => eval_regression_chat_obj_fullbench.py} | 2 + ... => eval_regression_chat_sub_fullbench.py} | 29 +- .github/scripts/oc_score_assert.py | 382 ++------ .github/scripts/oc_score_baseline.yaml | 4 +- .../scripts/oc_score_baseline_fullbench.yaml | 877 +++++++++-------- .../scripts/oc_score_baseline_testrange.yaml | 927 +++++++++--------- .github/workflows/daily-run-test.yml | 302 +++--- .github/workflows/pr-run-test.yml | 52 +- 11 files changed, 1226 insertions(+), 1422 deletions(-) rename .github/scripts/{eval_regression_base.py => eval_regression_base_models.py} (80%) rename .github/scripts/{eval_regression_chat.py => eval_regression_chat_models.py} (76%) rename .github/scripts/{eval_regression_chat_objective_fullbench.py => eval_regression_chat_obj_fullbench.py} (99%) rename .github/scripts/{eval_regression_chat_subjective_fullbench.py => eval_regression_chat_sub_fullbench.py} (86%) diff --git a/.github/scripts/eval_regression_base_fullbench.py b/.github/scripts/eval_regression_base_fullbench.py index 23a8505b..028a41b6 100644 --- a/.github/scripts/eval_regression_base_fullbench.py +++ b/.github/scripts/eval_regression_base_fullbench.py @@ -66,6 +66,8 @@ with read_base(): from opencompass.configs.summarizers.groups.mmlu_pro import \ mmlu_pro_summary_groups # noqa: F401, E501 + from ...volc import infer as volc_infer # noqa: F401, E501 + race_datasets = [race_datasets[1]] # Only take RACE-High humaneval_v2_datasets[0]['abbr'] = 'openai_humaneval_v2' bbh_datasets = [ diff --git a/.github/scripts/eval_regression_base.py b/.github/scripts/eval_regression_base_models.py similarity index 80% rename from .github/scripts/eval_regression_base.py rename to .github/scripts/eval_regression_base_models.py index 330c97e5..08413707 100644 --- a/.github/scripts/eval_regression_base.py +++ b/.github/scripts/eval_regression_base_models.py @@ -13,12 +13,22 @@ with read_base(): # read hf models - chat models from opencompass.configs.models.chatglm.hf_glm4_9b import \ models as hf_glm4_9b_model # noqa: F401, E501 + from opencompass.configs.models.chatglm.lmdeploy_glm4_9b import \ + models as lmdeploy_glm4_9b_model # noqa: F401, E501 + from opencompass.configs.models.deepseek.hf_deepseek_7b_base import \ + models as hf_deepseek_7b_base_model # noqa: F401, E501 + from opencompass.configs.models.deepseek.hf_deepseek_67b_base import \ + models as hf_deepseek_67b_base_model # noqa: F401, E501 from opencompass.configs.models.deepseek.hf_deepseek_moe_16b_base import \ models as hf_deepseek_moe_16b_base_model # noqa: F401, E501 from opencompass.configs.models.deepseek.hf_deepseek_v2_lite import \ models as hf_deepseek_v2_lite_model # noqa: F401, E501 from opencompass.configs.models.deepseek.lmdeploy_deepseek_7b_base import \ models as lmdeploy_deepseek_7b_base_model # noqa: F401, E501 + from opencompass.configs.models.deepseek.lmdeploy_deepseek_67b_base import \ + models as lmdeploy_deepseek_67b_base_model # noqa: F401, E501 + from opencompass.configs.models.deepseek.lmdeploy_deepseek_v2 import \ + lmdeploy_deepseek_v2_model # noqa: F401, E501 from opencompass.configs.models.deepseek.vllm_deepseek_moe_16b_base import \ models as vllm_deepseek_moe_16b_base_model # noqa: F401, E501 from opencompass.configs.models.gemma.hf_gemma2_2b import \ @@ -29,6 +39,8 @@ with read_base(): models as hf_gemma_2b_model # noqa: F401, E501 from opencompass.configs.models.gemma.hf_gemma_7b import \ models as hf_gemma_7b_model # noqa: F401, E501 + from opencompass.configs.models.gemma.lmdeploy_gemma_9b import \ + models as lmdeploy_gemma_9b_model # noqa: F401, E501 from opencompass.configs.models.gemma.vllm_gemma_2b import \ models as vllm_gemma_2b_model # noqa: F401, E501 from opencompass.configs.models.gemma.vllm_gemma_7b import \ @@ -59,10 +71,14 @@ with read_base(): models as hf_llama3_1_8b_model # noqa: F401, E501 from opencompass.configs.models.hf_llama.hf_llama3_8b import \ models as hf_llama3_8b_model # noqa: F401, E501 + from opencompass.configs.models.hf_llama.hf_llama3_70b import \ + models as hf_llama3_70b_model # noqa: F401, E501 from opencompass.configs.models.hf_llama.lmdeploy_llama3_1_8b import \ models as lmdeploy_llama3_1_8b_model # noqa: F401, E501 from opencompass.configs.models.hf_llama.lmdeploy_llama3_8b import \ models as lmdeploy_llama3_8b_model # noqa: F401, E501 + from opencompass.configs.models.hf_llama.lmdeploy_llama3_70b import \ + models as lmdeploy_llama3_70b_model # noqa: F401, E501 from opencompass.configs.models.mistral.hf_mistral_7b_v0_2 import \ models as hf_mistral_7b_v0_2_model # noqa: F401, E501 from opencompass.configs.models.mistral.hf_mistral_7b_v0_3 import \ @@ -73,10 +89,16 @@ with read_base(): models as hf_qwen_2_5_7b_model # noqa: F401, E501 from opencompass.configs.models.qwen2_5.hf_qwen_2_5_14b import \ models as hf_qwen_2_5_14b_model # noqa: F401, E501 + from opencompass.configs.models.qwen2_5.hf_qwen_2_5_32b import \ + models as hf_qwen_2_5_32b_model # noqa: F401, E501 from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_1_5b import \ models as lmdeploy_qwen2_5_1_5b_model # noqa: F401, E501 from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_7b import \ models as lmdeploy_qwen2_5_7b_model # noqa: F401, E501 + from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_32b import \ + models as lmdeploy_qwen2_5_32b_model # noqa: F401, E501 + from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_72b import \ + models as lmdeploy_qwen2_5_72b_model # noqa: F401, E501 from opencompass.configs.models.qwen.hf_qwen1_5_moe_a2_7b import \ models as hf_qwen1_5_moe_a2_7b_model # noqa: F401, E501 from opencompass.configs.models.qwen.hf_qwen2_0_5b import \ @@ -95,6 +117,10 @@ with read_base(): models as hf_yi_1_5_6b_model # noqa: F401, E501 from opencompass.configs.models.yi.hf_yi_1_5_9b import \ models as hf_yi_1_5_9b_model # noqa: F401, E501 + from opencompass.configs.models.yi.lmdeploy_yi_1_5_9b import \ + models as lmdeploy_yi_1_5_9b_model # noqa: F401, E501 + + from ...volc import infer as volc_infer # noqa: F401, E501 race_datasets = [race_datasets[1]] models = sum([v for k, v in locals().items() if k.endswith('_model')], []) diff --git a/.github/scripts/eval_regression_chat.py b/.github/scripts/eval_regression_chat_models.py similarity index 76% rename from .github/scripts/eval_regression_chat.py rename to .github/scripts/eval_regression_chat_models.py index 7762e4f7..eeade13f 100644 --- a/.github/scripts/eval_regression_chat.py +++ b/.github/scripts/eval_regression_chat_models.py @@ -7,8 +7,6 @@ with read_base(): from opencompass.configs.datasets.race.race_gen import \ race_datasets # noqa: F401, E501 # read hf models - chat models - from opencompass.configs.models.baichuan.hf_baichuan2_7b_chat import \ - models as hf_baichuan2_7b_chat_model # noqa: F401, E501 from opencompass.configs.models.chatglm.hf_glm4_9b_chat import \ models as hf_glm4_9b_chat_model # noqa: F401, E501 from opencompass.configs.models.chatglm.lmdeploy_glm4_9b_chat import \ @@ -17,22 +15,30 @@ with read_base(): models as vllm_glm4_9b_chat_model # noqa: F401, E501 from opencompass.configs.models.deepseek.hf_deepseek_7b_chat import \ models as hf_deepseek_7b_chat_model # noqa: F401, E501 + from opencompass.configs.models.deepseek.hf_deepseek_67b_chat import \ + models as hf_deepseek_67b_chat_model # noqa: F401, E501 from opencompass.configs.models.deepseek.hf_deepseek_moe_16b_chat import \ models as hf_deepseek_moe_16b_chat_model # noqa: F401, E501 from opencompass.configs.models.deepseek.hf_deepseek_v2_lite_chat import \ models as hf_deepseek_v2_lite_chat_model # noqa: F401, E501 + from opencompass.configs.models.deepseek.lmdeploy_deepseek_v2_5_1210 import \ + models as lmdeploy_deepseek_v2_5_1210_model # noqa: F401, E501 from opencompass.configs.models.deepseek.vllm_deepseek_7b_chat import \ models as vllm_deepseek_7b_chat_model # noqa: F401, E501 from opencompass.configs.models.gemma.hf_gemma2_2b_it import \ models as hf_gemma2_2b_it_model # noqa: F401, E501 from opencompass.configs.models.gemma.hf_gemma2_9b_it import \ models as hf_gemma2_9b_it_model # noqa: F401, E501 + from opencompass.configs.models.gemma.hf_gemma2_27b_it import \ + models as hf_gemma2_27b_it_model # noqa: F401, E501 from opencompass.configs.models.gemma.hf_gemma_2b_it import \ models as hf_gemma_2b_it_model # noqa: F401, E501 from opencompass.configs.models.gemma.hf_gemma_7b_it import \ models as hf_gemma_7b_it_model # noqa: F401, E501 from opencompass.configs.models.gemma.lmdeploy_gemma_9b_it import \ models as lmdeploy_gemma_9b_it_model # noqa: F401, E501 + from opencompass.configs.models.gemma.lmdeploy_gemma_27b_it import \ + models as lmdeploy_gemma_27b_it_model # noqa: F401, E501 from opencompass.configs.models.gemma.vllm_gemma_7b_it import \ models as vllm_gemma_7b_it_model # noqa: F401, E501 from opencompass.configs.models.hf_internlm.hf_internlm2_5_7b_chat import \ @@ -65,6 +71,8 @@ with read_base(): models as lmdeploy_llama3_1_8b_instruct_model # noqa: F401, E501 from opencompass.configs.models.hf_llama.lmdeploy_llama3_2_3b_instruct import \ models as lmdeploy_llama3_2_3b_instruct_model # noqa: F401, E501 + from opencompass.configs.models.hf_llama.lmdeploy_llama3_3_70b_instruct import \ + models as lmdeploy_llama3_3_70b_instruct_model # noqa: F401, E501 from opencompass.configs.models.hf_llama.lmdeploy_llama3_8b_instruct import \ models as lmdeploy_llama3_8b_instruct_model # noqa: F401, E501 from opencompass.configs.models.mistral.hf_mistral_7b_instruct_v0_2 import \ @@ -75,6 +83,13 @@ with read_base(): models as hf_mistral_nemo_instruct_2407_model # noqa: F401, E501 from opencompass.configs.models.mistral.hf_mistral_small_instruct_2409 import \ models as hf_mistral_small_instruct_2409_model # noqa: F401, E501 + from opencompass.configs.models.mistral.hf_mixtral_8x7b_instruct_v0_1 import \ + models as hf_mixtral_8x7b_instruct_v0_1_model # noqa: F401, E501 + from opencompass.configs.models.mistral.hf_mixtral_8x22b_instruct_v0_1 import \ + models as hf_mixtral_8x22b_instruct_v0_1_model # noqa: F401, E501 + from opencompass.configs.models.mistral.lmdeploy_mistral_large_instruct_2411 import \ + models as \ + lmdeploy_mistral_large_instruct_2411_model # noqa: F401, E501 from opencompass.configs.models.mistral.lmdeploy_mistral_nemo_instruct_2407 import \ models as lmdeploy_mistral_nemo_instruct_2407_model # noqa: F401, E501 from opencompass.configs.models.mistral.lmdeploy_mistral_small_instruct_2409 import \ @@ -84,22 +99,28 @@ with read_base(): models as vllm_mistral_7b_instruct_v0_1_model # noqa: F401, E501 from opencompass.configs.models.mistral.vllm_mistral_7b_instruct_v0_2 import \ models as vllm_mistral_7b_instruct_v0_2_model # noqa: F401, E501 + from opencompass.configs.models.nvidia.lmdeploy_nemotron_70b_instruct_hf import \ + models as lmdeploy_nemotron_70b_instruct_hf_model # noqa: F401, E501 from opencompass.configs.models.openbmb.hf_minicpm3_4b import \ models as hf_minicpm3_4b_model # noqa: F401, E501 - from opencompass.configs.models.openbmb.hf_minicpm_2b_dpo_fp32 import \ - models as hf_minicpm_2b_dpo_fp32_model # noqa: F401, E501 - from opencompass.configs.models.openbmb.hf_minicpm_2b_sft_bf16 import \ - models as hf_minicpm_2b_sft_bf16_model # noqa: F401, E501 - from opencompass.configs.models.openbmb.hf_minicpm_2b_sft_fp32 import \ - models as hf_minicpm_2b_sft_fp32_model # noqa: F401, E501 from opencompass.configs.models.phi.hf_phi_3_mini_4k_instruct import \ models as hf_phi_3_mini_4k_instruct_model # noqa: F401, E501 from opencompass.configs.models.phi.hf_phi_3_small_8k_instruct import \ models as hf_phi_3_mini_8k_instruct_model # noqa: F401, E501 + from opencompass.configs.models.qwen2_5.hf_qwen2_5_0_5b_instruct import \ + models as hf_qwen2_5_0_5b_instruct_model # noqa: F401, E501 + from opencompass.configs.models.qwen2_5.hf_qwen2_5_3b_instruct import \ + models as hf_qwen2_5_3b_instruct_model # noqa: F401, E501 from opencompass.configs.models.qwen2_5.hf_qwen2_5_14b_instruct import \ models as hf_qwen2_5_14b_instruct_model # noqa: F401, E501 + from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_0_5b_instruct import \ + models as lmdeploy_qwen2_5_0_5b_instruct_model # noqa: F401, E501 + from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_3b_instruct import \ + models as lmdeploy_qwen2_5_3b_instruct_model # noqa: F401, E501 from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_14b_instruct import \ models as lmdeploy_qwen2_5_14b_instruct_model # noqa: F401, E501 + from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_72b_instruct import \ + models as lmdeploy_qwen2_5_72b_instruct_model # noqa: F401, E501 from opencompass.configs.models.qwen.hf_qwen1_5_0_5b_chat import \ models as hf_qwen1_5_0_5b_chat_model # noqa: F401, E501 from opencompass.configs.models.qwen.hf_qwen2_1_5b_instruct import \ @@ -116,6 +137,14 @@ with read_base(): models as hf_yi_1_5_6b_chat_model # noqa: F401, E501 from opencompass.configs.models.yi.hf_yi_1_5_9b_chat import \ models as hf_yi_1_5_9b_chat_model # noqa: F401, E501 + from opencompass.configs.models.yi.lmdeploy_yi_1_5_6b_chat import \ + models as lmdeploy_yi_1_5_6b_chat_model # noqa: F401, E501 + from opencompass.configs.models.yi.lmdeploy_yi_1_5_9b_chat import \ + models as lmdeploy_yi_1_5_9b_chat_model # noqa: F401, E501 + from opencompass.configs.models.yi.lmdeploy_yi_1_5_34b_chat import \ + models as lmdeploy_yi_1_5_34b_chat_model # noqa: F401, E501 + + from ...volc import infer as volc_infer # noqa: F401, E501 race_datasets = [race_datasets[1]] datasets = sum([v for k, v in locals().items() if k.endswith('_datasets')], []) diff --git a/.github/scripts/eval_regression_chat_objective_fullbench.py b/.github/scripts/eval_regression_chat_obj_fullbench.py similarity index 99% rename from .github/scripts/eval_regression_chat_objective_fullbench.py rename to .github/scripts/eval_regression_chat_obj_fullbench.py index f02fb7c4..98588dee 100644 --- a/.github/scripts/eval_regression_chat_objective_fullbench.py +++ b/.github/scripts/eval_regression_chat_obj_fullbench.py @@ -107,6 +107,8 @@ with read_base(): from opencompass.configs.summarizers.mmmlu_lite import \ mmmlu_summary_groups # noqa: F401, E501 + from ...volc import infer as volc_infer # noqa: F401, E501 + # For HumanEval-X Evaluation # Apply the evaluator ip_address and port race_datasets = [race_datasets[1]] diff --git a/.github/scripts/eval_regression_chat_subjective_fullbench.py b/.github/scripts/eval_regression_chat_sub_fullbench.py similarity index 86% rename from .github/scripts/eval_regression_chat_subjective_fullbench.py rename to .github/scripts/eval_regression_chat_sub_fullbench.py index 60495f22..6ef87752 100644 --- a/.github/scripts/eval_regression_chat_subjective_fullbench.py +++ b/.github/scripts/eval_regression_chat_sub_fullbench.py @@ -22,8 +22,7 @@ with read_base(): arenahard_datasets # noqa: F401, E501 from opencompass.configs.datasets.subjective.compassarena.compassarena_compare_new import \ compassarena_datasets # noqa: F401, E501 - from opencompass.configs.datasets.subjective.fofo.fofo_bilingual_judge_new import \ - fofo_datasets # noqa: F401, E501 + # from opencompass.configs.datasets.subjective.fofo.fofo_bilingual_judge_new import fofo_datasets # noqa: F401, E501 from opencompass.configs.datasets.subjective.followbench.followbench_llmeval_new import \ followbench_llmeval_datasets # noqa: F401, E501 from opencompass.configs.datasets.subjective.multiround.mtbench101_judge_new import \ @@ -35,6 +34,8 @@ with read_base(): from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat import \ models as lmdeploy_internlm2_5_7b_chat_model # noqa: F401, E501 + from ...volc import infer as volc_infer # noqa: F401, E501 + datasets = sum((v for k, v in locals().items() if k.endswith('_datasets') and 'mtbench101' not in k and 'wildbench' not in k), []) datasets += mtbench101_datasets # noqa: F401, E501 @@ -73,25 +74,15 @@ eval = dict( summary_groups = [] summary_groups.append({ - 'name': - 'compassarena_language', + 'name': 'compassarena_language', 'subsets': [ ['compassarena_language', '内容总结'], - ['compassarena_language', '情感分析'], - ['compassarena_language', 'Information Retrival'], - ['compassarena_language', '综合问答'], - ['compassarena_language', '中华文化'], ], }) summary_groups.append({ - 'name': - 'compassarena_knowledge', + 'name': 'compassarena_knowledge', 'subsets': [ ['compassarena_knowledge', '生活常识_ZH'], - ['compassarena_knowledge', '自然科学工科_ZH'], - ['compassarena_knowledge', '人文科学_ZH'], - ['compassarena_knowledge', '自然科学理科_ZH'], - ['compassarena_knowledge', '社会科学_ZH'], ], }) summary_groups.append({ @@ -101,21 +92,15 @@ summary_groups.append({ ], }) summary_groups.append({ - 'name': - 'compassarena_math_v2', + 'name': 'compassarena_math_v2', 'subsets': [ ['compassarena_math_v2', '高等数学_ZH'], - ['compassarena_math_v2', '初等数学_ZH'], - ['compassarena_math_v2', '中等数学_ZH'], ], }) summary_groups.append({ - 'name': - 'compassarena_creationv2_zh', + 'name': 'compassarena_creationv2_zh', 'subsets': [ ['compassarena_creationv2_zh', '内容扩写_ZH'], - ['compassarena_creationv2_zh', '内容续写_ZH'], - ['compassarena_creationv2_zh', '内容改写_ZH'], ], }) summary_groups.append({ diff --git a/.github/scripts/oc_score_assert.py b/.github/scripts/oc_score_assert.py index d53c5bf5..6ad6e295 100644 --- a/.github/scripts/oc_score_assert.py +++ b/.github/scripts/oc_score_assert.py @@ -6,74 +6,19 @@ import yaml output_path = 'regression_result_daily' -chat_model_list = [ - 'baichuan2-7b-chat-hf', - 'glm-4-9b-chat-hf', - 'glm-4-9b-chat-turbomind', - 'glm-4-9b-chat-vllm', - 'deepseek-7b-chat-hf', - 'deepseek-moe-16b-chat-hf', - 'deepseek-7b-chat-vllm', - 'gemma2-2b-it-hf', - 'gemma2-9b-it-hf', - 'gemma-2b-it-hf', - 'gemma-7b-it-hf', - 'gemma-2-9b-it-turbomind', - 'gemma-7b-it-vllm', - 'internlm2_5-7b-chat-hf', - 'internlm2_5-7b-chat-turbomind', - 'internlm2-chat-1.8b-turbomind', - 'internlm2-chat-1.8b-sft-turbomind', - 'internlm2-chat-7b-lmdeploy', - 'internlm2-chat-7b-sft-turbomind', - 'internlm2-chat-7b-vllm', - 'llama-3_1-8b-instruct-hf', - 'llama-3_2-3b-instruct-hf', - 'llama-3-8b-instruct-hf', - 'llama-3_1-8b-instruct-turbomind', - 'llama-3_2-3b-instruct-turbomind', - 'llama-3-8b-instruct-turbomind', - 'mistral-7b-instruct-v0.2-hf', - 'mistral-7b-instruct-v0.3-hf', - 'mistral-nemo-instruct-2407-hf', - 'mistral-nemo-instruct-2407-turbomind', - 'mistral-7b-instruct-v0.1-vllm', - 'mistral-7b-instruct-v0.2-vllm', - # 'MiniCPM3-4B-hf', 'minicpm-2b-dpo-fp32-hf', 'minicpm-2b-sft-bf16-hf', - # 'minicpm-2b-sft-fp32-hf', - 'phi-3-mini-4k-instruct-hf', - 'qwen1.5-0.5b-chat-hf', - 'qwen2-1.5b-instruct-hf', - 'qwen2-7b-instruct-hf', - 'qwen2-1.5b-instruct-turbomind', - 'qwen2-7b-instruct-turbomind', - 'qwen1.5-0.5b-chat-vllm', - 'yi-1.5-6b-chat-hf', - 'yi-1.5-9b-chat-hf', - 'deepseek-v2-lite-chat-hf', - 'internlm2_5-20b-chat-hf', - 'internlm2_5-20b-chat-turbomind', - 'mistral-small-instruct-2409-hf', - 'mistral-small-instruct-2409-turbomind', - 'qwen2.5-14b-instruct-hf', - 'qwen2.5-14b-instruct-turbomind' -] -base_model_list = [ - 'glm-4-9b-hf', 'deepseek-moe-16b-base-hf', 'deepseek-7b-base-turbomind', - 'deepseek-moe-16b-base-vllm', 'gemma2-2b-hf', 'gemma2-9b-hf', - 'gemma-2b-hf', 'gemma-7b-hf', 'gemma-2b-vllm', 'gemma-7b-vllm', - 'internlm2_5-7b-hf', 'internlm2-7b-hf', 'internlm2-base-7b-hf', - 'internlm2-1.8b-turbomind', 'internlm2_5-7b-turbomind', - 'internlm2-7b-turbomind', 'internlm2-base-7b-turbomind', 'llama-2-7b-hf', - 'llama-3_1-8b-hf', 'llama-3-8b-hf', 'llama-3.1-8b-turbomind', - 'llama-3-8b-turbomind', 'mistral-7b-v0.2-hf', 'mistral-7b-v0.3-hf', - 'mistral-7b-v0.2-vllm', 'qwen2.5-7b-hf', 'qwen2.5-1.5b-turbomind', - 'qwen2.5-7b-turbomind', 'qwen1.5-moe-a2.7b-hf', 'qwen2-0.5b-hf', - 'qwen2-1.5b-hf', 'qwen2-7b-hf', 'qwen2-1.5b-turbomind', - 'qwen2-7b-turbomind', 'qwen1.5-0.5b-vllm', 'yi-1.5-6b-hf', 'yi-1.5-9b-hf', - 'deepseek-v2-lite-hf', 'internlm2-20b-hf', 'internlm2-base-20b-hf', - 'internlm2-20b-turbomind', 'qwen2.5-14b-hf' -] + +def model_list(type): + config_path = '.github/scripts/oc_score_baseline_testrange.yaml' + with open(config_path) as f: + config = yaml.load(f.read(), Loader=yaml.SafeLoader) + return config.get(type).keys() + + +def dataset_list(model, type): + config_path = '.github/scripts/oc_score_baseline_fullbench.yaml' + with open(config_path) as f: + config = yaml.load(f.read(), Loader=yaml.SafeLoader) + return config.get(model).get(type).keys() @pytest.fixture() @@ -115,36 +60,39 @@ def result_scores(): @pytest.mark.usefixtures('result_scores') @pytest.mark.usefixtures('baseline_scores_testrange') -@pytest.mark.chat +@pytest.mark.chat_models class TestChat: """Test cases for chat model.""" @pytest.mark.parametrize( - 'model, dataset', [(p1, p2) for p1 in chat_model_list + 'model, dataset', [(p1, p2) for p1 in model_list('chat') for p2 in ['gsm8k_accuracy', 'race-high_accuracy']]) def test_model_dataset_score(self, baseline_scores_testrange, result_scores, model, dataset): - base_score = baseline_scores_testrange.get(model).get(dataset) + base_score = baseline_scores_testrange.get('chat').get(model).get( + dataset) result_score = result_scores.get(model).get(dataset) assert_score(model, result_score, base_score) @pytest.mark.usefixtures('result_scores') @pytest.mark.usefixtures('baseline_scores_testrange') -@pytest.mark.base +@pytest.mark.base_models class TestBase: """Test cases for base model.""" - @pytest.mark.parametrize('model, dataset', [ - (p1, p2) for p1 in base_model_list for p2 in - ['gsm8k_accuracy', 'GPQA_diamond', 'race-high_accuracy', 'winogrande'] - ]) + @pytest.mark.parametrize('model, dataset', + [(p1, p2) for p1 in model_list('base') for p2 in [ + 'gsm8k_accuracy', 'GPQA_diamond_accuracy', + 'race-high_accuracy', 'winogrande_accuracy' + ]]) def test_model_dataset_score(self, baseline_scores_testrange, result_scores, model, dataset): if model in ['gemma-2b-vllm', 'gemma-7b-vllm' ] and dataset != 'gsm8k_accuracy': return - base_score = baseline_scores_testrange.get(model).get(dataset) + base_score = baseline_scores_testrange.get('base').get(model).get( + dataset) result_score = result_scores.get(model).get(dataset) assert_score(model, result_score, base_score) @@ -158,28 +106,11 @@ class TestChatObjFullbench: @pytest.mark.parametrize('model, dataset', [(p1, p2) for p1 in [ 'internlm2_5-7b-chat-hf_fullbench', 'internlm2_5-7b-chat-turbomind_fullbench' - ] for p2 in [ - 'race-high_accuracy', 'ARC-c_accuracy', 'BoolQ_accuracy', - 'triviaqa_wiki_1shot_score', 'nq_open_1shot_score', - 'IFEval_Prompt-level-strict-accuracy', 'drop_accuracy', - 'GPQA_diamond_accuracy', 'hellaswag_accuracy', 'TheoremQA_score', - 'musr_average_naive_average', 'korbench_single_naive_average', - 'gsm8k_accuracy', 'math_accuracy', 'cmo_fib_accuracy', - 'aime2024_accuracy', 'wikibench-wiki-single_choice_cncircular_perf_4', - 'sanitized_mbpp_score', 'ds1000_naive_average', - 'lcb_code_generation_pass@1', 'lcb_code_execution_pass@1', - 'lcb_test_output_pass@1', 'bbh-logical_deduction_seven_objects_score', - 'bbh-multistep_arithmetic_two_score', 'mmlu-other_naive_average', - 'cmmlu-china-specific_naive_average', 'mmlu_pro_math_accuracy', - 'ds1000_Pandas_accuracy', 'ds1000_Numpy_accuracy', - 'ds1000_Tensorflow_accuracy', 'ds1000_Scipy_accuracy', - 'ds1000_Sklearn_accuracy', 'ds1000_Pytorch_accuracy', - 'ds1000_Matplotlib_accuracy', 'openai_mmmlu_lite_AR-XY_accuracy', - 'college_naive_average', 'college_knowledge_naive_average' - ]]) + ] for p2 in dataset_list('internlm2_5-7b-chat-hf_fullbench', 'objective')]) def test_model_dataset_score(self, baseline_scores_fullbench, result_scores, model, dataset): - base_score = baseline_scores_fullbench.get(model).get(dataset) + base_score = baseline_scores_fullbench.get(model).get('objective').get( + dataset) result_score = result_scores.get(model).get(dataset) assert_score(model, result_score, base_score) @@ -193,32 +124,12 @@ class TestChatSubFullbench: @pytest.mark.parametrize('model, dataset', [(p1, p2) for p1 in [ 'internlm2_5-7b-chat-hf_fullbench', 'internlm2_5-7b-chat-turbomind_fullbench' - ] for p2 in [ - 'alignment_bench_v1_1_总分', 'alpaca_eval_total', 'arenahard_score', - 'Followbench_naive_average', 'CompassArena_naive_average', - 'mtbench101_avg', 'wildbench_average', - 'simpleqa_accuracy_given_attempted', - 'chinese_simpleqa_given_attempted_accuracy', - 'alignment_bench_v1_1_专业能力', 'alignment_bench_v1_1_数学计算', - 'alignment_bench_v1_1_基本任务', 'alignment_bench_v1_1_逻辑推理', - 'alignment_bench_v1_1_中文理解', 'alignment_bench_v1_1_文本写作', - 'alignment_bench_v1_1_角色扮演', 'alignment_bench_v1_1_综合问答', - 'alpaca_eval_helpful_base', 'compassarena_language_naive_average', - 'compassarena_knowledge_naive_average', - 'compassarena_reason_v2_naive_average', - 'compassarena_math_v2_naive_average', - 'compassarena_creationv2_zh_naive_average', - 'fofo_test_prompts_overall', 'followbench_llmeval_en_HSR_AVG', - 'followbench_llmeval_en_SSR_AVG', 'followbench_llmeval_en_HSR_L1', - 'followbench_llmeval_en_HSR_L2', 'followbench_llmeval_en_HSR_L3', - 'followbench_llmeval_en_HSR_L4', 'followbench_llmeval_en_HSR_L5', - 'followbench_llmeval_en_SSR_L1', 'followbench_llmeval_en_SSR_L2', - 'followbench_llmeval_en_SSR_L3', 'followbench_llmeval_en_SSR_L4', - 'followbench_llmeval_en_SSR_L5', 'simpleqa_f1' - ]]) + ] for p2 in dataset_list('internlm2_5-7b-chat-hf_fullbench', 'subjective')] + ) def test_model_dataset_score(self, baseline_scores_fullbench, result_scores, model, dataset): - base_score = baseline_scores_fullbench.get(model).get(dataset) + base_score = baseline_scores_fullbench.get(model).get( + 'subjective').get(dataset) result_score = result_scores.get(model).get(dataset) assert_score(model, result_score, base_score) @@ -229,25 +140,15 @@ class TestChatSubFullbench: class TestBaseFullbench: """Test cases for chat model.""" - @pytest.mark.parametrize('model, dataset', [(p1, p2) for p1 in [ - 'internlm2_5-7b-hf_fullbench', 'internlm2_5-7b-turbomind_fullbench' - ] for p2 in [ - 'race-high_accuracy', 'ARC-c_accuracy', 'BoolQ_accuracy', - 'triviaqa_wiki_1shot_score', 'nq_open_1shot_score', 'drop_accuracy', - 'GPQA_diamond_accuracy', 'hellaswag_accuracy', 'TheoremQA_score', - 'winogrande_accuracy', 'gsm8k_accuracy', - 'GaokaoBench_2010-2022_Math_II_MCQs_score', - 'GaokaoBench_2010-2022_Math_II_Fill-in-the-Blank_score', - 'math_accuracy', 'wikibench-wiki-single_choice_cncircular_perf_4', - 'sanitized_mbpp_score', 'dingo_en_192_score', 'dingo_zh_170_score', - 'mmlu-other_accuracy', 'cmmlu-china-specific_accuracy', - 'mmlu_pro_math_accuracy', 'bbh-logical_deduction_seven_objects_score', - 'bbh-multistep_arithmetic_two_score', 'college_naive_average', - 'college_knowledge_naive_average' - ]]) + @pytest.mark.parametrize( + 'model, dataset', + [(p1, p2) for p1 in + ['internlm2_5-7b-hf_fullbench', 'internlm2_5-7b-turbomind_fullbench'] + for p2 in dataset_list('internlm2_5-7b-hf_fullbench', 'objective')]) def test_model_dataset_score(self, baseline_scores_fullbench, result_scores, model, dataset): - base_score = baseline_scores_fullbench.get(model).get(dataset) + base_score = baseline_scores_fullbench.get(model).get('objective').get( + dataset) result_score = result_scores.get(model).get(dataset) assert_score(model, result_score, base_score) @@ -274,193 +175,64 @@ class TestApibench: class TestVolcFullbench: """Test cases for chat model.""" - @pytest.mark.parametrize('model, dataset', [( - p1, p2 - ) for p1 in ['internlm2_5-7b-chat-turbomind'] for p2 in [ - 'race-high_accuracy', 'ARC-c_accuracy', 'BoolQ_accuracy', - 'triviaqa_wiki_1shot_score', 'nq_open_1shot_score', - 'mmmlu_lite_naive_average', 'IFEval_Prompt-level-strict-accuracy', - 'drop_accuracy', 'bbh_naive_average', 'GPQA_diamond_accuracy', - 'hellaswag_accuracy', 'TheoremQA_score', 'musr_average_naive_average', - 'korbench_single_naive_average', - 'ARC_Prize_Public_Evaluation_accuracy', 'gsm8k_accuracy', - 'GaokaoBench_weighted_average', 'math_accuracy', 'cmo_fib_accuracy', - 'aime2024_accuracy', 'Mathbench_naive_average', - 'wikibench-wiki-single_choice_cncircular_perf_4', - 'cmmlu_naive_average', 'mmlu_naive_average', 'mmlu_pro_naive_average', - 'openai_humaneval_humaneval_pass@1', 'sanitized_mbpp_score', - 'humanevalx_naive_average', 'ds1000_naive_average', - 'lcb_code_generation_pass@1', 'lcb_code_execution_pass@1', - 'lcb_test_output_pass@1', 'bigcodebench_hard_instruct_pass@1', - 'bigcodebench_hard_complete_pass@1', 'teval_naive_average', - 'qa_dingo_cn_score', 'mmlu-stem_naive_average', - 'mmlu-social-science_naive_average', 'mmlu-humanities_naive_average', - 'mmlu-other_naive_average', 'cmmlu-stem_naive_average', - 'cmmlu-social-science_naive_average', 'cmmlu-humanities_naive_average', - 'cmmlu-other_naive_average', 'cmmlu-china-specific_naive_average', - 'mmlu_pro_biology_accuracy', 'mmlu_pro_business_accuracy', - 'mmlu_pro_chemistry_accuracy', 'mmlu_pro_computer_science_accuracy', - 'mmlu_pro_economics_accuracy', 'mmlu_pro_engineering_accuracy', - 'mmlu_pro_health_accuracy', 'mmlu_pro_history_accuracy', - 'mmlu_pro_law_accuracy', 'mmlu_pro_math_accuracy', - 'mmlu_pro_philosophy_accuracy', 'mmlu_pro_physics_accuracy', - 'mmlu_pro_psychology_accuracy', 'mmlu_pro_other_accuracy', - 'humanevalx-python_pass@1', 'humanevalx-cpp_pass@1', - 'humanevalx-go_pass@1', 'humanevalx-java_pass@1', - 'humanevalx-js_pass@1', 'ds1000_Pandas_accuracy', - 'ds1000_Numpy_accuracy', 'ds1000_Tensorflow_accuracy', - 'ds1000_Scipy_accuracy', 'ds1000_Sklearn_accuracy', - 'ds1000_Pytorch_accuracy', 'ds1000_Matplotlib_accuracy', - 'openai_mmmlu_lite_AR-XY_accuracy', 'openai_mmmlu_lite_BN-BD_accuracy', - 'openai_mmmlu_lite_DE-DE_accuracy', 'openai_mmmlu_lite_ES-LA_accuracy', - 'openai_mmmlu_lite_FR-FR_accuracy', 'openai_mmmlu_lite_HI-IN_accuracy', - 'openai_mmmlu_lite_ID-ID_accuracy', 'openai_mmmlu_lite_IT-IT_accuracy', - 'openai_mmmlu_lite_JA-JP_accuracy', 'openai_mmmlu_lite_KO-KR_accuracy', - 'openai_mmmlu_lite_PT-BR_accuracy', 'openai_mmmlu_lite_SW-KE_accuracy', - 'openai_mmmlu_lite_YO-NG_accuracy', 'openai_mmmlu_lite_ZH-CN_accuracy', - 'college_naive_average', 'high_naive_average', 'middle_naive_average', - 'primary_naive_average', 'arithmetic_naive_average', - 'mathbench-a (average)_naive_average', - 'college_knowledge_naive_average', 'high_knowledge_naive_average', - 'middle_knowledge_naive_average', 'primary_knowledge_naive_average', - 'mathbench-t (average)_naive_average' - ]]) - @pytest.mark.chat_objective - def test_chat_objective(self, baseline_scores_fullbench, result_scores, - model, dataset): - base_score = baseline_scores_fullbench.get(model).get(dataset) - result_score = result_scores.get(model).get(dataset) - assert_score(model + '_batch', result_score, base_score) - @pytest.mark.parametrize( 'model, dataset', [(p1, p2) for p1 in ['internlm2_5-7b-chat-turbomind'] - for p2 in [ - 'alignment_bench_v1_1_总分', 'alpaca_eval_total', 'arenahard_score', - 'Followbench_naive_average', 'CompassArena_naive_average', - 'FoFo_naive_average', 'mtbench101_avg', 'wildbench_average', - 'simpleqa_accuracy_given_attempted', - 'chinese_simpleqa_given_attempted_accuracy', - 'alignment_bench_v1_1_专业能力', 'alignment_bench_v1_1_数学计算', - 'alignment_bench_v1_1_基本任务', 'alignment_bench_v1_1_逻辑推理', - 'alignment_bench_v1_1_中文理解', 'alignment_bench_v1_1_文本写作', - 'alignment_bench_v1_1_角色扮演', 'alignment_bench_v1_1_综合问答', - 'alpaca_eval_helpful_base', 'alpaca_eval_koala', - 'alpaca_eval_oasst', 'alpaca_eval_selfinstruct', - 'alpaca_eval_vicuna', 'compassarena_language_naive_average', - 'compassarena_knowledge_naive_average', - 'compassarena_reason_v2_naive_average', - 'compassarena_math_v2_naive_average', - 'compassarena_creationv2_zh_naive_average', - 'fofo_test_prompts_overall', 'fofo_test_prompts_cn_overall', - 'followbench_llmeval_en_HSR_AVG', - 'followbench_llmeval_en_SSR_AVG', 'followbench_llmeval_en_HSR_L1', - 'followbench_llmeval_en_HSR_L2', 'followbench_llmeval_en_HSR_L3', - 'followbench_llmeval_en_HSR_L4', 'followbench_llmeval_en_HSR_L5', - 'followbench_llmeval_en_SSR_L1', 'followbench_llmeval_en_SSR_L2', - 'followbench_llmeval_en_SSR_L3', 'followbench_llmeval_en_SSR_L4', - 'followbench_llmeval_en_SSR_L5', 'simpleqa_f1' - ]]) - @pytest.mark.chat_subjective - def test_chat_subjective(self, baseline_scores_fullbench, result_scores, - model, dataset): - base_score = baseline_scores_fullbench.get(model).get(dataset) + for p2 in dataset_list('internlm2_5-7b-chat-turbomind', 'objective')]) + @pytest.mark.chat_objective + def test_chat_objective(self, baseline_scores_fullbench, result_scores, + model, dataset): + base_score = baseline_scores_fullbench.get(model).get('objective').get( + dataset) result_score = result_scores.get(model).get(dataset) assert_score(model + '_batch', result_score, base_score) - @pytest.mark.parametrize('model, dataset', [( - p1, p2 - ) for p1 in ['internlm2_5-7b-turbomind'] for p2 in [ - 'race-high_accuracy', 'ARC-c_accuracy', 'BoolQ_accuracy', - 'triviaqa_wiki_1shot_score', 'nq_open_1shot_score', 'drop_accuracy', - 'bbh_naive_average', 'GPQA_diamond_accuracy', 'hellaswag_accuracy', - 'TheoremQA_score', 'winogrande_accuracy', 'gsm8k_accuracy', - 'GaokaoBench_weighted_average', 'math_accuracy', - 'Mathbench_naive_average', - 'wikibench-wiki-single_choice_cncircular_perf_4', - 'cmmlu_naive_average', 'mmlu_naive_average', 'mmlu_pro_naive_average', - 'openai_humaneval_humaneval_pass@1', - 'openai_humaneval_v2_humaneval_pass@1', 'sanitized_mbpp_score', - 'dingo_en_192_score', 'dingo_zh_170_score', 'mmlu-stem_naive_average', - 'mmlu-social-science_naive_average', 'mmlu-humanities_naive_average', - 'mmlu-other_naive_average', 'cmmlu-stem_naive_average', - 'cmmlu-social-science_naive_average', 'cmmlu-humanities_naive_average', - 'cmmlu-other_naive_average', 'cmmlu-china-specific_naive_average', - 'mmlu_pro_biology_accuracy', 'mmlu_pro_business_accuracy', - 'mmlu_pro_chemistry_accuracy', 'mmlu_pro_computer_science_accuracy', - 'mmlu_pro_economics_accuracy', 'mmlu_pro_engineering_accuracy', - 'mmlu_pro_health_accuracy', 'mmlu_pro_history_accuracy', - 'mmlu_pro_law_accuracy', 'mmlu_pro_math_accuracy', - 'mmlu_pro_philosophy_accuracy', 'mmlu_pro_physics_accuracy', - 'mmlu_pro_psychology_accuracy', 'mmlu_pro_other_accuracy', - 'college_naive_average', 'high_naive_average', 'middle_naive_average', - 'primary_naive_average', 'arithmetic_naive_average', - 'mathbench-a (average)_naive_average', - 'college_knowledge_naive_average', 'high_knowledge_naive_average', - 'middle_knowledge_naive_average', 'primary_knowledge_naive_average', - 'mathbench-t (average)_naive_average' - ]]) - @pytest.mark.base_objective - def test_base_objective(self, baseline_scores_fullbench, result_scores, - model, dataset): - base_score = baseline_scores_fullbench.get(model).get(dataset) + @pytest.mark.parametrize('model, dataset', [ + (p1, p2) for p1 in ['internlm2_5-7b-chat-turbomind'] + for p2 in dataset_list('internlm2_5-7b-chat-turbomind', 'subjective') + ]) + @pytest.mark.chat_subjective + def test_chat_subjective(self, baseline_scores_fullbench, result_scores, + model, dataset): + base_score = baseline_scores_fullbench.get(model).get( + 'subjective').get(dataset) result_score = result_scores.get(model).get(dataset) assert_score(model + '_batch', result_score, base_score) @pytest.mark.parametrize( 'model, dataset', [(p1, p2) for p1 in ['internlm2_5-7b-turbomind'] - for p2 in [ - 'Single-Needle-Retrieval(S-RT)-32000_naive_average', - 'Single-Needle-Retrieval-EN-32000_naive_average', - 'Single-Needle-Retrieval-ZH-32000_naive_average', - 'Single-Needle-Retrieval(S-RT)-100000_naive_average', - 'Single-Needle-Retrieval-EN-100000_naive_average', - 'Single-Needle-Retrieval-ZH-100000_naive_average', - 'Single-Needle-Retrieval(S-RT)-200000_naive_average', - 'Single-Needle-Retrieval-EN-200000_naive_average', - 'Single-Needle-Retrieval-ZH-200000_naive_average', - 'longbench_naive_average', 'longbench_zh_naive_average', - 'longbench_en_naive_average', - 'longbench_single-document-qa_naive_average', - 'longbench_multi-document-qa_naive_average', - 'longbench_summarization_naive_average', - 'longbench_few-shot-learning_naive_average', - 'longbench_synthetic-tasks_naive_average', - 'longbench_code-completion_naive_average' - ]]) - @pytest.mark.base_long_context - def test_base_long_context(self, baseline_scores_fullbench, result_scores, - model, dataset): - base_score = baseline_scores_fullbench.get(model).get(dataset) + for p2 in dataset_list('internlm2_5-7b-turbomind', 'objective')]) + @pytest.mark.base_objective + def test_base_objective(self, baseline_scores_fullbench, result_scores, + model, dataset): + base_score = baseline_scores_fullbench.get(model).get('objective').get( + dataset) result_score = result_scores.get(model).get(dataset) assert_score(model + '_batch', result_score, base_score) @pytest.mark.parametrize( 'model, dataset', - [(p1, p2) for p1 in ['internlm2_5-7b-chat-1m-turbomind'] - for p2 in [ - 'ruler_8k_naive_average', 'ruler_32k_naive_average', - 'ruler_128k_naive_average', - 'NeedleBench-Overall-Score-8K_weighted_average', - 'NeedleBench-Overall-Score-32K_weighted_average', - 'NeedleBench-Overall-Score-128K_weighted_average', - 'longbench_naive_average', 'longbench_zh_naive_average', - 'longbench_en_naive_average', 'babilong_0k_naive_average', - 'babilong_4k_naive_average', 'babilong_16k_naive_average', - 'babilong_32k_naive_average', 'babilong_128k_naive_average', - 'babilong_256k_naive_average', - 'longbench_single-document-qa_naive_average', - 'longbench_multi-document-qa_naive_average', - 'longbench_summarization_naive_average', - 'longbench_few-shot-learning_naive_average', - 'longbench_synthetic-tasks_naive_average', - 'longbench_code-completion_naive_average' - ]]) + [(p1, p2) for p1 in ['internlm2_5-7b-turbomind'] + for p2 in dataset_list('internlm2_5-7b-turbomind', 'long_context')]) + @pytest.mark.base_long_context + def test_base_long_context(self, baseline_scores_fullbench, result_scores, + model, dataset): + base_score = baseline_scores_fullbench.get(model).get( + 'long_context').get(dataset) + result_score = result_scores.get(model).get(dataset) + assert_score(model + '_batch', result_score, base_score) + + @pytest.mark.parametrize( + 'model, dataset', + [(p1, p2) + for p1 in ['internlm2_5-7b-chat-1m-turbomind'] for p2 in dataset_list( + 'internlm2_5-7b-chat-1m-turbomind', 'long_context')]) @pytest.mark.chat_long_context def test_chat_long_context(self, baseline_scores_fullbench, result_scores, model, dataset): - base_score = baseline_scores_fullbench.get(model).get(dataset) + base_score = baseline_scores_fullbench.get(model).get( + 'long_context').get(dataset) result_score = result_scores.get(model).get(dataset) assert_score(model + '_batch', result_score, base_score) diff --git a/.github/scripts/oc_score_baseline.yaml b/.github/scripts/oc_score_baseline.yaml index 681ac5d3..a8e40891 100644 --- a/.github/scripts/oc_score_baseline.yaml +++ b/.github/scripts/oc_score_baseline.yaml @@ -14,12 +14,12 @@ internlm2-1.8b-hf: race-high_accuracy: 66.38 internlm2_5-7b-chat-lmdeploy: - demo_gsm8k_accuracy: 84.38 + demo_gsm8k_accuracy: 89.06 race-middle_accuracy: 92.76 race-high_accuracy: 90.54 internlm2-chat-1.8b-lmdeploy: - demo_gsm8k_accuracy: 31 + demo_gsm8k_accuracy: 32 race-middle_accuracy: 81.34 race-high_accuracy: 73.96 diff --git a/.github/scripts/oc_score_baseline_fullbench.yaml b/.github/scripts/oc_score_baseline_fullbench.yaml index 0359b633..568ed5fd 100644 --- a/.github/scripts/oc_score_baseline_fullbench.yaml +++ b/.github/scripts/oc_score_baseline_fullbench.yaml @@ -1,447 +1,456 @@ internlm2_5-7b-chat-hf_fullbench: - race-high_accuracy: 93.75 - ARC-c_accuracy: 93.75 - BoolQ_accuracy: 81.25 - triviaqa_wiki_1shot_score: 50 - nq_open_1shot_score: 25 - IFEval_Prompt-level-strict-accuracy: 50 - drop_accuracy: 81.25 - GPQA_diamond_accuracy: 25 - hellaswag_accuracy: 87.5 - TheoremQA_score: 18.75 - musr_average_naive_average: 39.58 - korbench_single_naive_average: 40 - gsm8k_accuracy: 62.50 - math_accuracy: 75 - cmo_fib_accuracy: 6.25 - aime2024_accuracy: 6.25 - wikibench-wiki-single_choice_cncircular_perf_4: 50 - sanitized_mbpp_score: 68.75 - ds1000_naive_average: 16.96 - lcb_code_generation_pass@1: 12.5 - lcb_code_execution_pass@1: 43.75 - lcb_test_output_pass@1: 18.75 - bbh-logical_deduction_seven_objects_score: 50 - bbh-multistep_arithmetic_two_score: 68.75 - mmlu-other_naive_average: 72.6 - cmmlu-china-specific_naive_average: 76.25 - mmlu_pro_math_accuracy: 25 - ds1000_Pandas_accuracy: 12.5 - ds1000_Numpy_accuracy: 0 - ds1000_Tensorflow_accuracy: 12.5 - ds1000_Scipy_accuracy: 18.75 - ds1000_Sklearn_accuracy: 18.75 - ds1000_Pytorch_accuracy: 12.5 - ds1000_Matplotlib_accuracy: 43.75 - openai_mmmlu_lite_AR-XY_accuracy: 37.5 - college_naive_average: 12.5 - college_knowledge_naive_average: 87.5 - alignment_bench_v1_1_总分: 0.66 - alpaca_eval_total: 0 - arenahard_score: 50 - Followbench_naive_average: 1 - CompassArena_naive_average: 54.48 - mtbench101_avg: 8.1 - wildbench_average: -9.86 - simpleqa_accuracy_given_attempted: 0 - chinese_simpleqa_given_attempted_accuracy: 1 - alignment_bench_v1_1_专业能力: 8 - alignment_bench_v1_1_数学计算: 0 - alignment_bench_v1_1_基本任务: 0 - alignment_bench_v1_1_逻辑推理: 0 - alignment_bench_v1_1_中文理解: 0 - alignment_bench_v1_1_文本写作: 0 - alignment_bench_v1_1_角色扮演: 0 - alignment_bench_v1_1_综合问答: 0 - alpaca_eval_helpful_base: 0 - compassarena_language_naive_average: 62 - compassarena_knowledge_naive_average: 56 - compassarena_reason_v2_naive_average: 49 - compassarena_math_v2_naive_average: 57.05 - compassarena_creationv2_zh_naive_average: 48.34 - fofo_test_prompts_overall: 1 - followbench_llmeval_en_HSR_AVG: 1 - followbench_llmeval_en_SSR_AVG: 1 - followbench_llmeval_en_HSR_L1: 1 - followbench_llmeval_en_HSR_L2: 1 - followbench_llmeval_en_HSR_L3: 1 - followbench_llmeval_en_HSR_L4: 1 - followbench_llmeval_en_HSR_L5: 1 - followbench_llmeval_en_SSR_L1: 1 - followbench_llmeval_en_SSR_L2: 1 - followbench_llmeval_en_SSR_L3: 1 - followbench_llmeval_en_SSR_L4: 1 - followbench_llmeval_en_SSR_L5: 1 - simpleqa_f1: 0 + objective: + race-high_accuracy: 93.75 + ARC-c_accuracy: 93.75 + BoolQ_accuracy: 81.25 + triviaqa_wiki_1shot_score: 50 + nq_open_1shot_score: 25 + IFEval_Prompt-level-strict-accuracy: 50 + drop_accuracy: 81.25 + GPQA_diamond_accuracy: 25 + hellaswag_accuracy: 87.5 + TheoremQA_score: 18.75 + musr_average_naive_average: 39.58 + korbench_single_naive_average: 40 + gsm8k_accuracy: 62.50 + math_accuracy: 75 + cmo_fib_accuracy: 6.25 + aime2024_accuracy: 6.25 + wikibench-wiki-single_choice_cncircular_perf_4: 50 + sanitized_mbpp_score: 68.75 + ds1000_naive_average: 16.96 + lcb_code_generation_pass@1: 12.5 + lcb_code_execution_pass@1: 43.75 + lcb_test_output_pass@1: 18.75 + bbh-logical_deduction_seven_objects_score: 50 + bbh-multistep_arithmetic_two_score: 68.75 + mmlu-other_naive_average: 72.6 + cmmlu-china-specific_naive_average: 76.25 + mmlu_pro_math_accuracy: 25 + ds1000_Pandas_accuracy: 12.5 + ds1000_Numpy_accuracy: 0 + ds1000_Tensorflow_accuracy: 12.5 + ds1000_Scipy_accuracy: 18.75 + ds1000_Sklearn_accuracy: 18.75 + ds1000_Pytorch_accuracy: 12.5 + ds1000_Matplotlib_accuracy: 43.75 + openai_mmmlu_lite_AR-XY_accuracy: 37.5 + college_naive_average: 12.5 + college_knowledge_naive_average: 87.5 + subjective: + alignment_bench_v1_1_总分: 0.66 + alpaca_eval_total: 20 + arenahard_score: 50 + Followbench_naive_average: 1 + CompassArena_naive_average: 44.00 + mtbench101_avg: 7.8 + wildbench_average: -12.78 + simpleqa_accuracy_given_attempted: 0 + chinese_simpleqa_given_attempted_accuracy: 1 + alignment_bench_v1_1_专业能力: 7.90 + alignment_bench_v1_1_数学计算: 0 + alignment_bench_v1_1_基本任务: 0 + alignment_bench_v1_1_逻辑推理: 0 + alignment_bench_v1_1_中文理解: 0 + alignment_bench_v1_1_文本写作: 0 + alignment_bench_v1_1_角色扮演: 0 + alignment_bench_v1_1_综合问答: 0 + alpaca_eval_helpful_base: 20 + compassarena_language_naive_average: 35 + compassarena_knowledge_naive_average: 55 + compassarena_reason_v2_naive_average: 45.00 + compassarena_math_v2_naive_average: 55 + compassarena_creationv2_zh_naive_average: 30 + followbench_llmeval_en_HSR_AVG: 1 + followbench_llmeval_en_SSR_AVG: 1 + followbench_llmeval_en_HSR_L1: 1 + followbench_llmeval_en_HSR_L2: 1 + followbench_llmeval_en_HSR_L3: 1 + followbench_llmeval_en_HSR_L4: 1 + followbench_llmeval_en_HSR_L5: 1 + followbench_llmeval_en_SSR_L1: 1 + followbench_llmeval_en_SSR_L2: 1 + followbench_llmeval_en_SSR_L3: 1 + followbench_llmeval_en_SSR_L4: 1 + followbench_llmeval_en_SSR_L5: 1 + simpleqa_f1: 0 internlm2_5-7b-chat-turbomind_fullbench: - race-high_accuracy: 93.75 - ARC-c_accuracy: 87.5 - BoolQ_accuracy: 68.75 - triviaqa_wiki_1shot_score: 50 - nq_open_1shot_score: 25 - IFEval_Prompt-level-strict-accuracy: 50 - drop_accuracy: 75 - GPQA_diamond_accuracy: 25 - hellaswag_accuracy: 81.25 - TheoremQA_score: 6.25 - musr_average_naive_average: 37.5 - korbench_single_naive_average: 41.25 - gsm8k_accuracy: 68.75 - math_accuracy: 75 - cmo_fib_accuracy: 6.25 - aime2024_accuracy: 6.25 - wikibench-wiki-single_choice_cncircular_perf_4: 25 - sanitized_mbpp_score: 68.75 - ds1000_naive_average: 13.39 - lcb_code_generation_pass@1: 12.5 - lcb_code_execution_pass@1: 43.75 - lcb_test_output_pass@1: 12.5 - bbh-logical_deduction_seven_objects_score: 56.25 - bbh-multistep_arithmetic_two_score: 68.75 - mmlu-other_naive_average: 74.04 - cmmlu-china-specific_naive_average: 76.25 - mmlu_pro_math_accuracy: 25 - ds1000_Pandas_accuracy: 0 - ds1000_Numpy_accuracy: 0 - ds1000_Tensorflow_accuracy: 12.5 - ds1000_Scipy_accuracy: 18.75 - ds1000_Sklearn_accuracy: 18.75 - ds1000_Pytorch_accuracy: 6.25 - ds1000_Matplotlib_accuracy: 37.5 - openai_mmmlu_lite_AR-XY_accuracy: 37.5 - college_naive_average: 0 - college_knowledge_naive_average: 87.5 - alignment_bench_v1_1_总分: 0.68 - alpaca_eval_total: 10 - arenahard_score: 50 - Followbench_naive_average: 1 - CompassArena_naive_average: 52.95 - mtbench101_avg: 8.1 - wildbench_average: -4.44 - simpleqa_accuracy_given_attempted: 0 - chinese_simpleqa_given_attempted_accuracy: 1 - alignment_bench_v1_1_专业能力: 8.2 - alignment_bench_v1_1_数学计算: 0 - alignment_bench_v1_1_基本任务: 0 - alignment_bench_v1_1_逻辑推理: 0 - alignment_bench_v1_1_中文理解: 0 - alignment_bench_v1_1_文本写作: 0 - alignment_bench_v1_1_角色扮演: 0 - alignment_bench_v1_1_综合问答: 0 - alpaca_eval_helpful_base: 10 - compassarena_language_naive_average: 61.5 - compassarena_knowledge_naive_average: 56.5 - compassarena_reason_v2_naive_average: 47.5 - compassarena_math_v2_naive_average: 53.03 - compassarena_creationv2_zh_naive_average: 46.22 - fofo_test_prompts_overall: 1 - followbench_llmeval_en_HSR_AVG: 1 - followbench_llmeval_en_SSR_AVG: 1 - followbench_llmeval_en_HSR_L1: 1 - followbench_llmeval_en_HSR_L2: 1 - followbench_llmeval_en_HSR_L3: 1 - followbench_llmeval_en_HSR_L4: 1 - followbench_llmeval_en_HSR_L5: 1 - followbench_llmeval_en_SSR_L1: 1 - followbench_llmeval_en_SSR_L2: 1 - followbench_llmeval_en_SSR_L3: 1 - followbench_llmeval_en_SSR_L4: 1 - followbench_llmeval_en_SSR_L5: 1 - simpleqa_f1: 0 + objective: + race-high_accuracy: 93.75 + ARC-c_accuracy: 93.75 + BoolQ_accuracy: 68.75 + triviaqa_wiki_1shot_score: 50 + nq_open_1shot_score: 25 + IFEval_Prompt-level-strict-accuracy: 56.25 + drop_accuracy: 81.25 + GPQA_diamond_accuracy: 31.25 + hellaswag_accuracy: 81.25 + TheoremQA_score: 6.25 + musr_average_naive_average: 39.58 + korbench_single_naive_average: 37.50 + gsm8k_accuracy: 68.75 + math_accuracy: 68.75 + cmo_fib_accuracy: 6.25 + aime2024_accuracy: 6.25 + wikibench-wiki-single_choice_cncircular_perf_4: 50.00 + sanitized_mbpp_score: 68.75 + ds1000_naive_average: 16.96 + lcb_code_generation_pass@1: 12.5 + lcb_code_execution_pass@1: 43.75 + lcb_test_output_pass@1: 25.00 + bbh-logical_deduction_seven_objects_score: 50.00 + bbh-multistep_arithmetic_two_score: 68.75 + mmlu-other_naive_average: 69.71 + cmmlu-china-specific_naive_average: 75.83 + mmlu_pro_math_accuracy: 31.25 + ds1000_Pandas_accuracy: 0 + ds1000_Numpy_accuracy: 0 + ds1000_Tensorflow_accuracy: 12.5 + ds1000_Scipy_accuracy: 18.75 + ds1000_Sklearn_accuracy: 18.75 + ds1000_Pytorch_accuracy: 18.75 + ds1000_Matplotlib_accuracy: 50.00 + openai_mmmlu_lite_AR-XY_accuracy: 37.5 + college_naive_average: 12.50 + college_knowledge_naive_average: 87.5 + subjective: + alignment_bench_v1_1_总分: 0.70 + alpaca_eval_total: 0 + arenahard_score: 50 + Followbench_naive_average: 1 + CompassArena_naive_average: 38 + mtbench101_avg: 7.80 + wildbench_average: -4.86 + simpleqa_accuracy_given_attempted: 0 + chinese_simpleqa_given_attempted_accuracy: 1 + alignment_bench_v1_1_专业能力: 8.4 + alignment_bench_v1_1_数学计算: 0 + alignment_bench_v1_1_基本任务: 0 + alignment_bench_v1_1_逻辑推理: 0 + alignment_bench_v1_1_中文理解: 0 + alignment_bench_v1_1_文本写作: 0 + alignment_bench_v1_1_角色扮演: 0 + alignment_bench_v1_1_综合问答: 0 + alpaca_eval_helpful_base: 0 + compassarena_language_naive_average: 35 + compassarena_knowledge_naive_average: 50 + compassarena_reason_v2_naive_average: 30 + compassarena_math_v2_naive_average: 50 + compassarena_creationv2_zh_naive_average: 25 + followbench_llmeval_en_HSR_AVG: 1 + followbench_llmeval_en_SSR_AVG: 1 + followbench_llmeval_en_HSR_L1: 1 + followbench_llmeval_en_HSR_L2: 1 + followbench_llmeval_en_HSR_L3: 1 + followbench_llmeval_en_HSR_L4: 1 + followbench_llmeval_en_HSR_L5: 1 + followbench_llmeval_en_SSR_L1: 1 + followbench_llmeval_en_SSR_L2: 1 + followbench_llmeval_en_SSR_L3: 1 + followbench_llmeval_en_SSR_L4: 1 + followbench_llmeval_en_SSR_L5: 1 + simpleqa_f1: 0 internlm2_5-7b-hf_fullbench: - race-high_accuracy: 100 - ARC-c_accuracy: 68.75 - BoolQ_accuracy: 87.5 - triviaqa_wiki_1shot_score: 43.75 - nq_open_1shot_score: 43.75 - drop_accuracy: 62.5 - GPQA_diamond_accuracy: 62.5 - hellaswag_accuracy: 93.75 - TheoremQA_score: 25 - winogrande_accuracy: 75 - gsm8k_accuracy: 37.5 - GaokaoBench_2010-2022_Math_II_MCQs_score: 62.5 - GaokaoBench_2010-2022_Math_II_Fill-in-the-Blank_score: 0 - math_accuracy: 12.5 - wikibench-wiki-single_choice_cncircular_perf_4: 25 - sanitized_mbpp_score: 56.25 - dingo_en_192_score: 37.5 - dingo_zh_170_score: 100 - mmlu-other_accuracy: 76.92 - cmmlu-china-specific_accuracy: 84.17 - mmlu_pro_math_accuracy: 18.75 - bbh-logical_deduction_seven_objects_score: 43.75 - bbh-multistep_arithmetic_two_score: 56.25 - college_naive_average: 12.5 - college_knowledge_naive_average: 87.5 + objective: + race-high_accuracy: 100 + ARC-c_accuracy: 68.75 + BoolQ_accuracy: 87.5 + triviaqa_wiki_1shot_score: 43.75 + nq_open_1shot_score: 43.75 + drop_accuracy: 62.5 + GPQA_diamond_accuracy: 62.5 + hellaswag_accuracy: 93.75 + TheoremQA_score: 25 + winogrande_accuracy: 75 + gsm8k_accuracy: 37.5 + GaokaoBench_2010-2022_Math_II_MCQs_score: 62.5 + GaokaoBench_2010-2022_Math_II_Fill-in-the-Blank_score: 0 + math_accuracy: 12.5 + wikibench-wiki-single_choice_cncircular_perf_4: 25 + sanitized_mbpp_score: 56.25 + dingo_en_192_score: 37.5 + dingo_zh_170_score: 100 + mmlu-other_accuracy: 76.92 + cmmlu-china-specific_accuracy: 84.17 + mmlu_pro_math_accuracy: 18.75 + bbh-logical_deduction_seven_objects_score: 43.75 + bbh-multistep_arithmetic_two_score: 56.25 + college_naive_average: 12.5 + college_knowledge_naive_average: 87.5 internlm2_5-7b-turbomind_fullbench: - race-high_accuracy: 100 - ARC-c_accuracy: 68.75 - BoolQ_accuracy: 87.5 - triviaqa_wiki_1shot_score: 43.75 - nq_open_1shot_score: 43.75 - drop_accuracy: 62.5 - GPQA_diamond_accuracy: 62.5 - hellaswag_accuracy: 93.75 - TheoremQA_score: 31.25 - winogrande_accuracy: 87.5 - gsm8k_accuracy: 68.75 - GaokaoBench_2010-2022_Math_II_MCQs_score: 62.5 - GaokaoBench_2010-2022_Math_II_Fill-in-the-Blank_score: 0 - math_accuracy: 18.75 - wikibench-wiki-single_choice_cncircular_perf_4: 25 - sanitized_mbpp_score: 56.25 - dingo_en_192_score: 43.75 - dingo_zh_170_score: 100 - mmlu-other_accuracy: 76.92 - cmmlu-china-specific_accuracy: 84.17 - mmlu_pro_math_accuracy: 18.75 - bbh-logical_deduction_seven_objects_score: 50 - bbh-multistep_arithmetic_two_score: 56.25 - college_naive_average: 12.5 - college_knowledge_naive_average: 87.5 + objective: + race-high_accuracy: 100 + ARC-c_accuracy: 68.75 + BoolQ_accuracy: 87.5 + triviaqa_wiki_1shot_score: 43.75 + nq_open_1shot_score: 43.75 + drop_accuracy: 62.5 + GPQA_diamond_accuracy: 62.5 + hellaswag_accuracy: 93.75 + TheoremQA_score: 25.00 + winogrande_accuracy: 87.5 + gsm8k_accuracy: 62.50 + GaokaoBench_2010-2022_Math_II_MCQs_score: 81.25 + GaokaoBench_2010-2022_Math_II_Fill-in-the-Blank_score: 0 + math_accuracy: 18.75 + wikibench-wiki-single_choice_cncircular_perf_4: 25 + sanitized_mbpp_score: 62.50 + dingo_en_192_score: 31.25 + dingo_zh_170_score: 93.75 + mmlu-other_accuracy: 76.92 + cmmlu-china-specific_accuracy: 84.17 + mmlu_pro_math_accuracy: 18.75 + bbh-logical_deduction_seven_objects_score: 50 + bbh-multistep_arithmetic_two_score: 56.25 + college_naive_average: 12.5 + college_knowledge_naive_average: 87.5 internlm2_5-7b-turbomind: - race-high_accuracy: 89.28 - ARC-c_accuracy: 52.2 - BoolQ_accuracy: 89.72 - triviaqa_wiki_1shot_score: 65.88 - nq_open_1shot_score: 34.82 - drop_accuracy: 68.1 - bbh_naive_average: 72.15 - GPQA_diamond_accuracy: 32.83 - hellaswag_accuracy: 88.36 - TheoremQA_score: 25 - winogrande_accuracy: 81.29 - gsm8k_accuracy: 74.68 - GaokaoBench_weighted_average: 58.19 - math_accuracy: 33.98 - Mathbench_naive_average: 48.38 - wikibench-wiki-single_choice_cncircular_perf_4: 29.1 - cmmlu_naive_average: 78.94 - mmlu_naive_average: 71.44 - mmlu_pro_naive_average: 38.18 - openai_humaneval_humaneval_pass@1: 59.76 - openai_humaneval_v2_humaneval_pass@1: 51.22 - sanitized_mbpp_score: 55.25 - dingo_en_192_score: 60.94 - dingo_zh_170_score: 67.65 - mmlu-stem_naive_average: 63.72 - mmlu-social-science_naive_average: 80.15 - mmlu-humanities_naive_average: 74.27 - mmlu-other_naive_average: 71.85 - cmmlu-stem_naive_average: 67.07 - cmmlu-social-science_naive_average: 81.49 - cmmlu-humanities_naive_average: 85.84 - cmmlu-other_naive_average: 82.69 - cmmlu-china-specific_naive_average: 79.88 - mmlu_pro_biology_accuracy: 58.58 - mmlu_pro_business_accuracy: 28.01 - mmlu_pro_chemistry_accuracy: 22.79 - mmlu_pro_computer_science_accuracy: 39.02 - mmlu_pro_economics_accuracy: 53.08 - mmlu_pro_engineering_accuracy: 25.7 - mmlu_pro_health_accuracy: 46.94 - mmlu_pro_history_accuracy: 43.04 - mmlu_pro_law_accuracy: 29.7 - mmlu_pro_math_accuracy: 24.2 - mmlu_pro_philosophy_accuracy: 42.48 - mmlu_pro_physics_accuracy: 26.02 - mmlu_pro_psychology_accuracy: 52.76 - mmlu_pro_other_accuracy: 42.21 - college_naive_average: 10.67 - high_naive_average: 6.67 - middle_naive_average: 26.67 - primary_naive_average: 60 - arithmetic_naive_average: 55 - mathbench-a (average)_naive_average: 31.8 - college_knowledge_naive_average: 62.34 - high_knowledge_naive_average: 59.83 - middle_knowledge_naive_average: 71.15 - primary_knowledge_naive_average: 66.55 - mathbench-t (average)_naive_average: 64.97 - Single-Needle-Retrieval(S-RT)-32000_naive_average: 100 - Single-Needle-Retrieval-EN-32000_naive_average: 100 - Single-Needle-Retrieval-ZH-32000_naive_average: 100 - Single-Needle-Retrieval(S-RT)-100000_naive_average: 100 - Single-Needle-Retrieval-EN-100000_naive_average: 100 - Single-Needle-Retrieval-ZH-100000_naive_average: 100 - Single-Needle-Retrieval(S-RT)-200000_naive_average: 100 - Single-Needle-Retrieval-EN-200000_naive_average: 100 - Single-Needle-Retrieval-ZH-200000_naive_average: 100 - longbench_naive_average: 46.19 - longbench_zh_naive_average: 49.3 - longbench_en_naive_average: 43.97 - longbench_single-document-qa_naive_average: 42.84 - longbench_multi-document-qa_naive_average: 37.29 - longbench_summarization_naive_average: 23.21 - longbench_few-shot-learning_naive_average: 61.67 - longbench_synthetic-tasks_naive_average: 60.05 - longbench_code-completion_naive_average: 52.09 + objective: + race-high_accuracy: 89.28 + ARC-c_accuracy: 52.2 + BoolQ_accuracy: 89.72 + triviaqa_wiki_1shot_score: 65.88 + nq_open_1shot_score: 34.82 + drop_accuracy: 68.1 + bbh_naive_average: 72.15 + GPQA_diamond_accuracy: 32.83 + hellaswag_accuracy: 88.36 + TheoremQA_score: 25 + winogrande_accuracy: 81.29 + gsm8k_accuracy: 74.68 + GaokaoBench_weighted_average: 58.19 + math_accuracy: 33.98 + Mathbench_naive_average: 48.38 + wikibench-wiki-single_choice_cncircular_perf_4: 29.1 + cmmlu_naive_average: 78.94 + mmlu_naive_average: 71.44 + mmlu_pro_naive_average: 38.18 + openai_humaneval_humaneval_pass@1: 59.76 + openai_humaneval_v2_humaneval_pass@1: 51.22 + sanitized_mbpp_score: 55.25 + dingo_en_192_score: 60.94 + dingo_zh_170_score: 67.65 + mmlu-stem_naive_average: 63.72 + mmlu-social-science_naive_average: 80.15 + mmlu-humanities_naive_average: 74.27 + mmlu-other_naive_average: 71.85 + cmmlu-stem_naive_average: 67.07 + cmmlu-social-science_naive_average: 81.49 + cmmlu-humanities_naive_average: 85.84 + cmmlu-other_naive_average: 82.69 + cmmlu-china-specific_naive_average: 79.88 + mmlu_pro_biology_accuracy: 58.58 + mmlu_pro_business_accuracy: 28.01 + mmlu_pro_chemistry_accuracy: 22.79 + mmlu_pro_computer_science_accuracy: 39.02 + mmlu_pro_economics_accuracy: 53.08 + mmlu_pro_engineering_accuracy: 25.7 + mmlu_pro_health_accuracy: 46.94 + mmlu_pro_history_accuracy: 43.04 + mmlu_pro_law_accuracy: 29.7 + mmlu_pro_math_accuracy: 24.2 + mmlu_pro_philosophy_accuracy: 42.48 + mmlu_pro_physics_accuracy: 26.02 + mmlu_pro_psychology_accuracy: 52.76 + mmlu_pro_other_accuracy: 42.21 + college_naive_average: 10.67 + high_naive_average: 6.67 + middle_naive_average: 26.67 + primary_naive_average: 60 + arithmetic_naive_average: 55 + mathbench-a (average)_naive_average: 31.8 + college_knowledge_naive_average: 62.34 + high_knowledge_naive_average: 59.83 + middle_knowledge_naive_average: 71.15 + primary_knowledge_naive_average: 66.55 + mathbench-t (average)_naive_average: 64.97 + long_context: + Single-Needle-Retrieval(S-RT)-32000_naive_average: 100 + Single-Needle-Retrieval-EN-32000_naive_average: 100 + Single-Needle-Retrieval-ZH-32000_naive_average: 100 + Single-Needle-Retrieval(S-RT)-100000_naive_average: 100 + Single-Needle-Retrieval-EN-100000_naive_average: 100 + Single-Needle-Retrieval-ZH-100000_naive_average: 100 + Single-Needle-Retrieval(S-RT)-200000_naive_average: 100 + Single-Needle-Retrieval-EN-200000_naive_average: 100 + Single-Needle-Retrieval-ZH-200000_naive_average: 100 + longbench_naive_average: 46.19 + longbench_zh_naive_average: 49.3 + longbench_en_naive_average: 43.97 + longbench_single-document-qa_naive_average: 42.84 + longbench_multi-document-qa_naive_average: 37.29 + longbench_summarization_naive_average: 23.21 + longbench_few-shot-learning_naive_average: 61.67 + longbench_synthetic-tasks_naive_average: 60.05 + longbench_code-completion_naive_average: 52.09 internlm2_5-7b-chat-turbomind: - race-high_accuracy: 86.16 - ARC-c_accuracy: 90.17 - BoolQ_accuracy: 87.89 - triviaqa_wiki_1shot_score: 64.91 - nq_open_1shot_score: 22.69 - mmmlu_lite_naive_average: 44.96 - IFEval_Prompt-level-strict-accuracy: 58.04 - drop_accuracy: 77.68 - bbh_naive_average: 73.14 - GPQA_diamond_accuracy: 25.76 - hellaswag_accuracy: 94.79 - TheoremQA_score: 21.5 - musr_average_naive_average: 51.03 - korbench_single_naive_average: 31.92 - ARC_Prize_Public_Evaluation_accuracy: 0.01 - gsm8k_accuracy: 86.73 - GaokaoBench_weighted_average: 77.89 - math_accuracy: 61.5 - cmo_fib_accuracy: 12.5 - aime2024_accuracy: 3.33 - Mathbench_naive_average: 65.17 - wikibench-wiki-single_choice_cncircular_perf_4: 31.55 - cmmlu_naive_average: 74.14 - mmlu_naive_average: 70.52 - mmlu_pro_naive_average: 44.98 - openai_humaneval_humaneval_pass@1: 70.73 - sanitized_mbpp_score: 63.81 - humanevalx_naive_average: 38.17 - ds1000_naive_average: 14.15 - lcb_code_generation_pass@1: 17.75 - lcb_code_execution_pass@1: 32.57 - lcb_test_output_pass@1: 24.89 - bigcodebench_hard_instruct_pass@1: 0.08 - bigcodebench_hard_complete_pass@1: 0.06 - teval_naive_average: 80.03 - qa_dingo_cn_score: 99.01 - mmlu-stem_naive_average: 68.2 - mmlu-social-science_naive_average: 76.11 - mmlu-humanities_naive_average: 68.71 - mmlu-other_naive_average: 70.56 - cmmlu-stem_naive_average: 66.27 - cmmlu-social-science_naive_average: 75.7 - cmmlu-humanities_naive_average: 77.7 - cmmlu-other_naive_average: 77.71 - cmmlu-china-specific_naive_average: 72.94 - mmlu_pro_biology_accuracy: 66.25 - mmlu_pro_business_accuracy: 48.42 - mmlu_pro_chemistry_accuracy: 35.25 - mmlu_pro_computer_science_accuracy: 47.56 - mmlu_pro_economics_accuracy: 55.92 - mmlu_pro_engineering_accuracy: 30.44 - mmlu_pro_health_accuracy: 45.97 - mmlu_pro_history_accuracy: 41.21 - mmlu_pro_law_accuracy: 25.79 - mmlu_pro_math_accuracy: 54.03 - mmlu_pro_philosophy_accuracy: 36.47 - mmlu_pro_physics_accuracy: 37.41 - mmlu_pro_psychology_accuracy: 58.77 - mmlu_pro_other_accuracy: 46.21 - humanevalx-python_pass@1: 53.66 - humanevalx-cpp_pass@1: 24.39 - humanevalx-go_pass@1: 0 - humanevalx-java_pass@1: 57.93 - humanevalx-js_pass@1: 54.88 - ds1000_Pandas_accuracy: 12.03 - ds1000_Numpy_accuracy: 4.09 - ds1000_Tensorflow_accuracy: 11.11 - ds1000_Scipy_accuracy: 8.49 - ds1000_Sklearn_accuracy: 6.96 - ds1000_Pytorch_accuracy: 7.35 - ds1000_Matplotlib_accuracy: 49.03 - openai_mmmlu_lite_AR-XY_accuracy: 17.89 - openai_mmmlu_lite_BN-BD_accuracy: 27.58 - openai_mmmlu_lite_DE-DE_accuracy: 51.16 - openai_mmmlu_lite_ES-LA_accuracy: 56.84 - openai_mmmlu_lite_FR-FR_accuracy: 57.96 - openai_mmmlu_lite_HI-IN_accuracy: 33.68 - openai_mmmlu_lite_ID-ID_accuracy: 51.02 - openai_mmmlu_lite_IT-IT_accuracy: 50.46 - openai_mmmlu_lite_JA-JP_accuracy: 50.53 - openai_mmmlu_lite_KO-KR_accuracy: 45.05 - openai_mmmlu_lite_PT-BR_accuracy: 57.68 - openai_mmmlu_lite_SW-KE_accuracy: 32.77 - openai_mmmlu_lite_YO-NG_accuracy: 31.79 - openai_mmmlu_lite_ZH-CN_accuracy: 65.05 - college_naive_average: 20.33 - high_naive_average: 47.67 - middle_naive_average: 62 - primary_naive_average: 72 - arithmetic_naive_average: 62.33 - mathbench-a (average)_naive_average: 52.87 - college_knowledge_naive_average: 70.57 - high_knowledge_naive_average: 70.13 - middle_knowledge_naive_average: 81.17 - primary_knowledge_naive_average: 88.01 - mathbench-t (average)_naive_average: 77.47 - alignment_bench_v1_1_总分: 5.68 - alpaca_eval_total: 25.96 - arenahard_score: 17.15 - Followbench_naive_average: 0.81 - CompassArena_naive_average: 34.61 - FoFo_naive_average: 0.38 - mtbench101_avg: 8.01 - wildbench_average: -15.69 - simpleqa_accuracy_given_attempted: 0.04 - chinese_simpleqa_given_attempted_accuracy: 0.34 - alignment_bench_v1_1_专业能力: 6.05 - alignment_bench_v1_1_数学计算: 5.87 - alignment_bench_v1_1_基本任务: 6.01 - alignment_bench_v1_1_逻辑推理: 4.48 - alignment_bench_v1_1_中文理解: 6.17 - alignment_bench_v1_1_文本写作: 6.06 - alignment_bench_v1_1_角色扮演: 6.3 - alignment_bench_v1_1_综合问答: 6.45 - alpaca_eval_helpful_base: 17.83 - alpaca_eval_koala: 28.21 - alpaca_eval_oasst: 23.4 - alpaca_eval_selfinstruct: 30.95 - alpaca_eval_vicuna: 25 - compassarena_language_naive_average: 52.5 - compassarena_knowledge_naive_average: 36 - compassarena_reason_v2_naive_average: 35 - compassarena_math_v2_naive_average: 19.91 - compassarena_creationv2_zh_naive_average: 29.64 - fofo_test_prompts_overall: 0.35 - fofo_test_prompts_cn_overall: 0.41 - followbench_llmeval_en_HSR_AVG: 0.73 - followbench_llmeval_en_SSR_AVG: 0.88 - followbench_llmeval_en_HSR_L1: 0.94 - followbench_llmeval_en_HSR_L2: 0.77 - followbench_llmeval_en_HSR_L3: 0.73 - followbench_llmeval_en_HSR_L4: 0.68 - followbench_llmeval_en_HSR_L5: 0.54 - followbench_llmeval_en_SSR_L1: 0.94 - followbench_llmeval_en_SSR_L2: 0.88 - followbench_llmeval_en_SSR_L3: 0.87 - followbench_llmeval_en_SSR_L4: 0.87 - followbench_llmeval_en_SSR_L5: 0.85 - simpleqa_f1: 0.04 + objective: + race-high_accuracy: 86.16 + ARC-c_accuracy: 90.17 + BoolQ_accuracy: 87.89 + triviaqa_wiki_1shot_score: 64.91 + nq_open_1shot_score: 22.69 + mmmlu_lite_naive_average: 44.96 + IFEval_Prompt-level-strict-accuracy: 58.04 + drop_accuracy: 77.68 + bbh_naive_average: 73.14 + GPQA_diamond_accuracy: 25.76 + hellaswag_accuracy: 94.79 + TheoremQA_score: 21.5 + musr_average_naive_average: 51.03 + korbench_single_naive_average: 31.92 + ARC_Prize_Public_Evaluation_accuracy: 0.01 + gsm8k_accuracy: 86.73 + GaokaoBench_weighted_average: 77.89 + math_accuracy: 61.5 + cmo_fib_accuracy: 12.5 + aime2024_accuracy: 3.33 + Mathbench_naive_average: 65.17 + wikibench-wiki-single_choice_cncircular_perf_4: 31.55 + cmmlu_naive_average: 74.14 + mmlu_naive_average: 70.52 + mmlu_pro_naive_average: 44.98 + openai_humaneval_humaneval_pass@1: 70.73 + sanitized_mbpp_score: 63.81 + humanevalx_naive_average: 38.17 + ds1000_naive_average: 14.15 + lcb_code_generation_pass@1: 17.75 + lcb_code_execution_pass@1: 32.57 + lcb_test_output_pass@1: 24.89 + bigcodebench_hard_instruct_pass@1: 0.08 + bigcodebench_hard_complete_pass@1: 0.06 + teval_naive_average: 80.03 + qa_dingo_cn_score: 99.01 + mmlu-stem_naive_average: 68.2 + mmlu-social-science_naive_average: 76.11 + mmlu-humanities_naive_average: 68.71 + mmlu-other_naive_average: 70.56 + cmmlu-stem_naive_average: 66.27 + cmmlu-social-science_naive_average: 75.7 + cmmlu-humanities_naive_average: 77.7 + cmmlu-other_naive_average: 77.71 + cmmlu-china-specific_naive_average: 72.94 + mmlu_pro_biology_accuracy: 66.25 + mmlu_pro_business_accuracy: 48.42 + mmlu_pro_chemistry_accuracy: 35.25 + mmlu_pro_computer_science_accuracy: 47.56 + mmlu_pro_economics_accuracy: 55.92 + mmlu_pro_engineering_accuracy: 30.44 + mmlu_pro_health_accuracy: 45.97 + mmlu_pro_history_accuracy: 41.21 + mmlu_pro_law_accuracy: 25.79 + mmlu_pro_math_accuracy: 54.03 + mmlu_pro_philosophy_accuracy: 36.47 + mmlu_pro_physics_accuracy: 37.41 + mmlu_pro_psychology_accuracy: 58.77 + mmlu_pro_other_accuracy: 46.21 + humanevalx-python_pass@1: 53.66 + humanevalx-cpp_pass@1: 24.39 + humanevalx-go_pass@1: 0 + humanevalx-java_pass@1: 57.93 + humanevalx-js_pass@1: 54.88 + ds1000_Pandas_accuracy: 12.03 + ds1000_Numpy_accuracy: 4.09 + ds1000_Tensorflow_accuracy: 11.11 + ds1000_Scipy_accuracy: 8.49 + ds1000_Sklearn_accuracy: 6.96 + ds1000_Pytorch_accuracy: 7.35 + ds1000_Matplotlib_accuracy: 49.03 + openai_mmmlu_lite_AR-XY_accuracy: 17.89 + openai_mmmlu_lite_BN-BD_accuracy: 27.58 + openai_mmmlu_lite_DE-DE_accuracy: 51.16 + openai_mmmlu_lite_ES-LA_accuracy: 56.84 + openai_mmmlu_lite_FR-FR_accuracy: 57.96 + openai_mmmlu_lite_HI-IN_accuracy: 33.68 + openai_mmmlu_lite_ID-ID_accuracy: 51.02 + openai_mmmlu_lite_IT-IT_accuracy: 50.46 + openai_mmmlu_lite_JA-JP_accuracy: 50.53 + openai_mmmlu_lite_KO-KR_accuracy: 45.05 + openai_mmmlu_lite_PT-BR_accuracy: 57.68 + openai_mmmlu_lite_SW-KE_accuracy: 32.77 + openai_mmmlu_lite_YO-NG_accuracy: 31.79 + openai_mmmlu_lite_ZH-CN_accuracy: 65.05 + college_naive_average: 20.33 + high_naive_average: 47.67 + middle_naive_average: 62 + primary_naive_average: 72 + arithmetic_naive_average: 62.33 + mathbench-a (average)_naive_average: 52.87 + college_knowledge_naive_average: 70.57 + high_knowledge_naive_average: 70.13 + middle_knowledge_naive_average: 81.17 + primary_knowledge_naive_average: 88.01 + mathbench-t (average)_naive_average: 77.47 + subjective: + alignment_bench_v1_1_总分: 5.68 + alpaca_eval_total: 25.96 + arenahard_score: 17.15 + Followbench_naive_average: 0.81 + CompassArena_naive_average: 34.61 + FoFo_naive_average: 0.38 + mtbench101_avg: 8.01 + wildbench_average: -15.69 + simpleqa_accuracy_given_attempted: 0.04 + chinese_simpleqa_given_attempted_accuracy: 0.34 + alignment_bench_v1_1_专业能力: 6.05 + alignment_bench_v1_1_数学计算: 5.87 + alignment_bench_v1_1_基本任务: 6.01 + alignment_bench_v1_1_逻辑推理: 4.48 + alignment_bench_v1_1_中文理解: 6.17 + alignment_bench_v1_1_文本写作: 6.06 + alignment_bench_v1_1_角色扮演: 6.3 + alignment_bench_v1_1_综合问答: 6.45 + alpaca_eval_helpful_base: 17.83 + alpaca_eval_koala: 28.21 + alpaca_eval_oasst: 23.4 + alpaca_eval_selfinstruct: 30.95 + alpaca_eval_vicuna: 25 + compassarena_language_naive_average: 52.5 + compassarena_knowledge_naive_average: 36 + compassarena_reason_v2_naive_average: 35 + compassarena_math_v2_naive_average: 19.91 + compassarena_creationv2_zh_naive_average: 29.64 + fofo_test_prompts_overall: 0.35 + fofo_test_prompts_cn_overall: 0.41 + followbench_llmeval_en_HSR_AVG: 0.73 + followbench_llmeval_en_SSR_AVG: 0.88 + followbench_llmeval_en_HSR_L1: 0.94 + followbench_llmeval_en_HSR_L2: 0.77 + followbench_llmeval_en_HSR_L3: 0.73 + followbench_llmeval_en_HSR_L4: 0.68 + followbench_llmeval_en_HSR_L5: 0.54 + followbench_llmeval_en_SSR_L1: 0.94 + followbench_llmeval_en_SSR_L2: 0.88 + followbench_llmeval_en_SSR_L3: 0.87 + followbench_llmeval_en_SSR_L4: 0.87 + followbench_llmeval_en_SSR_L5: 0.85 + simpleqa_f1: 0.04 internlm2_5-7b-chat-1m-turbomind: - ruler_8k_naive_average: 88.53 - ruler_32k_naive_average: 83.84 - ruler_128k_naive_average: 70.94 - NeedleBench-Overall-Score-8K_weighted_average: 91.89 - NeedleBench-Overall-Score-32K_weighted_average: 91.42 - NeedleBench-Overall-Score-128K_weighted_average: 88.57 - longbench_naive_average: 46.44 - longbench_zh_naive_average: 45.19 - longbench_en_naive_average: 45.71 - babilong_0k_naive_average: 79.3 - babilong_4k_naive_average: 67 - babilong_16k_naive_average: 52.7 - babilong_32k_naive_average: 48.9 - babilong_128k_naive_average: 40.8 - babilong_256k_naive_average: 23.5 - longbench_single-document-qa_naive_average: 43.56 - longbench_multi-document-qa_naive_average: 46.24 - longbench_summarization_naive_average: 24.32 - longbench_few-shot-learning_naive_average: 51.67 - longbench_synthetic-tasks_naive_average: 66.83 - longbench_code-completion_naive_average: 45.99 + long_context: + ruler_8k_naive_average: 88.53 + ruler_32k_naive_average: 83.84 + ruler_128k_naive_average: 70.94 + NeedleBench-Overall-Score-8K_weighted_average: 91.89 + NeedleBench-Overall-Score-32K_weighted_average: 91.42 + NeedleBench-Overall-Score-128K_weighted_average: 88.57 + longbench_naive_average: 46.44 + longbench_zh_naive_average: 45.19 + longbench_en_naive_average: 45.71 + babilong_0k_naive_average: 79.3 + babilong_4k_naive_average: 67 + babilong_16k_naive_average: 52.7 + babilong_32k_naive_average: 48.9 + babilong_128k_naive_average: 40.8 + babilong_256k_naive_average: 23.5 + longbench_single-document-qa_naive_average: 43.56 + longbench_multi-document-qa_naive_average: 46.24 + longbench_summarization_naive_average: 24.32 + longbench_few-shot-learning_naive_average: 51.67 + longbench_synthetic-tasks_naive_average: 66.83 + longbench_code-completion_naive_average: 45.99 diff --git a/.github/scripts/oc_score_baseline_testrange.yaml b/.github/scripts/oc_score_baseline_testrange.yaml index f1254343..131fd2ea 100644 --- a/.github/scripts/oc_score_baseline_testrange.yaml +++ b/.github/scripts/oc_score_baseline_testrange.yaml @@ -1,459 +1,468 @@ -baichuan2-7b-chat-hf: - gsm8k_accuracy: 18.75 - race-high_accuracy: 78.12 - -glm-4-9b-chat-hf: - gsm8k_accuracy: 68.75 - race-high_accuracy: 90.62 - -glm-4-9b-chat-turbomind: - gsm8k_accuracy: 75.00 - race-high_accuracy: 90.62 - -glm-4-9b-chat-vllm: - gsm8k_accuracy: 65.62 - race-high_accuracy: 90.62 - -deepseek-7b-chat-hf: - gsm8k_accuracy: 46.88 - race-high_accuracy: 81.25 - -deepseek-moe-16b-chat-hf: - gsm8k_accuracy: 50 - race-high_accuracy: 68.75 - -deepseek-7b-chat-vllm: - gsm8k_accuracy: 43.75 - race-high_accuracy: 75 - -gemma2-2b-it-hf: - gsm8k_accuracy: 50 - race-high_accuracy: 71.88 - -gemma2-9b-it-hf: - gsm8k_accuracy: 71.88 - race-high_accuracy: 84.38 - -gemma-2b-it-hf: - gsm8k_accuracy: 3.12 - race-high_accuracy: 40.62 - -gemma-7b-it-hf: - gsm8k_accuracy: 40.62 - race-high_accuracy: 68.75 - -gemma-2-9b-it-turbomind: - gsm8k_accuracy: 65.62 - race-high_accuracy: 84.38 - -gemma-7b-it-vllm: - gsm8k_accuracy: 34.38 - race-high_accuracy: 68.75 - -internlm2_5-7b-chat-hf: - gsm8k_accuracy: 84.38 - race-high_accuracy: 90.62 - -internlm2_5-7b-chat-turbomind: - gsm8k_accuracy: 84.38 - race-high_accuracy: 90.62 - -internlm2-chat-1.8b-turbomind: - gsm8k_accuracy: 25 - race-high_accuracy: 84.38 - -internlm2-chat-1.8b-sft-turbomind: - gsm8k_accuracy: 21.88 - race-high_accuracy: 84.38 - -internlm2-chat-7b-lmdeploy: - gsm8k_accuracy: 53.12 - race-high_accuracy: 84.38 - -internlm2-chat-7b-sft-turbomind: - gsm8k_accuracy: 50 - race-high_accuracy: 90.62 - -internlm2-chat-7b-vllm: - gsm8k_accuracy: 43.75 - race-high_accuracy: 87.5 - -llama-3_1-8b-instruct-hf: - gsm8k_accuracy: 84.38 - race-high_accuracy: 90.62 - -llama-3_2-3b-instruct-hf: - gsm8k_accuracy: 68.75 - race-high_accuracy: 81.25 - -llama-3-8b-instruct-hf: - gsm8k_accuracy: 68.75 - race-high_accuracy: 87.5 - -llama-3_1-8b-instruct-turbomind: - gsm8k_accuracy: 78.12 - race-high_accuracy: 90.62 - -llama-3_2-3b-instruct-turbomind: - gsm8k_accuracy: 65.62 - race-high_accuracy: 81.25 - -llama-3-8b-instruct-turbomind: - gsm8k_accuracy: 68.75 - race-high_accuracy: 87.5 - -mistral-7b-instruct-v0.2-hf: - gsm8k_accuracy: 40.62 - race-high_accuracy: 75 - -mistral-7b-instruct-v0.3-hf: - gsm8k_accuracy: 40.62 - race-high_accuracy: 75 - -mistral-nemo-instruct-2407-hf: - gsm8k_accuracy: 75 - race-high_accuracy: 81.25 - -mistral-nemo-instruct-2407-turbomind: - gsm8k_accuracy: 68.75 - race-high_accuracy: 87.50 - -mistral-7b-instruct-v0.1-vllm: - gsm8k_accuracy: 34.38 - race-high_accuracy: 68.75 - -mistral-7b-instruct-v0.2-vllm: - gsm8k_accuracy: 43.75 - race-high_accuracy: 75 - -MiniCPM3-4B-hf: - gsm8k_accuracy: 68.75 - race-high_accuracy: 84.38 - -minicpm-2b-dpo-fp32-hf: - gsm8k_accuracy: 56.25 - race-high_accuracy: 53.12 - -minicpm-2b-sft-bf16-hf: - gsm8k_accuracy: 46.88 - race-high_accuracy: 65.62 - -minicpm-2b-sft-fp32-hf: - gsm8k_accuracy: 46.88 - race-high_accuracy: 65.62 - -phi-3-mini-4k-instruct-hf: - gsm8k_accuracy: 56.25 - race-high_accuracy: 84.38 - -qwen1.5-0.5b-chat-hf: - gsm8k_accuracy: 0 - race-high_accuracy: 53.12 - -qwen2-1.5b-instruct-hf: - gsm8k_accuracy: 62.5 - race-high_accuracy: 84.38 - -qwen2-7b-instruct-hf: - gsm8k_accuracy: 68.75 - race-high_accuracy: 90.62 - -qwen2-1.5b-instruct-turbomind: - gsm8k_accuracy: 62.50 - race-high_accuracy: 84.38 - -qwen2-7b-instruct-turbomind: - gsm8k_accuracy: 81.25 - race-high_accuracy: 87.5 - -qwen1.5-0.5b-chat-vllm: - gsm8k_accuracy: 3.12 - race-high_accuracy: 53.12 - -yi-1.5-6b-chat-hf: - gsm8k_accuracy: 65.62 - race-high_accuracy: 84.38 - -yi-1.5-9b-chat-hf: - gsm8k_accuracy: 75 - race-high_accuracy: 93.75 - -deepseek-v2-lite-chat-hf: - gsm8k_accuracy: 43.75 - race-high_accuracy: 71.88 - -internlm2_5-20b-chat-hf: - gsm8k_accuracy: 84.38 - race-high_accuracy: 87.5 - -internlm2_5-20b-chat-turbomind: - gsm8k_accuracy: 84.38 - race-high_accuracy: 87.5 - -mistral-small-instruct-2409-hf: - gsm8k_accuracy: 81.25 - race-high_accuracy: 87.50 - -mistral-small-instruct-2409-turbomind: - gsm8k_accuracy: 78.12 - race-high_accuracy: 87.50 - -qwen2.5-14b-instruct-hf: - gsm8k_accuracy: 71.88 - race-high_accuracy: 96.88 - -qwen2.5-14b-instruct-turbomind: - gsm8k_accuracy: 71.88 - race-high_accuracy: 93.75 - -glm-4-9b-hf: - gsm8k_accuracy: 68.75 - GPQA_diamond_accuracy: 31.25 - race-high_accuracy: 93.75 - winogrande_accuracy: 84.38 - -deepseek-moe-16b-base-hf: - gsm8k_accuracy: 21.88 - GPQA_diamond_accuracy: 0 - race-high_accuracy: 21.88 - winogrande_accuracy: 65.62 - -deepseek-7b-base-turbomind: - gsm8k_accuracy: 21.88 - GPQA_diamond_accuracy: 0 - race-high_accuracy: 46.88 - winogrande_accuracy: 84.38 - -deepseek-moe-16b-base-vllm: - gsm8k_accuracy: 21.88 - GPQA_diamond_accuracy: 0 - race-high_accuracy: 25 - winogrande_accuracy: 68.75 - -gemma2-2b-hf: - gsm8k_accuracy: 31.25 - GPQA_diamond_accuracy: 3.12 - race-high_accuracy: 56.25 - winogrande_accuracy: 71.88 - -gemma2-9b-hf: - gsm8k_accuracy: 68.75 - GPQA_diamond_accuracy: 0 - race-high_accuracy: 81.25 - winogrande_accuracy: 84.38 - -gemma-2b-hf: - gsm8k_accuracy: 18.75 - GPQA_diamond_accuracy: 3.12 - race-high_accuracy: 25 - winogrande_accuracy: 53.12 - -gemma-7b-hf: - gsm8k_accuracy: 56.25 - GPQA_diamond_accuracy: 6.25 - race-high_accuracy: 65.62 - winogrande_accuracy: 78.12 - -gemma-2b-vllm: - gsm8k_accuracy: 15.62 - GPQA_diamond_accuracy: 6.25 - race-high_accuracy: - winogrande_accuracy: - -gemma-7b-vllm: - gsm8k_accuracy: 53.12 - GPQA_diamond_accuracy: 6.25 - race-high_accuracy: - winogrande_accuracy: - -internlm2_5-7b-hf: - gsm8k_accuracy: 37.5 - GPQA_diamond_accuracy: 25 - race-high_accuracy: 93.75 - winogrande_accuracy: 71.88 - -internlm2-7b-hf: - gsm8k_accuracy: 53.12 - GPQA_diamond_accuracy: 18.75 - race-high_accuracy: 62.5 - winogrande_accuracy: 78.12 - -internlm2-base-7b-hf: - gsm8k_accuracy: 3.12 - GPQA_diamond_accuracy: 21.88 - race-high_accuracy: 75 - winogrande_accuracy: 65.62 - -internlm2-1.8b-turbomind: - gsm8k_accuracy: 12.5 - GPQA_diamond_accuracy: 12.5 - race-high_accuracy: 71.88 - winogrande_accuracy: 75 - -internlm2_5-7b-turbomind: - gsm8k_accuracy: 68.75 - GPQA_diamond_accuracy: 31.25 - race-high_accuracy: 93.75 - winogrande_accuracy: 84.38 - -internlm2-7b-turbomind: - gsm8k_accuracy: 56.25 - GPQA_diamond_accuracy: 21.88 - race-high_accuracy: 75 - winogrande_accuracy: 81.25 - -internlm2-base-7b-turbomind: - gsm8k_accuracy: 40.62 - GPQA_diamond_accuracy: 28.12 - race-high_accuracy: 84.38 - winogrande_accuracy: 71.88 - -llama-2-7b-hf: - gsm8k_accuracy: 21.88 - GPQA_diamond_accuracy: 21.88 - race-high_accuracy: 40.62 - winogrande_accuracy: 71.88 - -llama-3_1-8b-hf: - gsm8k_accuracy: 78.12 - GPQA_diamond_accuracy: 25 - race-high_accuracy: 90.62 - winogrande_accuracy: 62.5 - -llama-3-8b-hf: - gsm8k_accuracy: 46.88 - GPQA_diamond_accuracy: 6.25 - race-high_accuracy: 65.62 - winogrande_accuracy: 65.62 - -llama-3.1-8b-turbomind: - gsm8k_accuracy: 56.25 - GPQA_diamond_accuracy: 6.25 - race-high_accuracy: 78.12 - winogrande_accuracy: 78.12 - -llama-3-8b-turbomind: - gsm8k_accuracy: 50 - GPQA_diamond_accuracy: 9.38 - race-high_accuracy: 65.62 - winogrande_accuracy: 78.12 - -mistral-7b-v0.2-hf: - gsm8k_accuracy: 31.25 - GPQA_diamond_accuracy: 6.25 - race-high_accuracy: 62.5 - winogrande_accuracy: 59.38 - -mistral-7b-v0.3-hf: - gsm8k_accuracy: 31.25 - GPQA_diamond_accuracy: 6.25 - race-high_accuracy: 62.5 - winogrande_accuracy: 59.38 - -mistral-7b-v0.2-vllm: - gsm8k_accuracy: 34.38 - GPQA_diamond_accuracy: 6.25 - race-high_accuracy: 62.5 - winogrande_accuracy: 65.62 - -qwen2.5-7b-hf: - gsm8k_accuracy: 81.25 - GPQA_diamond_accuracy: 18.75 - race-high_accuracy: 87.5 - winogrande_accuracy: 71.88 - -qwen2.5-1.5b-turbomind: - gsm8k_accuracy: 71.88 - GPQA_diamond_accuracy: 15.62 - race-high_accuracy: 78.12 - winogrande_accuracy: 71.88 - -qwen2.5-7b-turbomind: - gsm8k_accuracy: 71.88 - GPQA_diamond_accuracy: 25 - race-high_accuracy: 87.5 - winogrande_accuracy: 71.88 - -qwen1.5-moe-a2.7b-hf: - gsm8k_accuracy: 62.5 - GPQA_diamond_accuracy: 18.75 - race-high_accuracy: 84.38 - winogrande_accuracy: 75 - -qwen2-0.5b-hf: - gsm8k_accuracy: 25 - GPQA_diamond_accuracy: 0 - race-high_accuracy: 40.62 - winogrande_accuracy: 62.5 - -qwen2-1.5b-hf: - gsm8k_accuracy: 59.38 - GPQA_diamond_accuracy: 9.38 - race-high_accuracy: 81.25 - winogrande_accuracy: 62.5 - -qwen2-7b-hf: - gsm8k_accuracy: 68.75 - GPQA_diamond_accuracy: 9.38 - race-high_accuracy: 87.5 - winogrande_accuracy: 68.75 - -qwen2-1.5b-turbomind: - gsm8k_accuracy: 62.50 - GPQA_diamond_accuracy: 6.25 - race-high_accuracy: 81.25 - winogrande_accuracy: 75 - -qwen2-7b-turbomind: - gsm8k_accuracy: 68.75 - GPQA_diamond_accuracy: 12.5 - race-high_accuracy: 87.5 - winogrande_accuracy: 71.88 - -qwen1.5-0.5b-vllm: - gsm8k_accuracy: 9.38 - GPQA_diamond_accuracy: 0 - race-high_accuracy: 56.25 - winogrande_accuracy: 62.5 - -yi-1.5-6b-hf: - gsm8k_accuracy: 62.5 - GPQA_diamond_accuracy: 3.12 - race-high_accuracy: 87.5 - winogrande_accuracy: 62.5 - -yi-1.5-9b-hf: - gsm8k_accuracy: 75 - GPQA_diamond_accuracy: 40.62 - race-high_accuracy: 87.5 - winogrande_accuracy: 59.38 - -deepseek-v2-lite-hf: - gsm8k_accuracy: 28.12 - GPQA_diamond_accuracy: 21.88 - race-high_accuracy: 59.38 - winogrande_accuracy: 75 - -internlm2-20b-hf: - gsm8k_accuracy: 56.25 - GPQA_diamond_accuracy: 15.62 - race-high_accuracy: 68.75 - winogrande_accuracy: 75 - -internlm2-base-20b-hf: - gsm8k_accuracy: 12.5 - GPQA_diamond_accuracy: 9.38 - race-high_accuracy: 84.38 - winogrande_accuracy: 65.62 - -internlm2-20b-turbomind: - gsm8k_accuracy: 68.75 - GPQA_diamond_accuracy: 15.62 - race-high_accuracy: 68.75 - winogrande_accuracy: 81.25 - -qwen2.5-14b-hf: - gsm8k_accuracy: 75 - GPQA_diamond_accuracy: 37.5 - race-high_accuracy: 93.75 - winogrande_accuracy: 84.38 +chat: + glm-4-9b-chat-hf: + gsm8k_accuracy: 68.75 + race-high_accuracy: 90.62 + glm-4-9b-chat-turbomind: + gsm8k_accuracy: 71.88 + race-high_accuracy: 90.62 + glm-4-9b-chat-vllm: + gsm8k_accuracy: 65.62 + race-high_accuracy: 90.62 + deepseek-7b-chat-hf: + gsm8k_accuracy: 46.88 + race-high_accuracy: 81.25 + deepseek-moe-16b-chat-hf: + gsm8k_accuracy: 50 + race-high_accuracy: 68.75 + deepseek-7b-chat-vllm: + gsm8k_accuracy: 43.75 + race-high_accuracy: 75 + gemma2-2b-it-hf: + gsm8k_accuracy: 50 + race-high_accuracy: 71.88 + gemma2-9b-it-hf: + gsm8k_accuracy: 71.88 + race-high_accuracy: 84.38 + gemma-2b-it-hf: + gsm8k_accuracy: 3.12 + race-high_accuracy: 40.62 + gemma-7b-it-hf: + gsm8k_accuracy: 40.62 + race-high_accuracy: 68.75 + gemma-2-9b-it-turbomind: + gsm8k_accuracy: 71.88 + race-high_accuracy: 84.38 + gemma-2-27b-it-turbomind: + gsm8k_accuracy: 78.12 + race-high_accuracy: 93.75 + gemma-7b-it-vllm: + gsm8k_accuracy: 34.38 + race-high_accuracy: 68.75 + internlm2_5-7b-chat-hf: + gsm8k_accuracy: 84.38 + race-high_accuracy: 90.62 + internlm2_5-7b-chat-turbomind: + gsm8k_accuracy: 87.50 + race-high_accuracy: 90.62 + internlm2-chat-1.8b-turbomind: + gsm8k_accuracy: 28.12 + race-high_accuracy: 84.38 + internlm2-chat-1.8b-sft-turbomind: + gsm8k_accuracy: 21.88 + race-high_accuracy: 84.38 + internlm2-chat-7b-lmdeploy: + gsm8k_accuracy: 53.12 + race-high_accuracy: 84.38 + internlm2-chat-7b-sft-turbomind: + gsm8k_accuracy: 53.12 + race-high_accuracy: 90.62 + internlm2-chat-7b-vllm: + gsm8k_accuracy: 56.25 + race-high_accuracy: 84.38 + llama-3_1-8b-instruct-hf: + gsm8k_accuracy: 84.38 + race-high_accuracy: 90.62 + llama-3_2-3b-instruct-hf: + gsm8k_accuracy: 68.75 + race-high_accuracy: 81.25 + llama-3-8b-instruct-hf: + gsm8k_accuracy: 68.75 + race-high_accuracy: 87.5 + llama-2-7b-chat-turbomind: + gsm8k_accuracy: 18.75 + race-high_accuracy: 46.88 + llama-3_1-8b-instruct-turbomind: + gsm8k_accuracy: 78.12 + race-high_accuracy: 90.62 + llama-3_2-3b-instruct-turbomind: + gsm8k_accuracy: 71.88 + race-high_accuracy: 81.25 + llama-3-8b-instruct-turbomind: + gsm8k_accuracy: 71.88 + race-high_accuracy: 87.5 + mistral-7b-instruct-v0.2-hf: + gsm8k_accuracy: 40.62 + race-high_accuracy: 75 + mistral-7b-instruct-v0.3-hf: + gsm8k_accuracy: 40.62 + race-high_accuracy: 75 + mistral-nemo-instruct-2407-hf: + gsm8k_accuracy: 75 + race-high_accuracy: 81.25 + mistral-nemo-instruct-2407-turbomind: + gsm8k_accuracy: 65.62 + race-high_accuracy: 87.50 + mistral-7b-instruct-v0.1-vllm: + gsm8k_accuracy: 34.38 + race-high_accuracy: 68.75 + mistral-7b-instruct-v0.2-vllm: + gsm8k_accuracy: 43.75 + race-high_accuracy: 75 + MiniCPM3-4B-hf: + gsm8k_accuracy: 68.75 + race-high_accuracy: 84.38 + phi-3-mini-4k-instruct-hf: + gsm8k_accuracy: 56.25 + race-high_accuracy: 84.38 + phi-3-small-8k-instruct-hf: + gsm8k_accuracy: 0 + race-high_accuracy: 0 + qwen2.5-0.5b-instruct-hf: + gsm8k_accuracy: 34.38 + race-high_accuracy: 46.88 + qwen2.5-3b-instruct-hf : + gsm8k_accuracy: 53.12 + race-high_accuracy: 90.62 + qwen2.5-0.5b-instruct-turbomind: + gsm8k_accuracy: 28.12 + race-high_accuracy: 50 + qwen2.5-3b-instruct-turbomind: + gsm8k_accuracy: 59.38 + race-high_accuracy: 90.62 + qwen1.5-0.5b-chat-hf: + gsm8k_accuracy: 0 + race-high_accuracy: 53.12 + qwen2-1.5b-instruct-hf: + gsm8k_accuracy: 62.5 + race-high_accuracy: 84.38 + qwen2-7b-instruct-hf: + gsm8k_accuracy: 68.75 + race-high_accuracy: 90.62 + qwen2-1.5b-instruct-turbomind: + gsm8k_accuracy: 53.12 + race-high_accuracy: 84.38 + qwen2-7b-instruct-turbomind: + gsm8k_accuracy: 81.25 + race-high_accuracy: 90.62 + qwen1.5-0.5b-chat-vllm: + gsm8k_accuracy: 3.12 + race-high_accuracy: 53.12 + yi-1.5-6b-chat-hf: + gsm8k_accuracy: 65.62 + race-high_accuracy: 84.38 + yi-1.5-9b-chat-hf: + gsm8k_accuracy: 75 + race-high_accuracy: 93.75 + yi-1.5-6b-chat-turbomind: + gsm8k_accuracy: 62.5 + race-high_accuracy: 84.38 + yi-1.5-9b-chat-turbomind: + gsm8k_accuracy: 71.88 + race-high_accuracy: 93.75 + deepseek-v2-lite-chat-hf: + gsm8k_accuracy: 46.88 + race-high_accuracy: 71.88 + gemma2-27b-it-hf: + gsm8k_accuracy: 75 + race-high_accuracy: 93.75 + internlm2_5-20b-chat-hf: + gsm8k_accuracy: 84.38 + race-high_accuracy: 87.5 + internlm2_5-20b-chat-turbomind: + gsm8k_accuracy: 87.50 + race-high_accuracy: 87.5 + mistral-small-instruct-2409-hf: + gsm8k_accuracy: 81.25 + race-high_accuracy: 87.50 + mistral-small-instruct-2409-turbomind: + gsm8k_accuracy: 81.25 + race-high_accuracy: 87.50 + qwen2.5-14b-instruct-hf: + gsm8k_accuracy: 71.88 + race-high_accuracy: 96.88 + qwen2.5-14b-instruct-turbomind: + gsm8k_accuracy: 68.75 + race-high_accuracy: 93.75 + yi-1.5-34b-chat-turbomind: + gsm8k_accuracy: 78.12 + race-high_accuracy: 93.75 + deepseek-67b-chat-hf: + gsm8k_accuracy: 71.88 + race-high_accuracy: 78.12 + llama-3_3-70b-instruct-turbomind: + gsm8k_accuracy: 93.75 + race-high_accuracy: 87.5 + mixtral-8x7b-instruct-v0.1-hf: + gsm8k_accuracy: 56.25 + race-high_accuracy: 81.25 + mixtral-large-instruct-2411-turbomind: + gsm8k_accuracy: 90.62 + race-high_accuracy: 93.75 + nvidia-3_1-Nemotron-70b-instruct-HF-turbomind: + gsm8k_accuracy: 87.5 + race-high_accuracy: 46.88 + qwen2.5-72b-instruct-turbomind: + gsm8k_accuracy: 75 + race-high_accuracy: 93.75 + deepseek-v2_5-1210-turbomind: + gsm8k_accuracy: 90.62 + race-high_accuracy: 84.38 + mixtral-8x22b-instruct-v0.1-hf: + gsm8k_accuracy: 81.25 + race-high_accuracy: 81.25 +base: + glm-4-9b-hf: + gsm8k_accuracy: 68.75 + GPQA_diamond_accuracy: 31.25 + race-high_accuracy: 93.75 + winogrande_accuracy: 84.38 + glm-4-9b-turbomind: + gsm8k_accuracy: 62.5 + GPQA_diamond_accuracy: 28.12 + race-high_accuracy: 93.75 + winogrande_accuracy: 84.38 + deepseek-7b-base-hf: + gsm8k_accuracy: 25 + GPQA_diamond_accuracy: 0 + race-high_accuracy: 46.88 + winogrande_accuracy: 71.88 + deepseek-moe-16b-base-hf: + gsm8k_accuracy: 21.88 + GPQA_diamond_accuracy: 0 + race-high_accuracy: 21.88 + winogrande_accuracy: 65.62 + deepseek-7b-base-turbomind: + gsm8k_accuracy: 21.88 + GPQA_diamond_accuracy: 0 + race-high_accuracy: 46.88 + winogrande_accuracy: 84.38 + deepseek-moe-16b-base-vllm: + gsm8k_accuracy: 21.88 + GPQA_diamond_accuracy: 0 + race-high_accuracy: 25 + winogrande_accuracy: 68.75 + gemma2-2b-hf: + gsm8k_accuracy: 28.12 + GPQA_diamond_accuracy: 3.12 + race-high_accuracy: 56.25 + winogrande_accuracy: 71.88 + gemma2-9b-hf: + gsm8k_accuracy: 68.75 + GPQA_diamond_accuracy: 0 + race-high_accuracy: 81.25 + winogrande_accuracy: 84.38 + gemma-2b-hf: + gsm8k_accuracy: 18.75 + GPQA_diamond_accuracy: 3.12 + race-high_accuracy: 25 + winogrande_accuracy: 53.12 + gemma-7b-hf: + gsm8k_accuracy: 56.25 + GPQA_diamond_accuracy: 6.25 + race-high_accuracy: 65.62 + winogrande_accuracy: 78.12 + gemma-2b-vllm: + gsm8k_accuracy: 15.62 + GPQA_diamond_accuracy: 3.12 + race-high_accuracy: + winogrande_accuracy: + gemma-7b-vllm: + gsm8k_accuracy: 53.12 + GPQA_diamond_accuracy: 9.38 + race-high_accuracy: + winogrande_accuracy: + internlm2_5-7b-hf: + gsm8k_accuracy: 37.5 + GPQA_diamond_accuracy: 25 + race-high_accuracy: 93.75 + winogrande_accuracy: 71.88 + internlm2-7b-hf: + gsm8k_accuracy: 53.12 + GPQA_diamond_accuracy: 18.75 + race-high_accuracy: 62.5 + winogrande_accuracy: 78.12 + internlm2-base-7b-hf: + gsm8k_accuracy: 3.12 + GPQA_diamond_accuracy: 21.88 + race-high_accuracy: 75 + winogrande_accuracy: 65.62 + internlm2-1.8b-turbomind: + gsm8k_accuracy: 12.5 + GPQA_diamond_accuracy: 9.38 + race-high_accuracy: 71.88 + winogrande_accuracy: 78.12 + internlm2_5-7b-turbomind: + gsm8k_accuracy: 62.50 + GPQA_diamond_accuracy: 34.38 + race-high_accuracy: 93.75 + winogrande_accuracy: 87.50 + internlm2-7b-turbomind: + gsm8k_accuracy: 53.12 + GPQA_diamond_accuracy: 21.88 + race-high_accuracy: 71.88 + winogrande_accuracy: 84.38 + internlm2-base-7b-turbomind: + gsm8k_accuracy: 37.50 + GPQA_diamond_accuracy: 28.12 + race-high_accuracy: 81.25 + winogrande_accuracy: 75 + llama-2-7b-hf: + gsm8k_accuracy: 21.88 + GPQA_diamond_accuracy: 21.88 + race-high_accuracy: 40.62 + winogrande_accuracy: 71.88 + llama-3_1-8b-hf: + gsm8k_accuracy: 78.12 + GPQA_diamond_accuracy: 25 + race-high_accuracy: 90.62 + winogrande_accuracy: 62.5 + llama-3-8b-hf: + gsm8k_accuracy: 46.88 + GPQA_diamond_accuracy: 6.25 + race-high_accuracy: 65.62 + winogrande_accuracy: 65.62 + llama-3.1-8b-turbomind: + gsm8k_accuracy: 56.25 + GPQA_diamond_accuracy: 9.38 + race-high_accuracy: 78.12 + winogrande_accuracy: 78.12 + llama-3-8b-turbomind: + gsm8k_accuracy: 50 + GPQA_diamond_accuracy: 12.50 + race-high_accuracy: 65.62 + winogrande_accuracy: 78.12 + mistral-7b-v0.2-hf: + gsm8k_accuracy: 31.25 + GPQA_diamond_accuracy: 6.25 + race-high_accuracy: 62.5 + winogrande_accuracy: 59.38 + mistral-7b-v0.3-hf: + gsm8k_accuracy: 31.25 + GPQA_diamond_accuracy: 6.25 + race-high_accuracy: 62.5 + winogrande_accuracy: 59.38 + mistral-7b-v0.2-vllm: + gsm8k_accuracy: 34.38 + GPQA_diamond_accuracy: 6.25 + race-high_accuracy: 62.5 + winogrande_accuracy: 65.62 + qwen2.5-7b-hf: + gsm8k_accuracy: 81.25 + GPQA_diamond_accuracy: 18.75 + race-high_accuracy: 87.5 + winogrande_accuracy: 71.88 + qwen2.5-1.5b-turbomind: + gsm8k_accuracy: 62.50 + GPQA_diamond_accuracy: 12.50 + race-high_accuracy: 78.12 + winogrande_accuracy: 68.75 + qwen2.5-7b-turbomind: + gsm8k_accuracy: 75.00 + GPQA_diamond_accuracy: 25 + race-high_accuracy: 87.5 + winogrande_accuracy: 71.88 + qwen1.5-moe-a2.7b-hf: + gsm8k_accuracy: 62.5 + GPQA_diamond_accuracy: 18.75 + race-high_accuracy: 84.38 + winogrande_accuracy: 75 + qwen2-0.5b-hf: + gsm8k_accuracy: 25 + GPQA_diamond_accuracy: 0 + race-high_accuracy: 40.62 + winogrande_accuracy: 62.5 + qwen2-1.5b-hf: + gsm8k_accuracy: 59.38 + GPQA_diamond_accuracy: 9.38 + race-high_accuracy: 81.25 + winogrande_accuracy: 62.5 + qwen2-7b-hf: + gsm8k_accuracy: 68.75 + GPQA_diamond_accuracy: 9.38 + race-high_accuracy: 87.5 + winogrande_accuracy: 68.75 + qwen2-1.5b-turbomind: + gsm8k_accuracy: 56.25 + GPQA_diamond_accuracy: 9.38 + race-high_accuracy: 81.25 + winogrande_accuracy: 75 + qwen2-7b-turbomind: + gsm8k_accuracy: 75.00 + GPQA_diamond_accuracy: 12.5 + race-high_accuracy: 87.5 + winogrande_accuracy: 71.88 + qwen1.5-0.5b-vllm: + gsm8k_accuracy: 9.38 + GPQA_diamond_accuracy: 0 + race-high_accuracy: 56.25 + winogrande_accuracy: 62.5 + yi-1.5-6b-hf: + gsm8k_accuracy: 62.5 + GPQA_diamond_accuracy: 3.12 + race-high_accuracy: 87.5 + winogrande_accuracy: 62.5 + yi-1.5-9b-hf: + gsm8k_accuracy: 75 + GPQA_diamond_accuracy: 40.62 + race-high_accuracy: 87.5 + winogrande_accuracy: 59.38 + yi-1.5-9b-turbomind: + gsm8k_accuracy: 78.12 + GPQA_diamond_accuracy: 40.62 + race-high_accuracy: 87.5 + winogrande_accuracy: 71.88 + deepseek-v2-lite-hf: + gsm8k_accuracy: 31.25 + GPQA_diamond_accuracy: 28.12 + race-high_accuracy: 59.38 + winogrande_accuracy: 71.88 + internlm2-20b-hf: + gsm8k_accuracy: 56.25 + GPQA_diamond_accuracy: 15.62 + race-high_accuracy: 68.75 + winogrande_accuracy: 75 + internlm2-base-20b-hf: + gsm8k_accuracy: 12.5 + GPQA_diamond_accuracy: 9.38 + race-high_accuracy: 84.38 + winogrande_accuracy: 65.62 + internlm2-20b-turbomind: + gsm8k_accuracy: 71.88 + GPQA_diamond_accuracy: 15.62 + race-high_accuracy: 68.75 + winogrande_accuracy: 81.25 + qwen2.5-14b-hf: + gsm8k_accuracy: 75 + GPQA_diamond_accuracy: 37.5 + race-high_accuracy: 93.75 + winogrande_accuracy: 84.38 + qwen2.5-32b-hf: + gsm8k_accuracy: 87.5 + GPQA_diamond_accuracy: 31.25 + race-high_accuracy: 93.75 + winogrande_accuracy: 78.12 + qwen2.5-32b-turbomind: + gsm8k_accuracy: 84.38 + GPQA_diamond_accuracy: 28.12 + race-high_accuracy: 93.75 + winogrande_accuracy: 81.25 + deepseek-67b-base-hf: + gsm8k_accuracy: 59.38 + GPQA_diamond_accuracy: 31.25 + race-high_accuracy: 81.25 + winogrande_accuracy: 90.62 + deepseek-67b-base-turbomind: + gsm8k_accuracy: 56.25 + GPQA_diamond_accuracy: 28.12 + race-high_accuracy: 81.25 + winogrande_accuracy: 84.38 + llama-3-70b-turbomind: + gsm8k_accuracy: 59.38 + GPQA_diamond_accuracy: 9.38 + race-high_accuracy: 93.75 + winogrande_accuracy: 84.38 + qwen2.5-72b-turbomind: + gsm8k_accuracy: 84.38 + GPQA_diamond_accuracy: 34.38 + race-high_accuracy: 93.75 + winogrande_accuracy: 87.5 + deepseek-v2-turbomind: + gsm8k_accuracy: 62.5 + GPQA_diamond_accuracy: 3.12 + race-high_accuracy: 81.25 + winogrande_accuracy: 68.75 + llama-3-70b-hf: + gsm8k_accuracy: 62.5 + GPQA_diamond_accuracy: 3.12 + race-high_accuracy: 93.75 + winogrande_accuracy: 84.38 diff --git a/.github/workflows/daily-run-test.yml b/.github/workflows/daily-run-test.yml index 35614e6a..4ea85c19 100644 --- a/.github/workflows/daily-run-test.yml +++ b/.github/workflows/daily-run-test.yml @@ -28,21 +28,21 @@ on: description: 'Set branch or tag or commit id. Default is "main"' type: string default: 'main' - regression_func: + regression_func_volc: required: true description: 'regression functions' type: string - default: "['chat_models','base_models', 'chat_obj_fullbench', 'chat_sub_fullbench', 'base_fullbench','cmd', 'api']" - cuda_env: + default: "['chat_models','base_models', 'chat_obj_fullbench', 'base_fullbench']" + regression_func_local: required: true - description: "regression conda env, eg. ['dsw_cu11','dsw_cu12']" + description: 'regression functions' type: string - default: "['dsw_cu12']" + default: "['cmd', 'api', 'chat_sub_fullbench']" fullbench_eval: required: true description: 'fullbench volc functions' type: string - default: "['base_long_context','base_objective','chat_long_context','chat_objective','chat_subjective']" + default: "['base_objective','chat_objective','chat_subjective','base_long_context','chat_long_context']" schedule: - cron: '15 14 * * *' @@ -54,6 +54,13 @@ env: LMDEPLOY_USE_MODELSCOPE: false HF_HUB_OFFLINE: 1 OUTPUT_FOLDER: cuda12.1_dist_${{ github.run_id }} + CONDA_PATH: /fs-computility/llm/qa-llm-cicd/miniconda3 + PIP_CACHE_PATH: /fs-computility/llm/qa-llm-cicd/.cache/pip + REPORT_ROOT: /fs-computility/llm/qa-llm-cicd/eval_report/regression + COMPASS_DATA_CACHE: /fs-computility/llm/shared/llmeval/datasets/compass_data_cache + HUGGINGFACE_HUB_CACHE: /fs-computility/llm/shared/llmeval/models/opencompass_hf_hub + HF_HUB_CACHE: /fs-computility/llm/shared/llmeval/models/opencompass_hf_hub + CONDA_ENV: regression_test jobs: build-pypi: @@ -117,14 +124,7 @@ jobs: prepare_env: if: ${{!cancelled()}} needs: ['build-pypi', 'build-pypi-lmdeploy'] - strategy: - fail-fast: false - matrix: - cuda_env: ${{ fromJSON(inputs.cuda_env || '["dsw_cu12"]')}} - runs-on: ${{ matrix.cuda_env }} - env: - CONDA_ENV: opencompass_regression - PIP_CACHE_PATH: /cpfs01/user/qa-llm-cicd/.cache/pip + runs-on: volc_cu12 environment: 'prod' timeout-minutes: 240 #4hours steps: @@ -140,79 +140,52 @@ jobs: - name: Remove Conda Env if: always() run: | - . /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate - conda env remove -y --name ${{env.CONDA_ENV}}_${{ matrix.cuda_env }} + . /fs-computility/llm/qa-llm-cicd/miniconda3/bin/activate + conda env remove -y --name ${{env.CONDA_ENV}} conda info --envs - - name: Prepare - create conda env and install torch - cu11 - if: ${{matrix.cuda_env == 'dsw_cu11'}} - uses: nick-fields/retry@v3 - id: retry1 - with: - max_attempts: 3 - timeout_minutes: 40 - command: | - . /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate - conda create -y --name ${{env.CONDA_ENV}}_${{ matrix.cuda_env }} python=3.10 - conda activate ${{env.CONDA_ENV}}_${{ matrix.cuda_env }} - pip install -r /cpfs01/shared/public/qa-llm-cicd/requirements-cu11.txt --cache-dir ${{env.PIP_CACHE_PATH}} - pip install opencompass*.whl --cache-dir ${{env.PIP_CACHE_PATH}} - pip install /cpfs01/user/qa-llm-cicd/packages/lmdeploy-0.6.1+cu118-cp310-cp310-manylinux2014_x86_64.whl --cache-dir ${{env.PIP_CACHE_PATH}} - pip install /cpfs01/user/qa-llm-cicd/packages/vllm-0.6.1.post1+cu118-cp310-cp310-manylinux1_x86_64.whl --cache-dir ${{env.PIP_CACHE_PATH}} - pip uninstall torch torchvision torchaudio -y - pip install torch==2.5.1 torchvision==0.20.1 torchaudio==2.5.1 --cache-dir ${{env.PIP_CACHE_PATH}} --index-url https://download.pytorch.org/whl/cu118 - FLASH_ATTENTION_FORCE_BUILD=TRUE pip install /cpfs01/user/qa-llm-cicd/packages/flash_attn-2.7.0.post2+cu11torch2.5cxx11abiFALSE-cp310-cp310-linux_x86_64.whl - pip install /cpfs01/user/qa-llm-cicd/packages/xformers-0.0.28.post3-cp310-cp310-manylinux_2_28_x86_64.whl --cache-dir ${{env.PIP_CACHE_PATH}} - conda info --envs - pip list - name: Prepare - create conda env and install torch - cu12 - if: ${{matrix.cuda_env == 'dsw_cu12'}} uses: nick-fields/retry@v3 - id: retry2 with: max_attempts: 3 - timeout_minutes: 40 + timeout_minutes: 240 command: | - . /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate - conda create -y --name ${{env.CONDA_ENV}}_${{ matrix.cuda_env }} python=3.10 - conda activate ${{env.CONDA_ENV}}_${{ matrix.cuda_env }} - pip install -r /cpfs01/shared/public/qa-llm-cicd/requirements-cu12.txt --cache-dir ${{env.PIP_CACHE_PATH}} + . ${{env.CONDA_PATH}}/bin/activate + conda create -y --name ${{env.CONDA_ENV}} python=3.10 + conda activate ${{env.CONDA_ENV}} + pip install -r /fs-computility/llm/qa-llm-cicd/config/requirements.txt --cache-dir ${{env.PIP_CACHE_PATH}} pip install opencompass*.whl --cache-dir ${{env.PIP_CACHE_PATH}} pip install opencompass[lmdeploy] --cache-dir ${{env.PIP_CACHE_PATH}} pip install opencompass[vllm] --cache-dir ${{env.PIP_CACHE_PATH}} pip install torch==2.5.1 torchvision==0.20.1 torchaudio==2.5.1 --cache-dir ${{env.PIP_CACHE_PATH}} - FLASH_ATTENTION_FORCE_BUILD=TRUE pip install /cpfs01/user/qa-llm-cicd/packages/flash_attn-2.7.0.post2+cu12torch2.5cxx11abiFALSE-cp310-cp310-linux_x86_64.whl - pip install /cpfs01/user/qa-llm-cicd/packages/xformers-0.0.28.post3-cp310-cp310-manylinux_2_28_x86_64.whl --cache-dir ${{env.PIP_CACHE_PATH}} - conda info --envs - pip list + FLASH_ATTENTION_FORCE_BUILD=TRUE pip install /fs-computility/llm/qa-llm-cicd/packages/flash_attn-2.7.0.post2+cu12torch2.5cxx11abiFALSE-cp310-cp310-linux_x86_64.whl + pip install xformers --index-url https://download.pytorch.org/whl/cu121 --cache-dir ${{env.PIP_CACHE_PATH}} + cp -r /root/nltk_data ${{env.CONDA_PATH}}/envs/${{env.CONDA_ENV}}/nltk_data - name: Prepare - reinstall lmdeploy - cu12 - if: ${{matrix.cuda_env == 'dsw_cu12' && inputs.build_lmdeploy}} + if: ${{inputs.build_lmdeploy}} uses: actions/download-artifact@v4 with: name: my-artifact-${{ github.run_id }}-py310 - name: Prepare - reinstall lmdeploy - cu12 - if: ${{matrix.cuda_env == 'dsw_cu12' && inputs.build_lmdeploy}} + if: ${{inputs.build_lmdeploy}} run: | - . /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate - conda activate ${{env.CONDA_ENV}}_${{ matrix.cuda_env }} + . ${{env.CONDA_PATH}}/bin/activate + conda activate ${{env.CONDA_ENV}} pip install lmdeploy-*.whl --no-deps + - name: conda env + run: | + . ${{env.CONDA_PATH}}/bin/activate + conda activate ${{env.CONDA_ENV}} + conda info --envs + pip list - daily_run_test: + daily_run_test_volc: if: ${{!cancelled()}} needs: prepare_env strategy: fail-fast: false matrix: - cuda_env: ${{ fromJSON(inputs.cuda_env || '["dsw_cu12"]')}} - regression_func: ${{fromJSON(github.event.inputs.regression_func || '["chat_models","base_models","chat_obj_fullbench","chat_sub_fullbench","base_fullbench","cmd","api"]')}} - runs-on: ${{ matrix.cuda_env }} - env: - CONDA_ENV: opencompass_regression - PIP_CACHE_PATH: /cpfs01/user/qa-llm-cicd/.cache/pip - HF_CACHE_PATH: /cpfs01/shared/public/public_hdd/llmeval/model_weights/hf_hub - HUGGINGFACE_HUB_CACHE: /cpfs01/shared/public/public_hdd/llmeval/model_weights/hf_hub - HF_HUB_CACHE: /cpfs01/shared/public/public_hdd/llmeval/model_weights/hf_hub - COMPASS_DATA_CACHE: /cpfs01/shared/public/llmeval/compass_data_cache - REPORT_ROOT: /cpfs01/shared/public/qa-llm-cicd/report + regression_func: ${{fromJSON(github.event.inputs.regression_func_volc || '["chat_models","base_models","chat_obj_fullbench","base_fullbench"]')}} + runs-on: volc_cu12_daily environment: 'prod' timeout-minutes: 240 #4hours steps: @@ -221,105 +194,114 @@ jobs: with: repository: ${{ github.event.inputs.repo_org || 'open-compass/opencompass' }} ref: ${{github.event.inputs.repo_ref || 'main'}} - - name: Prepare - prepare data and hf model + - name: conda env run: | - rm -rf ~/.cache/huggingface/hub -f && mkdir ~/.cache -p && mkdir ~/.cache/huggingface -p - ln -s ${{env.HF_CACHE_PATH}} ~/.cache/huggingface/hub + . ${{env.CONDA_PATH}}/bin/activate + conda activate ${{env.CONDA_ENV}} + conda info --envs + pip list + - name: modify config + if: matrix.regression_func != 'chat_sub_fullbench' + run: | + cp -r /fs-computility/llm/qa-llm-cicd/ocplayground/template/configs_cluster/volc.py . + cat /fs-computility/llm/qa-llm-cicd/config/test_config.txt >> .github/scripts/eval_regression_${{matrix.regression_func}}.py + - name: Run test + uses: nick-fields/retry@v3 + with: + max_attempts: 3 + timeout_minutes: 40 + command: | + . ${{env.CONDA_PATH}}/bin/activate + conda activate ${{env.CONDA_ENV}} + conda info --envs + opencompass .github/scripts/eval_regression_${{matrix.regression_func}}.py --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/${{matrix.regression_func}} --reuse --dump-eval-details + rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/${{matrix.regression_func}}/*/summary regression_result_daily + python -m pytest -m ${{matrix.regression_func}} -s -v --color=yes .github/scripts/oc_score_assert.py + + + daily_run_test_local: + if: ${{!cancelled()}} + needs: prepare_env + strategy: + fail-fast: false + matrix: + regression_func: ${{fromJSON(github.event.inputs.regression_func_local || '["cmd","api","chat_sub_fullbench"]')}} + runs-on: volc_cu12_local + environment: 'prod' + timeout-minutes: 240 #4hours + steps: + - name: Clone repository + uses: actions/checkout@v2 + with: + repository: ${{ github.event.inputs.repo_org || 'open-compass/opencompass' }} + ref: ${{github.event.inputs.repo_ref || 'main'}} + - name: conda env + run: | + . ${{env.CONDA_PATH}}/bin/activate + conda activate ${{env.CONDA_ENV}} + conda info --envs + pip list + - name: modify config + if: matrix.regression_func == 'chat_sub_fullbench' + run: | + cp -r /fs-computility/llm/qa-llm-cicd/ocplayground/template/configs_cluster/volc.py . + cat /fs-computility/llm/qa-llm-cicd/config/test_config_sub.txt >> .github/scripts/eval_regression_${{matrix.regression_func}}.py - name: Run command testcase if: matrix.regression_func == 'cmd' run: | - . /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate - conda activate ${{env.CONDA_ENV}}_${{ matrix.cuda_env }} + . ${{env.CONDA_PATH}}/bin/activate + conda activate ${{env.CONDA_ENV}} conda info --envs export from_tf=TRUE python tools/list_configs.py internlm2_5 mmlu - opencompass --models hf_internlm2_5_7b hf_internlm2_1_8b --datasets race_ppl demo_gsm8k_chat_gen --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd1_${{ matrix.cuda_env }} --reuse --max-num-workers 2 --dump-eval-details - rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd1_${{ matrix.cuda_env }}/*/summary regression_result_daily + opencompass --models hf_internlm2_5_7b hf_internlm2_1_8b --datasets race_ppl demo_gsm8k_chat_gen --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd1 --reuse --max-num-workers 2 --dump-eval-details + rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd1/*/summary regression_result_daily python -m pytest -m case1 -s -v --color=yes .github/scripts/oc_score_assert.py - opencompass --models hf_internlm2_5_7b_chat hf_internlm2_chat_1_8b --datasets race_gen demo_gsm8k_chat_gen -a lmdeploy --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd2_${{ matrix.cuda_env }} --reuse --max-num-workers 2 --dump-eval-details - rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd2_${{ matrix.cuda_env }}/*/summary regression_result_daily + opencompass --models hf_internlm2_5_7b_chat hf_internlm2_chat_1_8b --datasets race_gen demo_gsm8k_chat_gen -a lmdeploy --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd2 --reuse --max-num-workers 2 --dump-eval-details + rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd2/*/summary regression_result_daily python -m pytest -m case2 -s -v --color=yes .github/scripts/oc_score_assert.py - opencompass --datasets race_ppl demo_gsm8k_chat_gen --hf-type base --hf-path internlm/internlm2_5-7b --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd3_${{ matrix.cuda_env }} --reuse --max-num-workers 2 --dump-eval-details - rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd3_${{ matrix.cuda_env }}/*/summary regression_result_daily + opencompass --datasets race_ppl demo_gsm8k_chat_gen --hf-type base --hf-path internlm/internlm2_5-7b --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd3 --reuse --max-num-workers 2 --dump-eval-details + rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd3/*/summary regression_result_daily python -m pytest -m case3 -s -v --color=yes .github/scripts/oc_score_assert.py - opencompass --datasets race_gen demo_gsm8k_chat_gen --hf-type chat --hf-path internlm/internlm2_5-7b-chat --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd4_${{ matrix.cuda_env }} --reuse --max-num-workers 2 --dump-eval-details - rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd4_${{ matrix.cuda_env }}/*/summary regression_result_daily + opencompass --datasets race_gen demo_gsm8k_chat_gen --hf-type chat --hf-path internlm/internlm2_5-7b-chat --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd4 --reuse --max-num-workers 2 --dump-eval-details + rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd4/*/summary regression_result_daily python -m pytest -m case4 -s -v --color=yes .github/scripts/oc_score_assert.py - - name: Run chat model test - if: matrix.regression_func == 'chat_models' - run: | - . /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate - conda activate ${{env.CONDA_ENV}}_${{ matrix.cuda_env }} - conda info --envs - opencompass .github/scripts/eval_regression_chat.py --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/chat_${{ matrix.cuda_env }} --reuse --max-num-workers 2 --dump-eval-details - rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/chat_${{ matrix.cuda_env }}/*/summary regression_result_daily - python -m pytest -m chat -s -v --color=yes .github/scripts/oc_score_assert.py - - name: Run base model test - if: matrix.regression_func == 'base_models' - run: | - . /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate - conda activate ${{env.CONDA_ENV}}_${{ matrix.cuda_env }} - conda info --envs - opencompass .github/scripts/eval_regression_base.py --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/base_${{ matrix.cuda_env }} --reuse --max-num-workers 2 --dump-eval-details - rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/base_${{ matrix.cuda_env }}/*/summary regression_result_daily - python -m pytest -m base -s -v --color=yes .github/scripts/oc_score_assert.py - - name: Run chat model test - fullbench - if: matrix.regression_func == 'chat_obj_fullbench' - run: | - . /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate - conda activate ${{env.CONDA_ENV}}_${{ matrix.cuda_env }} - conda info --envs - opencompass .github/scripts/eval_regression_chat_objective_fullbench.py --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/chat_obj_${{ matrix.cuda_env }} --reuse --max-num-workers 2 --dump-eval-details - rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/chat_obj_${{ matrix.cuda_env }}/*/summary regression_result_daily - python -m pytest -m chat_obj_fullbench -s -v --color=yes .github/scripts/oc_score_assert.py - - name: Run chat model test - fullbench - if: matrix.regression_func == 'chat_sub_fullbench' - env: - COMPASS_DATA_CACHE: /cpfs01/shared/public/llmeval/compass_data_cache_subset - run: | - . /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate - conda activate ${{env.CONDA_ENV}}_${{ matrix.cuda_env }} - conda info --envs - opencompass .github/scripts/eval_regression_chat_subjective_fullbench.py --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/chat_sub_${{ matrix.cuda_env }} --reuse --max-num-workers 2 --dump-eval-details - rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/chat_sub_${{ matrix.cuda_env }}/*/summary regression_result_daily - python -m pytest -m chat_sub_fullbench -s -v --color=yes .github/scripts/oc_score_assert.py - - name: Run base model test - fullbench - if: matrix.regression_func == 'base_fullbench' - run: | - . /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate - conda activate ${{env.CONDA_ENV}}_${{ matrix.cuda_env }} - conda info --envs - opencompass .github/scripts/eval_regression_base_fullbench.py --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/base_full_${{ matrix.cuda_env }} --reuse --max-num-workers 2 --dump-eval-details - rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/base_full_${{ matrix.cuda_env }}/*/summary regression_result_daily - python -m pytest -m base_fullbench -s -v --color=yes .github/scripts/oc_score_assert.py - name: Run model test - api if: matrix.regression_func == 'api' run: | - . /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate - conda activate ${{env.CONDA_ENV}}_${{ matrix.cuda_env }} + . ${{env.CONDA_PATH}}/bin/activate + conda activate ${{env.CONDA_ENV}} conda info --envs lmdeploy serve api_server internlm/internlm2_5-7b-chat --max-batch-size 256 --model-name internlm2 > ${{env.REPORT_ROOT}}/${{ github.run_id }}/restful.log 2>&1 & echo "restful_pid=$!" >> "$GITHUB_ENV" sleep 120s - opencompass .github/scripts/eval_regression_api.py --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/api_${{ matrix.cuda_env }} --reuse --max-num-workers 2 --dump-eval-details - rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/api_${{ matrix.cuda_env }}/*/summary regression_result_daily + opencompass .github/scripts/eval_regression_api.py --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/api --reuse --max-num-workers 2 --dump-eval-details + rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/api/*/summary regression_result_daily python -m pytest -m api -s -v --color=yes .github/scripts/oc_score_assert.py - name: Run model test - api kill if: always() && matrix.regression_func == 'api' run: | kill -15 "$restful_pid" + - name: Run testcase + if: matrix.regression_func == 'chat_sub_fullbench' + env: + COMPASS_DATA_CACHE: /fs-computility/llm/shared/llmeval/datasets/compass_data_cache_subset + run: | + . ${{env.CONDA_PATH}}/bin/activate + conda activate ${{env.CONDA_ENV}} + conda info --envs + export from_tf=TRUE + opencompass .github/scripts/eval_regression_${{matrix.regression_func}}.py --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/${{matrix.regression_func}} --reuse --dump-eval-details + rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/${{matrix.regression_func}}/*/summary regression_result_daily + python -m pytest -m ${{matrix.regression_func}} -s -v --color=yes .github/scripts/oc_score_assert.py fullbench_run_test: if: ${{!cancelled()}} - needs: ['build-pypi', 'build-pypi-lmdeploy'] - env: - FULLBENCH_CONDA_ENV: regression_test - FULLBENCH_REPORT_ROOT: /fs-computility/llm/qa-llm-cicd/eval_report/regression - COMPASS_DATA_CACHE: /fs-computility/llm/shared/llmeval/datasets/compass_data_cache + needs: prepare_env strategy: fail-fast: false matrix: - function_type: ${{fromJSON(github.event.inputs.fullbench_eval || '["base_long_context","base_objective","chat_long_context","chat_objective","chat_subjective"]')}} + function_type: ${{fromJSON(github.event.inputs.fullbench_eval || '["base_objective","chat_objective","chat_subjective","base_long_context","chat_long_context"]')}} runs-on: volc_cu12 environment: 'prod' timeout-minutes: 360 #6hours @@ -329,48 +311,30 @@ jobs: with: repository: ${{ github.event.inputs.repo_org || 'open-compass/opencompass' }} ref: ${{github.event.inputs.repo_ref || 'main'}} - - name: Download Artifacts - uses: actions/download-artifact@v4 - with: - name: my-artifact-${{ github.run_id }} - - name: Prepare - reinstall opencompass - cu12 - if: ${{matrix.cuda_env == 'dsw_cu12' && inputs.build_lmdeploy}} + - name: conda env run: | - . /fs-computility/llm/qa-llm-cicd/miniconda3/bin/activate - conda activate ${{env.FULLBENCH_CONDA_ENV}} - pip install opencompass*.whl --no-deps - - name: Prepare - reinstall lmdeploy - cu12 - if: ${{matrix.cuda_env == 'dsw_cu12' && inputs.build_lmdeploy}} - uses: actions/download-artifact@v4 - with: - name: my-artifact-${{ github.run_id }}-py310 - - name: Prepare - reinstall lmdeploy - cu12 - if: ${{matrix.cuda_env == 'dsw_cu12' && inputs.build_lmdeploy}} - run: | - . /fs-computility/llm/qa-llm-cicd/miniconda3/bin/activate - conda activate ${{env.FULLBENCH_CONDA_ENV}} - pip install lmdeploy-*.whl --no-deps - - name: Conda env - if: ${{matrix.cuda_env == 'dsw_cu12' && inputs.build_lmdeploy}} - run: | - . /fs-computility/llm/qa-llm-cicd/miniconda3/bin/activate - conda activate ${{env.FULLBENCH_CONDA_ENV}} + . ${{env.CONDA_PATH}}/bin/activate + conda activate ${{env.CONDA_ENV}} conda info --envs pip list - - name: Run command testcase - run: | - . /fs-computility/llm/qa-llm-cicd/miniconda3/bin/activate - conda activate ${{env.FULLBENCH_CONDA_ENV}} - conda info --envs - export from_tf=TRUE - opencompass /fs-computility/llm/qa-llm-cicd/ocplayground/template/regression/eval_${{ matrix.function_type }}.py --work-dir ${{env.FULLBENCH_REPORT_ROOT}}/${{ github.run_id }}/${{ matrix.function_type }} --reuse - rm regression_result_daily -f && ln -s ${{env.FULLBENCH_REPORT_ROOT}}/${{ github.run_id }}/${{ matrix.function_type }}/*/summary regression_result_daily - python -m pytest -m ${{ matrix.function_type }} -s -v --color=yes .github/scripts/oc_score_assert.py + - name: Run testcase + uses: nick-fields/retry@v3 + with: + max_attempts: 3 + timeout_minutes: 240 + command: | + . ${{env.CONDA_PATH}}/bin/activate + conda activate ${{env.CONDA_ENV}} + conda info --envs + export from_tf=TRUE + opencompass /fs-computility/llm/qa-llm-cicd/ocplayground/template/regression/eval_${{ matrix.function_type }}.py --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/${{ matrix.function_type }} --reuse + rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/${{ matrix.function_type }}/*/summary regression_result_daily + python -m pytest -m ${{ matrix.function_type }} -s -v --color=yes .github/scripts/oc_score_assert.py notify_to_feishu: if: ${{ always() && !cancelled() && contains(needs.*.result, 'failure') && (github.ref_name == 'develop' || github.ref_name == 'main') }} - needs: [daily_run_test, fullbench_run_test] + needs: [daily_run_test_volc, daily_run_test_local, fullbench_run_test] environment: 'prod' timeout-minutes: 5 runs-on: self-hosted diff --git a/.github/workflows/pr-run-test.yml b/.github/workflows/pr-run-test.yml index bc829eab..ef067720 100644 --- a/.github/workflows/pr-run-test.yml +++ b/.github/workflows/pr-run-test.yml @@ -18,18 +18,23 @@ concurrency: cancel-in-progress: true env: - CONDA_ENV: opencompass_ - USERSPACE_PREFIX: /cpfs01/user/qa-llm-cicd - HF_CACHE_PATH: /cpfs01/shared/public/public_hdd/llmeval/model_weights/hf_hub + CONDA_ENV: pr_test HF_DATASETS_OFFLINE: 1 + HF_EVALUATE_OFFLINE: 1 TRANSFORMERS_OFFLINE: 1 - HF_HUB_OFFLINE: 1 VLLM_USE_MODELSCOPE: false LMDEPLOY_USE_MODELSCOPE: false + HF_HUB_OFFLINE: 1 + CONDA_PATH: /fs-computility/llm/qa-llm-cicd/miniconda3 + PIP_CACHE_PATH: /fs-computility/llm/qa-llm-cicd/.cache/pip + REPORT_ROOT: /fs-computility/llm/qa-llm-cicd/eval_report/prtest + COMPASS_DATA_CACHE: /fs-computility/llm/shared/llmeval/datasets/compass_data_cache + HUGGINGFACE_HUB_CACHE: /fs-computility/llm/shared/llmeval/models/opencompass_hf_hub + HF_HUB_CACHE: /fs-computility/llm/shared/llmeval/models/opencompass_hf_hub jobs: pr_run_test: - runs-on: dsw_cu12 + runs-on: volc_cu12_local environment: 'prod' timeout-minutes: 30 steps: @@ -37,54 +42,55 @@ jobs: uses: actions/checkout@v2 - name: Prepare - Install opencompass run: | - . /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate - conda activate ${{env.CONDA_ENV}}${{ runner.name }} + . ${{env.CONDA_PATH}}/bin/activate + conda activate ${{env.CONDA_ENV}} python3 -m pip uninstall opencompass -y - python3 -m pip install -e . --cache-dir ${{env.USERSPACE_PREFIX}}/.cache/pip + python3 -m pip install -e . --cache-dir ${{env.PIP_CACHE_PATH}} conda info --envs - - name: Prepare - prepare data and hf model + - name: conda env run: | - cp -r ${{env.USERSPACE_PREFIX}}/data . - rm -rf ~/.cache/huggingface/hub -f && mkdir ~/.cache -p && mkdir ~/.cache/huggingface -p - ln -s ${{env.HF_CACHE_PATH}} ~/.cache/huggingface/hub + . ${{env.CONDA_PATH}}/bin/activate + conda activate ${{env.CONDA_ENV}} + conda info --envs + pip list + lmdeploy check_env - name: Run test run: | - . /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate - conda activate ${{env.CONDA_ENV}}${{ runner.name }} + . ${{env.CONDA_PATH}}/bin/activate + conda activate ${{env.CONDA_ENV}} conda info --envs rm -rf regression_result - opencompass --models hf_internlm2_5_20b_chat --datasets demo_gsm8k_chat_gen --work-dir regression_result1 --debug - opencompass --models hf_internlm2_5_7b_chat --datasets demo_gsm8k_chat_gen --work-dir regression_result2 --debug --max-num-workers 2 - opencompass --models hf_internlm2_5_7b_chat --datasets demo_gsm8k_chat_gen -a lmdeploy --work-dir regression_result3 --debug --max-num-workers 2 + opencompass --models hf_internlm2_5_20b_chat --datasets demo_gsm8k_chat_gen --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/regression_result1 --debug + opencompass --models hf_internlm2_5_7b_chat --datasets demo_gsm8k_chat_gen --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/regression_result2 --debug --max-num-workers 2 + opencompass --models hf_internlm2_5_7b_chat --datasets demo_gsm8k_chat_gen -a lmdeploy --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/regression_result3 --debug --max-num-workers 2 - name: Get result run: | - score=$(sed -n '$p' regression_result1/*/summary/*.csv | awk -F ',' '{print $NF}') + score=$(sed -n '$p' ${{env.REPORT_ROOT}}/${{ github.run_id }}/regression_result1/*/summary/*.csv | awk -F ',' '{print $NF}') if (( ${score%.*} >= 88 && ${score%.*} <= 89 )); then echo "score is $score between 88 and 89" else echo "score is $score not between 88 and 89" exit 1 fi - score=$(sed -n '$p' regression_result2/*/summary/*.csv | awk -F ',' '{print $NF}') + score=$(sed -n '$p' ${{env.REPORT_ROOT}}/${{ github.run_id }}/regression_result2/*/summary/*.csv | awk -F ',' '{print $NF}') if (( ${score%.*} >= 87 && ${score%.*} <= 88 )); then echo "score is $score between 87 and 88" else echo "score is $score not between 87 and 88" exit 1 fi - score=$(sed -n '$p' regression_result3/*/summary/*.csv | awk -F ',' '{print $NF}') + score=$(sed -n '$p' ${{env.REPORT_ROOT}}/${{ github.run_id }}/regression_result3/*/summary/*.csv | awk -F ',' '{print $NF}') if (( ${score%.*} >= 87 && ${score%.*} <= 89 )); then echo "score is $score between 87 and 89" else echo "score is $score not between 87 and 89" exit 1 fi - rm -rf regression_result1 & rm -rf regression_result2 & rm -rf regression_result3 - name: Uninstall opencompass if: always() run: | - . /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate - conda activate ${{env.CONDA_ENV}}${{ runner.name }} + . ${{env.CONDA_PATH}}/bin/activate + conda activate ${{env.CONDA_ENV}} python3 -m pip uninstall opencompass -y conda info --envs