diff --git a/.github/scripts/eval_regression_api.py b/.github/scripts/eval_regression_api.py index ba1902a9..98f0fdf0 100644 --- a/.github/scripts/eval_regression_api.py +++ b/.github/scripts/eval_regression_api.py @@ -24,9 +24,9 @@ models = [ abbr='lmdeploy-api-test', type=OpenAISDK, key='EMPTY', - openai_api_base='http://0.0.0.0:23333/v1', - path='internlm2', - tokenizer_path='internlm/internlm2_5-7b-chat', + openai_api_base='http://localhost:23333/v1', + path='internlm3', + tokenizer_path='internlm/internlm3-8b-instruct', rpm_verbose=True, meta_template=api_meta_template, query_per_second=128, diff --git a/.github/scripts/eval_regression_base_models.py b/.github/scripts/eval_regression_base_models.py index a8dc7a60..4259cc36 100644 --- a/.github/scripts/eval_regression_base_models.py +++ b/.github/scripts/eval_regression_base_models.py @@ -11,18 +11,10 @@ with read_base(): from opencompass.configs.datasets.winogrande.winogrande_5shot_ll_252f01 import \ winogrande_datasets # noqa: F401, E501 # read hf models - chat models - from opencompass.configs.models.chatglm.hf_glm4_9b import \ - models as hf_glm4_9b_model # noqa: F401, E501 from opencompass.configs.models.chatglm.lmdeploy_glm4_9b import \ models as lmdeploy_glm4_9b_model # noqa: F401, E501 from opencompass.configs.models.deepseek.hf_deepseek_7b_base import \ models as hf_deepseek_7b_base_model # noqa: F401, E501 - from opencompass.configs.models.deepseek.hf_deepseek_67b_base import \ - models as hf_deepseek_67b_base_model # noqa: F401, E501 - from opencompass.configs.models.deepseek.hf_deepseek_moe_16b_base import \ - models as hf_deepseek_moe_16b_base_model # noqa: F401, E501 - from opencompass.configs.models.deepseek.hf_deepseek_v2_lite import \ - models as hf_deepseek_v2_lite_model # noqa: F401, E501 from opencompass.configs.models.deepseek.lmdeploy_deepseek_7b_base import \ models as lmdeploy_deepseek_7b_base_model # noqa: F401, E501 from opencompass.configs.models.deepseek.lmdeploy_deepseek_67b_base import \ @@ -49,12 +41,6 @@ with read_base(): models as hf_internlm2_5_7b_model # noqa: F401, E501 from opencompass.configs.models.hf_internlm.hf_internlm2_7b import \ models as hf_internlm2_7b_model # noqa: F401, E501 - from opencompass.configs.models.hf_internlm.hf_internlm2_20b import \ - models as hf_internlm2_20b_model # noqa: F401, E501 - from opencompass.configs.models.hf_internlm.hf_internlm2_base_7b import \ - models as hf_internlm2_base_7b_model # noqa: F401, E501 - from opencompass.configs.models.hf_internlm.hf_internlm2_base_20b import \ - models as hf_internlm2_base_20b_model # noqa: F401, E501 from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_1_8b import \ models as lmdeploy_internlm2_1_8b_model # noqa: F401, E501 from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b import \ @@ -65,14 +51,14 @@ with read_base(): models as lmdeploy_internlm2_20b_model # noqa: F401, E501 from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_base_7b import \ models as lmdeploy_internlm2_base_7b_model # noqa: F401, E501 + from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_base_20b import \ + models as lmdeploy_internlm2_base_20b_model # noqa: F401, E501 from opencompass.configs.models.hf_llama.hf_llama2_7b import \ models as hf_llama2_7b_model # noqa: F401, E501 from opencompass.configs.models.hf_llama.hf_llama3_1_8b import \ models as hf_llama3_1_8b_model # noqa: F401, E501 from opencompass.configs.models.hf_llama.hf_llama3_8b import \ models as hf_llama3_8b_model # noqa: F401, E501 - from opencompass.configs.models.hf_llama.hf_llama3_70b import \ - models as hf_llama3_70b_model # noqa: F401, E501 from opencompass.configs.models.hf_llama.lmdeploy_llama3_1_8b import \ models as lmdeploy_llama3_1_8b_model # noqa: F401, E501 from opencompass.configs.models.hf_llama.lmdeploy_llama3_8b import \ diff --git a/.github/scripts/eval_regression_chat_models.py b/.github/scripts/eval_regression_chat_models.py index 40ec1bc5..bfe923f6 100644 --- a/.github/scripts/eval_regression_chat_models.py +++ b/.github/scripts/eval_regression_chat_models.py @@ -15,14 +15,24 @@ with read_base(): models as vllm_glm4_9b_chat_model # noqa: F401, E501 from opencompass.configs.models.deepseek.hf_deepseek_7b_chat import \ models as hf_deepseek_7b_chat_model # noqa: F401, E501 - from opencompass.configs.models.deepseek.hf_deepseek_67b_chat import \ - models as hf_deepseek_67b_chat_model # noqa: F401, E501 - from opencompass.configs.models.deepseek.hf_deepseek_moe_16b_chat import \ - models as hf_deepseek_moe_16b_chat_model # noqa: F401, E501 - from opencompass.configs.models.deepseek.hf_deepseek_v2_lite_chat import \ - models as hf_deepseek_v2_lite_chat_model # noqa: F401, E501 + from opencompass.configs.models.deepseek.lmdeploy_deepseek_67b_chat import \ + models as lmdeploy_deepseek_67b_chat_model # noqa: F401, E501 + from opencompass.configs.models.deepseek.lmdeploy_deepseek_r1_distill_llama_8b import \ + models as \ + lmdeploy_deepseek_r1_distill_llama_8b_model # noqa: F401, E501 + from opencompass.configs.models.deepseek.lmdeploy_deepseek_r1_distill_llama_70b import \ + models as \ + lmdeploy_deepseek_r1_distill_llama_70b_model # noqa: F401, E501 + from opencompass.configs.models.deepseek.lmdeploy_deepseek_r1_distill_qwen_1_5b import \ + models as \ + lmdeploy_deepseek_r1_distill_qwen_1_5b_model # noqa: F401, E501 + from opencompass.configs.models.deepseek.lmdeploy_deepseek_r1_distill_qwen_32b import \ + models as \ + lmdeploy_deepseek_r1_distill_qwen_32b_model # noqa: F401, E501 from opencompass.configs.models.deepseek.lmdeploy_deepseek_v2_5_1210 import \ models as lmdeploy_deepseek_v2_5_1210_model # noqa: F401, E501 + from opencompass.configs.models.deepseek.lmdeploy_deepseek_v2_lite import \ + models as lmdeploy_deepseek_v2_lite_model # noqa: F401, E501 from opencompass.configs.models.deepseek.vllm_deepseek_7b_chat import \ models as vllm_deepseek_7b_chat_model # noqa: F401, E501 from opencompass.configs.models.gemma.hf_gemma2_2b_it import \ @@ -45,6 +55,8 @@ with read_base(): models as hf_internlm2_5_7b_chat_model # noqa: F401, E501 from opencompass.configs.models.hf_internlm.hf_internlm2_5_20b_chat import \ models as hf_internlm2_5_20b_chat_model # noqa: F401, E501 + from opencompass.configs.models.hf_internlm.hf_internlm3_8b_instruct import \ + models as hf_internlm3_8b_instruct_model # noqa: F401, E501 from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat import \ models as lmdeploy_internlm2_5_7b_chat_model # noqa: F401, E501 from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_20b_chat import \ @@ -57,6 +69,8 @@ with read_base(): models as lmdeploy_internlm2_chat_7b_model # noqa: F401, E501 from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_chat_7b_sft import \ models as lmdeploy_internlm2_chat_7b_sft_model # noqa: F401, E501 + from opencompass.configs.models.hf_internlm.lmdeploy_internlm3_8b_instruct import \ + models as lmdeploy_internlm3_8b_instruct_model # noqa: F401, E501 from opencompass.configs.models.hf_internlm.vllm_internlm2_chat_7b import \ models as vllm_internlm2_chat_7b_model # noqa: F401, E501 from opencompass.configs.models.hf_llama.hf_llama3_1_8b_instruct import \ @@ -83,10 +97,6 @@ with read_base(): models as hf_mistral_nemo_instruct_2407_model # noqa: F401, E501 from opencompass.configs.models.mistral.hf_mistral_small_instruct_2409 import \ models as hf_mistral_small_instruct_2409_model # noqa: F401, E501 - from opencompass.configs.models.mistral.hf_mixtral_8x7b_instruct_v0_1 import \ - models as hf_mixtral_8x7b_instruct_v0_1_model # noqa: F401, E501 - from opencompass.configs.models.mistral.hf_mixtral_8x22b_instruct_v0_1 import \ - models as hf_mixtral_8x22b_instruct_v0_1_model # noqa: F401, E501 from opencompass.configs.models.mistral.lmdeploy_mistral_large_instruct_2411 import \ models as \ lmdeploy_mistral_large_instruct_2411_model # noqa: F401, E501 @@ -95,14 +105,19 @@ with read_base(): from opencompass.configs.models.mistral.lmdeploy_mistral_small_instruct_2409 import \ models as \ lmdeploy_mistral_small_instruct_2409_model # noqa: F401, E501 + from opencompass.configs.models.mistral.lmdeploy_mixtral_8x22b_instruct_v0_1 import \ + models as \ + lmdeploy_mixtral_8x22b_instruct_v0_1_model # noqa: F401, E501 from opencompass.configs.models.mistral.vllm_mistral_7b_instruct_v0_1 import \ models as vllm_mistral_7b_instruct_v0_1_model # noqa: F401, E501 from opencompass.configs.models.mistral.vllm_mistral_7b_instruct_v0_2 import \ models as vllm_mistral_7b_instruct_v0_2_model # noqa: F401, E501 + from opencompass.configs.models.mistral.vllm_mixtral_8x22b_instruct_v0_1 import \ + models as vllm_mixtral_8x22b_instruct_v0_1_model # noqa: F401, E501 from opencompass.configs.models.nvidia.lmdeploy_nemotron_70b_instruct_hf import \ models as lmdeploy_nemotron_70b_instruct_hf_model # noqa: F401, E501 - from opencompass.configs.models.phi.hf_phi_3_mini_4k_instruct import \ - models as hf_phi_3_mini_4k_instruct_model # noqa: F401, E501 + from opencompass.configs.models.phi.hf_phi_4 import \ + models as hf_phi_4_model # noqa: F401, E501 from opencompass.configs.models.qwen2_5.hf_qwen2_5_0_5b_instruct import \ models as hf_qwen2_5_0_5b_instruct_model # noqa: F401, E501 from opencompass.configs.models.qwen2_5.hf_qwen2_5_3b_instruct import \ @@ -142,6 +157,8 @@ with read_base(): from ...volc import infer as volc_infer # noqa: F401, E501 +hf_glm4_9b_chat_model[0]['path'] = 'THUDM/glm-4-9b-chat-hf' + race_datasets = [race_datasets[1]] datasets = sum([v for k, v in locals().items() if k.endswith('_datasets')], []) diff --git a/.github/scripts/oc_score_assert.py b/.github/scripts/oc_score_assert.py index 4ef414dc..1cbc5ad2 100644 --- a/.github/scripts/oc_score_assert.py +++ b/.github/scripts/oc_score_assert.py @@ -175,10 +175,11 @@ class TestApibench: class TestVolcFullbench: """Test cases for chat model.""" - @pytest.mark.parametrize( - 'model, dataset', - [(p1, p2) for p1 in ['internlm2_5-7b-chat-turbomind'] - for p2 in dataset_list('internlm2_5-7b-chat-turbomind', 'objective')]) + @pytest.mark.parametrize('model, dataset', [(p1, p2) for p1 in [ + 'internlm2_5-7b-chat-turbomind', 'qwen2.5-7b-instruct-turbomind', + 'internlm2_5-7b-chat-pytorch', 'qwen2.5-7b-instruct-pytorch', + 'internlm3-8b-instruct-turbomind', 'internlm3-8b-instruct-pytorch' + ] for p2 in dataset_list(p1, 'objective')]) @pytest.mark.chat_objective def test_chat_objective(self, baseline_scores_fullbench, result_scores, model, dataset): @@ -245,10 +246,7 @@ class TestCmdCase: @pytest.mark.parametrize('model, dataset', [('internlm2_5-7b-hf', 'race-middle_accuracy'), ('internlm2_5-7b-hf', 'race-high_accuracy'), - ('internlm2_5-7b-hf', 'demo_gsm8k_accuracy'), - ('internlm2-1.8b-hf', 'race-middle_accuracy'), - ('internlm2-1.8b-hf', 'race-high_accuracy'), - ('internlm2-1.8b-hf', 'demo_gsm8k_accuracy')]) + ('internlm2_5-7b-hf', 'demo_gsm8k_accuracy')]) def test_cmd_case1(self, baseline_scores, result_scores, model, dataset): base_score = baseline_scores.get(model).get(dataset) result_score = result_scores.get(model).get(dataset) @@ -260,9 +258,9 @@ class TestCmdCase: [('internlm2_5-7b-chat-lmdeploy', 'race-middle_accuracy'), ('internlm2_5-7b-chat-lmdeploy', 'race-high_accuracy'), ('internlm2_5-7b-chat-lmdeploy', 'demo_gsm8k_accuracy'), - ('internlm2-chat-1.8b-lmdeploy', 'race-middle_accuracy'), - ('internlm2-chat-1.8b-lmdeploy', 'race-high_accuracy'), - ('internlm2-chat-1.8b-lmdeploy', 'demo_gsm8k_accuracy')]) + ('internlm3-8b-instruct-lmdeploy', 'race-middle_accuracy'), + ('internlm3-8b-instruct-lmdeploy', 'race-high_accuracy'), + ('internlm3-8b-instruct-lmdeploy', 'demo_gsm8k_accuracy')]) def test_cmd_case2(self, baseline_scores, result_scores, model, dataset): base_score = baseline_scores.get(model).get(dataset) result_score = result_scores.get(model).get(dataset) @@ -280,13 +278,25 @@ class TestCmdCase: @pytest.mark.case4 @pytest.mark.parametrize( - 'model, dataset', [('internlm2_5-7b-chat_hf', 'race-middle_accuracy'), - ('internlm2_5-7b-chat_hf', 'race-high_accuracy'), - ('internlm2_5-7b-chat_hf', 'demo_gsm8k_accuracy')]) + 'model, dataset', + [('internlm3-8b-instruct_hf-lmdeploy', 'race-middle_accuracy'), + ('internlm3-8b-instruct_hf-lmdeploy', 'race-high_accuracy'), + ('internlm3-8b-instruct_hf-lmdeploy', 'demo_gsm8k_accuracy')]) def test_cmd_case4(self, baseline_scores, result_scores, model, dataset): base_score = baseline_scores.get(model).get(dataset) result_score = result_scores.get(model).get(dataset) - assert_score(model, result_score, base_score, dataset) + assert_score(model + '_batch', result_score, base_score, dataset) + + @pytest.mark.case5 + @pytest.mark.parametrize( + 'model, dataset', + [('internlm3-8b-instruct_hf-vllm', 'race-middle_accuracy'), + ('internlm3-8b-instruct_hf-vllm', 'race-high_accuracy'), + ('internlm3-8b-instruct_hf-vllm', 'demo_gsm8k_accuracy')]) + def test_cmd_case5(self, baseline_scores, result_scores, model, dataset): + base_score = baseline_scores.get(model).get(dataset) + result_score = result_scores.get(model).get(dataset) + assert_score(model + '_batch', result_score, base_score, dataset) def assert_score(model_type, score, baseline, dataset: str = ''): diff --git a/.github/scripts/oc_score_baseline.yaml b/.github/scripts/oc_score_baseline.yaml index cd2e3328..e4567553 100644 --- a/.github/scripts/oc_score_baseline.yaml +++ b/.github/scripts/oc_score_baseline.yaml @@ -8,20 +8,25 @@ internlm2_5-7b_hf: race-middle_accuracy: 91.78 race-high_accuracy: 90.02 -internlm2-1.8b-hf: - demo_gsm8k_accuracy: 15.62 - race-middle_accuracy: 71.66 - race-high_accuracy: 66.38 - internlm2_5-7b-chat-lmdeploy: - demo_gsm8k_accuracy: 89.06 + demo_gsm8k_accuracy: 87.50 race-middle_accuracy: 92.76 race-high_accuracy: 90.54 -internlm2-chat-1.8b-lmdeploy: - demo_gsm8k_accuracy: 31 - race-middle_accuracy: 81.34 - race-high_accuracy: 73.96 +internlm3-8b-instruct-lmdeploy: + demo_gsm8k_accuracy: 73.44 + race-middle_accuracy: 93.38 + race-high_accuracy: 90.34 + +internlm3-8b-instruct_hf-lmdeploy: + demo_gsm8k_accuracy: 73.44 + race-middle_accuracy: 93.38 + race-high_accuracy: 90.34 + +internlm3-8b-instruct_hf-vllm: + demo_gsm8k_accuracy: 81.25 + race-middle_accuracy: 92.20 + race-high_accuracy: 89.88 internlm2_5-7b-chat_hf: demo_gsm8k_accuracy: 87.50 @@ -29,6 +34,6 @@ internlm2_5-7b-chat_hf: race-high_accuracy: 90.48 lmdeploy-api-test: - gsm8k_accuracy: 68.75 - race-middle_accuracy: 87.50 + gsm8k_accuracy: 56.25 + race-middle_accuracy: 93.75 race-high_accuracy: 93.75 diff --git a/.github/scripts/oc_score_baseline_fullbench.yaml b/.github/scripts/oc_score_baseline_fullbench.yaml index c0e735fb..3f5753d3 100644 --- a/.github/scripts/oc_score_baseline_fullbench.yaml +++ b/.github/scripts/oc_score_baseline_fullbench.yaml @@ -39,15 +39,15 @@ internlm2_5-7b-chat-hf_fullbench: college_knowledge_naive_average: 87.5 subjective: alignment_bench_v1_1_总分: 0.66 - alpaca_eval_total: 20 + alpaca_eval_total: 0 arenahard_score: 50 Followbench_naive_average: 1 CompassArena_naive_average: 43 mtbench101_avg: 7.8 - wildbench_average: -12.78 + wildbench_average: -15.56 simpleqa_accuracy_given_attempted: 0 chinese_simpleqa_given_attempted_accuracy: 1 - alignment_bench_v1_1_专业能力: 7.90 + alignment_bench_v1_1_专业能力: 8.00 alignment_bench_v1_1_数学计算: 0 alignment_bench_v1_1_基本任务: 0 alignment_bench_v1_1_逻辑推理: 0 @@ -55,7 +55,7 @@ internlm2_5-7b-chat-hf_fullbench: alignment_bench_v1_1_文本写作: 0 alignment_bench_v1_1_角色扮演: 0 alignment_bench_v1_1_综合问答: 0 - alpaca_eval_helpful_base: 20 + alpaca_eval_helpful_base: 0 compassarena_language_naive_average: 35 compassarena_knowledge_naive_average: 55 compassarena_reason_v2_naive_average: 40 @@ -78,53 +78,53 @@ internlm2_5-7b-chat-hf_fullbench: internlm2_5-7b-chat-turbomind_fullbench: objective: race-high_accuracy: 93.75 - ARC-c_accuracy: 93.75 + ARC-c_accuracy: 87.50 BoolQ_accuracy: 68.75 triviaqa_wiki_1shot_score: 50 nq_open_1shot_score: 25 IFEval_Prompt-level-strict-accuracy: 56.25 - drop_accuracy: 81.25 + drop_accuracy: 75 GPQA_diamond_accuracy: 31.25 - hellaswag_accuracy: 81.25 - TheoremQA_score: 6.25 + hellaswag_accuracy: 87.5 + TheoremQA_score: 12.5 musr_average_naive_average: 39.58 - korbench_single_naive_average: 37.50 - gsm8k_accuracy: 68.75 - math_accuracy: 68.75 + korbench_single_naive_average: 40 + gsm8k_accuracy: 62.5 + math_accuracy: 75 cmo_fib_accuracy: 6.25 aime2024_accuracy: 6.25 - wikibench-wiki-single_choice_cncircular_perf_4: 50.00 + wikibench-wiki-single_choice_cncircular_perf_4: 25 sanitized_mbpp_score: 68.75 - ds1000_naive_average: 16.96 + ds1000_naive_average: 17.86 lcb_code_generation_pass@1: 12.5 lcb_code_execution_pass@1: 43.75 - lcb_test_output_pass@1: 25.00 - bbh-logical_deduction_seven_objects_score: 50.00 - bbh-multistep_arithmetic_two_score: 68.75 - mmlu-other_accuracy: 69.71 - cmmlu-china-specific_accuracy: 75.83 + lcb_test_output_pass@1: 18.75 + bbh-logical_deduction_seven_objects_score: 56.25 + bbh-multistep_arithmetic_two_score: 75 + mmlu-other_accuracy: 72.6 + cmmlu-china-specific_accuracy: 78.33 mmlu_pro_math_accuracy: 31.25 - ds1000_Pandas_accuracy: 0 + ds1000_Pandas_accuracy: 12.5 ds1000_Numpy_accuracy: 0 ds1000_Tensorflow_accuracy: 12.5 - ds1000_Scipy_accuracy: 18.75 + ds1000_Scipy_accuracy: 25 ds1000_Sklearn_accuracy: 18.75 - ds1000_Pytorch_accuracy: 18.75 + ds1000_Pytorch_accuracy: 6.25 ds1000_Matplotlib_accuracy: 50.00 openai_mmmlu_lite_AR-XY_accuracy: 37.5 college_naive_average: 12.50 college_knowledge_naive_average: 87.5 subjective: - alignment_bench_v1_1_总分: 0.70 + alignment_bench_v1_1_总分: 0.66 alpaca_eval_total: 0 arenahard_score: 50 Followbench_naive_average: 1 - CompassArena_naive_average: 38 - mtbench101_avg: 7.80 - wildbench_average: -4.86 + CompassArena_naive_average: 40 + mtbench101_avg: 8 + wildbench_average: -6.81 simpleqa_accuracy_given_attempted: 0 chinese_simpleqa_given_attempted_accuracy: 1 - alignment_bench_v1_1_专业能力: 8.4 + alignment_bench_v1_1_专业能力: 7.9 alignment_bench_v1_1_数学计算: 0 alignment_bench_v1_1_基本任务: 0 alignment_bench_v1_1_逻辑推理: 0 @@ -134,10 +134,10 @@ internlm2_5-7b-chat-turbomind_fullbench: alignment_bench_v1_1_综合问答: 0 alpaca_eval_helpful_base: 0 compassarena_language_naive_average: 35 - compassarena_knowledge_naive_average: 50 - compassarena_reason_v2_naive_average: 30 - compassarena_math_v2_naive_average: 50 - compassarena_creationv2_zh_naive_average: 25 + compassarena_knowledge_naive_average: 45 + compassarena_reason_v2_naive_average: 25 + compassarena_math_v2_naive_average: 60 + compassarena_creationv2_zh_naive_average: 35 followbench_llmeval_en_HSR_AVG: 1 followbench_llmeval_en_SSR_AVG: 1 followbench_llmeval_en_HSR_L1: 1 @@ -190,20 +190,20 @@ internlm2_5-7b-turbomind_fullbench: drop_accuracy: 62.5 GPQA_diamond_accuracy: 62.5 hellaswag_accuracy: 93.75 - TheoremQA_score: 25.00 + TheoremQA_score: 31.25 winogrande_accuracy: 87.5 - gsm8k_accuracy: 62.50 - GaokaoBench_2010-2022_Math_II_MCQs_score: 81.25 + gsm8k_accuracy: 56.25 + GaokaoBench_2010-2022_Math_II_MCQs_score: 68.75 GaokaoBench_2010-2022_Math_II_Fill-in-the-Blank_score: 0 math_accuracy: 18.75 wikibench-wiki-single_choice_cncircular_perf_4: 25 sanitized_mbpp_score: 62.50 - dingo_en_192_score: 31.25 + dingo_en_192_score: 50.00 dingo_zh_170_score: 93.75 mmlu-other_accuracy: 76.92 cmmlu-china-specific_accuracy: 84.17 mmlu_pro_math_accuracy: 18.75 - bbh-logical_deduction_seven_objects_score: 50 + bbh-logical_deduction_seven_objects_score: 43.75 bbh-multistep_arithmetic_two_score: 56.25 college_naive_average: 12.5 college_knowledge_naive_average: 87.5 @@ -409,7 +409,7 @@ internlm2_5-7b-chat-turbomind: alpaca_eval_koala: 28.21 alpaca_eval_oasst: 23.4 alpaca_eval_selfinstruct: 30.95 - alpaca_eval_vicuna: 25 + alpaca_eval_vicuna: 33.75 compassarena_language_naive_average: 52.5 compassarena_knowledge_naive_average: 36 compassarena_reason_v2_naive_average: 35 @@ -454,3 +454,530 @@ internlm2_5-7b-chat-1m-turbomind: longbench_few-shot-learning_score: 51.67 longbench_synthetic-tasks_score: 66.83 longbench_code-completion_score: 45.99 + + +qwen2.5-7b-instruct-turbomind: + objective: + race-high_accuracy: 84.99 + ARC-c_accuracy: 92.2 + BoolQ_accuracy: 86.7 + triviaqa_wiki_1shot_score: 53.06 + nq_open_1shot_score: 17.51 + mmmlu_lite_naive_average: 54.96 + IFEval_Prompt-level-strict-accuracy: 71.53 + drop_accuracy: 80.07 + bbh_naive_average: 68.81 + GPQA_diamond_accuracy: 34.34 + hellaswag_accuracy: 85.42 + TheoremQA_score: 18.38 + musr_average_naive_average: 43.44 + korbench_single_naive_average: 39.44 + ARC_Prize_Public_Evaluation_accuracy: 0 + gsm8k_accuracy: 92.57 + GaokaoBench_weighted_average: 80.14 + math_accuracy: 73.58 + cmo_fib_accuracy: 25 + aime2024_accuracy: 16.67 + Mathbench_naive_average: 77.33 + wikibench-wiki-single_choice_cncircular_perf_4: 34.9 + cmmlu_naive_average: 75.97 + mmlu_naive_average: 76.01 + mmlu_pro_naive_average: 56.12 + openai_humaneval_humaneval_pass@1: 83.54 + sanitized_mbpp_score: 74.71 + humanevalx_naive_average: 48.29 + ds1000_naive_average: 18.66 + lcb_code_generation_pass@1: 39.5 + lcb_code_execution_pass@1: 42.38 + lcb_test_output_pass@1: 50.68 + bigcodebench_hard_instruct_pass@1: 16.22 + bigcodebench_hard_complete_pass@1: 11.49 + teval_naive_average: 79.72 + SciCode_sub_accuracy: 100 + qa_dingo_cn_score: 99.01 + mmlu_accuracy: 76.01 + mmlu-stem_accuracy: 77.59 + mmlu-social-science_accuracy: 79.02 + mmlu-humanities_accuracy: 72.07 + mmlu-other_accuracy: 74.86 + cmmlu_accuracy: 75.97 + cmmlu-stem_accuracy: 73.09 + cmmlu-social-science_accuracy: 75.95 + cmmlu-humanities_accuracy: 76.53 + cmmlu-other_accuracy: 78.79 + cmmlu-china-specific_accuracy: 73.17 + mmlu_pro_accuracy: 56.12 + mmlu_pro_biology_accuracy: 71.41 + mmlu_pro_business_accuracy: 67.68 + mmlu_pro_chemistry_accuracy: 54.59 + mmlu_pro_computer_science_accuracy: 58.29 + mmlu_pro_economics_accuracy: 66.82 + mmlu_pro_engineering_accuracy: 42.41 + mmlu_pro_health_accuracy: 55.87 + mmlu_pro_history_accuracy: 46.46 + mmlu_pro_law_accuracy: 28.97 + mmlu_pro_math_accuracy: 73.13 + mmlu_pro_philosophy_accuracy: 44.89 + mmlu_pro_physics_accuracy: 58.43 + mmlu_pro_psychology_accuracy: 63.16 + mmlu_pro_other_accuracy: 53.57 + humanevalx-python_pass@1: 50 + humanevalx-cpp_pass@1: 42.07 + humanevalx-go_pass@1: 0 + humanevalx-java_pass@1: 74.39 + humanevalx-js_pass@1: 75 + ds1000_Pandas_accuracy: 14.09 + ds1000_Numpy_accuracy: 8.18 + ds1000_Tensorflow_accuracy: 17.78 + ds1000_Scipy_accuracy: 15.09 + ds1000_Sklearn_accuracy: 10.43 + ds1000_Pytorch_accuracy: 4.41 + ds1000_Matplotlib_accuracy: 60.65 + mmmlu_lite_accuracy: 54.96 + openai_mmmlu_lite_AR-XY_accuracy: 42.32 + openai_mmmlu_lite_BN-BD_accuracy: 42.25 + openai_mmmlu_lite_DE-DE_accuracy: 59.93 + openai_mmmlu_lite_ES-LA_accuracy: 66.53 + openai_mmmlu_lite_FR-FR_accuracy: 66.88 + openai_mmmlu_lite_HI-IN_accuracy: 49.26 + openai_mmmlu_lite_ID-ID_accuracy: 61.26 + openai_mmmlu_lite_IT-IT_accuracy: 65.47 + openai_mmmlu_lite_JA-JP_accuracy: 61.54 + openai_mmmlu_lite_KO-KR_accuracy: 60.28 + openai_mmmlu_lite_PT-BR_accuracy: 55.51 + openai_mmmlu_lite_SW-KE_accuracy: 36.42 + openai_mmmlu_lite_YO-NG_accuracy: 32.14 + openai_mmmlu_lite_ZH-CN_accuracy: 69.61 + college_naive_average: 48 + high_naive_average: 59 + middle_naive_average: 78 + primary_naive_average: 85.67 + arithmetic_naive_average: 75.67 + mathbench-a (average)_naive_average: 69.27 + college_knowledge_naive_average: 83.86 + high_knowledge_naive_average: 80.29 + middle_knowledge_naive_average: 84.26 + primary_knowledge_naive_average: 93.16 + mathbench-t (average)_naive_average: 85.39 + + + + +internlm2_5-7b-chat-pytorch: + objective: + race-high_accuracy: 86.39 + ARC-c_accuracy: 90.51 + BoolQ_accuracy: 88.01 + triviaqa_wiki_1shot_score: 64.77 + nq_open_1shot_score: 22.71 + mmmlu_lite_naive_average: 45.02 + IFEval_Prompt-level-strict-accuracy: 56.56 + drop_accuracy: 75.46 + bbh_naive_average: 73.34 + GPQA_diamond_accuracy: 32.83 + hellaswag_accuracy: 94.81 + TheoremQA_score: 23.88 + musr_average_naive_average: 51.31 + korbench_single_naive_average: 32 + ARC_Prize_Public_Evaluation_accuracy: 0.01 + gsm8k_accuracy: 86.96 + GaokaoBench_weighted_average: 78.05 + math_accuracy: 60.34 + cmo_fib_accuracy: 12.98 + aime2024_accuracy: 3.33 + Mathbench_naive_average: 64.82 + wikibench-wiki-single_choice_cncircular_perf_4: 31.7 + cmmlu_naive_average: 74.24 + mmlu_naive_average: 70.2 + mmlu_pro_naive_average: 45.39 + openai_humaneval_humaneval_pass@1: 70.12 + sanitized_mbpp_score: 64.59 + humanevalx_naive_average: 38.78 + ds1000_naive_average: 14.19 + lcb_code_generation_pass@1: 16.5 + lcb_code_execution_pass@1: 33.82 + lcb_test_output_pass@1: 22.62 + bigcodebench_hard_instruct_pass@1: 6.08 + bigcodebench_hard_complete_pass@1: 6.76 + teval_naive_average: 79.73 + SciCode_sub_accuracy: 100 + qa_dingo_cn_score: 100 + mmlu_accuracy: 70.2 + mmlu-stem_accuracy: 67.73 + mmlu-social-science_accuracy: 75.49 + mmlu-humanities_accuracy: 68.56 + mmlu-other_accuracy: 70.58 + cmmlu_accuracy: 74.24 + cmmlu-stem_accuracy: 66.7 + cmmlu-social-science_accuracy: 75.88 + cmmlu-humanities_accuracy: 77.56 + cmmlu-other_accuracy: 77.52 + cmmlu-china-specific_accuracy: 73.46 + mmlu_pro_accuracy: 45.39 + mmlu_pro_biology_accuracy: 65.83 + mmlu_pro_business_accuracy: 51.96 + mmlu_pro_chemistry_accuracy: 36.84 + mmlu_pro_computer_science_accuracy: 48.29 + mmlu_pro_economics_accuracy: 56.16 + mmlu_pro_engineering_accuracy: 29.1 + mmlu_pro_health_accuracy: 44.5 + mmlu_pro_history_accuracy: 42.26 + mmlu_pro_law_accuracy: 24.98 + mmlu_pro_math_accuracy: 54.85 + mmlu_pro_philosophy_accuracy: 39.28 + mmlu_pro_physics_accuracy: 37.41 + mmlu_pro_psychology_accuracy: 58.27 + mmlu_pro_other_accuracy: 45.78 + humanevalx-python_pass@1: 56.1 + humanevalx-cpp_pass@1: 20.73 + humanevalx-go_pass@1: 0 + humanevalx-java_pass@1: 59.15 + humanevalx-js_pass@1: 57.93 + ds1000_Pandas_accuracy: 8.93 + ds1000_Numpy_accuracy: 4.09 + ds1000_Tensorflow_accuracy: 11.11 + ds1000_Scipy_accuracy: 7.55 + ds1000_Sklearn_accuracy: 7.83 + ds1000_Pytorch_accuracy: 8.82 + ds1000_Matplotlib_accuracy: 50.97 + mmmlu_lite_accuracy: 45.02 + openai_mmmlu_lite_AR-XY_accuracy: 18.6 + openai_mmmlu_lite_BN-BD_accuracy: 27.58 + openai_mmmlu_lite_DE-DE_accuracy: 51.23 + openai_mmmlu_lite_ES-LA_accuracy: 56.63 + openai_mmmlu_lite_FR-FR_accuracy: 58.11 + openai_mmmlu_lite_HI-IN_accuracy: 33.82 + openai_mmmlu_lite_ID-ID_accuracy: 50.39 + openai_mmmlu_lite_IT-IT_accuracy: 50.39 + openai_mmmlu_lite_JA-JP_accuracy: 50.95 + openai_mmmlu_lite_KO-KR_accuracy: 45.05 + openai_mmmlu_lite_PT-BR_accuracy: 57.89 + openai_mmmlu_lite_SW-KE_accuracy: 32.14 + openai_mmmlu_lite_YO-NG_accuracy: 32.14 + openai_mmmlu_lite_ZH-CN_accuracy: 65.33 + college_naive_average: 21 + high_naive_average: 47 + middle_naive_average: 59.67 + primary_naive_average: 76 + arithmetic_naive_average: 62 + mathbench-a (average)_naive_average: 53.13 + college_knowledge_naive_average: 68.99 + high_knowledge_naive_average: 70.06 + middle_knowledge_naive_average: 78.53 + primary_knowledge_naive_average: 88.49 + mathbench-t (average)_naive_average: 76.51 + + +qwen2.5-7b-instruct-pytorch: + objective: + race-high_accuracy: 85.16 + ARC-c_accuracy: 90.85 + BoolQ_accuracy: 86.61 + triviaqa_wiki_1shot_score: 52.96 + nq_open_1shot_score: 17.62 + mmmlu_lite_naive_average: 54.7 + IFEval_Prompt-level-strict-accuracy: 71.35 + drop_accuracy: 80.23 + bbh_naive_average: 68.88 + GPQA_diamond_accuracy: 36.36 + hellaswag_accuracy: 85.49 + TheoremQA_score: 18.38 + musr_average_naive_average: 43.3 + korbench_single_naive_average: 39.44 + ARC_Prize_Public_Evaluation_accuracy: 0 + gsm8k_accuracy: 91.66 + GaokaoBench_weighted_average: 80.02 + math_accuracy: 73.74 + cmo_fib_accuracy: 26.44 + aime2024_accuracy: 10 + Mathbench_naive_average: 77.08 + wikibench-wiki-single_choice_cncircular_perf_4: 34 + cmmlu_naive_average: 75.9 + mmlu_naive_average: 76.27 + mmlu_pro_naive_average: 56.14 + openai_humaneval_humaneval_pass@1: 84.76 + sanitized_mbpp_score: 74.71 + humanevalx_naive_average: 48.17 + ds1000_naive_average: 18.57 + lcb_code_generation_pass@1: 38.75 + lcb_code_execution_pass@1: 42.38 + lcb_test_output_pass@1: 50.45 + bigcodebench_hard_instruct_pass@1: 16.89 + bigcodebench_hard_complete_pass@1: 12.16 + teval_naive_average: 79.46 + SciCode_sub_accuracy: 100 + qa_dingo_cn_score: 100 + mmlu_accuracy: 76.27 + mmlu-stem_accuracy: 77.75 + mmlu-social-science_accuracy: 78.65 + mmlu-humanities_accuracy: 73.12 + mmlu-other_accuracy: 75.05 + cmmlu_accuracy: 75.9 + cmmlu-stem_accuracy: 73.41 + cmmlu-social-science_accuracy: 75.97 + cmmlu-humanities_accuracy: 76.42 + cmmlu-other_accuracy: 78.15 + cmmlu-china-specific_accuracy: 73.27 + mmlu_pro_accuracy: 56.14 + mmlu_pro_biology_accuracy: 72.25 + mmlu_pro_business_accuracy: 66.16 + mmlu_pro_chemistry_accuracy: 55.65 + mmlu_pro_computer_science_accuracy: 60.24 + mmlu_pro_economics_accuracy: 66.82 + mmlu_pro_engineering_accuracy: 41.38 + mmlu_pro_health_accuracy: 54.89 + mmlu_pro_history_accuracy: 46.46 + mmlu_pro_law_accuracy: 29.06 + mmlu_pro_math_accuracy: 73.58 + mmlu_pro_philosophy_accuracy: 44.89 + mmlu_pro_physics_accuracy: 60.05 + mmlu_pro_psychology_accuracy: 61.9 + mmlu_pro_other_accuracy: 52.6 + humanevalx-python_pass@1: 51.83 + humanevalx-cpp_pass@1: 42.68 + humanevalx-go_pass@1: 0 + humanevalx-java_pass@1: 73.78 + humanevalx-js_pass@1: 72.56 + ds1000_Pandas_accuracy: 14.09 + ds1000_Numpy_accuracy: 8.64 + ds1000_Tensorflow_accuracy: 17.78 + ds1000_Scipy_accuracy: 15.09 + ds1000_Sklearn_accuracy: 8.7 + ds1000_Pytorch_accuracy: 4.41 + ds1000_Matplotlib_accuracy: 61.29 + mmmlu_lite_accuracy: 54.7 + openai_mmmlu_lite_AR-XY_accuracy: 42.32 + openai_mmmlu_lite_BN-BD_accuracy: 42.18 + openai_mmmlu_lite_DE-DE_accuracy: 60 + openai_mmmlu_lite_ES-LA_accuracy: 66.18 + openai_mmmlu_lite_FR-FR_accuracy: 66.88 + openai_mmmlu_lite_HI-IN_accuracy: 48.63 + openai_mmmlu_lite_ID-ID_accuracy: 61.26 + openai_mmmlu_lite_IT-IT_accuracy: 65.26 + openai_mmmlu_lite_JA-JP_accuracy: 60.7 + openai_mmmlu_lite_KO-KR_accuracy: 60.63 + openai_mmmlu_lite_PT-BR_accuracy: 54.46 + openai_mmmlu_lite_SW-KE_accuracy: 36 + openai_mmmlu_lite_YO-NG_accuracy: 31.86 + openai_mmmlu_lite_ZH-CN_accuracy: 69.4 + college_naive_average: 48.33 + high_naive_average: 59.33 + middle_naive_average: 76.67 + primary_naive_average: 86.67 + arithmetic_naive_average: 74.33 + mathbench-a (average)_naive_average: 69.07 + college_knowledge_naive_average: 83.54 + high_knowledge_naive_average: 80.82 + middle_knowledge_naive_average: 83.79 + primary_knowledge_naive_average: 92.22 + mathbench-t (average)_naive_average: 85.1 + + +internlm3-8b-instruct-turbomind: + objective: + race-high_accuracy: 89.22 + ARC-c_accuracy: 92.54 + BoolQ_accuracy: 86.45 + triviaqa_wiki_1shot_score: 60.72 + nq_open_1shot_score: 20.25 + mmmlu_lite_naive_average: 41.82 + IFEval_Prompt-level-strict-accuracy: 77.45 + drop_accuracy: 83.27 + bbh_naive_average: 55.22 + GPQA_diamond_accuracy: 37.88 + hellaswag_accuracy: 91.28 + TheoremQA_score: 20.12 + musr_average_naive_average: 36.86 + korbench_single_naive_average: 41.2 + ARC_Prize_Public_Evaluation_accuracy: 0.06 + gsm8k_accuracy: 91.28 + GaokaoBench_weighted_average: 86.59 + math_accuracy: 76.96 + cmo_fib_accuracy: 35.1 + aime2024_accuracy: 16.67 + Mathbench_naive_average: 78.96 + wikibench-wiki-single_choice_cncircular_perf_4: 37.45 + cmmlu_naive_average: 83.33 + mmlu_naive_average: 76.21 + mmlu_pro_naive_average: 57.96 + openai_humaneval_humaneval_pass@1: 81.71 + sanitized_mbpp_score: 69.65 + humanevalx_naive_average: 40.73 + ds1000_naive_average: 27.23 + lcb_code_generation_pass@1: 34.75 + lcb_code_execution_pass@1: 49.9 + lcb_test_output_pass@1: 48.19 + bigcodebench_hard_instruct_pass@1: 13.51 + bigcodebench_hard_complete_pass@1: 15.54 + teval_naive_average: 82.86 + SciCode_sub_accuracy: 100 + qa_dingo_cn_score: 100 + mmlu_accuracy: 76.21 + mmlu-stem_accuracy: 77.7 + mmlu-social-science_accuracy: 80.98 + mmlu-humanities_accuracy: 70.83 + mmlu-other_accuracy: 75.01 + cmmlu_accuracy: 83.33 + cmmlu-stem_accuracy: 79.66 + cmmlu-social-science_accuracy: 83.39 + cmmlu-humanities_accuracy: 84.73 + cmmlu-other_accuracy: 86.2 + cmmlu-china-specific_accuracy: 81.77 + mmlu_pro_accuracy: 57.96 + mmlu_pro_biology_accuracy: 75.45 + mmlu_pro_business_accuracy: 64.64 + mmlu_pro_chemistry_accuracy: 59.81 + mmlu_pro_computer_science_accuracy: 60.24 + mmlu_pro_economics_accuracy: 68.6 + mmlu_pro_engineering_accuracy: 44.79 + mmlu_pro_health_accuracy: 58.31 + mmlu_pro_history_accuracy: 49.87 + mmlu_pro_law_accuracy: 32.43 + mmlu_pro_math_accuracy: 70.17 + mmlu_pro_philosophy_accuracy: 46.89 + mmlu_pro_physics_accuracy: 59.58 + mmlu_pro_psychology_accuracy: 66.29 + mmlu_pro_other_accuracy: 54.33 + humanevalx-python_pass@1: 43.9 + humanevalx-cpp_pass@1: 20.12 + humanevalx-go_pass@1: 0 + humanevalx-java_pass@1: 74.39 + humanevalx-js_pass@1: 65.24 + ds1000_Pandas_accuracy: 16.49 + ds1000_Numpy_accuracy: 34.09 + ds1000_Tensorflow_accuracy: 26.67 + ds1000_Scipy_accuracy: 17.92 + ds1000_Sklearn_accuracy: 20.87 + ds1000_Pytorch_accuracy: 19.12 + ds1000_Matplotlib_accuracy: 55.48 + mmmlu_lite_accuracy: 41.82 + openai_mmmlu_lite_AR-XY_accuracy: 32.56 + openai_mmmlu_lite_BN-BD_accuracy: 4.56 + openai_mmmlu_lite_DE-DE_accuracy: 24.91 + openai_mmmlu_lite_ES-LA_accuracy: 51.09 + openai_mmmlu_lite_FR-FR_accuracy: 61.68 + openai_mmmlu_lite_HI-IN_accuracy: 24.98 + openai_mmmlu_lite_ID-ID_accuracy: 44.56 + openai_mmmlu_lite_IT-IT_accuracy: 52.35 + openai_mmmlu_lite_JA-JP_accuracy: 51.02 + openai_mmmlu_lite_KO-KR_accuracy: 47.93 + openai_mmmlu_lite_PT-BR_accuracy: 53.89 + openai_mmmlu_lite_SW-KE_accuracy: 33.47 + openai_mmmlu_lite_YO-NG_accuracy: 33.47 + openai_mmmlu_lite_ZH-CN_accuracy: 69.05 + college_naive_average: 45.67 + high_naive_average: 64.67 + middle_naive_average: 82.33 + primary_naive_average: 90.33 + arithmetic_naive_average: 74 + mathbench-a (average)_naive_average: 71.4 + college_knowledge_naive_average: 85.28 + high_knowledge_naive_average: 79.43 + middle_knowledge_naive_average: 87.9 + primary_knowledge_naive_average: 93.42 + mathbench-t (average)_naive_average: 86.51 + + +internlm3-8b-instruct-pytorch: + objective: + race-high_accuracy: 89.02 + ARC-c_accuracy: 93.56 + BoolQ_accuracy: 86.67 + triviaqa_wiki_1shot_score: 60.54 + nq_open_1shot_score: 20.3 + mmmlu_lite_naive_average: 42.6 + IFEval_Prompt-level-strict-accuracy: 79.11 + drop_accuracy: 83.32 + bbh_naive_average: 54.76 + GPQA_diamond_accuracy: 42.42 + hellaswag_accuracy: 91.31 + TheoremQA_score: 18 + musr_average_naive_average: 36.62 + korbench_single_naive_average: 41.84 + ARC_Prize_Public_Evaluation_accuracy: 0.06 + gsm8k_accuracy: 90.67 + GaokaoBench_weighted_average: 86.27 + math_accuracy: 76.68 + cmo_fib_accuracy: 33.65 + aime2024_accuracy: 10 + Mathbench_naive_average: 78.92 + wikibench-wiki-single_choice_cncircular_perf_4: 37.35 + cmmlu_naive_average: 83.11 + mmlu_naive_average: 76.23 + mmlu_pro_naive_average: 58.16 + openai_humaneval_humaneval_pass@1: 82.32 + sanitized_mbpp_score: 70.04 + humanevalx_naive_average: 39.76 + ds1000_naive_average: 27.84 + lcb_code_generation_pass@1: 34.5 + lcb_code_execution_pass@1: 48.02 + lcb_test_output_pass@1: 47.74 + bigcodebench_hard_instruct_pass@1: 12.84 + bigcodebench_hard_complete_pass@1: 15.54 + teval_naive_average: 82.86 + SciCode_sub_accuracy: 100 + qa_dingo_cn_score: 100 + mmlu_accuracy: 76.23 + mmlu-stem_accuracy: 78.08 + mmlu-social-science_accuracy: 80.31 + mmlu-humanities_accuracy: 71.38 + mmlu-other_accuracy: 74.63 + cmmlu_accuracy: 83.11 + cmmlu-stem_accuracy: 79.42 + cmmlu-social-science_accuracy: 83.34 + cmmlu-humanities_accuracy: 83.95 + cmmlu-other_accuracy: 86.22 + cmmlu-china-specific_accuracy: 81.5 + mmlu_pro_accuracy: 58.16 + mmlu_pro_biology_accuracy: 74.62 + mmlu_pro_business_accuracy: 65.02 + mmlu_pro_chemistry_accuracy: 60.69 + mmlu_pro_computer_science_accuracy: 61.46 + mmlu_pro_economics_accuracy: 68.25 + mmlu_pro_engineering_accuracy: 45.3 + mmlu_pro_health_accuracy: 60.15 + mmlu_pro_history_accuracy: 50.66 + mmlu_pro_law_accuracy: 31.7 + mmlu_pro_math_accuracy: 70.32 + mmlu_pro_philosophy_accuracy: 47.7 + mmlu_pro_physics_accuracy: 59.51 + mmlu_pro_psychology_accuracy: 65.41 + mmlu_pro_other_accuracy: 53.46 + humanevalx-python_pass@1: 42.68 + humanevalx-cpp_pass@1: 19.51 + humanevalx-go_pass@1: 0 + humanevalx-java_pass@1: 72.56 + humanevalx-js_pass@1: 64.02 + ds1000_Pandas_accuracy: 14.09 + ds1000_Numpy_accuracy: 35 + ds1000_Tensorflow_accuracy: 24.44 + ds1000_Scipy_accuracy: 20.75 + ds1000_Sklearn_accuracy: 21.74 + ds1000_Pytorch_accuracy: 22.06 + ds1000_Matplotlib_accuracy: 56.77 + mmmlu_lite_accuracy: 42.6 + openai_mmmlu_lite_AR-XY_accuracy: 32.84 + openai_mmmlu_lite_BN-BD_accuracy: 10.46 + openai_mmmlu_lite_DE-DE_accuracy: 24.56 + openai_mmmlu_lite_ES-LA_accuracy: 50.95 + openai_mmmlu_lite_FR-FR_accuracy: 61.05 + openai_mmmlu_lite_HI-IN_accuracy: 30.6 + openai_mmmlu_lite_ID-ID_accuracy: 45.89 + openai_mmmlu_lite_IT-IT_accuracy: 51.79 + openai_mmmlu_lite_JA-JP_accuracy: 51.65 + openai_mmmlu_lite_KO-KR_accuracy: 48.77 + openai_mmmlu_lite_PT-BR_accuracy: 52.7 + openai_mmmlu_lite_SW-KE_accuracy: 32.91 + openai_mmmlu_lite_YO-NG_accuracy: 32.84 + openai_mmmlu_lite_ZH-CN_accuracy: 69.33 + college_naive_average: 47 + high_naive_average: 66.67 + middle_naive_average: 81.67 + primary_naive_average: 89.33 + arithmetic_naive_average: 73.67 + mathbench-a (average)_naive_average: 71.67 + college_knowledge_naive_average: 82.91 + high_knowledge_naive_average: 79.86 + middle_knowledge_naive_average: 88.92 + primary_knowledge_naive_average: 92.96 + mathbench-t (average)_naive_average: 86.16 diff --git a/.github/scripts/oc_score_baseline_testrange.yaml b/.github/scripts/oc_score_baseline_testrange.yaml index 45f74131..16a13209 100644 --- a/.github/scripts/oc_score_baseline_testrange.yaml +++ b/.github/scripts/oc_score_baseline_testrange.yaml @@ -1,21 +1,24 @@ chat: glm-4-9b-chat-hf: - gsm8k_accuracy: 68.75 - race-high_accuracy: 90.62 + gsm8k_accuracy: 56.25 + race-high_accuracy: 84.38 glm-4-9b-chat-turbomind: gsm8k_accuracy: 71.88 race-high_accuracy: 90.62 glm-4-9b-chat-vllm: - gsm8k_accuracy: 71.88 + gsm8k_accuracy: 68.75 race-high_accuracy: 90.62 deepseek-7b-chat-hf: gsm8k_accuracy: 46.88 race-high_accuracy: 81.25 - deepseek-moe-16b-chat-hf: - gsm8k_accuracy: 50 - race-high_accuracy: 68.75 + deepseek-r1-distill-llama-8b-turbomind: + gsm8k_accuracy: 31.25 + race-high_accuracy: 81.25 + deepseek-r1-distill-qwen-1_5b-turbomind: + gsm8k_accuracy: 37.5 + race-high_accuracy: 53.12 deepseek-7b-chat-vllm: - gsm8k_accuracy: 50 + gsm8k_accuracy: 43.75 race-high_accuracy: 78.12 gemma2-2b-it-hf: gsm8k_accuracy: 50 @@ -36,34 +39,40 @@ chat: gsm8k_accuracy: 78.12 race-high_accuracy: 93.75 gemma-7b-it-vllm: - gsm8k_accuracy: 46.88 + gsm8k_accuracy: 31.25 race-high_accuracy: 68.75 internlm2_5-7b-chat-hf: gsm8k_accuracy: 84.38 race-high_accuracy: 90.62 + internlm3-8b-instruct-hf: + gsm8k_accuracy: 65.62 + race-high_accuracy: 87.5 internlm2_5-7b-chat-turbomind: - gsm8k_accuracy: 87.50 + gsm8k_accuracy: 84.38 race-high_accuracy: 90.62 internlm2-chat-1.8b-turbomind: gsm8k_accuracy: 28.12 race-high_accuracy: 84.38 internlm2-chat-1.8b-sft-turbomind: - gsm8k_accuracy: 21.88 + gsm8k_accuracy: 31.25 race-high_accuracy: 84.38 internlm2-chat-7b-lmdeploy: - gsm8k_accuracy: 53.12 + gsm8k_accuracy: 59.38 race-high_accuracy: 84.38 internlm2-chat-7b-sft-turbomind: - gsm8k_accuracy: 53.12 + gsm8k_accuracy: 56.25 race-high_accuracy: 90.62 + internlm3-8b-instruct-turbomind: + gsm8k_accuracy: 68.75 + race-high_accuracy: 87.5 internlm2-chat-7b-vllm: - gsm8k_accuracy: 43.75 - race-high_accuracy: 84.38 + gsm8k_accuracy: 59.38 + race-high_accuracy: 87.50 llama-3_1-8b-instruct-hf: gsm8k_accuracy: 84.38 race-high_accuracy: 90.62 llama-3_2-3b-instruct-hf: - gsm8k_accuracy: 68.75 + gsm8k_accuracy: 71.88 race-high_accuracy: 81.25 llama-3-8b-instruct-hf: gsm8k_accuracy: 68.75 @@ -72,14 +81,14 @@ chat: gsm8k_accuracy: 18.75 race-high_accuracy: 46.88 llama-3_1-8b-instruct-turbomind: - gsm8k_accuracy: 78.12 + gsm8k_accuracy: 81.25 race-high_accuracy: 90.62 llama-3_2-3b-instruct-turbomind: - gsm8k_accuracy: 65.62 + gsm8k_accuracy: 75.00 race-high_accuracy: 81.25 llama-3-8b-instruct-turbomind: - gsm8k_accuracy: 71.88 - race-high_accuracy: 87.5 + gsm8k_accuracy: 68.75 + race-high_accuracy: 84.38 mistral-7b-instruct-v0.2-hf: gsm8k_accuracy: 40.62 race-high_accuracy: 75 @@ -94,13 +103,10 @@ chat: race-high_accuracy: 78.12 mistral-7b-instruct-v0.1-vllm: gsm8k_accuracy: 34.38 - race-high_accuracy: 68.75 + race-high_accuracy: 65.62 mistral-7b-instruct-v0.2-vllm: - gsm8k_accuracy: 31.25 - race-high_accuracy: 75 - phi-3-mini-4k-instruct-hf: - gsm8k_accuracy: 81.25 - race-high_accuracy: 87.50 + gsm8k_accuracy: 21.88 + race-high_accuracy: 78.12 qwen2.5-0.5b-instruct-hf: gsm8k_accuracy: 34.38 race-high_accuracy: 46.88 @@ -108,10 +114,10 @@ chat: gsm8k_accuracy: 53.12 race-high_accuracy: 90.62 qwen2.5-0.5b-instruct-turbomind: - gsm8k_accuracy: 28.12 - race-high_accuracy: 50 + gsm8k_accuracy: 31.25 + race-high_accuracy: 43.75 qwen2.5-3b-instruct-turbomind: - gsm8k_accuracy: 59.38 + gsm8k_accuracy: 56.25 race-high_accuracy: 90.62 qwen1.5-0.5b-chat-hf: gsm8k_accuracy: 0 @@ -123,11 +129,11 @@ chat: gsm8k_accuracy: 68.75 race-high_accuracy: 90.62 qwen2-1.5b-instruct-turbomind: - gsm8k_accuracy: 53.12 + gsm8k_accuracy: 56.25 race-high_accuracy: 84.38 qwen2-7b-instruct-turbomind: gsm8k_accuracy: 81.25 - race-high_accuracy: 90.62 + race-high_accuracy: 87.50 qwen1.5-0.5b-chat-vllm: gsm8k_accuracy: 3.12 race-high_accuracy: 53.12 @@ -143,11 +149,11 @@ chat: yi-1.5-9b-chat-turbomind: gsm8k_accuracy: 71.88 race-high_accuracy: 93.75 - deepseek-v2-lite-chat-hf: - gsm8k_accuracy: 46.88 + deepseek-v2_lite-chat-turbomind: + gsm8k_accuracy: 37.5 race-high_accuracy: 71.88 gemma2-27b-it-hf: - gsm8k_accuracy: 75 + gsm8k_accuracy: 71.88 race-high_accuracy: 93.75 internlm2_5-20b-chat-hf: gsm8k_accuracy: 84.38 @@ -161,6 +167,9 @@ chat: mistral-small-instruct-2409-turbomind: gsm8k_accuracy: 81.25 race-high_accuracy: 87.50 + phi-4: + gsm8k_accuracy: 81.25 + race-high_accuracy: 87.50 qwen2.5-14b-instruct-hf: gsm8k_accuracy: 71.88 race-high_accuracy: 96.88 @@ -168,40 +177,41 @@ chat: gsm8k_accuracy: 68.75 race-high_accuracy: 93.75 yi-1.5-34b-chat-turbomind: - gsm8k_accuracy: 78.12 + gsm8k_accuracy: 75.00 race-high_accuracy: 93.75 - deepseek-67b-chat-hf: - gsm8k_accuracy: 71.88 + deepseek-67b-chat-turbomind: + gsm8k_accuracy: 75.00 race-high_accuracy: 78.12 + deepseek-r1-distill-qwen-32b-turbomind: + gsm8k_accuracy: 25 + race-high_accuracy: 90.62 llama-3_3-70b-instruct-turbomind: gsm8k_accuracy: 93.75 race-high_accuracy: 87.5 - mixtral-8x7b-instruct-v0.1-hf: - gsm8k_accuracy: 59.38 - race-high_accuracy: 81.25 mixtral-large-instruct-2411-turbomind: - gsm8k_accuracy: 90.62 + gsm8k_accuracy: 87.50 race-high_accuracy: 93.75 nvidia-3_1-Nemotron-70b-instruct-HF-turbomind: - gsm8k_accuracy: 87.5 - race-high_accuracy: 46.88 + gsm8k_accuracy: 93.75 + race-high_accuracy: 50.00 qwen2.5-72b-instruct-turbomind: - gsm8k_accuracy: 75 - race-high_accuracy: 93.75 + gsm8k_accuracy: 81.25 + race-high_accuracy: 90.62 + deepseek-r1-distill-llama-70b-turbomind: + gsm8k_accuracy: 40.62 + race-high_accuracy: 90.62 deepseek-v2_5-1210-turbomind: gsm8k_accuracy: 90.62 race-high_accuracy: 84.38 - mixtral-8x22b-instruct-v0.1-hf: - gsm8k_accuracy: 81.25 - race-high_accuracy: 81.25 + mixtral-8x22b-instruct-v0.1-turbomind: + gsm8k_accuracy: 75 + race-high_accuracy: 78.12 + mixtral-8x22b-instruct-v0.1-vllm: + gsm8k_accuracy: 78.12 + race-high_accuracy: 78.12 base: - glm-4-9b-hf: - gsm8k_accuracy: 68.75 - GPQA_diamond_accuracy: 31.25 - race-high_accuracy: 93.75 - winogrande_accuracy: 84.38 glm-4-9b-turbomind: - gsm8k_accuracy: 62.5 + gsm8k_accuracy: 56.25 GPQA_diamond_accuracy: 28.12 race-high_accuracy: 93.75 winogrande_accuracy: 84.38 @@ -210,15 +220,10 @@ base: GPQA_diamond_accuracy: 0 race-high_accuracy: 46.88 winogrande_accuracy: 71.88 - deepseek-moe-16b-base-hf: - gsm8k_accuracy: 21.88 - GPQA_diamond_accuracy: 0 - race-high_accuracy: 21.88 - winogrande_accuracy: 65.62 deepseek-7b-base-turbomind: - gsm8k_accuracy: 21.88 + gsm8k_accuracy: 18.75 GPQA_diamond_accuracy: 0 - race-high_accuracy: 46.88 + race-high_accuracy: 43.75 winogrande_accuracy: 84.38 deepseek-moe-16b-base-vllm: gsm8k_accuracy: 21.88 @@ -245,16 +250,21 @@ base: GPQA_diamond_accuracy: 3.12 race-high_accuracy: 65.62 winogrande_accuracy: 71.88 + gemma-2-9b-turbomind: + gsm8k_accuracy: 68.75 + GPQA_diamond_accuracy: 0 + race-high_accuracy: 78.12 + winogrande_accuracy: 50 gemma-2b-vllm: gsm8k_accuracy: 15.62 GPQA_diamond_accuracy: 3.12 - race-high_accuracy: - winogrande_accuracy: + race-high_accuracy: 28.12 + winogrande_accuracy: 68.75 gemma-7b-vllm: - gsm8k_accuracy: 53.12 - GPQA_diamond_accuracy: 9.38 - race-high_accuracy: - winogrande_accuracy: + gsm8k_accuracy: 43.75 + GPQA_diamond_accuracy: 6.25 + race-high_accuracy: 81.25 + winogrande_accuracy: 81.25 internlm2_5-7b-hf: gsm8k_accuracy: 37.5 GPQA_diamond_accuracy: 25 @@ -265,30 +275,25 @@ base: GPQA_diamond_accuracy: 18.75 race-high_accuracy: 62.5 winogrande_accuracy: 78.12 - internlm2-base-7b-hf: - gsm8k_accuracy: 3.12 - GPQA_diamond_accuracy: 21.88 - race-high_accuracy: 75 - winogrande_accuracy: 65.62 internlm2-1.8b-turbomind: - gsm8k_accuracy: 12.5 - GPQA_diamond_accuracy: 9.38 + gsm8k_accuracy: 6.25 + GPQA_diamond_accuracy: 12.5 race-high_accuracy: 71.88 - winogrande_accuracy: 78.12 + winogrande_accuracy: 75 internlm2_5-7b-turbomind: - gsm8k_accuracy: 62.50 + gsm8k_accuracy: 59.38 GPQA_diamond_accuracy: 34.38 race-high_accuracy: 93.75 - winogrande_accuracy: 87.50 + winogrande_accuracy: 84.38 internlm2-7b-turbomind: - gsm8k_accuracy: 53.12 - GPQA_diamond_accuracy: 21.88 + gsm8k_accuracy: 50 + GPQA_diamond_accuracy: 18.75 race-high_accuracy: 71.88 winogrande_accuracy: 84.38 internlm2-base-7b-turbomind: gsm8k_accuracy: 37.50 - GPQA_diamond_accuracy: 28.12 - race-high_accuracy: 81.25 + GPQA_diamond_accuracy: 21.88 + race-high_accuracy: 84.38 winogrande_accuracy: 75 llama-2-7b-hf: gsm8k_accuracy: 21.88 @@ -311,7 +316,7 @@ base: race-high_accuracy: 78.12 winogrande_accuracy: 78.12 llama-3-8b-turbomind: - gsm8k_accuracy: 50 + gsm8k_accuracy: 46.88 GPQA_diamond_accuracy: 12.50 race-high_accuracy: 65.62 winogrande_accuracy: 78.12 @@ -327,14 +332,14 @@ base: winogrande_accuracy: 71.88 qwen2.5-1.5b-turbomind: gsm8k_accuracy: 62.50 - GPQA_diamond_accuracy: 12.50 - race-high_accuracy: 78.12 - winogrande_accuracy: 68.75 - qwen2.5-7b-turbomind: - gsm8k_accuracy: 75.00 - GPQA_diamond_accuracy: 25 - race-high_accuracy: 87.5 + GPQA_diamond_accuracy: 15.62 + race-high_accuracy: 75 winogrande_accuracy: 71.88 + qwen2.5-7b-turbomind: + gsm8k_accuracy: 71.88 + GPQA_diamond_accuracy: 18.75 + race-high_accuracy: 87.5 + winogrande_accuracy: 75.00 qwen1.5-moe-a2.7b-hf: gsm8k_accuracy: 62.5 GPQA_diamond_accuracy: 18.75 @@ -356,17 +361,17 @@ base: race-high_accuracy: 87.5 winogrande_accuracy: 68.75 qwen2-1.5b-turbomind: - gsm8k_accuracy: 56.25 - GPQA_diamond_accuracy: 9.38 + gsm8k_accuracy: 59.38 + GPQA_diamond_accuracy: 12.50 race-high_accuracy: 81.25 winogrande_accuracy: 75 qwen2-7b-turbomind: - gsm8k_accuracy: 75.00 + gsm8k_accuracy: 65.62 GPQA_diamond_accuracy: 12.5 race-high_accuracy: 87.5 winogrande_accuracy: 71.88 qwen1.5-0.5b-vllm: - gsm8k_accuracy: 9.38 + gsm8k_accuracy: 6.25 GPQA_diamond_accuracy: 0 race-high_accuracy: 56.25 winogrande_accuracy: 62.5 @@ -382,27 +387,12 @@ base: winogrande_accuracy: 59.38 yi-1.5-9b-turbomind: gsm8k_accuracy: 78.12 - GPQA_diamond_accuracy: 40.62 + GPQA_diamond_accuracy: 43.75 race-high_accuracy: 87.5 winogrande_accuracy: 71.88 - deepseek-v2-lite-hf: - gsm8k_accuracy: 31.25 - GPQA_diamond_accuracy: 28.12 - race-high_accuracy: 59.38 - winogrande_accuracy: 71.88 - internlm2-20b-hf: - gsm8k_accuracy: 56.25 - GPQA_diamond_accuracy: 15.62 - race-high_accuracy: 68.75 - winogrande_accuracy: 75 - internlm2-base-20b-hf: - gsm8k_accuracy: 12.5 - GPQA_diamond_accuracy: 9.38 - race-high_accuracy: 84.38 - winogrande_accuracy: 65.62 internlm2-20b-turbomind: - gsm8k_accuracy: 71.88 - GPQA_diamond_accuracy: 15.62 + gsm8k_accuracy: 75 + GPQA_diamond_accuracy: 18.75 race-high_accuracy: 68.75 winogrande_accuracy: 81.25 qwen2.5-14b-hf: @@ -416,37 +406,27 @@ base: race-high_accuracy: 93.75 winogrande_accuracy: 78.12 qwen2.5-32b-turbomind: - gsm8k_accuracy: 84.38 - GPQA_diamond_accuracy: 28.12 + gsm8k_accuracy: 87.5 + GPQA_diamond_accuracy: 18.75 race-high_accuracy: 93.75 winogrande_accuracy: 81.25 - deepseek-67b-base-hf: - gsm8k_accuracy: 59.38 - GPQA_diamond_accuracy: 31.25 - race-high_accuracy: 81.25 - winogrande_accuracy: 90.62 deepseek-67b-base-turbomind: - gsm8k_accuracy: 56.25 + gsm8k_accuracy: 53.12 GPQA_diamond_accuracy: 28.12 race-high_accuracy: 81.25 winogrande_accuracy: 84.38 llama-3-70b-turbomind: - gsm8k_accuracy: 59.38 - GPQA_diamond_accuracy: 9.38 + gsm8k_accuracy: 56.25 + GPQA_diamond_accuracy: 12.50 race-high_accuracy: 93.75 winogrande_accuracy: 84.38 qwen2.5-72b-turbomind: gsm8k_accuracy: 84.38 - GPQA_diamond_accuracy: 34.38 + GPQA_diamond_accuracy: 31.25 race-high_accuracy: 93.75 winogrande_accuracy: 87.5 deepseek-v2-turbomind: - gsm8k_accuracy: 65.62 - GPQA_diamond_accuracy: 15.62 - race-high_accuracy: 93.75 - winogrande_accuracy: 84.38 - llama-3-70b-hf: - gsm8k_accuracy: 62.5 + gsm8k_accuracy: 59.38 GPQA_diamond_accuracy: 3.12 race-high_accuracy: 93.75 - winogrande_accuracy: 84.38 + winogrande_accuracy: 81.25 diff --git a/.github/workflows/daily-run-test.yml b/.github/workflows/daily-run-test.yml index a5a930fa..6a1c2ebc 100644 --- a/.github/workflows/daily-run-test.yml +++ b/.github/workflows/daily-run-test.yml @@ -61,6 +61,7 @@ env: HUGGINGFACE_HUB_CACHE: /fs-computility/llm/shared/llmeval/models/opencompass_hf_hub HF_HUB_CACHE: /fs-computility/llm/shared/llmeval/models/opencompass_hf_hub CONDA_ENV: regression_test + export VLLM_WORKER_MULTIPROC_METHOD: spawn jobs: build-pypi: @@ -92,7 +93,6 @@ jobs: matrix: pyver: [py310] runs-on: ubuntu-latest - environment: 'prod' env: PYTHON_VERSION: ${{ matrix.pyver }} PLAT_NAME: manylinux2014_x86_64 @@ -126,7 +126,6 @@ jobs: if: ${{!cancelled()}} needs: ['build-pypi', 'build-pypi-lmdeploy'] runs-on: volc_cu12 - environment: 'prod' timeout-minutes: 120 #2hours steps: - name: Clone repository @@ -190,7 +189,6 @@ jobs: matrix: regression_func: ${{fromJSON(github.event.inputs.regression_func_volc || '["chat_models","base_models","chat_obj_fullbench","base_fullbench"]')}} runs-on: volc_cu12_daily - environment: 'prod' timeout-minutes: 180 #3hours steps: - name: Clone repository @@ -231,7 +229,6 @@ jobs: matrix: regression_func: ${{fromJSON(github.event.inputs.regression_func_local || '["cmd","api","chat_sub_fullbench"]')}} runs-on: volc_cu12_local - environment: 'prod' timeout-minutes: 480 #6hours steps: - name: Clone repository @@ -258,27 +255,33 @@ jobs: conda info --envs export from_tf=TRUE python tools/list_configs.py internlm2_5 mmlu - opencompass --models hf_internlm2_5_7b hf_internlm2_1_8b --datasets race_ppl demo_gsm8k_chat_gen --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd1 --reuse --max-num-workers 2 --dump-eval-details + opencompass --models hf_internlm2_5_7b --datasets race_ppl demo_gsm8k_chat_gen --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd1 --reuse --max-num-workers 2 --dump-eval-details rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd1/*/summary regression_result_daily python -m pytest -m case1 -s -v --color=yes .github/scripts/oc_score_assert.py - opencompass --models hf_internlm2_5_7b_chat hf_internlm2_chat_1_8b --datasets race_gen demo_gsm8k_chat_gen -a lmdeploy --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd2 --reuse --max-num-workers 2 --dump-eval-details + opencompass --models hf_internlm2_5_7b_chat hf_internlm3_8b_instruct --datasets race_gen demo_gsm8k_chat_gen -a lmdeploy --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd2 --reuse --max-num-workers 2 --dump-eval-details rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd2/*/summary regression_result_daily python -m pytest -m case2 -s -v --color=yes .github/scripts/oc_score_assert.py opencompass --datasets race_ppl demo_gsm8k_chat_gen --hf-type base --hf-path internlm/internlm2_5-7b --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd3 --reuse --max-num-workers 2 --dump-eval-details rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd3/*/summary regression_result_daily python -m pytest -m case3 -s -v --color=yes .github/scripts/oc_score_assert.py - opencompass --datasets race_gen demo_gsm8k_chat_gen --hf-type chat --hf-path internlm/internlm2_5-7b-chat --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd4 --reuse --max-num-workers 2 --dump-eval-details + opencompass --datasets race_gen demo_gsm8k_chat_gen --hf-type chat --hf-path internlm/internlm3-8b-instruct -a lmdeploy --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd4 --reuse --max-num-workers 2 --dump-eval-details rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd4/*/summary regression_result_daily python -m pytest -m case4 -s -v --color=yes .github/scripts/oc_score_assert.py + opencompass --datasets race_gen demo_gsm8k_chat_gen --hf-type chat --hf-path internlm/internlm3-8b-instruct -a vllm --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd5 --reuse --max-num-workers 2 --dump-eval-details + rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd5/*/summary regression_result_daily + python -m pytest -m case5 -s -v --color=yes .github/scripts/oc_score_assert.py - name: Run model test - api if: matrix.regression_func == 'api' run: | . ${{env.CONDA_PATH}}/bin/activate conda activate ${{env.CONDA_ENV}} conda info --envs - lmdeploy serve api_server internlm/internlm2_5-7b-chat --max-batch-size 256 --model-name internlm2 > ${{env.REPORT_ROOT}}/${{ github.run_id }}/restful.log 2>&1 & + lmdeploy serve api_server internlm/internlm3-8b-instruct --max-batch-size 256 --model-name internlm3 > ${{env.REPORT_ROOT}}/${{ github.run_id }}/restful.log 2>&1 & echo "restful_pid=$!" >> "$GITHUB_ENV" sleep 180s + env | grep PROXY + env | grep proxy + unset HTTP_PROXY;unset HTTPS_PROXY;unset http_proxy;unset https_proxy; opencompass .github/scripts/eval_regression_api.py --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/api --reuse --max-num-workers 2 --dump-eval-details rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/api/*/summary regression_result_daily python -m pytest -m api -s -v --color=yes .github/scripts/oc_score_assert.py @@ -307,7 +310,6 @@ jobs: matrix: function_type: ${{fromJSON(github.event.inputs.fullbench_eval || '["base_objective","chat_objective","chat_subjective","base_long_context","chat_long_context"]')}} runs-on: volc_cu12 - environment: 'prod' timeout-minutes: 480 #6hours steps: - name: Clone repository @@ -341,7 +343,6 @@ jobs: needs: [daily_run_test_volc, daily_run_test_local, fullbench_run_test] timeout-minutes: 5 runs-on: self-hosted - environment: 'prod' steps: - name: notify run: | diff --git a/opencompass/configs/models/mistral/lmdeploy_mixtral_8x22b_instruct_v0_1.py b/opencompass/configs/models/mistral/lmdeploy_mixtral_8x22b_instruct_v0_1.py new file mode 100644 index 00000000..1ffef256 --- /dev/null +++ b/opencompass/configs/models/mistral/lmdeploy_mixtral_8x22b_instruct_v0_1.py @@ -0,0 +1,22 @@ +from opencompass.models import TurboMindModelwithChatTemplate + +models = [ + dict( + type=TurboMindModelwithChatTemplate, + abbr='mixtral-8x22b-instruct-v0.1-turbomind', + path='mistralai/Mixtral-8x22B-Instruct-v0.1', + engine_config=dict( + session_len=32768, + max_batch_size=16, + tp=8, + cache_max_entry_count=0.7, + ), + gen_config=dict( + top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=4096 + ), + max_seq_len=32768, + max_out_len=4096, + batch_size=8, + run_cfg=dict(num_gpus=8), + ) +] diff --git a/opencompass/summarizers/subjective/common_summarizer.py b/opencompass/summarizers/subjective/common_summarizer.py index ccb8d139..de917f44 100644 --- a/opencompass/summarizers/subjective/common_summarizer.py +++ b/opencompass/summarizers/subjective/common_summarizer.py @@ -147,6 +147,5 @@ class CommonSummarizer(CompassArenaSummarizer): f.write(','.join(new_header) + '\n') for line in new_table: f.write(','.join(map(str, line)) + '\n') - print(t) print(output_file) return {'qa_bench_' + show_dataset_abbr:json_result}