internlm2_5-7b-chat-hf_fullbench: objective: race-high_accuracy: 93.75 ARC-c_accuracy: 93.75 BoolQ_accuracy: 81.25 triviaqa_wiki_1shot_score: 50 nq_open_1shot_score: 25 IFEval_Prompt-level-strict-accuracy: 50 drop_accuracy: 81.25 GPQA_diamond_accuracy: 25 hellaswag_accuracy: 87.5 TheoremQA_score: 12.50 musr_average_naive_average: 39.58 korbench_single_naive_average: 40 gsm8k_accuracy: 62.50 math_accuracy: 75 cmo_fib_accuracy: 6.25 aime2024_accuracy: 6.25 wikibench-wiki-single_choice_cncircular_perf_4: 50 sanitized_mbpp_score: 68.75 ds1000_naive_average: 16.96 lcb_code_generation_pass@1: 12.5 lcb_code_execution_pass@1: 43.75 lcb_test_output_pass@1: 18.75 bbh-logical_deduction_seven_objects_score: 50 bbh-multistep_arithmetic_two_score: 68.75 mmlu-other_accuracy: 72.6 cmmlu-china-specific_accuracy: 76.25 mmlu_pro_math_accuracy: 25 ds1000_Pandas_accuracy: 12.5 ds1000_Numpy_accuracy: 0 ds1000_Tensorflow_accuracy: 12.5 ds1000_Scipy_accuracy: 18.75 ds1000_Sklearn_accuracy: 18.75 ds1000_Pytorch_accuracy: 12.5 ds1000_Matplotlib_accuracy: 43.75 openai_mmmlu_lite_AR-XY_accuracy: 37.5 college_naive_average: 12.5 college_knowledge_naive_average: 87.5 subjective: alignment_bench_v1_1_总分: 0.66 alpaca_eval_total: 20.00 arenahard_score: 56.82 Followbench_naive_average: 1 CompassArena_naive_average: 43 mtbench101_avg: 7.60 wildbench_average: -14.58 simpleqa_accuracy_given_attempted: 1.00 chinese_simpleqa_given_attempted_accuracy: 0.90 alignment_bench_v1_1_专业能力: 7.90 alignment_bench_v1_1_数学计算: 0 alignment_bench_v1_1_基本任务: 0 alignment_bench_v1_1_逻辑推理: 0 alignment_bench_v1_1_中文理解: 0 alignment_bench_v1_1_文本写作: 0 alignment_bench_v1_1_角色扮演: 0 alignment_bench_v1_1_综合问答: 0 alpaca_eval_helpful_base: 20.00 compassarena_language_naive_average: 35 compassarena_knowledge_naive_average: 60.00 compassarena_reason_v2_naive_average: 40 compassarena_math_v2_naive_average: 50.00 compassarena_creationv2_zh_naive_average: 30 followbench_llmeval_en_HSR_AVG: 1 followbench_llmeval_en_SSR_AVG: 1 followbench_llmeval_en_HSR_L1: 1 followbench_llmeval_en_HSR_L2: 1 followbench_llmeval_en_HSR_L3: 1 followbench_llmeval_en_HSR_L4: 1 followbench_llmeval_en_HSR_L5: 1 followbench_llmeval_en_SSR_L1: 1 followbench_llmeval_en_SSR_L2: 1 followbench_llmeval_en_SSR_L3: 1 followbench_llmeval_en_SSR_L4: 1 followbench_llmeval_en_SSR_L5: 1 simpleqa_f1: 0.12 internlm2_5-7b-chat-turbomind_fullbench: objective: race-high_accuracy: 93.75 ARC-c_accuracy: 93.75 BoolQ_accuracy: 75.00 triviaqa_wiki_1shot_score: 50 nq_open_1shot_score: 25 IFEval_Prompt-level-strict-accuracy: 56.25 drop_accuracy: 75 GPQA_diamond_accuracy: 37.50 hellaswag_accuracy: 81.25 TheoremQA_score: 12.5 musr_average_naive_average: 39.58 korbench_single_naive_average: 40 gsm8k_accuracy: 68.75 math_accuracy: 68.75 cmo_fib_accuracy: 6.25 aime2024_accuracy: 6.25 wikibench-wiki-single_choice_cncircular_perf_4: 25 sanitized_mbpp_score: 68.75 ds1000_naive_average: 15.18 lcb_code_generation_pass@1: 12.5 lcb_code_execution_pass@1: 43.75 lcb_test_output_pass@1: 0.00 bbh-logical_deduction_seven_objects_score: 62.50 bbh-multistep_arithmetic_two_score: 62.50 mmlu-other_accuracy: 73.08 cmmlu-china-specific_accuracy: 75.42 mmlu_pro_math_accuracy: 25.00 ds1000_Pandas_accuracy: 0.00 ds1000_Numpy_accuracy: 0 ds1000_Tensorflow_accuracy: 12.5 ds1000_Scipy_accuracy: 18.75 ds1000_Sklearn_accuracy: 18.75 ds1000_Pytorch_accuracy: 12.50 ds1000_Matplotlib_accuracy: 43.75 openai_mmmlu_lite_AR-XY_accuracy: 37.5 college_naive_average: 12.50 college_knowledge_naive_average: 87.5 subjective: alignment_bench_v1_1_总分: 0.72 alpaca_eval_total: 20.00 arenahard_score: 55.77 Followbench_naive_average: 1 CompassArena_naive_average: 39.00 mtbench101_avg: 7.90 wildbench_average: 0.00 simpleqa_accuracy_given_attempted: 1.00 chinese_simpleqa_given_attempted_accuracy: 1 alignment_bench_v1_1_专业能力: 8.70 alignment_bench_v1_1_数学计算: 0 alignment_bench_v1_1_基本任务: 0 alignment_bench_v1_1_逻辑推理: 0 alignment_bench_v1_1_中文理解: 0 alignment_bench_v1_1_文本写作: 0 alignment_bench_v1_1_角色扮演: 0 alignment_bench_v1_1_综合问答: 0 alpaca_eval_helpful_base: 20.00 compassarena_language_naive_average: 25.00 compassarena_knowledge_naive_average: 55.00 compassarena_reason_v2_naive_average: 35.00 compassarena_math_v2_naive_average: 55.00 compassarena_creationv2_zh_naive_average: 25.00 followbench_llmeval_en_HSR_AVG: 1 followbench_llmeval_en_SSR_AVG: 1 followbench_llmeval_en_HSR_L1: 1 followbench_llmeval_en_HSR_L2: 1 followbench_llmeval_en_HSR_L3: 1 followbench_llmeval_en_HSR_L4: 1 followbench_llmeval_en_HSR_L5: 1 followbench_llmeval_en_SSR_L1: 1 followbench_llmeval_en_SSR_L2: 1 followbench_llmeval_en_SSR_L3: 1 followbench_llmeval_en_SSR_L4: 1 followbench_llmeval_en_SSR_L5: 1 simpleqa_f1: 0.12 internlm2_5-7b-hf_fullbench: objective: race-high_accuracy: 100 ARC-c_accuracy: 68.75 BoolQ_accuracy: 87.5 triviaqa_wiki_1shot_score: 43.75 nq_open_1shot_score: 43.75 drop_accuracy: 62.5 GPQA_diamond_accuracy: 62.5 hellaswag_accuracy: 93.75 TheoremQA_score: 18.75 winogrande_accuracy: 75 gsm8k_accuracy: 37.5 GaokaoBench_2010-2022_Math_II_MCQs_score: 62.5 GaokaoBench_2010-2022_Math_II_Fill-in-the-Blank_score: 0 math_accuracy: 12.5 wikibench-wiki-single_choice_cncircular_perf_4: 25 sanitized_mbpp_score: 56.25 dingo_en_192_score: 37.5 dingo_zh_170_score: 100 mmlu-other_accuracy: 76.92 cmmlu-china-specific_accuracy: 84.17 mmlu_pro_math_accuracy: 18.75 bbh-logical_deduction_seven_objects_score: 43.75 bbh-multistep_arithmetic_two_score: 56.25 college_naive_average: 12.5 college_knowledge_naive_average: 87.5 internlm2_5-7b-turbomind_fullbench: objective: race-high_accuracy: 100 ARC-c_accuracy: 68.75 BoolQ_accuracy: 87.5 triviaqa_wiki_1shot_score: 43.75 nq_open_1shot_score: 43.75 drop_accuracy: 62.5 GPQA_diamond_accuracy: 68.75 hellaswag_accuracy: 93.75 TheoremQA_score: 18.75 winogrande_accuracy: 87.5 gsm8k_accuracy: 62.50 GaokaoBench_2010-2022_Math_II_MCQs_score: 93.75 GaokaoBench_2010-2022_Math_II_Fill-in-the-Blank_score: 0 math_accuracy: 6.25 wikibench-wiki-single_choice_cncircular_perf_4: 0.00 sanitized_mbpp_score: 62.50 dingo_en_192_score: 37.50 dingo_zh_170_score: 100.00 mmlu-other_accuracy: 78.37 cmmlu-china-specific_accuracy: 83.33 mmlu_pro_math_accuracy: 18.75 bbh-logical_deduction_seven_objects_score: 62.50 bbh-multistep_arithmetic_two_score: 50.00 college_naive_average: 12.5 college_knowledge_naive_average: 87.5 internlm2_5-7b-turbomind: objective: race-high_accuracy: 89.28 ARC-c_accuracy: 52.2 BoolQ_accuracy: 89.72 triviaqa_wiki_1shot_score: 65.88 nq_open_1shot_score: 34.82 drop_accuracy: 68.1 bbh_naive_average: 72.15 GPQA_diamond_accuracy: 32.83 hellaswag_accuracy: 88.36 TheoremQA_score: 25 winogrande_accuracy: 81.29 gsm8k_accuracy: 74.68 GaokaoBench_weighted_average: 58.19 math_accuracy: 33.98 Mathbench_naive_average: 48.38 wikibench-wiki-single_choice_cncircular_perf_4: 29.1 cmmlu_naive_average: 78.94 mmlu_naive_average: 71.44 mmlu_pro_naive_average: 38.18 openai_humaneval_humaneval_pass@1: 59.76 openai_humaneval_v2_humaneval_pass@1: 57.93 sanitized_mbpp_score: 55.25 dingo_en_192_score: 60.94 dingo_zh_170_score: 67.65 mmlu-stem_accuracy: 63.72 mmlu-social-science_accuracy: 80.15 mmlu-humanities_accuracy: 74.27 mmlu-other_accuracy: 71.85 cmmlu-stem_accuracy: 67.07 cmmlu-social-science_accuracy: 81.49 cmmlu-humanities_accuracy: 85.84 cmmlu-other_accuracy: 82.69 cmmlu-china-specific_accuracy: 79.88 mmlu_pro_biology_accuracy: 58.58 mmlu_pro_business_accuracy: 28.01 mmlu_pro_chemistry_accuracy: 22.79 mmlu_pro_computer_science_accuracy: 39.02 mmlu_pro_economics_accuracy: 53.08 mmlu_pro_engineering_accuracy: 25.7 mmlu_pro_health_accuracy: 46.94 mmlu_pro_history_accuracy: 43.04 mmlu_pro_law_accuracy: 29.7 mmlu_pro_math_accuracy: 24.2 mmlu_pro_philosophy_accuracy: 42.48 mmlu_pro_physics_accuracy: 26.02 mmlu_pro_psychology_accuracy: 52.76 mmlu_pro_other_accuracy: 42.21 college_naive_average: 7.00 high_naive_average: 6.67 middle_naive_average: 26.67 primary_naive_average: 64.00 arithmetic_naive_average: 55 mathbench-a (average)_naive_average: 31.8 college_knowledge_naive_average: 58.23 high_knowledge_naive_average: 52.51 middle_knowledge_naive_average: 71.15 primary_knowledge_naive_average: 60.48 mathbench-t (average)_naive_average: 60.19 long_context: Single-Needle-Retrieval(S-RT)-32000_naive_average: 100 Single-Needle-Retrieval-EN-32000_naive_average: 100 Single-Needle-Retrieval-ZH-32000_naive_average: 100 Single-Needle-Retrieval(S-RT)-100000_naive_average: 100 Single-Needle-Retrieval-EN-100000_naive_average: 100 Single-Needle-Retrieval-ZH-100000_naive_average: 100 Single-Needle-Retrieval(S-RT)-200000_naive_average: 100 Single-Needle-Retrieval-EN-200000_naive_average: 100 Single-Needle-Retrieval-ZH-200000_naive_average: 100 longbench_naive_average: 46.19 longbench_zh_naive_average: 49.3 longbench_en_naive_average: 43.97 longbench_single-document-qa_score: 42.84 longbench_multi-document-qa_score: 41.25 longbench_summarization_score: 23.21 longbench_few-shot-learning_score: 61.67 longbench_synthetic-tasks_score: 60.05 longbench_code-completion_score: 52.09 internlm2_5-7b-chat-turbomind: objective: race-high_accuracy: 86.16 ARC-c_accuracy: 90.17 BoolQ_accuracy: 87.89 triviaqa_wiki_1shot_score: 64.91 nq_open_1shot_score: 22.69 mmmlu_lite_naive_average: 44.96 IFEval_Prompt-level-strict-accuracy: 58.04 drop_accuracy: 77.68 bbh_naive_average: 73.14 GPQA_diamond_accuracy: 31.06 hellaswag_accuracy: 94.79 TheoremQA_score: 22.25 musr_average_naive_average: 50.89 korbench_single_naive_average: 32.16 ARC_Prize_Public_Evaluation_accuracy: 0.02 gsm8k_accuracy: 86.73 GaokaoBench_weighted_average: 78.6 math_accuracy: 61 cmo_fib_accuracy: 11 aime2024_accuracy: 3.33 Mathbench_naive_average: 64.23 wikibench-wiki-single_choice_cncircular_perf_4: 31.32 cmmlu_naive_average: 74.3 mmlu_naive_average: 70.84 mmlu_pro_naive_average: 44.98 openai_humaneval_humaneval_pass@1: 69.8 sanitized_mbpp_score: 64.4 humanevalx_naive_average: 33.35 ds1000_naive_average: 14.15 lcb_code_generation_pass@1: 17.75 lcb_code_execution_pass@1: 32.57 lcb_test_output_pass@1: 26.13 bigcodebench_hard_instruct_pass@1: 3.38 bigcodebench_hard_complete_pass@1: 5.06 teval_naive_average: 80 SciCode_sub_accuracy: 5.56 qa_dingo_cn_score: 99.01 mmlu-stem_accuracy: 68.2 mmlu-social-science_accuracy: 75.8 mmlu-humanities_accuracy: 69.3 mmlu-other_accuracy: 71.3 cmmlu-stem_accuracy: 66.64 cmmlu-social-science_accuracy: 76 cmmlu-humanities_accuracy: 77.9 cmmlu-other_accuracy: 77.25 cmmlu-china-specific_accuracy: 73.6 mmlu_pro_biology_accuracy: 66.67 mmlu_pro_business_accuracy: 47.91 mmlu_pro_chemistry_accuracy: 35 mmlu_pro_computer_science_accuracy: 48.9 mmlu_pro_economics_accuracy: 55.87 mmlu_pro_engineering_accuracy: 29.62 mmlu_pro_health_accuracy: 45 mmlu_pro_history_accuracy: 40.8 mmlu_pro_law_accuracy: 25.79 mmlu_pro_math_accuracy: 53.48 mmlu_pro_philosophy_accuracy: 38.38 mmlu_pro_physics_accuracy: 37.79 mmlu_pro_psychology_accuracy: 58.39 mmlu_pro_other_accuracy: 46.27 humanevalx-python_pass@1: 53.66 humanevalx-cpp_pass@1: 22.56 humanevalx-go_pass@1: 0 humanevalx-js_pass@1: 54.88 ds1000_Pandas_accuracy: 10.65 ds1000_Numpy_accuracy: 3.63 ds1000_Tensorflow_accuracy: 13.33 ds1000_Scipy_accuracy: 8.96 ds1000_Sklearn_accuracy: 6.96 ds1000_Pytorch_accuracy: 6.62 ds1000_Matplotlib_accuracy: 49.35 openai_mmmlu_lite_AR-XY_accuracy: 17.19 openai_mmmlu_lite_BN-BD_accuracy: 26.78 openai_mmmlu_lite_DE-DE_accuracy: 51.27 openai_mmmlu_lite_ES-LA_accuracy: 56.94 openai_mmmlu_lite_FR-FR_accuracy: 58.22 openai_mmmlu_lite_HI-IN_accuracy: 30.75 openai_mmmlu_lite_ID-ID_accuracy: 50.6 openai_mmmlu_lite_IT-IT_accuracy: 50.6 openai_mmmlu_lite_JA-JP_accuracy: 51.13 openai_mmmlu_lite_KO-KR_accuracy: 45 openai_mmmlu_lite_PT-BR_accuracy: 57.68 openai_mmmlu_lite_SW-KE_accuracy: 32.56 openai_mmmlu_lite_YO-NG_accuracy: 32.42 openai_mmmlu_lite_ZH-CN_accuracy: 65.4 college_naive_average: 19.17 high_naive_average: 46.5 middle_naive_average: 61.34 primary_naive_average: 73.34 arithmetic_naive_average: 61.67 mathbench-a (average)_naive_average: 52.58 college_knowledge_naive_average: 67.1 high_knowledge_naive_average: 70 middle_knowledge_naive_average: 80 primary_knowledge_naive_average: 90.12 mathbench-t (average)_naive_average: 76 subjective: alignment_bench_v1_1_总分: 5.68 alpaca_eval_total: 25.96 arenahard_score: 17.15 Followbench_naive_average: 0.81 CompassArena_naive_average: 39.49 FoFo_naive_average: 0.38 mtbench101_avg: 8.01 wildbench_average: -10.49 simpleqa_accuracy_given_attempted: 0.04 chinese_simpleqa_given_attempted_accuracy: 0.34 alignment_bench_v1_1_专业能力: 6.05 alignment_bench_v1_1_数学计算: 5.87 alignment_bench_v1_1_基本任务: 6.01 alignment_bench_v1_1_逻辑推理: 4.48 alignment_bench_v1_1_中文理解: 6.17 alignment_bench_v1_1_文本写作: 6.06 alignment_bench_v1_1_角色扮演: 6.3 alignment_bench_v1_1_综合问答: 6.45 alpaca_eval_helpful_base: 17.83 alpaca_eval_koala: 28.21 alpaca_eval_oasst: 23.4 alpaca_eval_selfinstruct: 30.95 alpaca_eval_vicuna: 25.00 compassarena_language_naive_average: 53.00 compassarena_knowledge_naive_average: 36 compassarena_reason_v2_naive_average: 35 compassarena_math_v2_naive_average: 16.07 compassarena_creationv2_zh_naive_average: 43.64 fofo_test_prompts_overall: 0.35 fofo_test_prompts_cn_overall: 0.41 followbench_llmeval_en_HSR_AVG: 0.73 followbench_llmeval_en_SSR_AVG: 0.88 followbench_llmeval_en_HSR_L1: 0.94 followbench_llmeval_en_HSR_L2: 0.77 followbench_llmeval_en_HSR_L3: 0.73 followbench_llmeval_en_HSR_L4: 0.68 followbench_llmeval_en_HSR_L5: 0.54 followbench_llmeval_en_SSR_L1: 0.94 followbench_llmeval_en_SSR_L2: 0.88 followbench_llmeval_en_SSR_L3: 0.87 followbench_llmeval_en_SSR_L4: 0.87 followbench_llmeval_en_SSR_L5: 0.85 simpleqa_f1: 0.04 internlm2_5-7b-chat-1m-turbomind: long_context: ruler_8k_naive_average: 88.53 ruler_32k_naive_average: 83.84 ruler_128k_naive_average: 70.94 NeedleBench-Overall-Score-8K_weighted_average: 91.89 NeedleBench-Overall-Score-32K_weighted_average: 91.42 NeedleBench-Overall-Score-128K_weighted_average: 88.57 longbench_naive_average: 46.44 longbench_zh_naive_average: 45.19 longbench_en_naive_average: 45.71 babilong_0k_naive_average: 79.3 babilong_4k_naive_average: 67 babilong_16k_naive_average: 52.7 babilong_32k_naive_average: 48.9 babilong_128k_naive_average: 40.8 babilong_256k_naive_average: 23.5 longbench_single-document-qa_score: 43.56 longbench_multi-document-qa_score: 46.24 longbench_summarization_score: 24.32 longbench_few-shot-learning_score: 51.67 longbench_synthetic-tasks_score: 66.83 longbench_code-completion_score: 45.99 qwen2.5-7b-instruct-turbomind: objective: race-high_accuracy: 84.99 ARC-c_accuracy: 92.2 BoolQ_accuracy: 86.7 triviaqa_wiki_1shot_score: 53.06 nq_open_1shot_score: 17.51 mmmlu_lite_naive_average: 54.96 IFEval_Prompt-level-strict-accuracy: 71.53 drop_accuracy: 80.07 bbh_naive_average: 68.81 GPQA_diamond_accuracy: 34.34 hellaswag_accuracy: 85.42 TheoremQA_score: 18.38 musr_average_naive_average: 43.44 korbench_single_naive_average: 39.44 ARC_Prize_Public_Evaluation_accuracy: 0 gsm8k_accuracy: 92.57 GaokaoBench_weighted_average: 80.14 math_accuracy: 73.58 cmo_fib_accuracy: 25 aime2024_accuracy: 16.67 Mathbench_naive_average: 77.33 wikibench-wiki-single_choice_cncircular_perf_4: 34.9 cmmlu_naive_average: 75.97 mmlu_naive_average: 76.01 mmlu_pro_naive_average: 56.12 openai_humaneval_humaneval_pass@1: 83.54 sanitized_mbpp_score: 74.71 humanevalx_naive_average: 48.29 ds1000_naive_average: 18.66 lcb_code_generation_pass@1: 39.5 lcb_code_execution_pass@1: 42.38 lcb_test_output_pass@1: 50.68 bigcodebench_hard_instruct_pass@1: 16.22 bigcodebench_hard_complete_pass@1: 11.49 teval_naive_average: 79.72 SciCode_sub_accuracy: 10.76 qa_dingo_cn_score: 99.01 mmlu_accuracy: 76.01 mmlu-stem_accuracy: 77.59 mmlu-social-science_accuracy: 79.02 mmlu-humanities_accuracy: 72.07 mmlu-other_accuracy: 74.86 cmmlu_accuracy: 75.97 cmmlu-stem_accuracy: 73.09 cmmlu-social-science_accuracy: 75.95 cmmlu-humanities_accuracy: 76.53 cmmlu-other_accuracy: 78.79 cmmlu-china-specific_accuracy: 73.17 mmlu_pro_accuracy: 56.12 mmlu_pro_biology_accuracy: 71.41 mmlu_pro_business_accuracy: 67.68 mmlu_pro_chemistry_accuracy: 54.59 mmlu_pro_computer_science_accuracy: 58.29 mmlu_pro_economics_accuracy: 66.82 mmlu_pro_engineering_accuracy: 42.41 mmlu_pro_health_accuracy: 55.87 mmlu_pro_history_accuracy: 46.46 mmlu_pro_law_accuracy: 28.97 mmlu_pro_math_accuracy: 73.13 mmlu_pro_philosophy_accuracy: 44.89 mmlu_pro_physics_accuracy: 58.43 mmlu_pro_psychology_accuracy: 63.16 mmlu_pro_other_accuracy: 53.57 humanevalx-python_pass@1: 50 humanevalx-cpp_pass@1: 42.07 humanevalx-go_pass@1: 0 humanevalx-java_pass@1: 53.05 humanevalx-js_pass@1: 75 ds1000_Pandas_accuracy: 14.09 ds1000_Numpy_accuracy: 8.18 ds1000_Tensorflow_accuracy: 17.78 ds1000_Scipy_accuracy: 15.09 ds1000_Sklearn_accuracy: 10.43 ds1000_Pytorch_accuracy: 4.41 ds1000_Matplotlib_accuracy: 60.65 mmmlu_lite_accuracy: 54.96 openai_mmmlu_lite_AR-XY_accuracy: 42.32 openai_mmmlu_lite_BN-BD_accuracy: 42.25 openai_mmmlu_lite_DE-DE_accuracy: 59.93 openai_mmmlu_lite_ES-LA_accuracy: 66.53 openai_mmmlu_lite_FR-FR_accuracy: 66.88 openai_mmmlu_lite_HI-IN_accuracy: 49.26 openai_mmmlu_lite_ID-ID_accuracy: 61.26 openai_mmmlu_lite_IT-IT_accuracy: 65.47 openai_mmmlu_lite_JA-JP_accuracy: 61.54 openai_mmmlu_lite_KO-KR_accuracy: 60.28 openai_mmmlu_lite_PT-BR_accuracy: 55.51 openai_mmmlu_lite_SW-KE_accuracy: 36.42 openai_mmmlu_lite_YO-NG_accuracy: 32.14 openai_mmmlu_lite_ZH-CN_accuracy: 69.61 college_naive_average: 44.33 high_naive_average: 59 middle_naive_average: 78 primary_naive_average: 85.67 arithmetic_naive_average: 75.67 mathbench-a (average)_naive_average: 69.27 college_knowledge_naive_average: 83.86 high_knowledge_naive_average: 80.29 middle_knowledge_naive_average: 84.26 primary_knowledge_naive_average: 93.16 mathbench-t (average)_naive_average: 85.39 internlm2_5-7b-chat-pytorch: objective: race-high_accuracy: 86.39 ARC-c_accuracy: 90.51 BoolQ_accuracy: 88.01 triviaqa_wiki_1shot_score: 64.77 nq_open_1shot_score: 22.71 mmmlu_lite_naive_average: 45.02 IFEval_Prompt-level-strict-accuracy: 56.56 drop_accuracy: 75.46 bbh_naive_average: 73.34 GPQA_diamond_accuracy: 32.83 hellaswag_accuracy: 94.81 TheoremQA_score: 23.88 musr_average_naive_average: 51.31 korbench_single_naive_average: 32 ARC_Prize_Public_Evaluation_accuracy: 0.01 gsm8k_accuracy: 86.96 GaokaoBench_weighted_average: 78.05 math_accuracy: 60.34 cmo_fib_accuracy: 12.98 aime2024_accuracy: 3.33 Mathbench_naive_average: 64.82 wikibench-wiki-single_choice_cncircular_perf_4: 31.7 cmmlu_naive_average: 74.24 mmlu_naive_average: 70.2 mmlu_pro_naive_average: 45.39 openai_humaneval_humaneval_pass@1: 70.12 sanitized_mbpp_score: 64.59 humanevalx_naive_average: 38.78 ds1000_naive_average: 14.19 lcb_code_generation_pass@1: 16.5 lcb_code_execution_pass@1: 33.82 lcb_test_output_pass@1: 22.62 bigcodebench_hard_instruct_pass@1: 6.08 bigcodebench_hard_complete_pass@1: 6.76 teval_naive_average: 79.73 SciCode_sub_accuracy: 3.47 qa_dingo_cn_score: 100 mmlu_accuracy: 70.2 mmlu-stem_accuracy: 67.73 mmlu-social-science_accuracy: 75.49 mmlu-humanities_accuracy: 68.56 mmlu-other_accuracy: 70.58 cmmlu_accuracy: 74.24 cmmlu-stem_accuracy: 66.7 cmmlu-social-science_accuracy: 75.88 cmmlu-humanities_accuracy: 77.56 cmmlu-other_accuracy: 77.52 cmmlu-china-specific_accuracy: 73.46 mmlu_pro_accuracy: 45.39 mmlu_pro_biology_accuracy: 65.83 mmlu_pro_business_accuracy: 51.96 mmlu_pro_chemistry_accuracy: 36.84 mmlu_pro_computer_science_accuracy: 48.29 mmlu_pro_economics_accuracy: 56.16 mmlu_pro_engineering_accuracy: 29.1 mmlu_pro_health_accuracy: 44.5 mmlu_pro_history_accuracy: 42.26 mmlu_pro_law_accuracy: 24.98 mmlu_pro_math_accuracy: 54.85 mmlu_pro_philosophy_accuracy: 39.28 mmlu_pro_physics_accuracy: 37.41 mmlu_pro_psychology_accuracy: 58.27 mmlu_pro_other_accuracy: 45.78 humanevalx-python_pass@1: 56.1 humanevalx-cpp_pass@1: 20.73 humanevalx-go_pass@1: 0 humanevalx-java_pass@1: 59.15 humanevalx-js_pass@1: 57.93 ds1000_Pandas_accuracy: 8.93 ds1000_Numpy_accuracy: 4.09 ds1000_Tensorflow_accuracy: 11.11 ds1000_Scipy_accuracy: 7.55 ds1000_Sklearn_accuracy: 7.83 ds1000_Pytorch_accuracy: 8.82 ds1000_Matplotlib_accuracy: 50.97 mmmlu_lite_accuracy: 45.02 openai_mmmlu_lite_AR-XY_accuracy: 18.6 openai_mmmlu_lite_BN-BD_accuracy: 27.58 openai_mmmlu_lite_DE-DE_accuracy: 51.23 openai_mmmlu_lite_ES-LA_accuracy: 56.63 openai_mmmlu_lite_FR-FR_accuracy: 58.11 openai_mmmlu_lite_HI-IN_accuracy: 33.82 openai_mmmlu_lite_ID-ID_accuracy: 50.39 openai_mmmlu_lite_IT-IT_accuracy: 50.39 openai_mmmlu_lite_JA-JP_accuracy: 50.95 openai_mmmlu_lite_KO-KR_accuracy: 45.05 openai_mmmlu_lite_PT-BR_accuracy: 57.89 openai_mmmlu_lite_SW-KE_accuracy: 32.14 openai_mmmlu_lite_YO-NG_accuracy: 32.14 openai_mmmlu_lite_ZH-CN_accuracy: 65.33 college_naive_average: 21 high_naive_average: 47 middle_naive_average: 59.67 primary_naive_average: 72.33 arithmetic_naive_average: 62 mathbench-a (average)_naive_average: 53.13 college_knowledge_naive_average: 68.99 high_knowledge_naive_average: 70.06 middle_knowledge_naive_average: 78.53 primary_knowledge_naive_average: 88.49 mathbench-t (average)_naive_average: 76.51 qwen2.5-7b-instruct-pytorch: objective: race-high_accuracy: 85.16 ARC-c_accuracy: 90.85 BoolQ_accuracy: 86.61 triviaqa_wiki_1shot_score: 52.96 nq_open_1shot_score: 17.62 mmmlu_lite_naive_average: 54.7 IFEval_Prompt-level-strict-accuracy: 71.35 drop_accuracy: 80.23 bbh_naive_average: 68.88 GPQA_diamond_accuracy: 36.36 hellaswag_accuracy: 85.49 TheoremQA_score: 18.38 musr_average_naive_average: 43.3 korbench_single_naive_average: 39.44 ARC_Prize_Public_Evaluation_accuracy: 0 gsm8k_accuracy: 91.66 GaokaoBench_weighted_average: 80.02 math_accuracy: 73.74 cmo_fib_accuracy: 22.60 aime2024_accuracy: 13.33 Mathbench_naive_average: 77.08 wikibench-wiki-single_choice_cncircular_perf_4: 34 cmmlu_naive_average: 75.9 mmlu_naive_average: 76.27 mmlu_pro_naive_average: 56.14 openai_humaneval_humaneval_pass@1: 84.76 sanitized_mbpp_score: 74.71 humanevalx_naive_average: 48.17 ds1000_naive_average: 18.57 lcb_code_generation_pass@1: 38.75 lcb_code_execution_pass@1: 42.38 lcb_test_output_pass@1: 50.45 bigcodebench_hard_instruct_pass@1: 16.89 bigcodebench_hard_complete_pass@1: 12.16 teval_naive_average: 79.46 SciCode_sub_accuracy: 10.42 qa_dingo_cn_score: 100 mmlu_accuracy: 76.27 mmlu-stem_accuracy: 77.75 mmlu-social-science_accuracy: 78.65 mmlu-humanities_accuracy: 73.12 mmlu-other_accuracy: 75.05 cmmlu_accuracy: 75.9 cmmlu-stem_accuracy: 73.41 cmmlu-social-science_accuracy: 75.97 cmmlu-humanities_accuracy: 76.42 cmmlu-other_accuracy: 78.15 cmmlu-china-specific_accuracy: 73.27 mmlu_pro_accuracy: 56.14 mmlu_pro_biology_accuracy: 72.25 mmlu_pro_business_accuracy: 66.16 mmlu_pro_chemistry_accuracy: 55.65 mmlu_pro_computer_science_accuracy: 60.24 mmlu_pro_economics_accuracy: 66.82 mmlu_pro_engineering_accuracy: 41.38 mmlu_pro_health_accuracy: 54.89 mmlu_pro_history_accuracy: 46.46 mmlu_pro_law_accuracy: 29.06 mmlu_pro_math_accuracy: 73.58 mmlu_pro_philosophy_accuracy: 44.89 mmlu_pro_physics_accuracy: 60.05 mmlu_pro_psychology_accuracy: 61.9 mmlu_pro_other_accuracy: 52.6 humanevalx-python_pass@1: 51.83 humanevalx-cpp_pass@1: 42.68 humanevalx-go_pass@1: 0 humanevalx-java_pass@1: 73.78 humanevalx-js_pass@1: 72.56 ds1000_Pandas_accuracy: 14.09 ds1000_Numpy_accuracy: 8.64 ds1000_Tensorflow_accuracy: 17.78 ds1000_Scipy_accuracy: 15.09 ds1000_Sklearn_accuracy: 8.7 ds1000_Pytorch_accuracy: 4.41 ds1000_Matplotlib_accuracy: 61.29 mmmlu_lite_accuracy: 54.7 openai_mmmlu_lite_AR-XY_accuracy: 42.32 openai_mmmlu_lite_BN-BD_accuracy: 42.18 openai_mmmlu_lite_DE-DE_accuracy: 60 openai_mmmlu_lite_ES-LA_accuracy: 66.18 openai_mmmlu_lite_FR-FR_accuracy: 66.88 openai_mmmlu_lite_HI-IN_accuracy: 48.63 openai_mmmlu_lite_ID-ID_accuracy: 61.26 openai_mmmlu_lite_IT-IT_accuracy: 65.26 openai_mmmlu_lite_JA-JP_accuracy: 60.7 openai_mmmlu_lite_KO-KR_accuracy: 60.63 openai_mmmlu_lite_PT-BR_accuracy: 54.46 openai_mmmlu_lite_SW-KE_accuracy: 36 openai_mmmlu_lite_YO-NG_accuracy: 31.86 openai_mmmlu_lite_ZH-CN_accuracy: 69.4 college_naive_average: 48.33 high_naive_average: 59.33 middle_naive_average: 76.67 primary_naive_average: 86.67 arithmetic_naive_average: 74.33 mathbench-a (average)_naive_average: 69.07 college_knowledge_naive_average: 83.54 high_knowledge_naive_average: 80.82 middle_knowledge_naive_average: 83.79 primary_knowledge_naive_average: 92.22 mathbench-t (average)_naive_average: 85.1 internlm3-8b-instruct-turbomind: objective: race-high_accuracy: 89.22 ARC-c_accuracy: 92.54 BoolQ_accuracy: 86.45 triviaqa_wiki_1shot_score: 60.72 nq_open_1shot_score: 20.25 mmmlu_lite_naive_average: 41.82 IFEval_Prompt-level-strict-accuracy: 77.45 drop_accuracy: 83.27 bbh_naive_average: 55.22 GPQA_diamond_accuracy: 37.88 hellaswag_accuracy: 91.28 TheoremQA_score: 20.12 musr_average_naive_average: 36.86 korbench_single_naive_average: 41.2 ARC_Prize_Public_Evaluation_accuracy: 0.06 gsm8k_accuracy: 91.28 GaokaoBench_weighted_average: 86.59 math_accuracy: 76.96 cmo_fib_accuracy: 38.46 aime2024_accuracy: 13.33 Mathbench_naive_average: 78.96 wikibench-wiki-single_choice_cncircular_perf_4: 37.45 cmmlu_naive_average: 83.33 mmlu_naive_average: 76.21 mmlu_pro_naive_average: 57.96 openai_humaneval_humaneval_pass@1: 81.71 sanitized_mbpp_score: 69.65 humanevalx_naive_average: 40.73 ds1000_naive_average: 27.23 lcb_code_generation_pass@1: 34.75 lcb_code_execution_pass@1: 49.9 lcb_test_output_pass@1: 48.19 bigcodebench_hard_instruct_pass@1: 13.51 bigcodebench_hard_complete_pass@1: 15.54 teval_naive_average: 82.86 SciCode_sub_accuracy: 11.11 qa_dingo_cn_score: 100 mmlu_accuracy: 76.21 mmlu-stem_accuracy: 77.7 mmlu-social-science_accuracy: 80.98 mmlu-humanities_accuracy: 70.83 mmlu-other_accuracy: 75.01 cmmlu_accuracy: 83.33 cmmlu-stem_accuracy: 79.66 cmmlu-social-science_accuracy: 83.39 cmmlu-humanities_accuracy: 84.73 cmmlu-other_accuracy: 86.2 cmmlu-china-specific_accuracy: 81.77 mmlu_pro_accuracy: 57.96 mmlu_pro_biology_accuracy: 75.45 mmlu_pro_business_accuracy: 64.64 mmlu_pro_chemistry_accuracy: 59.81 mmlu_pro_computer_science_accuracy: 60.24 mmlu_pro_economics_accuracy: 68.6 mmlu_pro_engineering_accuracy: 44.79 mmlu_pro_health_accuracy: 58.31 mmlu_pro_history_accuracy: 49.87 mmlu_pro_law_accuracy: 32.43 mmlu_pro_math_accuracy: 70.17 mmlu_pro_philosophy_accuracy: 46.89 mmlu_pro_physics_accuracy: 59.58 mmlu_pro_psychology_accuracy: 66.29 mmlu_pro_other_accuracy: 54.33 humanevalx-python_pass@1: 43.9 humanevalx-cpp_pass@1: 20.12 humanevalx-go_pass@1: 0 humanevalx-java_pass@1: 40.85 humanevalx-js_pass@1: 65.24 ds1000_Pandas_accuracy: 16.49 ds1000_Numpy_accuracy: 34.09 ds1000_Tensorflow_accuracy: 26.67 ds1000_Scipy_accuracy: 17.92 ds1000_Sklearn_accuracy: 20.87 ds1000_Pytorch_accuracy: 19.12 ds1000_Matplotlib_accuracy: 55.48 mmmlu_lite_accuracy: 41.82 openai_mmmlu_lite_AR-XY_accuracy: 32.56 openai_mmmlu_lite_BN-BD_accuracy: 4.56 openai_mmmlu_lite_DE-DE_accuracy: 24.91 openai_mmmlu_lite_ES-LA_accuracy: 51.09 openai_mmmlu_lite_FR-FR_accuracy: 61.68 openai_mmmlu_lite_HI-IN_accuracy: 24.98 openai_mmmlu_lite_ID-ID_accuracy: 44.56 openai_mmmlu_lite_IT-IT_accuracy: 52.35 openai_mmmlu_lite_JA-JP_accuracy: 51.02 openai_mmmlu_lite_KO-KR_accuracy: 47.93 openai_mmmlu_lite_PT-BR_accuracy: 53.89 openai_mmmlu_lite_SW-KE_accuracy: 33.47 openai_mmmlu_lite_YO-NG_accuracy: 33.47 openai_mmmlu_lite_ZH-CN_accuracy: 69.05 college_naive_average: 45.67 high_naive_average: 64.67 middle_naive_average: 82.33 primary_naive_average: 90.33 arithmetic_naive_average: 74 mathbench-a (average)_naive_average: 71.4 college_knowledge_naive_average: 85.28 high_knowledge_naive_average: 79.43 middle_knowledge_naive_average: 87.9 primary_knowledge_naive_average: 93.42 mathbench-t (average)_naive_average: 86.51 internlm3-8b-instruct-pytorch: objective: race-high_accuracy: 89.02 ARC-c_accuracy: 93.56 BoolQ_accuracy: 86.67 triviaqa_wiki_1shot_score: 60.54 nq_open_1shot_score: 20.3 mmmlu_lite_naive_average: 42.6 IFEval_Prompt-level-strict-accuracy: 79.11 drop_accuracy: 83.32 bbh_naive_average: 54.76 GPQA_diamond_accuracy: 33.84 hellaswag_accuracy: 91.31 TheoremQA_score: 18 musr_average_naive_average: 36.62 korbench_single_naive_average: 41.84 ARC_Prize_Public_Evaluation_accuracy: 0.06 gsm8k_accuracy: 90.67 GaokaoBench_weighted_average: 86.27 math_accuracy: 76.68 cmo_fib_accuracy: 33.65 aime2024_accuracy: 10 Mathbench_naive_average: 78.92 wikibench-wiki-single_choice_cncircular_perf_4: 37.35 cmmlu_naive_average: 83.11 mmlu_naive_average: 76.23 mmlu_pro_naive_average: 58.16 openai_humaneval_humaneval_pass@1: 82.32 sanitized_mbpp_score: 70.04 humanevalx_naive_average: 25.49 ds1000_naive_average: 27.84 lcb_code_generation_pass@1: 34.5 lcb_code_execution_pass@1: 48.02 lcb_test_output_pass@1: 47.74 bigcodebench_hard_instruct_pass@1: 12.84 bigcodebench_hard_complete_pass@1: 15.54 teval_naive_average: 82.86 SciCode_sub_accuracy: 9.38 qa_dingo_cn_score: 100 mmlu_accuracy: 76.23 mmlu-stem_accuracy: 78.08 mmlu-social-science_accuracy: 80.31 mmlu-humanities_accuracy: 71.38 mmlu-other_accuracy: 74.63 cmmlu_accuracy: 83.11 cmmlu-stem_accuracy: 79.42 cmmlu-social-science_accuracy: 83.34 cmmlu-humanities_accuracy: 83.95 cmmlu-other_accuracy: 86.22 cmmlu-china-specific_accuracy: 81.5 mmlu_pro_accuracy: 58.16 mmlu_pro_biology_accuracy: 74.62 mmlu_pro_business_accuracy: 65.02 mmlu_pro_chemistry_accuracy: 60.69 mmlu_pro_computer_science_accuracy: 61.46 mmlu_pro_economics_accuracy: 68.25 mmlu_pro_engineering_accuracy: 45.3 mmlu_pro_health_accuracy: 60.15 mmlu_pro_history_accuracy: 50.66 mmlu_pro_law_accuracy: 31.7 mmlu_pro_math_accuracy: 70.32 mmlu_pro_philosophy_accuracy: 47.7 mmlu_pro_physics_accuracy: 59.51 mmlu_pro_psychology_accuracy: 65.41 mmlu_pro_other_accuracy: 53.46 humanevalx-python_pass@1: 42.68 humanevalx-cpp_pass@1: 19.51 humanevalx-go_pass@1: 0 humanevalx-java_pass@1: 0.00 humanevalx-js_pass@1: 64.02 ds1000_Pandas_accuracy: 14.09 ds1000_Numpy_accuracy: 35 ds1000_Tensorflow_accuracy: 24.44 ds1000_Scipy_accuracy: 20.75 ds1000_Sklearn_accuracy: 21.74 ds1000_Pytorch_accuracy: 22.06 ds1000_Matplotlib_accuracy: 56.77 mmmlu_lite_accuracy: 42.6 openai_mmmlu_lite_AR-XY_accuracy: 32.84 openai_mmmlu_lite_BN-BD_accuracy: 10.46 openai_mmmlu_lite_DE-DE_accuracy: 24.56 openai_mmmlu_lite_ES-LA_accuracy: 50.95 openai_mmmlu_lite_FR-FR_accuracy: 61.05 openai_mmmlu_lite_HI-IN_accuracy: 30.6 openai_mmmlu_lite_ID-ID_accuracy: 45.89 openai_mmmlu_lite_IT-IT_accuracy: 51.79 openai_mmmlu_lite_JA-JP_accuracy: 51.65 openai_mmmlu_lite_KO-KR_accuracy: 48.77 openai_mmmlu_lite_PT-BR_accuracy: 52.7 openai_mmmlu_lite_SW-KE_accuracy: 32.91 openai_mmmlu_lite_YO-NG_accuracy: 32.84 openai_mmmlu_lite_ZH-CN_accuracy: 69.33 college_naive_average: 47 high_naive_average: 66.67 middle_naive_average: 81.67 primary_naive_average: 89.33 arithmetic_naive_average: 73.67 mathbench-a (average)_naive_average: 71.67 college_knowledge_naive_average: 82.91 high_knowledge_naive_average: 79.86 middle_knowledge_naive_average: 88.92 primary_knowledge_naive_average: 92.96 mathbench-t (average)_naive_average: 86.16