diff --git a/.github/scripts/eval_regression_base_fullbench.py b/.github/scripts/eval_regression_base_fullbench.py index 11c2f514..23a8505b 100644 --- a/.github/scripts/eval_regression_base_fullbench.py +++ b/.github/scripts/eval_regression_base_fullbench.py @@ -99,61 +99,66 @@ GaokaoBench_datasets = [ ] datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), []) +summary_groups = sum( + [v for k, v in locals().items() if k.endswith('_summary_groups')], []) +summary_groups.append( + { + 'name': 'Mathbench', + 'subsets': ['mathbench-a (average)', 'mathbench-t (average)'], + }, ) + summarizer = dict( dataset_abbrs=[ + 'Language', ['race-high', 'accuracy'], ['ARC-c', 'accuracy'], ['BoolQ', 'accuracy'], - ['mmlu_pro', 'naive_average'], - ['GPQA_diamond', 'accuracy'], - ['cmmlu', 'naive_average'], - ['mmlu', 'naive_average'], + ['triviaqa_wiki_1shot', 'score'], + ['nq_open_1shot', 'score'], + '', + 'General Reasoning', ['drop', 'accuracy'], ['bbh', 'naive_average'], + ['GPQA_diamond', 'accuracy'], + ['hellaswag', 'accuracy'], + ['TheoremQA', 'score'], + ['winogrande', 'accuracy'], + '', + 'Math Calculation', + ['gsm8k', 'accuracy'], + ['GaokaoBench', 'weighted_average'], + 'GaokaoBench_2010-2022_Math_II_MCQs', + 'GaokaoBench_2010-2022_Math_II_Fill-in-the-Blank', ['math', 'accuracy'], + ['Mathbench', 'naive_average'], + '', + 'Knowledge', + ['wikibench-wiki-single_choice_cncircular', 'perf_4'], + ['cmmlu', 'naive_average'], + ['mmlu', 'naive_average'], + ['mmlu_pro', 'naive_average'], + '', + 'Code', ['openai_humaneval', 'humaneval_pass@1'], ['openai_humaneval_v2', 'humaneval_pass@1'], ['sanitized_mbpp', 'score'], - ['wikibench-wiki-single_choice_cncircular', 'perf_4'], - ['gsm8k', 'accuracy'], - ['GaokaoBench', 'weighted_average'], - ['triviaqa_wiki_1shot', 'score'], - ['nq_open_1shot', 'score'], - ['winogrande', 'accuracy'], - ['hellaswag', 'accuracy'], - ['TheoremQA', 'score'], + '', ['dingo_en_192', 'score'], ['dingo_zh_170', 'score'], - '###### MathBench-A: Application Part ######', - 'college', - 'high', - 'middle', - 'primary', - 'arithmetic', - 'mathbench-a (average)', - '###### MathBench-T: Theory Part ######', - 'college_knowledge', - 'high_knowledge', - 'middle_knowledge', - 'primary_knowledge', - 'mathbench-t (average)', - '###### Overall: Average between MathBench-A and MathBench-T ######', - 'Overall', - '', - 'bbh-logical_deduction_seven_objects', - 'bbh-multistep_arithmetic_two', '', 'mmlu', 'mmlu-stem', 'mmlu-social-science', 'mmlu-humanities', ['mmlu-other', 'accuracy'], + '', 'cmmlu', 'cmmlu-stem', 'cmmlu-social-science', 'cmmlu-humanities', 'cmmlu-other', ['cmmlu-china-specific', 'accuracy'], + '', 'mmlu_pro', 'mmlu_pro_biology', 'mmlu_pro_business', @@ -169,9 +174,24 @@ summarizer = dict( 'mmlu_pro_physics', 'mmlu_pro_psychology', 'mmlu_pro_other', + '', + 'bbh-logical_deduction_seven_objects', + 'bbh-multistep_arithmetic_two', + '###### MathBench-A: Application Part ######', + 'college', + 'high', + 'middle', + 'primary', + 'arithmetic', + 'mathbench-a (average)', + '###### MathBench-T: Theory Part ######', + 'college_knowledge', + 'high_knowledge', + 'middle_knowledge', + 'primary_knowledge', + 'mathbench-t (average)', ], - summary_groups=sum( - [v for k, v in locals().items() if k.endswith('_summary_groups')], []), + summary_groups=summary_groups, ) models = sum([v for k, v in locals().items() if k.endswith('_model')], []) diff --git a/.github/scripts/eval_regression_chat_objective_fullbench.py b/.github/scripts/eval_regression_chat_objective_fullbench.py index 368fe040..f02fb7c4 100644 --- a/.github/scripts/eval_regression_chat_objective_fullbench.py +++ b/.github/scripts/eval_regression_chat_objective_fullbench.py @@ -7,8 +7,14 @@ with read_base(): aime2024_datasets # noqa: F401, E501 from opencompass.configs.datasets.ARC_c.ARC_c_cot_gen_926652 import \ ARC_c_datasets # noqa: F401, E501 + # remove because of oom + # from opencompass.configs.datasets.ARC_Prize_Public_Evaluation.arc_prize_public_evaluation_gen_872059 import arc_prize_public_evaluation_datasets # noqa: F401, E501 from opencompass.configs.datasets.bbh.bbh_gen_5b92b0 import \ bbh_datasets # noqa: F401, E501 + from opencompass.configs.datasets.bigcodebench.bigcodebench_hard_complete_gen_faf748 import \ + bigcodebench_hard_complete_datasets # noqa: F401, E501 + from opencompass.configs.datasets.bigcodebench.bigcodebench_hard_instruct_gen_8815eb import \ + bigcodebench_hard_instruct_datasets # noqa: F401, E501 from opencompass.configs.datasets.cmmlu.cmmlu_0shot_cot_gen_305931 import \ cmmlu_datasets # noqa: F401, E501 from opencompass.configs.datasets.cmo_fib.cmo_fib_gen_ace24b import \ @@ -26,15 +32,17 @@ with read_base(): gsm8k_datasets # noqa: F401, E501 from opencompass.configs.datasets.hellaswag.hellaswag_10shot_gen_e42710 import \ hellaswag_datasets # noqa: F401, E501 - from opencompass.configs.datasets.humaneval.humaneval_openai_sample_evals_gen_159614 import \ + from opencompass.configs.datasets.humaneval.humaneval_openai_sample_evals_gen_dcae0e import \ humaneval_datasets # noqa: F401, E501 - from opencompass.configs.datasets.humanevalx.humanevalx_gen_620cfa import \ + from opencompass.configs.datasets.humanevalx.humanevalx_gen_3d84a3 import \ humanevalx_datasets # noqa: F401, E501 - from opencompass.configs.datasets.IFEval.IFEval_gen_3321a3 import \ + from opencompass.configs.datasets.IFEval.IFEval_gen_353ae7 import \ ifeval_datasets # noqa: F401, E501 + from opencompass.configs.datasets.korbench.korbench_single_0_shot_gen import \ + korbench_0shot_single_datasets # noqa: F401, E501 from opencompass.configs.datasets.livecodebench.livecodebench_gen_b2b0fd import \ LCB_datasets # noqa: F401, E501 - from opencompass.configs.datasets.math.math_0shot_gen_393424 import \ + from opencompass.configs.datasets.math.math_0shot_gen_11c4b5 import \ math_datasets # noqa: F401, E501 from opencompass.configs.datasets.MathBench.mathbench_2024_gen_50a320 import \ mathbench_datasets # noqa: F401, E501 @@ -71,6 +79,7 @@ with read_base(): from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat import \ models as lmdeploy_internlm2_5_7b_chat_model # noqa: F401, E501 # Summary Groups + # Summary Groups from opencompass.configs.summarizers.groups.bbh import \ bbh_summary_groups # noqa: F401, E501 from opencompass.configs.summarizers.groups.cmmlu import \ @@ -81,6 +90,8 @@ with read_base(): GaokaoBench_summary_groups # noqa: F401, E501 from opencompass.configs.summarizers.groups.humanevalx import \ humanevalx_summary_groups # noqa: F401, E501 + from opencompass.configs.summarizers.groups.korbench import \ + korbench_summary_groups # noqa: F401, E501 from opencompass.configs.summarizers.groups.mathbench_v1_2024 import \ mathbench_2024_summary_groups # noqa: F401, E501 from opencompass.configs.summarizers.groups.mmlu import \ @@ -185,6 +196,8 @@ summarizer = dict( ['hellaswag', 'accuracy'], ['TheoremQA', 'score'], ['musr_average', 'naive_average'], + ['korbench_single', 'naive_average'], + ['ARC_Prize_Public_Evaluation', 'accuracy'], '', 'Math Calculation', ['gsm8k', 'accuracy'], @@ -208,6 +221,8 @@ summarizer = dict( ['lcb_code_generation', 'pass@1'], ['lcb_code_execution', 'pass@1'], ['lcb_test_output', 'pass@1'], + ['bigcodebench_hard_instruct', 'pass@1'], + ['bigcodebench_hard_complete', 'pass@1'], '', 'Agent', ['teval', 'naive_average'], diff --git a/.github/scripts/eval_regression_chat_subjective_fullbench.py b/.github/scripts/eval_regression_chat_subjective_fullbench.py index 8a6ef8fd..60495f22 100644 --- a/.github/scripts/eval_regression_chat_subjective_fullbench.py +++ b/.github/scripts/eval_regression_chat_subjective_fullbench.py @@ -4,35 +4,37 @@ from mmengine.config import read_base from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner from opencompass.runners import LocalRunner -from opencompass.summarizers import SubjectiveSummarizer +from opencompass.summarizers import DefaultSubjectiveSummarizer from opencompass.tasks.subjective_eval import SubjectiveEvalTask with read_base(): # read hf models - chat models # Dataset - from opencompass.configs.datasets.subjective.alignbench.alignbench_v1_1_judgeby_critiquellm import \ + from opencompass.configs.datasets.chinese_simpleqa.chinese_simpleqa_gen import \ + csimpleqa_datasets # noqa: F401, E501 + from opencompass.configs.datasets.SimpleQA.simpleqa_gen_0283c3 import \ + simpleqa_datasets # noqa: F401, E501; noqa: F401, E501 + from opencompass.configs.datasets.subjective.alignbench.alignbench_v1_1_judgeby_critiquellm_new import \ alignbench_datasets # noqa: F401, E501 - from opencompass.configs.datasets.subjective.alpaca_eval.alpacav2_judgeby_gpt4 import \ + from opencompass.configs.datasets.subjective.alpaca_eval.alpacav2_judgeby_gpt4_new import \ alpacav2_datasets # noqa: F401, E501 - from opencompass.configs.datasets.subjective.arena_hard.arena_hard_compare import \ + from opencompass.configs.datasets.subjective.arena_hard.arena_hard_compare_new import \ arenahard_datasets # noqa: F401, E501 - from opencompass.configs.datasets.subjective.compassarena.compassarena_compare import \ + from opencompass.configs.datasets.subjective.compassarena.compassarena_compare_new import \ compassarena_datasets # noqa: F401, E501 - from opencompass.configs.datasets.subjective.fofo.fofo_bilingual_judge import \ + from opencompass.configs.datasets.subjective.fofo.fofo_bilingual_judge_new import \ fofo_datasets # noqa: F401, E501 - from opencompass.configs.datasets.subjective.followbench.followbench_llmeval import \ + from opencompass.configs.datasets.subjective.followbench.followbench_llmeval_new import \ followbench_llmeval_datasets # noqa: F401, E501 - from opencompass.configs.datasets.subjective.multiround.mtbench101_judge import \ + from opencompass.configs.datasets.subjective.multiround.mtbench101_judge_new import \ mtbench101_datasets # noqa: F401, E501 - from opencompass.configs.datasets.subjective.wildbench.wildbench_pair_judge import \ + from opencompass.configs.datasets.subjective.wildbench.wildbench_pair_judge_new import \ wildbench_datasets # noqa: F401, E501 from opencompass.configs.models.hf_internlm.hf_internlm2_5_7b_chat import \ models as hf_internlm2_5_7b_chat_model # noqa: F401, E501 from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat import \ models as lmdeploy_internlm2_5_7b_chat_model # noqa: F401, E501 -summarizer = dict(type=SubjectiveSummarizer, function='subjective') - datasets = sum((v for k, v in locals().items() if k.endswith('_datasets') and 'mtbench101' not in k and 'wildbench' not in k), []) datasets += mtbench101_datasets # noqa: F401, E501 @@ -68,3 +70,128 @@ eval = dict( max_num_workers=16, task=dict(type=SubjectiveEvalTask)), ) + +summary_groups = [] +summary_groups.append({ + 'name': + 'compassarena_language', + 'subsets': [ + ['compassarena_language', '内容总结'], + ['compassarena_language', '情感分析'], + ['compassarena_language', 'Information Retrival'], + ['compassarena_language', '综合问答'], + ['compassarena_language', '中华文化'], + ], +}) +summary_groups.append({ + 'name': + 'compassarena_knowledge', + 'subsets': [ + ['compassarena_knowledge', '生活常识_ZH'], + ['compassarena_knowledge', '自然科学工科_ZH'], + ['compassarena_knowledge', '人文科学_ZH'], + ['compassarena_knowledge', '自然科学理科_ZH'], + ['compassarena_knowledge', '社会科学_ZH'], + ], +}) +summary_groups.append({ + 'name': 'compassarena_reason_v2', + 'subsets': [ + ['compassarena_reason_v2', 'reasoning'], + ], +}) +summary_groups.append({ + 'name': + 'compassarena_math_v2', + 'subsets': [ + ['compassarena_math_v2', '高等数学_ZH'], + ['compassarena_math_v2', '初等数学_ZH'], + ['compassarena_math_v2', '中等数学_ZH'], + ], +}) +summary_groups.append({ + 'name': + 'compassarena_creationv2_zh', + 'subsets': [ + ['compassarena_creationv2_zh', '内容扩写_ZH'], + ['compassarena_creationv2_zh', '内容续写_ZH'], + ['compassarena_creationv2_zh', '内容改写_ZH'], + ], +}) +summary_groups.append({ + 'name': + 'CompassArena', + 'subsets': [ + 'compassarena_language', + 'compassarena_knowledge', + 'compassarena_reason_v2', + 'compassarena_math_v2', + 'compassarena_creationv2_zh', + ], +}) +summary_groups.append({ + 'name': + 'FoFo', + 'subsets': [['fofo_test_prompts', 'overall'], + ['fofo_test_prompts_cn', 'overall']], +}) +summary_groups.append({ + 'name': + 'Followbench', + 'subsets': [ + ['followbench_llmeval_en', 'HSR_AVG'], + ['followbench_llmeval_en', 'SSR_AVG'], + ], +}) + +# Summarizer +summarizer = dict( + dataset_abbrs=[ + ['alignment_bench_v1_1', '总分'], + ['alpaca_eval', 'total'], + ['arenahard', 'score'], + ['Followbench', 'naive_average'], + ['CompassArena', 'naive_average'], + ['FoFo', 'naive_average'], + ['mtbench101', 'avg'], + ['wildbench', 'average'], + ['simpleqa', 'accuracy_given_attempted'], + ['chinese_simpleqa', 'given_attempted_accuracy'], + '', + ['alignment_bench_v1_1', '专业能力'], + ['alignment_bench_v1_1', '数学计算'], + ['alignment_bench_v1_1', '基本任务'], + ['alignment_bench_v1_1', '逻辑推理'], + ['alignment_bench_v1_1', '中文理解'], + ['alignment_bench_v1_1', '文本写作'], + ['alignment_bench_v1_1', '角色扮演'], + ['alignment_bench_v1_1', '综合问答'], + ['alpaca_eval', 'helpful_base'], + ['alpaca_eval', 'koala'], + ['alpaca_eval', 'oasst'], + ['alpaca_eval', 'selfinstruct'], + ['alpaca_eval', 'vicuna'], + ['compassarena_language', 'naive_average'], + ['compassarena_knowledge', 'naive_average'], + ['compassarena_reason_v2', 'naive_average'], + ['compassarena_math_v2', 'naive_average'], + ['compassarena_creationv2_zh', 'naive_average'], + ['fofo_test_prompts', 'overall'], + ['fofo_test_prompts_cn', 'overall'], + ['followbench_llmeval_en', 'HSR_AVG'], + ['followbench_llmeval_en', 'SSR_AVG'], + ['followbench_llmeval_en', 'HSR_L1'], + ['followbench_llmeval_en', 'HSR_L2'], + ['followbench_llmeval_en', 'HSR_L3'], + ['followbench_llmeval_en', 'HSR_L4'], + ['followbench_llmeval_en', 'HSR_L5'], + ['followbench_llmeval_en', 'SSR_L1'], + ['followbench_llmeval_en', 'SSR_L2'], + ['followbench_llmeval_en', 'SSR_L3'], + ['followbench_llmeval_en', 'SSR_L4'], + ['followbench_llmeval_en', 'SSR_L5'], + ['simpleqa', 'f1'], + ], + type=DefaultSubjectiveSummarizer, + summary_groups=summary_groups, +) diff --git a/.github/scripts/oc_score_assert.py b/.github/scripts/oc_score_assert.py index 179dec27..d53c5bf5 100644 --- a/.github/scripts/oc_score_assert.py +++ b/.github/scripts/oc_score_assert.py @@ -7,28 +7,55 @@ import yaml output_path = 'regression_result_daily' chat_model_list = [ - 'baichuan2-7b-chat-hf', 'glm-4-9b-chat-hf', 'glm-4-9b-chat-turbomind', - 'glm-4-9b-chat-vllm', 'deepseek-7b-chat-hf', 'deepseek-moe-16b-chat-hf', - 'deepseek-7b-chat-vllm', 'gemma2-2b-it-hf', 'gemma2-9b-it-hf', - 'gemma-2b-it-hf', 'gemma-7b-it-hf', 'gemma-2-9b-it-turbomind', - 'gemma-7b-it-vllm', 'internlm2_5-7b-chat-hf', - 'internlm2_5-7b-chat-turbomind', 'internlm2-chat-1.8b-turbomind', - 'internlm2-chat-1.8b-sft-turbomind', 'internlm2-chat-7b-lmdeploy', - 'internlm2-chat-7b-sft-turbomind', 'internlm2-chat-7b-vllm', - 'llama-3_1-8b-instruct-hf', 'llama-3_2-3b-instruct-hf', - 'llama-3-8b-instruct-hf', 'llama-3_1-8b-instruct-turbomind', - 'llama-3_2-3b-instruct-turbomind', 'llama-3-8b-instruct-turbomind', - 'mistral-7b-instruct-v0.2-hf', 'mistral-7b-instruct-v0.3-hf', - 'mistral-nemo-instruct-2407-hf', 'mistral-nemo-instruct-2407-turbomind', - 'mistral-7b-instruct-v0.1-vllm', 'mistral-7b-instruct-v0.2-vllm', - 'MiniCPM3-4B-hf', 'minicpm-2b-dpo-fp32-hf', 'minicpm-2b-sft-bf16-hf', - 'minicpm-2b-sft-fp32-hf', 'phi-3-mini-4k-instruct-hf', - 'qwen1.5-0.5b-chat-hf', 'qwen2-1.5b-instruct-hf', 'qwen2-7b-instruct-hf', - 'qwen2-1.5b-instruct-turbomind', 'qwen2-7b-instruct-turbomind', - 'qwen1.5-0.5b-chat-vllm', 'yi-1.5-6b-chat-hf', 'yi-1.5-9b-chat-hf', - 'deepseek-v2-lite-chat-hf', 'internlm2_5-20b-chat-hf', - 'internlm2_5-20b-chat-turbomind', 'mistral-small-instruct-2409-hf', - 'mistral-small-instruct-2409-turbomind', 'qwen2.5-14b-instruct-hf', + 'baichuan2-7b-chat-hf', + 'glm-4-9b-chat-hf', + 'glm-4-9b-chat-turbomind', + 'glm-4-9b-chat-vllm', + 'deepseek-7b-chat-hf', + 'deepseek-moe-16b-chat-hf', + 'deepseek-7b-chat-vllm', + 'gemma2-2b-it-hf', + 'gemma2-9b-it-hf', + 'gemma-2b-it-hf', + 'gemma-7b-it-hf', + 'gemma-2-9b-it-turbomind', + 'gemma-7b-it-vllm', + 'internlm2_5-7b-chat-hf', + 'internlm2_5-7b-chat-turbomind', + 'internlm2-chat-1.8b-turbomind', + 'internlm2-chat-1.8b-sft-turbomind', + 'internlm2-chat-7b-lmdeploy', + 'internlm2-chat-7b-sft-turbomind', + 'internlm2-chat-7b-vllm', + 'llama-3_1-8b-instruct-hf', + 'llama-3_2-3b-instruct-hf', + 'llama-3-8b-instruct-hf', + 'llama-3_1-8b-instruct-turbomind', + 'llama-3_2-3b-instruct-turbomind', + 'llama-3-8b-instruct-turbomind', + 'mistral-7b-instruct-v0.2-hf', + 'mistral-7b-instruct-v0.3-hf', + 'mistral-nemo-instruct-2407-hf', + 'mistral-nemo-instruct-2407-turbomind', + 'mistral-7b-instruct-v0.1-vllm', + 'mistral-7b-instruct-v0.2-vllm', + # 'MiniCPM3-4B-hf', 'minicpm-2b-dpo-fp32-hf', 'minicpm-2b-sft-bf16-hf', + # 'minicpm-2b-sft-fp32-hf', + 'phi-3-mini-4k-instruct-hf', + 'qwen1.5-0.5b-chat-hf', + 'qwen2-1.5b-instruct-hf', + 'qwen2-7b-instruct-hf', + 'qwen2-1.5b-instruct-turbomind', + 'qwen2-7b-instruct-turbomind', + 'qwen1.5-0.5b-chat-vllm', + 'yi-1.5-6b-chat-hf', + 'yi-1.5-9b-chat-hf', + 'deepseek-v2-lite-chat-hf', + 'internlm2_5-20b-chat-hf', + 'internlm2_5-20b-chat-turbomind', + 'mistral-small-instruct-2409-hf', + 'mistral-small-instruct-2409-turbomind', + 'qwen2.5-14b-instruct-hf', 'qwen2.5-14b-instruct-turbomind' ] base_model_list = [ @@ -92,9 +119,9 @@ def result_scores(): class TestChat: """Test cases for chat model.""" - @pytest.mark.parametrize('model, dataset', - [(p1, p2) for p1 in chat_model_list - for p2 in ['gsm8k', 'race-high']]) + @pytest.mark.parametrize( + 'model, dataset', [(p1, p2) for p1 in chat_model_list + for p2 in ['gsm8k_accuracy', 'race-high_accuracy']]) def test_model_dataset_score(self, baseline_scores_testrange, result_scores, model, dataset): base_score = baseline_scores_testrange.get(model).get(dataset) @@ -108,13 +135,14 @@ class TestChat: class TestBase: """Test cases for base model.""" - @pytest.mark.parametrize( - 'model, dataset', - [(p1, p2) for p1 in base_model_list - for p2 in ['gsm8k', 'GPQA_diamond', 'race-high', 'winogrande']]) + @pytest.mark.parametrize('model, dataset', [ + (p1, p2) for p1 in base_model_list for p2 in + ['gsm8k_accuracy', 'GPQA_diamond', 'race-high_accuracy', 'winogrande'] + ]) def test_model_dataset_score(self, baseline_scores_testrange, result_scores, model, dataset): - if model in ['gemma-2b-vllm', 'gemma-7b-vllm'] and dataset != 'gsm8k': + if model in ['gemma-2b-vllm', 'gemma-7b-vllm' + ] and dataset != 'gsm8k_accuracy': return base_score = baseline_scores_testrange.get(model).get(dataset) result_score = result_scores.get(model).get(dataset) @@ -131,16 +159,23 @@ class TestChatObjFullbench: 'internlm2_5-7b-chat-hf_fullbench', 'internlm2_5-7b-chat-turbomind_fullbench' ] for p2 in [ - 'race-high', 'ARC-c', 'BoolQ', 'triviaqa_wiki_1shot', 'nq_open_1shot', - 'IFEval', 'drop', 'GPQA_diamond', 'hellaswag', 'TheoremQA', - 'musr_average', 'gsm8k', 'math', 'cmo_fib', 'aime2024', - 'wikibench-wiki-single_choice_cncircular', 'sanitized_mbpp', 'ds1000', - 'lcb_code_generation', 'lcb_code_execution', 'lcb_test_output', - 'bbh-logical_deduction_seven_objects', 'bbh-multistep_arithmetic_two', - 'mmlu-other', 'cmmlu-china-specific', 'mmlu_pro_math', 'ds1000_Pandas', - 'ds1000_Numpy', 'ds1000_Tensorflow', 'ds1000_Scipy', 'ds1000_Sklearn', - 'ds1000_Pytorch', 'ds1000_Matplotlib', 'openai_mmmlu_lite_AR-XY', - 'college', 'college_knowledge' + 'race-high_accuracy', 'ARC-c_accuracy', 'BoolQ_accuracy', + 'triviaqa_wiki_1shot_score', 'nq_open_1shot_score', + 'IFEval_Prompt-level-strict-accuracy', 'drop_accuracy', + 'GPQA_diamond_accuracy', 'hellaswag_accuracy', 'TheoremQA_score', + 'musr_average_naive_average', 'korbench_single_naive_average', + 'gsm8k_accuracy', 'math_accuracy', 'cmo_fib_accuracy', + 'aime2024_accuracy', 'wikibench-wiki-single_choice_cncircular_perf_4', + 'sanitized_mbpp_score', 'ds1000_naive_average', + 'lcb_code_generation_pass@1', 'lcb_code_execution_pass@1', + 'lcb_test_output_pass@1', 'bbh-logical_deduction_seven_objects_score', + 'bbh-multistep_arithmetic_two_score', 'mmlu-other_naive_average', + 'cmmlu-china-specific_naive_average', 'mmlu_pro_math_accuracy', + 'ds1000_Pandas_accuracy', 'ds1000_Numpy_accuracy', + 'ds1000_Tensorflow_accuracy', 'ds1000_Scipy_accuracy', + 'ds1000_Sklearn_accuracy', 'ds1000_Pytorch_accuracy', + 'ds1000_Matplotlib_accuracy', 'openai_mmmlu_lite_AR-XY_accuracy', + 'college_naive_average', 'college_knowledge_naive_average' ]]) def test_model_dataset_score(self, baseline_scores_fullbench, result_scores, model, dataset): @@ -159,17 +194,27 @@ class TestChatSubFullbench: 'internlm2_5-7b-chat-hf_fullbench', 'internlm2_5-7b-chat-turbomind_fullbench' ] for p2 in [ - 'Alignbench总分', 'Alignbench专业能力', 'AlpacaEvaltotal', - 'AlpacaEvalhelpful_base', 'CompassArenacompassarena_language', - 'CompassArenacompassarena_knowledge', - 'CompassArenacompassarena_reason_v2', - 'CompassArenacompassarena_math_v2', - 'CompassArenacompassarena_creationv2_zh', 'Fofofofo_test_prompts', - 'followbenchHSR_AVG', 'followbenchSSR_AVG', 'followbenchHSR_L1', - 'followbenchHSR_L2', 'followbenchHSR_L3', 'followbenchHSR_L4', - 'followbenchHSR_L5', 'followbenchSSR_L1', 'followbenchSSR_L2', - 'followbenchSSR_L3', 'followbenchSSR_L4', 'followbenchSSR_L5', - 'MTBench101average', 'Wildbenchscore' + 'alignment_bench_v1_1_总分', 'alpaca_eval_total', 'arenahard_score', + 'Followbench_naive_average', 'CompassArena_naive_average', + 'mtbench101_avg', 'wildbench_average', + 'simpleqa_accuracy_given_attempted', + 'chinese_simpleqa_given_attempted_accuracy', + 'alignment_bench_v1_1_专业能力', 'alignment_bench_v1_1_数学计算', + 'alignment_bench_v1_1_基本任务', 'alignment_bench_v1_1_逻辑推理', + 'alignment_bench_v1_1_中文理解', 'alignment_bench_v1_1_文本写作', + 'alignment_bench_v1_1_角色扮演', 'alignment_bench_v1_1_综合问答', + 'alpaca_eval_helpful_base', 'compassarena_language_naive_average', + 'compassarena_knowledge_naive_average', + 'compassarena_reason_v2_naive_average', + 'compassarena_math_v2_naive_average', + 'compassarena_creationv2_zh_naive_average', + 'fofo_test_prompts_overall', 'followbench_llmeval_en_HSR_AVG', + 'followbench_llmeval_en_SSR_AVG', 'followbench_llmeval_en_HSR_L1', + 'followbench_llmeval_en_HSR_L2', 'followbench_llmeval_en_HSR_L3', + 'followbench_llmeval_en_HSR_L4', 'followbench_llmeval_en_HSR_L5', + 'followbench_llmeval_en_SSR_L1', 'followbench_llmeval_en_SSR_L2', + 'followbench_llmeval_en_SSR_L3', 'followbench_llmeval_en_SSR_L4', + 'followbench_llmeval_en_SSR_L5', 'simpleqa_f1' ]]) def test_model_dataset_score(self, baseline_scores_fullbench, result_scores, model, dataset): @@ -187,13 +232,18 @@ class TestBaseFullbench: @pytest.mark.parametrize('model, dataset', [(p1, p2) for p1 in [ 'internlm2_5-7b-hf_fullbench', 'internlm2_5-7b-turbomind_fullbench' ] for p2 in [ - 'race-high', 'ARC-c', 'BoolQ', 'drop', 'GPQA_diamond', 'math', - 'wikibench-wiki-single_choice_cncircular', 'sanitized_mbpp', 'gsm8k', - 'triviaqa_wiki_1shot', 'nq_open_1shot', 'winogrande', 'hellaswag', - 'TheoremQA', 'dingo_en_192', 'dingo_zh_170', 'college', - 'college_knowledge', 'bbh-logical_deduction_seven_objects', - 'bbh-multistep_arithmetic_two', 'mmlu-other', 'cmmlu-china-specific', - 'mmlu_pro_math' + 'race-high_accuracy', 'ARC-c_accuracy', 'BoolQ_accuracy', + 'triviaqa_wiki_1shot_score', 'nq_open_1shot_score', 'drop_accuracy', + 'GPQA_diamond_accuracy', 'hellaswag_accuracy', 'TheoremQA_score', + 'winogrande_accuracy', 'gsm8k_accuracy', + 'GaokaoBench_2010-2022_Math_II_MCQs_score', + 'GaokaoBench_2010-2022_Math_II_Fill-in-the-Blank_score', + 'math_accuracy', 'wikibench-wiki-single_choice_cncircular_perf_4', + 'sanitized_mbpp_score', 'dingo_en_192_score', 'dingo_zh_170_score', + 'mmlu-other_accuracy', 'cmmlu-china-specific_accuracy', + 'mmlu_pro_math_accuracy', 'bbh-logical_deduction_seven_objects_score', + 'bbh-multistep_arithmetic_two_score', 'college_naive_average', + 'college_knowledge_naive_average' ]]) def test_model_dataset_score(self, baseline_scores_fullbench, result_scores, model, dataset): @@ -209,40 +259,238 @@ class TestApibench: """Test cases for chat model.""" @pytest.mark.parametrize('model, dataset', - [('lmdeploy-api-test', 'race-middle'), - ('lmdeploy-api-test', 'race-high'), - ('lmdeploy-api-test', 'gsm8k')]) + [('lmdeploy-api-test', 'race-middle_accuracy'), + ('lmdeploy-api-test', 'race-high_accuracy'), + ('lmdeploy-api-test', 'gsm8k_accuracy')]) def test_api(self, baseline_scores, result_scores, model, dataset): base_score = baseline_scores.get(model).get(dataset) result_score = result_scores.get(model).get(dataset) assert_score(model + '_batch', result_score, base_score) +@pytest.mark.usefixtures('result_scores') +@pytest.mark.usefixtures('baseline_scores_fullbench') +@pytest.mark.volc_fullbench +class TestVolcFullbench: + """Test cases for chat model.""" + + @pytest.mark.parametrize('model, dataset', [( + p1, p2 + ) for p1 in ['internlm2_5-7b-chat-turbomind'] for p2 in [ + 'race-high_accuracy', 'ARC-c_accuracy', 'BoolQ_accuracy', + 'triviaqa_wiki_1shot_score', 'nq_open_1shot_score', + 'mmmlu_lite_naive_average', 'IFEval_Prompt-level-strict-accuracy', + 'drop_accuracy', 'bbh_naive_average', 'GPQA_diamond_accuracy', + 'hellaswag_accuracy', 'TheoremQA_score', 'musr_average_naive_average', + 'korbench_single_naive_average', + 'ARC_Prize_Public_Evaluation_accuracy', 'gsm8k_accuracy', + 'GaokaoBench_weighted_average', 'math_accuracy', 'cmo_fib_accuracy', + 'aime2024_accuracy', 'Mathbench_naive_average', + 'wikibench-wiki-single_choice_cncircular_perf_4', + 'cmmlu_naive_average', 'mmlu_naive_average', 'mmlu_pro_naive_average', + 'openai_humaneval_humaneval_pass@1', 'sanitized_mbpp_score', + 'humanevalx_naive_average', 'ds1000_naive_average', + 'lcb_code_generation_pass@1', 'lcb_code_execution_pass@1', + 'lcb_test_output_pass@1', 'bigcodebench_hard_instruct_pass@1', + 'bigcodebench_hard_complete_pass@1', 'teval_naive_average', + 'qa_dingo_cn_score', 'mmlu-stem_naive_average', + 'mmlu-social-science_naive_average', 'mmlu-humanities_naive_average', + 'mmlu-other_naive_average', 'cmmlu-stem_naive_average', + 'cmmlu-social-science_naive_average', 'cmmlu-humanities_naive_average', + 'cmmlu-other_naive_average', 'cmmlu-china-specific_naive_average', + 'mmlu_pro_biology_accuracy', 'mmlu_pro_business_accuracy', + 'mmlu_pro_chemistry_accuracy', 'mmlu_pro_computer_science_accuracy', + 'mmlu_pro_economics_accuracy', 'mmlu_pro_engineering_accuracy', + 'mmlu_pro_health_accuracy', 'mmlu_pro_history_accuracy', + 'mmlu_pro_law_accuracy', 'mmlu_pro_math_accuracy', + 'mmlu_pro_philosophy_accuracy', 'mmlu_pro_physics_accuracy', + 'mmlu_pro_psychology_accuracy', 'mmlu_pro_other_accuracy', + 'humanevalx-python_pass@1', 'humanevalx-cpp_pass@1', + 'humanevalx-go_pass@1', 'humanevalx-java_pass@1', + 'humanevalx-js_pass@1', 'ds1000_Pandas_accuracy', + 'ds1000_Numpy_accuracy', 'ds1000_Tensorflow_accuracy', + 'ds1000_Scipy_accuracy', 'ds1000_Sklearn_accuracy', + 'ds1000_Pytorch_accuracy', 'ds1000_Matplotlib_accuracy', + 'openai_mmmlu_lite_AR-XY_accuracy', 'openai_mmmlu_lite_BN-BD_accuracy', + 'openai_mmmlu_lite_DE-DE_accuracy', 'openai_mmmlu_lite_ES-LA_accuracy', + 'openai_mmmlu_lite_FR-FR_accuracy', 'openai_mmmlu_lite_HI-IN_accuracy', + 'openai_mmmlu_lite_ID-ID_accuracy', 'openai_mmmlu_lite_IT-IT_accuracy', + 'openai_mmmlu_lite_JA-JP_accuracy', 'openai_mmmlu_lite_KO-KR_accuracy', + 'openai_mmmlu_lite_PT-BR_accuracy', 'openai_mmmlu_lite_SW-KE_accuracy', + 'openai_mmmlu_lite_YO-NG_accuracy', 'openai_mmmlu_lite_ZH-CN_accuracy', + 'college_naive_average', 'high_naive_average', 'middle_naive_average', + 'primary_naive_average', 'arithmetic_naive_average', + 'mathbench-a (average)_naive_average', + 'college_knowledge_naive_average', 'high_knowledge_naive_average', + 'middle_knowledge_naive_average', 'primary_knowledge_naive_average', + 'mathbench-t (average)_naive_average' + ]]) + @pytest.mark.chat_objective + def test_chat_objective(self, baseline_scores_fullbench, result_scores, + model, dataset): + base_score = baseline_scores_fullbench.get(model).get(dataset) + result_score = result_scores.get(model).get(dataset) + assert_score(model + '_batch', result_score, base_score) + + @pytest.mark.parametrize( + 'model, dataset', + [(p1, p2) for p1 in ['internlm2_5-7b-chat-turbomind'] + for p2 in [ + 'alignment_bench_v1_1_总分', 'alpaca_eval_total', 'arenahard_score', + 'Followbench_naive_average', 'CompassArena_naive_average', + 'FoFo_naive_average', 'mtbench101_avg', 'wildbench_average', + 'simpleqa_accuracy_given_attempted', + 'chinese_simpleqa_given_attempted_accuracy', + 'alignment_bench_v1_1_专业能力', 'alignment_bench_v1_1_数学计算', + 'alignment_bench_v1_1_基本任务', 'alignment_bench_v1_1_逻辑推理', + 'alignment_bench_v1_1_中文理解', 'alignment_bench_v1_1_文本写作', + 'alignment_bench_v1_1_角色扮演', 'alignment_bench_v1_1_综合问答', + 'alpaca_eval_helpful_base', 'alpaca_eval_koala', + 'alpaca_eval_oasst', 'alpaca_eval_selfinstruct', + 'alpaca_eval_vicuna', 'compassarena_language_naive_average', + 'compassarena_knowledge_naive_average', + 'compassarena_reason_v2_naive_average', + 'compassarena_math_v2_naive_average', + 'compassarena_creationv2_zh_naive_average', + 'fofo_test_prompts_overall', 'fofo_test_prompts_cn_overall', + 'followbench_llmeval_en_HSR_AVG', + 'followbench_llmeval_en_SSR_AVG', 'followbench_llmeval_en_HSR_L1', + 'followbench_llmeval_en_HSR_L2', 'followbench_llmeval_en_HSR_L3', + 'followbench_llmeval_en_HSR_L4', 'followbench_llmeval_en_HSR_L5', + 'followbench_llmeval_en_SSR_L1', 'followbench_llmeval_en_SSR_L2', + 'followbench_llmeval_en_SSR_L3', 'followbench_llmeval_en_SSR_L4', + 'followbench_llmeval_en_SSR_L5', 'simpleqa_f1' + ]]) + @pytest.mark.chat_subjective + def test_chat_subjective(self, baseline_scores_fullbench, result_scores, + model, dataset): + base_score = baseline_scores_fullbench.get(model).get(dataset) + result_score = result_scores.get(model).get(dataset) + assert_score(model + '_batch', result_score, base_score) + + @pytest.mark.parametrize('model, dataset', [( + p1, p2 + ) for p1 in ['internlm2_5-7b-turbomind'] for p2 in [ + 'race-high_accuracy', 'ARC-c_accuracy', 'BoolQ_accuracy', + 'triviaqa_wiki_1shot_score', 'nq_open_1shot_score', 'drop_accuracy', + 'bbh_naive_average', 'GPQA_diamond_accuracy', 'hellaswag_accuracy', + 'TheoremQA_score', 'winogrande_accuracy', 'gsm8k_accuracy', + 'GaokaoBench_weighted_average', 'math_accuracy', + 'Mathbench_naive_average', + 'wikibench-wiki-single_choice_cncircular_perf_4', + 'cmmlu_naive_average', 'mmlu_naive_average', 'mmlu_pro_naive_average', + 'openai_humaneval_humaneval_pass@1', + 'openai_humaneval_v2_humaneval_pass@1', 'sanitized_mbpp_score', + 'dingo_en_192_score', 'dingo_zh_170_score', 'mmlu-stem_naive_average', + 'mmlu-social-science_naive_average', 'mmlu-humanities_naive_average', + 'mmlu-other_naive_average', 'cmmlu-stem_naive_average', + 'cmmlu-social-science_naive_average', 'cmmlu-humanities_naive_average', + 'cmmlu-other_naive_average', 'cmmlu-china-specific_naive_average', + 'mmlu_pro_biology_accuracy', 'mmlu_pro_business_accuracy', + 'mmlu_pro_chemistry_accuracy', 'mmlu_pro_computer_science_accuracy', + 'mmlu_pro_economics_accuracy', 'mmlu_pro_engineering_accuracy', + 'mmlu_pro_health_accuracy', 'mmlu_pro_history_accuracy', + 'mmlu_pro_law_accuracy', 'mmlu_pro_math_accuracy', + 'mmlu_pro_philosophy_accuracy', 'mmlu_pro_physics_accuracy', + 'mmlu_pro_psychology_accuracy', 'mmlu_pro_other_accuracy', + 'college_naive_average', 'high_naive_average', 'middle_naive_average', + 'primary_naive_average', 'arithmetic_naive_average', + 'mathbench-a (average)_naive_average', + 'college_knowledge_naive_average', 'high_knowledge_naive_average', + 'middle_knowledge_naive_average', 'primary_knowledge_naive_average', + 'mathbench-t (average)_naive_average' + ]]) + @pytest.mark.base_objective + def test_base_objective(self, baseline_scores_fullbench, result_scores, + model, dataset): + base_score = baseline_scores_fullbench.get(model).get(dataset) + result_score = result_scores.get(model).get(dataset) + assert_score(model + '_batch', result_score, base_score) + + @pytest.mark.parametrize( + 'model, dataset', + [(p1, p2) for p1 in ['internlm2_5-7b-turbomind'] + for p2 in [ + 'Single-Needle-Retrieval(S-RT)-32000_naive_average', + 'Single-Needle-Retrieval-EN-32000_naive_average', + 'Single-Needle-Retrieval-ZH-32000_naive_average', + 'Single-Needle-Retrieval(S-RT)-100000_naive_average', + 'Single-Needle-Retrieval-EN-100000_naive_average', + 'Single-Needle-Retrieval-ZH-100000_naive_average', + 'Single-Needle-Retrieval(S-RT)-200000_naive_average', + 'Single-Needle-Retrieval-EN-200000_naive_average', + 'Single-Needle-Retrieval-ZH-200000_naive_average', + 'longbench_naive_average', 'longbench_zh_naive_average', + 'longbench_en_naive_average', + 'longbench_single-document-qa_naive_average', + 'longbench_multi-document-qa_naive_average', + 'longbench_summarization_naive_average', + 'longbench_few-shot-learning_naive_average', + 'longbench_synthetic-tasks_naive_average', + 'longbench_code-completion_naive_average' + ]]) + @pytest.mark.base_long_context + def test_base_long_context(self, baseline_scores_fullbench, result_scores, + model, dataset): + base_score = baseline_scores_fullbench.get(model).get(dataset) + result_score = result_scores.get(model).get(dataset) + assert_score(model + '_batch', result_score, base_score) + + @pytest.mark.parametrize( + 'model, dataset', + [(p1, p2) for p1 in ['internlm2_5-7b-chat-1m-turbomind'] + for p2 in [ + 'ruler_8k_naive_average', 'ruler_32k_naive_average', + 'ruler_128k_naive_average', + 'NeedleBench-Overall-Score-8K_weighted_average', + 'NeedleBench-Overall-Score-32K_weighted_average', + 'NeedleBench-Overall-Score-128K_weighted_average', + 'longbench_naive_average', 'longbench_zh_naive_average', + 'longbench_en_naive_average', 'babilong_0k_naive_average', + 'babilong_4k_naive_average', 'babilong_16k_naive_average', + 'babilong_32k_naive_average', 'babilong_128k_naive_average', + 'babilong_256k_naive_average', + 'longbench_single-document-qa_naive_average', + 'longbench_multi-document-qa_naive_average', + 'longbench_summarization_naive_average', + 'longbench_few-shot-learning_naive_average', + 'longbench_synthetic-tasks_naive_average', + 'longbench_code-completion_naive_average' + ]]) + @pytest.mark.chat_long_context + def test_chat_long_context(self, baseline_scores_fullbench, result_scores, + model, dataset): + base_score = baseline_scores_fullbench.get(model).get(dataset) + result_score = result_scores.get(model).get(dataset) + assert_score(model + '_batch', result_score, base_score) + + @pytest.mark.usefixtures('result_scores') @pytest.mark.usefixtures('baseline_scores') class TestCmdCase: @pytest.mark.case1 @pytest.mark.parametrize('model, dataset', - [('internlm2_5-7b-hf', 'race-middle'), - ('internlm2_5-7b-hf', 'race-high'), - ('internlm2_5-7b-hf', 'demo_gsm8k'), - ('internlm2-1.8b-hf', 'race-middle'), - ('internlm2-1.8b-hf', 'race-high'), - ('internlm2-1.8b-hf', 'demo_gsm8k')]) + [('internlm2_5-7b-hf', 'race-middle_accuracy'), + ('internlm2_5-7b-hf', 'race-high_accuracy'), + ('internlm2_5-7b-hf', 'demo_gsm8k_accuracy'), + ('internlm2-1.8b-hf', 'race-middle_accuracy'), + ('internlm2-1.8b-hf', 'race-high_accuracy'), + ('internlm2-1.8b-hf', 'demo_gsm8k_accuracy')]) def test_cmd_case1(self, baseline_scores, result_scores, model, dataset): base_score = baseline_scores.get(model).get(dataset) result_score = result_scores.get(model).get(dataset) assert_score(model, result_score, base_score) @pytest.mark.case2 - @pytest.mark.parametrize('model, dataset', - [('internlm2_5-7b-chat-lmdeploy', 'race-middle'), - ('internlm2_5-7b-chat-lmdeploy', 'race-high'), - ('internlm2_5-7b-chat-lmdeploy', 'demo_gsm8k'), - ('internlm2-chat-1.8b-lmdeploy', 'race-middle'), - ('internlm2-chat-1.8b-lmdeploy', 'race-high'), - ('internlm2-chat-1.8b-lmdeploy', 'demo_gsm8k')]) + @pytest.mark.parametrize( + 'model, dataset', + [('internlm2_5-7b-chat-lmdeploy', 'race-middle_accuracy'), + ('internlm2_5-7b-chat-lmdeploy', 'race-high_accuracy'), + ('internlm2_5-7b-chat-lmdeploy', 'demo_gsm8k_accuracy'), + ('internlm2-chat-1.8b-lmdeploy', 'race-middle_accuracy'), + ('internlm2-chat-1.8b-lmdeploy', 'race-high_accuracy'), + ('internlm2-chat-1.8b-lmdeploy', 'demo_gsm8k_accuracy')]) def test_cmd_case2(self, baseline_scores, result_scores, model, dataset): base_score = baseline_scores.get(model).get(dataset) result_score = result_scores.get(model).get(dataset) @@ -250,19 +498,19 @@ class TestCmdCase: @pytest.mark.case3 @pytest.mark.parametrize('model, dataset', - [('internlm2_5-7b_hf', 'race-middle'), - ('internlm2_5-7b_hf', 'race-high'), - ('internlm2_5-7b_hf', 'demo_gsm8k')]) + [('internlm2_5-7b_hf', 'race-middle_accuracy'), + ('internlm2_5-7b_hf', 'race-high_accuracy'), + ('internlm2_5-7b_hf', 'demo_gsm8k_accuracy')]) def test_cmd_case3(self, baseline_scores, result_scores, model, dataset): base_score = baseline_scores.get(model).get(dataset) result_score = result_scores.get(model).get(dataset) assert_score(model, result_score, base_score) @pytest.mark.case4 - @pytest.mark.parametrize('model, dataset', - [('internlm2_5-7b-chat_hf', 'race-middle'), - ('internlm2_5-7b-chat_hf', 'race-high'), - ('internlm2_5-7b-chat_hf', 'demo_gsm8k')]) + @pytest.mark.parametrize( + 'model, dataset', [('internlm2_5-7b-chat_hf', 'race-middle_accuracy'), + ('internlm2_5-7b-chat_hf', 'race-high_accuracy'), + ('internlm2_5-7b-chat_hf', 'demo_gsm8k_accuracy')]) def test_cmd_case4(self, baseline_scores, result_scores, model, dataset): base_score = baseline_scores.get(model).get(dataset) result_score = result_scores.get(model).get(dataset) @@ -310,8 +558,7 @@ def find_csv_files(directory): csv_files = [] for root, dirs, files in os.walk(directory): for file in files: - if file.endswith('.csv') and (file.startswith('summary') or - file.startswith('Subjective_all')): + if file.endswith('.csv') and file.startswith('summary'): csv_files.append(os.path.join(root, file)) csv_files_with_time = {f: os.path.getctime(f) for f in csv_files} @@ -324,24 +571,15 @@ def read_csv_file(file_path): with open(file_path, 'r') as csvfile: reader = csv.DictReader(csvfile) filtered_data = [] - if 'Subjective_all' not in file_path: - for row in reader: - if row['metric'] is not None and 'bpb' not in row['metric']: - filtered_row = { - k: v - for k, v in row.items() - if k not in ['version', 'metric', 'mode'] - } - filtered_data.append(filtered_row) - else: - for row in reader: - if row['Detailed Scores'] is not None: - filtered_row = row - filtered_row['dataset'] = filtered_row[ - 'Dataset'] + filtered_row['Detailed Scores'] - del filtered_row['Dataset'] - del filtered_row['Detailed Scores'] - filtered_data.append(filtered_row) + for row in reader: + if row['metric'] is not None and 'bpb' not in row[ + 'metric'] and '_' != row['metric']: + filtered_row = row + filtered_row['dataset'] = row['dataset'] + '_' + row['metric'] + del filtered_row['version'] + del filtered_row['metric'] + del filtered_row['mode'] + filtered_data.append(filtered_row) result = {} for data in filtered_data: diff --git a/.github/scripts/oc_score_baseline.yaml b/.github/scripts/oc_score_baseline.yaml index 40cb1087..681ac5d3 100644 --- a/.github/scripts/oc_score_baseline.yaml +++ b/.github/scripts/oc_score_baseline.yaml @@ -1,34 +1,34 @@ internlm2_5-7b-hf: - demo_gsm8k: 42.19 - race-middle: 91.78 - race-high: 90.02 + demo_gsm8k_accuracy: 42.19 + race-middle_accuracy: 91.78 + race-high_accuracy: 90.02 internlm2_5-7b_hf: - demo_gsm8k: 42.19 - race-middle: 91.78 - race-high: 90.02 + demo_gsm8k_accuracy: 42.19 + race-middle_accuracy: 91.78 + race-high_accuracy: 90.02 internlm2-1.8b-hf: - demo_gsm8k: 15.62 - race-middle: 71.66 - race-high: 66.38 + demo_gsm8k_accuracy: 15.62 + race-middle_accuracy: 71.66 + race-high_accuracy: 66.38 internlm2_5-7b-chat-lmdeploy: - demo_gsm8k: 84.38 - race-middle: 92.76 - race-high: 90.54 + demo_gsm8k_accuracy: 84.38 + race-middle_accuracy: 92.76 + race-high_accuracy: 90.54 internlm2-chat-1.8b-lmdeploy: - demo_gsm8k: 31 - race-middle: 81.34 - race-high: 73.96 + demo_gsm8k_accuracy: 31 + race-middle_accuracy: 81.34 + race-high_accuracy: 73.96 internlm2_5-7b-chat_hf: - demo_gsm8k: 87.50 - race-middle: 92.76 - race-high: 90.48 + demo_gsm8k_accuracy: 87.50 + race-middle_accuracy: 92.76 + race-high_accuracy: 90.48 lmdeploy-api-test: - gsm8k: 83.78 - race-middle: 92.41 - race-high: 90.37 + gsm8k_accuracy: 83.78 + race-middle_accuracy: 92.41 + race-high_accuracy: 90.37 diff --git a/.github/scripts/oc_score_baseline_fullbench.yaml b/.github/scripts/oc_score_baseline_fullbench.yaml index 49393e05..0359b633 100644 --- a/.github/scripts/oc_score_baseline_fullbench.yaml +++ b/.github/scripts/oc_score_baseline_fullbench.yaml @@ -1,173 +1,447 @@ internlm2_5-7b-chat-hf_fullbench: - race-high: 93.75 - ARC-c: 93.75 - BoolQ: 81.25 - triviaqa_wiki_1shot: 50 - nq_open_1shot: 25 - IFEval: 50 - drop: 81.25 - GPQA_diamond: 25 - hellaswag: 87.5 - TheoremQA: 18.75 - musr_average: 39.58 - gsm8k: 56.25 - math: 75 - cmo_fib: 6.25 - aime2024: 6.25 - wikibench-wiki-single_choice_cncircular: 50 - sanitized_mbpp: 68.75 - ds1000: 16.96 - lcb_code_generation: 12.5 - lcb_code_execution: 43.75 - lcb_test_output: 18.75 - bbh-logical_deduction_seven_objects: 50 - bbh-multistep_arithmetic_two: 68.75 - mmlu-other: 72.6 - cmmlu-china-specific: 76.25 - mmlu_pro_math: 25 - ds1000_Pandas: 12.5 - ds1000_Numpy: 0 - ds1000_Tensorflow: 12.5 - ds1000_Scipy: 18.75 - ds1000_Sklearn: 18.75 - ds1000_Pytorch: 12.5 - ds1000_Matplotlib: 43.75 - openai_mmmlu_lite_AR-XY: 37.5 - college: 12.5 - college_knowledge: 87.5 - Alignbench总分: 0.65 - Alignbench专业能力: 7.83 - AlpacaEvaltotal: 0 - AlpacaEvalhelpful_base: 0 - CompassArenacompassarena_language: 60 - CompassArenacompassarena_knowledge: 56 - CompassArenacompassarena_reason_v2: 50 - CompassArenacompassarena_math_v2: 53.5 - CompassArenacompassarena_creationv2_zh: 48.75 - Fofofofo_test_prompts: 1 - followbenchHSR_AVG: 1 - followbenchSSR_AVG: 1 - followbenchHSR_L1: 1 - followbenchHSR_L2: 1 - followbenchHSR_L3: 1 - followbenchHSR_L4: 1 - followbenchHSR_L5: 1 - followbenchSSR_L1: 1 - followbenchSSR_L2: 1 - followbenchSSR_L3: 1 - followbenchSSR_L4: 1 - followbenchSSR_L5: 1 - MTBench101average: 8.1 - Wildbenchscore: -3.3333333333333335 + race-high_accuracy: 93.75 + ARC-c_accuracy: 93.75 + BoolQ_accuracy: 81.25 + triviaqa_wiki_1shot_score: 50 + nq_open_1shot_score: 25 + IFEval_Prompt-level-strict-accuracy: 50 + drop_accuracy: 81.25 + GPQA_diamond_accuracy: 25 + hellaswag_accuracy: 87.5 + TheoremQA_score: 18.75 + musr_average_naive_average: 39.58 + korbench_single_naive_average: 40 + gsm8k_accuracy: 62.50 + math_accuracy: 75 + cmo_fib_accuracy: 6.25 + aime2024_accuracy: 6.25 + wikibench-wiki-single_choice_cncircular_perf_4: 50 + sanitized_mbpp_score: 68.75 + ds1000_naive_average: 16.96 + lcb_code_generation_pass@1: 12.5 + lcb_code_execution_pass@1: 43.75 + lcb_test_output_pass@1: 18.75 + bbh-logical_deduction_seven_objects_score: 50 + bbh-multistep_arithmetic_two_score: 68.75 + mmlu-other_naive_average: 72.6 + cmmlu-china-specific_naive_average: 76.25 + mmlu_pro_math_accuracy: 25 + ds1000_Pandas_accuracy: 12.5 + ds1000_Numpy_accuracy: 0 + ds1000_Tensorflow_accuracy: 12.5 + ds1000_Scipy_accuracy: 18.75 + ds1000_Sklearn_accuracy: 18.75 + ds1000_Pytorch_accuracy: 12.5 + ds1000_Matplotlib_accuracy: 43.75 + openai_mmmlu_lite_AR-XY_accuracy: 37.5 + college_naive_average: 12.5 + college_knowledge_naive_average: 87.5 + alignment_bench_v1_1_总分: 0.66 + alpaca_eval_total: 0 + arenahard_score: 50 + Followbench_naive_average: 1 + CompassArena_naive_average: 54.48 + mtbench101_avg: 8.1 + wildbench_average: -9.86 + simpleqa_accuracy_given_attempted: 0 + chinese_simpleqa_given_attempted_accuracy: 1 + alignment_bench_v1_1_专业能力: 8 + alignment_bench_v1_1_数学计算: 0 + alignment_bench_v1_1_基本任务: 0 + alignment_bench_v1_1_逻辑推理: 0 + alignment_bench_v1_1_中文理解: 0 + alignment_bench_v1_1_文本写作: 0 + alignment_bench_v1_1_角色扮演: 0 + alignment_bench_v1_1_综合问答: 0 + alpaca_eval_helpful_base: 0 + compassarena_language_naive_average: 62 + compassarena_knowledge_naive_average: 56 + compassarena_reason_v2_naive_average: 49 + compassarena_math_v2_naive_average: 57.05 + compassarena_creationv2_zh_naive_average: 48.34 + fofo_test_prompts_overall: 1 + followbench_llmeval_en_HSR_AVG: 1 + followbench_llmeval_en_SSR_AVG: 1 + followbench_llmeval_en_HSR_L1: 1 + followbench_llmeval_en_HSR_L2: 1 + followbench_llmeval_en_HSR_L3: 1 + followbench_llmeval_en_HSR_L4: 1 + followbench_llmeval_en_HSR_L5: 1 + followbench_llmeval_en_SSR_L1: 1 + followbench_llmeval_en_SSR_L2: 1 + followbench_llmeval_en_SSR_L3: 1 + followbench_llmeval_en_SSR_L4: 1 + followbench_llmeval_en_SSR_L5: 1 + simpleqa_f1: 0 internlm2_5-7b-chat-turbomind_fullbench: - race-high: 93.75 - ARC-c: 87.5 - BoolQ: 68.75 - triviaqa_wiki_1shot: 50 - nq_open_1shot: 25 - IFEval: 50 - drop: 75 - hellaswag: 81.25 - TheoremQA: 6.25 - musr_average: 37.5 - gsm8k: 68.75 - math: 75 - GPQA_diamond: 25 - cmo_fib: 6.25 - aime2024: 6.25 - wikibench-wiki-single_choice_cncircular: 25 - sanitized_mbpp: 68.75 - ds1000: 13.39 - lcb_code_generation: 12.5 - lcb_code_execution: 43.75 - lcb_test_output: 12.5 - bbh-logical_deduction_seven_objects: 56.25 - bbh-multistep_arithmetic_two: 68.75 - mmlu-other: 74.04 - cmmlu-china-specific: 76.25 - mmlu_pro_math: 25 - ds1000_Pandas: 0 - ds1000_Numpy: 0 - ds1000_Tensorflow: 12.5 - ds1000_Scipy: 18.75 - ds1000_Sklearn: 18.75 - ds1000_Pytorch: 6.25 - ds1000_Matplotlib: 37.5 - openai_mmmlu_lite_AR-XY: 37.5 - college: 0 - college_knowledge: 87.5 - Alignbench总分: 0.64 - Alignbench专业能力: 7.6 - AlpacaEvaltotal: 10 - AlpacaEvalhelpful_base: 10 - CompassArenacompassarena_language: 59 - CompassArenacompassarena_knowledge: 57 - CompassArenacompassarena_reason_v2: 49.5 - CompassArenacompassarena_math_v2: 51 - CompassArenacompassarena_creationv2_zh: 43.75 - Fofofofo_test_prompts: 1 - followbenchHSR_AVG: 1 - followbenchSSR_AVG: 1 - followbenchHSR_L1: 1 - followbenchHSR_L2: 1 - followbenchHSR_L3: 1 - followbenchHSR_L4: 1 - followbenchHSR_L5: 1 - followbenchSSR_L1: 1 - followbenchSSR_L2: 1 - followbenchSSR_L3: 1 - followbenchSSR_L4: 1 - followbenchSSR_L5: 1 - MTBench101average: 8.1 - Wildbenchscore: -8.333333333333334 + race-high_accuracy: 93.75 + ARC-c_accuracy: 87.5 + BoolQ_accuracy: 68.75 + triviaqa_wiki_1shot_score: 50 + nq_open_1shot_score: 25 + IFEval_Prompt-level-strict-accuracy: 50 + drop_accuracy: 75 + GPQA_diamond_accuracy: 25 + hellaswag_accuracy: 81.25 + TheoremQA_score: 6.25 + musr_average_naive_average: 37.5 + korbench_single_naive_average: 41.25 + gsm8k_accuracy: 68.75 + math_accuracy: 75 + cmo_fib_accuracy: 6.25 + aime2024_accuracy: 6.25 + wikibench-wiki-single_choice_cncircular_perf_4: 25 + sanitized_mbpp_score: 68.75 + ds1000_naive_average: 13.39 + lcb_code_generation_pass@1: 12.5 + lcb_code_execution_pass@1: 43.75 + lcb_test_output_pass@1: 12.5 + bbh-logical_deduction_seven_objects_score: 56.25 + bbh-multistep_arithmetic_two_score: 68.75 + mmlu-other_naive_average: 74.04 + cmmlu-china-specific_naive_average: 76.25 + mmlu_pro_math_accuracy: 25 + ds1000_Pandas_accuracy: 0 + ds1000_Numpy_accuracy: 0 + ds1000_Tensorflow_accuracy: 12.5 + ds1000_Scipy_accuracy: 18.75 + ds1000_Sklearn_accuracy: 18.75 + ds1000_Pytorch_accuracy: 6.25 + ds1000_Matplotlib_accuracy: 37.5 + openai_mmmlu_lite_AR-XY_accuracy: 37.5 + college_naive_average: 0 + college_knowledge_naive_average: 87.5 + alignment_bench_v1_1_总分: 0.68 + alpaca_eval_total: 10 + arenahard_score: 50 + Followbench_naive_average: 1 + CompassArena_naive_average: 52.95 + mtbench101_avg: 8.1 + wildbench_average: -4.44 + simpleqa_accuracy_given_attempted: 0 + chinese_simpleqa_given_attempted_accuracy: 1 + alignment_bench_v1_1_专业能力: 8.2 + alignment_bench_v1_1_数学计算: 0 + alignment_bench_v1_1_基本任务: 0 + alignment_bench_v1_1_逻辑推理: 0 + alignment_bench_v1_1_中文理解: 0 + alignment_bench_v1_1_文本写作: 0 + alignment_bench_v1_1_角色扮演: 0 + alignment_bench_v1_1_综合问答: 0 + alpaca_eval_helpful_base: 10 + compassarena_language_naive_average: 61.5 + compassarena_knowledge_naive_average: 56.5 + compassarena_reason_v2_naive_average: 47.5 + compassarena_math_v2_naive_average: 53.03 + compassarena_creationv2_zh_naive_average: 46.22 + fofo_test_prompts_overall: 1 + followbench_llmeval_en_HSR_AVG: 1 + followbench_llmeval_en_SSR_AVG: 1 + followbench_llmeval_en_HSR_L1: 1 + followbench_llmeval_en_HSR_L2: 1 + followbench_llmeval_en_HSR_L3: 1 + followbench_llmeval_en_HSR_L4: 1 + followbench_llmeval_en_HSR_L5: 1 + followbench_llmeval_en_SSR_L1: 1 + followbench_llmeval_en_SSR_L2: 1 + followbench_llmeval_en_SSR_L3: 1 + followbench_llmeval_en_SSR_L4: 1 + followbench_llmeval_en_SSR_L5: 1 + simpleqa_f1: 0 internlm2_5-7b-hf_fullbench: - race-high: 100 - ARC-c: 68.75 - BoolQ: 87.5 - GPQA_diamond: 62.5 - drop: 62.5 - math: 12.5 - wikibench-wiki-single_choice_cncircular: 25 - sanitized_mbpp: 56.25 - gsm8k: 37.5 - triviaqa_wiki_1shot: 43.75 - nq_open_1shot: 43.75 - winogrande: 75 - hellaswag: 93.75 - TheoremQA: 25 - dingo_en_192: 37.5 - dingo_zh_170: 100 - college: 12.5 - college_knowledge: 87.5 - bbh-logical_deduction_seven_objects: 43.75 - bbh-multistep_arithmetic_two: 56.25 - mmlu-other: 76.92 - cmmlu-china-specific: 84.17 - mmlu_pro_math: 18.75 + race-high_accuracy: 100 + ARC-c_accuracy: 68.75 + BoolQ_accuracy: 87.5 + triviaqa_wiki_1shot_score: 43.75 + nq_open_1shot_score: 43.75 + drop_accuracy: 62.5 + GPQA_diamond_accuracy: 62.5 + hellaswag_accuracy: 93.75 + TheoremQA_score: 25 + winogrande_accuracy: 75 + gsm8k_accuracy: 37.5 + GaokaoBench_2010-2022_Math_II_MCQs_score: 62.5 + GaokaoBench_2010-2022_Math_II_Fill-in-the-Blank_score: 0 + math_accuracy: 12.5 + wikibench-wiki-single_choice_cncircular_perf_4: 25 + sanitized_mbpp_score: 56.25 + dingo_en_192_score: 37.5 + dingo_zh_170_score: 100 + mmlu-other_accuracy: 76.92 + cmmlu-china-specific_accuracy: 84.17 + mmlu_pro_math_accuracy: 18.75 + bbh-logical_deduction_seven_objects_score: 43.75 + bbh-multistep_arithmetic_two_score: 56.25 + college_naive_average: 12.5 + college_knowledge_naive_average: 87.5 internlm2_5-7b-turbomind_fullbench: - race-high: 100 - ARC-c: 68.75 - BoolQ: 87.5 - GPQA_diamond: 62.5 - drop: 62.5 - math: 18.75 - wikibench-wiki-single_choice_cncircular: 25 - sanitized_mbpp: 56.25 - gsm8k: 68.75 - triviaqa_wiki_1shot: 43.75 - nq_open_1shot: 43.75 - winogrande: 87.5 - hellaswag: 93.75 - TheoremQA: 31.25 - dingo_en_192: 43.75 - dingo_zh_170: 100 - college: 12.5 - college_knowledge: 87.5 - bbh-logical_deduction_seven_objects: 50 - bbh-multistep_arithmetic_two: 56.25 - mmlu-other: 76.92 - cmmlu-china-specific: 84.17 - mmlu_pro_math: 18.75 + race-high_accuracy: 100 + ARC-c_accuracy: 68.75 + BoolQ_accuracy: 87.5 + triviaqa_wiki_1shot_score: 43.75 + nq_open_1shot_score: 43.75 + drop_accuracy: 62.5 + GPQA_diamond_accuracy: 62.5 + hellaswag_accuracy: 93.75 + TheoremQA_score: 31.25 + winogrande_accuracy: 87.5 + gsm8k_accuracy: 68.75 + GaokaoBench_2010-2022_Math_II_MCQs_score: 62.5 + GaokaoBench_2010-2022_Math_II_Fill-in-the-Blank_score: 0 + math_accuracy: 18.75 + wikibench-wiki-single_choice_cncircular_perf_4: 25 + sanitized_mbpp_score: 56.25 + dingo_en_192_score: 43.75 + dingo_zh_170_score: 100 + mmlu-other_accuracy: 76.92 + cmmlu-china-specific_accuracy: 84.17 + mmlu_pro_math_accuracy: 18.75 + bbh-logical_deduction_seven_objects_score: 50 + bbh-multistep_arithmetic_two_score: 56.25 + college_naive_average: 12.5 + college_knowledge_naive_average: 87.5 + +internlm2_5-7b-turbomind: + race-high_accuracy: 89.28 + ARC-c_accuracy: 52.2 + BoolQ_accuracy: 89.72 + triviaqa_wiki_1shot_score: 65.88 + nq_open_1shot_score: 34.82 + drop_accuracy: 68.1 + bbh_naive_average: 72.15 + GPQA_diamond_accuracy: 32.83 + hellaswag_accuracy: 88.36 + TheoremQA_score: 25 + winogrande_accuracy: 81.29 + gsm8k_accuracy: 74.68 + GaokaoBench_weighted_average: 58.19 + math_accuracy: 33.98 + Mathbench_naive_average: 48.38 + wikibench-wiki-single_choice_cncircular_perf_4: 29.1 + cmmlu_naive_average: 78.94 + mmlu_naive_average: 71.44 + mmlu_pro_naive_average: 38.18 + openai_humaneval_humaneval_pass@1: 59.76 + openai_humaneval_v2_humaneval_pass@1: 51.22 + sanitized_mbpp_score: 55.25 + dingo_en_192_score: 60.94 + dingo_zh_170_score: 67.65 + mmlu-stem_naive_average: 63.72 + mmlu-social-science_naive_average: 80.15 + mmlu-humanities_naive_average: 74.27 + mmlu-other_naive_average: 71.85 + cmmlu-stem_naive_average: 67.07 + cmmlu-social-science_naive_average: 81.49 + cmmlu-humanities_naive_average: 85.84 + cmmlu-other_naive_average: 82.69 + cmmlu-china-specific_naive_average: 79.88 + mmlu_pro_biology_accuracy: 58.58 + mmlu_pro_business_accuracy: 28.01 + mmlu_pro_chemistry_accuracy: 22.79 + mmlu_pro_computer_science_accuracy: 39.02 + mmlu_pro_economics_accuracy: 53.08 + mmlu_pro_engineering_accuracy: 25.7 + mmlu_pro_health_accuracy: 46.94 + mmlu_pro_history_accuracy: 43.04 + mmlu_pro_law_accuracy: 29.7 + mmlu_pro_math_accuracy: 24.2 + mmlu_pro_philosophy_accuracy: 42.48 + mmlu_pro_physics_accuracy: 26.02 + mmlu_pro_psychology_accuracy: 52.76 + mmlu_pro_other_accuracy: 42.21 + college_naive_average: 10.67 + high_naive_average: 6.67 + middle_naive_average: 26.67 + primary_naive_average: 60 + arithmetic_naive_average: 55 + mathbench-a (average)_naive_average: 31.8 + college_knowledge_naive_average: 62.34 + high_knowledge_naive_average: 59.83 + middle_knowledge_naive_average: 71.15 + primary_knowledge_naive_average: 66.55 + mathbench-t (average)_naive_average: 64.97 + Single-Needle-Retrieval(S-RT)-32000_naive_average: 100 + Single-Needle-Retrieval-EN-32000_naive_average: 100 + Single-Needle-Retrieval-ZH-32000_naive_average: 100 + Single-Needle-Retrieval(S-RT)-100000_naive_average: 100 + Single-Needle-Retrieval-EN-100000_naive_average: 100 + Single-Needle-Retrieval-ZH-100000_naive_average: 100 + Single-Needle-Retrieval(S-RT)-200000_naive_average: 100 + Single-Needle-Retrieval-EN-200000_naive_average: 100 + Single-Needle-Retrieval-ZH-200000_naive_average: 100 + longbench_naive_average: 46.19 + longbench_zh_naive_average: 49.3 + longbench_en_naive_average: 43.97 + longbench_single-document-qa_naive_average: 42.84 + longbench_multi-document-qa_naive_average: 37.29 + longbench_summarization_naive_average: 23.21 + longbench_few-shot-learning_naive_average: 61.67 + longbench_synthetic-tasks_naive_average: 60.05 + longbench_code-completion_naive_average: 52.09 + +internlm2_5-7b-chat-turbomind: + race-high_accuracy: 86.16 + ARC-c_accuracy: 90.17 + BoolQ_accuracy: 87.89 + triviaqa_wiki_1shot_score: 64.91 + nq_open_1shot_score: 22.69 + mmmlu_lite_naive_average: 44.96 + IFEval_Prompt-level-strict-accuracy: 58.04 + drop_accuracy: 77.68 + bbh_naive_average: 73.14 + GPQA_diamond_accuracy: 25.76 + hellaswag_accuracy: 94.79 + TheoremQA_score: 21.5 + musr_average_naive_average: 51.03 + korbench_single_naive_average: 31.92 + ARC_Prize_Public_Evaluation_accuracy: 0.01 + gsm8k_accuracy: 86.73 + GaokaoBench_weighted_average: 77.89 + math_accuracy: 61.5 + cmo_fib_accuracy: 12.5 + aime2024_accuracy: 3.33 + Mathbench_naive_average: 65.17 + wikibench-wiki-single_choice_cncircular_perf_4: 31.55 + cmmlu_naive_average: 74.14 + mmlu_naive_average: 70.52 + mmlu_pro_naive_average: 44.98 + openai_humaneval_humaneval_pass@1: 70.73 + sanitized_mbpp_score: 63.81 + humanevalx_naive_average: 38.17 + ds1000_naive_average: 14.15 + lcb_code_generation_pass@1: 17.75 + lcb_code_execution_pass@1: 32.57 + lcb_test_output_pass@1: 24.89 + bigcodebench_hard_instruct_pass@1: 0.08 + bigcodebench_hard_complete_pass@1: 0.06 + teval_naive_average: 80.03 + qa_dingo_cn_score: 99.01 + mmlu-stem_naive_average: 68.2 + mmlu-social-science_naive_average: 76.11 + mmlu-humanities_naive_average: 68.71 + mmlu-other_naive_average: 70.56 + cmmlu-stem_naive_average: 66.27 + cmmlu-social-science_naive_average: 75.7 + cmmlu-humanities_naive_average: 77.7 + cmmlu-other_naive_average: 77.71 + cmmlu-china-specific_naive_average: 72.94 + mmlu_pro_biology_accuracy: 66.25 + mmlu_pro_business_accuracy: 48.42 + mmlu_pro_chemistry_accuracy: 35.25 + mmlu_pro_computer_science_accuracy: 47.56 + mmlu_pro_economics_accuracy: 55.92 + mmlu_pro_engineering_accuracy: 30.44 + mmlu_pro_health_accuracy: 45.97 + mmlu_pro_history_accuracy: 41.21 + mmlu_pro_law_accuracy: 25.79 + mmlu_pro_math_accuracy: 54.03 + mmlu_pro_philosophy_accuracy: 36.47 + mmlu_pro_physics_accuracy: 37.41 + mmlu_pro_psychology_accuracy: 58.77 + mmlu_pro_other_accuracy: 46.21 + humanevalx-python_pass@1: 53.66 + humanevalx-cpp_pass@1: 24.39 + humanevalx-go_pass@1: 0 + humanevalx-java_pass@1: 57.93 + humanevalx-js_pass@1: 54.88 + ds1000_Pandas_accuracy: 12.03 + ds1000_Numpy_accuracy: 4.09 + ds1000_Tensorflow_accuracy: 11.11 + ds1000_Scipy_accuracy: 8.49 + ds1000_Sklearn_accuracy: 6.96 + ds1000_Pytorch_accuracy: 7.35 + ds1000_Matplotlib_accuracy: 49.03 + openai_mmmlu_lite_AR-XY_accuracy: 17.89 + openai_mmmlu_lite_BN-BD_accuracy: 27.58 + openai_mmmlu_lite_DE-DE_accuracy: 51.16 + openai_mmmlu_lite_ES-LA_accuracy: 56.84 + openai_mmmlu_lite_FR-FR_accuracy: 57.96 + openai_mmmlu_lite_HI-IN_accuracy: 33.68 + openai_mmmlu_lite_ID-ID_accuracy: 51.02 + openai_mmmlu_lite_IT-IT_accuracy: 50.46 + openai_mmmlu_lite_JA-JP_accuracy: 50.53 + openai_mmmlu_lite_KO-KR_accuracy: 45.05 + openai_mmmlu_lite_PT-BR_accuracy: 57.68 + openai_mmmlu_lite_SW-KE_accuracy: 32.77 + openai_mmmlu_lite_YO-NG_accuracy: 31.79 + openai_mmmlu_lite_ZH-CN_accuracy: 65.05 + college_naive_average: 20.33 + high_naive_average: 47.67 + middle_naive_average: 62 + primary_naive_average: 72 + arithmetic_naive_average: 62.33 + mathbench-a (average)_naive_average: 52.87 + college_knowledge_naive_average: 70.57 + high_knowledge_naive_average: 70.13 + middle_knowledge_naive_average: 81.17 + primary_knowledge_naive_average: 88.01 + mathbench-t (average)_naive_average: 77.47 + alignment_bench_v1_1_总分: 5.68 + alpaca_eval_total: 25.96 + arenahard_score: 17.15 + Followbench_naive_average: 0.81 + CompassArena_naive_average: 34.61 + FoFo_naive_average: 0.38 + mtbench101_avg: 8.01 + wildbench_average: -15.69 + simpleqa_accuracy_given_attempted: 0.04 + chinese_simpleqa_given_attempted_accuracy: 0.34 + alignment_bench_v1_1_专业能力: 6.05 + alignment_bench_v1_1_数学计算: 5.87 + alignment_bench_v1_1_基本任务: 6.01 + alignment_bench_v1_1_逻辑推理: 4.48 + alignment_bench_v1_1_中文理解: 6.17 + alignment_bench_v1_1_文本写作: 6.06 + alignment_bench_v1_1_角色扮演: 6.3 + alignment_bench_v1_1_综合问答: 6.45 + alpaca_eval_helpful_base: 17.83 + alpaca_eval_koala: 28.21 + alpaca_eval_oasst: 23.4 + alpaca_eval_selfinstruct: 30.95 + alpaca_eval_vicuna: 25 + compassarena_language_naive_average: 52.5 + compassarena_knowledge_naive_average: 36 + compassarena_reason_v2_naive_average: 35 + compassarena_math_v2_naive_average: 19.91 + compassarena_creationv2_zh_naive_average: 29.64 + fofo_test_prompts_overall: 0.35 + fofo_test_prompts_cn_overall: 0.41 + followbench_llmeval_en_HSR_AVG: 0.73 + followbench_llmeval_en_SSR_AVG: 0.88 + followbench_llmeval_en_HSR_L1: 0.94 + followbench_llmeval_en_HSR_L2: 0.77 + followbench_llmeval_en_HSR_L3: 0.73 + followbench_llmeval_en_HSR_L4: 0.68 + followbench_llmeval_en_HSR_L5: 0.54 + followbench_llmeval_en_SSR_L1: 0.94 + followbench_llmeval_en_SSR_L2: 0.88 + followbench_llmeval_en_SSR_L3: 0.87 + followbench_llmeval_en_SSR_L4: 0.87 + followbench_llmeval_en_SSR_L5: 0.85 + simpleqa_f1: 0.04 + +internlm2_5-7b-chat-1m-turbomind: + ruler_8k_naive_average: 88.53 + ruler_32k_naive_average: 83.84 + ruler_128k_naive_average: 70.94 + NeedleBench-Overall-Score-8K_weighted_average: 91.89 + NeedleBench-Overall-Score-32K_weighted_average: 91.42 + NeedleBench-Overall-Score-128K_weighted_average: 88.57 + longbench_naive_average: 46.44 + longbench_zh_naive_average: 45.19 + longbench_en_naive_average: 45.71 + babilong_0k_naive_average: 79.3 + babilong_4k_naive_average: 67 + babilong_16k_naive_average: 52.7 + babilong_32k_naive_average: 48.9 + babilong_128k_naive_average: 40.8 + babilong_256k_naive_average: 23.5 + longbench_single-document-qa_naive_average: 43.56 + longbench_multi-document-qa_naive_average: 46.24 + longbench_summarization_naive_average: 24.32 + longbench_few-shot-learning_naive_average: 51.67 + longbench_synthetic-tasks_naive_average: 66.83 + longbench_code-completion_naive_average: 45.99 diff --git a/.github/scripts/oc_score_baseline_testrange.yaml b/.github/scripts/oc_score_baseline_testrange.yaml index 68f6660a..f1254343 100644 --- a/.github/scripts/oc_score_baseline_testrange.yaml +++ b/.github/scripts/oc_score_baseline_testrange.yaml @@ -1,459 +1,459 @@ baichuan2-7b-chat-hf: - gsm8k: 18.75 - race-high: 78.12 + gsm8k_accuracy: 18.75 + race-high_accuracy: 78.12 glm-4-9b-chat-hf: - gsm8k: 68.75 - race-high: 90.62 + gsm8k_accuracy: 68.75 + race-high_accuracy: 90.62 glm-4-9b-chat-turbomind: - gsm8k: 75.00 - race-high: 90.62 + gsm8k_accuracy: 75.00 + race-high_accuracy: 90.62 glm-4-9b-chat-vllm: - gsm8k: 65.62 - race-high: 90.62 + gsm8k_accuracy: 65.62 + race-high_accuracy: 90.62 deepseek-7b-chat-hf: - gsm8k: 46.88 - race-high: 81.25 + gsm8k_accuracy: 46.88 + race-high_accuracy: 81.25 deepseek-moe-16b-chat-hf: - gsm8k: 50 - race-high: 68.75 + gsm8k_accuracy: 50 + race-high_accuracy: 68.75 deepseek-7b-chat-vllm: - gsm8k: 43.75 - race-high: 75 + gsm8k_accuracy: 43.75 + race-high_accuracy: 75 gemma2-2b-it-hf: - gsm8k: 50 - race-high: 71.88 + gsm8k_accuracy: 50 + race-high_accuracy: 71.88 gemma2-9b-it-hf: - gsm8k: 71.88 - race-high: 84.38 + gsm8k_accuracy: 71.88 + race-high_accuracy: 84.38 gemma-2b-it-hf: - gsm8k: 3.12 - race-high: 40.62 + gsm8k_accuracy: 3.12 + race-high_accuracy: 40.62 gemma-7b-it-hf: - gsm8k: 40.62 - race-high: 68.75 + gsm8k_accuracy: 40.62 + race-high_accuracy: 68.75 gemma-2-9b-it-turbomind: - gsm8k: 65.62 - race-high: 84.38 + gsm8k_accuracy: 65.62 + race-high_accuracy: 84.38 gemma-7b-it-vllm: - gsm8k: 34.38 - race-high: 68.75 + gsm8k_accuracy: 34.38 + race-high_accuracy: 68.75 internlm2_5-7b-chat-hf: - gsm8k: 84.38 - race-high: 90.62 + gsm8k_accuracy: 84.38 + race-high_accuracy: 90.62 internlm2_5-7b-chat-turbomind: - gsm8k: 84.38 - race-high: 90.62 + gsm8k_accuracy: 84.38 + race-high_accuracy: 90.62 internlm2-chat-1.8b-turbomind: - gsm8k: 25 - race-high: 84.38 + gsm8k_accuracy: 25 + race-high_accuracy: 84.38 internlm2-chat-1.8b-sft-turbomind: - gsm8k: 21.88 - race-high: 84.38 + gsm8k_accuracy: 21.88 + race-high_accuracy: 84.38 internlm2-chat-7b-lmdeploy: - gsm8k: 53.12 - race-high: 84.38 + gsm8k_accuracy: 53.12 + race-high_accuracy: 84.38 internlm2-chat-7b-sft-turbomind: - gsm8k: 50 - race-high: 90.62 + gsm8k_accuracy: 50 + race-high_accuracy: 90.62 internlm2-chat-7b-vllm: - gsm8k: 43.75 - race-high: 87.5 + gsm8k_accuracy: 43.75 + race-high_accuracy: 87.5 llama-3_1-8b-instruct-hf: - gsm8k: 84.38 - race-high: 90.62 + gsm8k_accuracy: 84.38 + race-high_accuracy: 90.62 llama-3_2-3b-instruct-hf: - gsm8k: 65.62 - race-high: 81.25 + gsm8k_accuracy: 68.75 + race-high_accuracy: 81.25 llama-3-8b-instruct-hf: - gsm8k: 68.75 - race-high: 87.5 + gsm8k_accuracy: 68.75 + race-high_accuracy: 87.5 llama-3_1-8b-instruct-turbomind: - gsm8k: 78.12 - race-high: 90.62 + gsm8k_accuracy: 78.12 + race-high_accuracy: 90.62 llama-3_2-3b-instruct-turbomind: - gsm8k: 62.50 - race-high: 81.25 + gsm8k_accuracy: 65.62 + race-high_accuracy: 81.25 llama-3-8b-instruct-turbomind: - gsm8k: 68.75 - race-high: 87.5 + gsm8k_accuracy: 68.75 + race-high_accuracy: 87.5 mistral-7b-instruct-v0.2-hf: - gsm8k: 40.62 - race-high: 75 + gsm8k_accuracy: 40.62 + race-high_accuracy: 75 mistral-7b-instruct-v0.3-hf: - gsm8k: 40.62 - race-high: 75 + gsm8k_accuracy: 40.62 + race-high_accuracy: 75 mistral-nemo-instruct-2407-hf: - gsm8k: 75 - race-high: 81.25 + gsm8k_accuracy: 75 + race-high_accuracy: 81.25 mistral-nemo-instruct-2407-turbomind: - gsm8k: 68.75 - race-high: 87.50 + gsm8k_accuracy: 68.75 + race-high_accuracy: 87.50 mistral-7b-instruct-v0.1-vllm: - gsm8k: 34.38 - race-high: 68.75 + gsm8k_accuracy: 34.38 + race-high_accuracy: 68.75 mistral-7b-instruct-v0.2-vllm: - gsm8k: 43.75 - race-high: 75 + gsm8k_accuracy: 43.75 + race-high_accuracy: 75 MiniCPM3-4B-hf: - gsm8k: 68.75 - race-high: 84.38 + gsm8k_accuracy: 68.75 + race-high_accuracy: 84.38 minicpm-2b-dpo-fp32-hf: - gsm8k: 56.25 - race-high: 53.12 + gsm8k_accuracy: 56.25 + race-high_accuracy: 53.12 minicpm-2b-sft-bf16-hf: - gsm8k: 46.88 - race-high: 65.62 + gsm8k_accuracy: 46.88 + race-high_accuracy: 65.62 minicpm-2b-sft-fp32-hf: - gsm8k: 46.88 - race-high: 65.62 + gsm8k_accuracy: 46.88 + race-high_accuracy: 65.62 phi-3-mini-4k-instruct-hf: - gsm8k: 56.25 - race-high: 84.38 + gsm8k_accuracy: 56.25 + race-high_accuracy: 84.38 qwen1.5-0.5b-chat-hf: - gsm8k: 0 - race-high: 53.12 + gsm8k_accuracy: 0 + race-high_accuracy: 53.12 qwen2-1.5b-instruct-hf: - gsm8k: 62.5 - race-high: 84.38 + gsm8k_accuracy: 62.5 + race-high_accuracy: 84.38 qwen2-7b-instruct-hf: - gsm8k: 68.75 - race-high: 90.62 + gsm8k_accuracy: 68.75 + race-high_accuracy: 90.62 qwen2-1.5b-instruct-turbomind: - gsm8k: 62.50 - race-high: 84.38 + gsm8k_accuracy: 62.50 + race-high_accuracy: 84.38 qwen2-7b-instruct-turbomind: - gsm8k: 81.25 - race-high: 87.5 + gsm8k_accuracy: 81.25 + race-high_accuracy: 87.5 qwen1.5-0.5b-chat-vllm: - gsm8k: 3.12 - race-high: 53.12 + gsm8k_accuracy: 3.12 + race-high_accuracy: 53.12 yi-1.5-6b-chat-hf: - gsm8k: 65.62 - race-high: 84.38 + gsm8k_accuracy: 65.62 + race-high_accuracy: 84.38 yi-1.5-9b-chat-hf: - gsm8k: 75 - race-high: 93.75 + gsm8k_accuracy: 75 + race-high_accuracy: 93.75 deepseek-v2-lite-chat-hf: - gsm8k: 43.75 - race-high: 71.88 + gsm8k_accuracy: 43.75 + race-high_accuracy: 71.88 internlm2_5-20b-chat-hf: - gsm8k: 84.38 - race-high: 87.5 + gsm8k_accuracy: 84.38 + race-high_accuracy: 87.5 internlm2_5-20b-chat-turbomind: - gsm8k: 84.38 - race-high: 87.5 + gsm8k_accuracy: 84.38 + race-high_accuracy: 87.5 mistral-small-instruct-2409-hf: - gsm8k: 81.25 - race-high: 87.50 + gsm8k_accuracy: 81.25 + race-high_accuracy: 87.50 mistral-small-instruct-2409-turbomind: - gsm8k: 78.12 - race-high: 87.50 + gsm8k_accuracy: 78.12 + race-high_accuracy: 87.50 qwen2.5-14b-instruct-hf: - gsm8k: 71.88 - race-high: 96.88 + gsm8k_accuracy: 71.88 + race-high_accuracy: 96.88 qwen2.5-14b-instruct-turbomind: - gsm8k: 71.88 - race-high: 93.75 + gsm8k_accuracy: 71.88 + race-high_accuracy: 93.75 glm-4-9b-hf: - gsm8k: 68.75 - GPQA_diamond: 31.25 - race-high: 93.75 - winogrande: 84.38 + gsm8k_accuracy: 68.75 + GPQA_diamond_accuracy: 31.25 + race-high_accuracy: 93.75 + winogrande_accuracy: 84.38 deepseek-moe-16b-base-hf: - gsm8k: 21.88 - GPQA_diamond: 0 - race-high: 21.88 - winogrande: 65.62 + gsm8k_accuracy: 21.88 + GPQA_diamond_accuracy: 0 + race-high_accuracy: 21.88 + winogrande_accuracy: 65.62 deepseek-7b-base-turbomind: - gsm8k: 21.88 - GPQA_diamond: 0 - race-high: 46.88 - winogrande: 84.38 + gsm8k_accuracy: 21.88 + GPQA_diamond_accuracy: 0 + race-high_accuracy: 46.88 + winogrande_accuracy: 84.38 deepseek-moe-16b-base-vllm: - gsm8k: 21.88 - GPQA_diamond: 0 - race-high: 25 - winogrande: 68.75 + gsm8k_accuracy: 21.88 + GPQA_diamond_accuracy: 0 + race-high_accuracy: 25 + winogrande_accuracy: 68.75 gemma2-2b-hf: - gsm8k: 31.25 - GPQA_diamond: 3.12 - race-high: 56.25 - winogrande: 71.88 + gsm8k_accuracy: 31.25 + GPQA_diamond_accuracy: 3.12 + race-high_accuracy: 56.25 + winogrande_accuracy: 71.88 gemma2-9b-hf: - gsm8k: 68.75 - GPQA_diamond: 0 - race-high: 81.25 - winogrande: 84.38 + gsm8k_accuracy: 68.75 + GPQA_diamond_accuracy: 0 + race-high_accuracy: 81.25 + winogrande_accuracy: 84.38 gemma-2b-hf: - gsm8k: 18.75 - GPQA_diamond: 3.12 - race-high: 25 - winogrande: 53.12 + gsm8k_accuracy: 18.75 + GPQA_diamond_accuracy: 3.12 + race-high_accuracy: 25 + winogrande_accuracy: 53.12 gemma-7b-hf: - gsm8k: 56.25 - GPQA_diamond: 6.25 - race-high: 65.62 - winogrande: 78.12 + gsm8k_accuracy: 56.25 + GPQA_diamond_accuracy: 6.25 + race-high_accuracy: 65.62 + winogrande_accuracy: 78.12 gemma-2b-vllm: - gsm8k: 15.62 - GPQA_diamond: 6.25 - race-high: - winogrande: + gsm8k_accuracy: 15.62 + GPQA_diamond_accuracy: 6.25 + race-high_accuracy: + winogrande_accuracy: gemma-7b-vllm: - gsm8k: 53.12 - GPQA_diamond: 6.25 - race-high: - winogrande: + gsm8k_accuracy: 53.12 + GPQA_diamond_accuracy: 6.25 + race-high_accuracy: + winogrande_accuracy: internlm2_5-7b-hf: - gsm8k: 37.5 - GPQA_diamond: 25 - race-high: 93.75 - winogrande: 71.88 + gsm8k_accuracy: 37.5 + GPQA_diamond_accuracy: 25 + race-high_accuracy: 93.75 + winogrande_accuracy: 71.88 internlm2-7b-hf: - gsm8k: 53.12 - GPQA_diamond: 18.75 - race-high: 62.5 - winogrande: 78.12 + gsm8k_accuracy: 53.12 + GPQA_diamond_accuracy: 18.75 + race-high_accuracy: 62.5 + winogrande_accuracy: 78.12 internlm2-base-7b-hf: - gsm8k: 3.12 - GPQA_diamond: 21.88 - race-high: 75 - winogrande: 65.62 + gsm8k_accuracy: 3.12 + GPQA_diamond_accuracy: 21.88 + race-high_accuracy: 75 + winogrande_accuracy: 65.62 internlm2-1.8b-turbomind: - gsm8k: 12.5 - GPQA_diamond: 12.5 - race-high: 71.88 - winogrande: 75 + gsm8k_accuracy: 12.5 + GPQA_diamond_accuracy: 12.5 + race-high_accuracy: 71.88 + winogrande_accuracy: 75 internlm2_5-7b-turbomind: - gsm8k: 68.75 - GPQA_diamond: 31.25 - race-high: 93.75 - winogrande: 84.38 + gsm8k_accuracy: 68.75 + GPQA_diamond_accuracy: 31.25 + race-high_accuracy: 93.75 + winogrande_accuracy: 84.38 internlm2-7b-turbomind: - gsm8k: 56.25 - GPQA_diamond: 21.88 - race-high: 75 - winogrande: 81.25 + gsm8k_accuracy: 56.25 + GPQA_diamond_accuracy: 21.88 + race-high_accuracy: 75 + winogrande_accuracy: 81.25 internlm2-base-7b-turbomind: - gsm8k: 40.62 - GPQA_diamond: 28.12 - race-high: 84.38 - winogrande: 71.88 + gsm8k_accuracy: 40.62 + GPQA_diamond_accuracy: 28.12 + race-high_accuracy: 84.38 + winogrande_accuracy: 71.88 llama-2-7b-hf: - gsm8k: 21.88 - GPQA_diamond: 21.88 - race-high: 40.62 - winogrande: 71.88 + gsm8k_accuracy: 21.88 + GPQA_diamond_accuracy: 21.88 + race-high_accuracy: 40.62 + winogrande_accuracy: 71.88 llama-3_1-8b-hf: - gsm8k: 78.12 - GPQA_diamond: 25 - race-high: 90.62 - winogrande: 62.5 + gsm8k_accuracy: 78.12 + GPQA_diamond_accuracy: 25 + race-high_accuracy: 90.62 + winogrande_accuracy: 62.5 llama-3-8b-hf: - gsm8k: 46.88 - GPQA_diamond: 6.25 - race-high: 65.62 - winogrande: 65.62 + gsm8k_accuracy: 46.88 + GPQA_diamond_accuracy: 6.25 + race-high_accuracy: 65.62 + winogrande_accuracy: 65.62 llama-3.1-8b-turbomind: - gsm8k: 56.25 - GPQA_diamond: 6.25 - race-high: 78.12 - winogrande: 78.12 + gsm8k_accuracy: 56.25 + GPQA_diamond_accuracy: 6.25 + race-high_accuracy: 78.12 + winogrande_accuracy: 78.12 llama-3-8b-turbomind: - gsm8k: 50 - GPQA_diamond: 9.38 - race-high: 65.62 - winogrande: 78.12 + gsm8k_accuracy: 50 + GPQA_diamond_accuracy: 9.38 + race-high_accuracy: 65.62 + winogrande_accuracy: 78.12 mistral-7b-v0.2-hf: - gsm8k: 31.25 - GPQA_diamond: 6.25 - race-high: 62.5 - winogrande: 59.38 + gsm8k_accuracy: 31.25 + GPQA_diamond_accuracy: 6.25 + race-high_accuracy: 62.5 + winogrande_accuracy: 59.38 mistral-7b-v0.3-hf: - gsm8k: 31.25 - GPQA_diamond: 6.25 - race-high: 62.5 - winogrande: 59.38 + gsm8k_accuracy: 31.25 + GPQA_diamond_accuracy: 6.25 + race-high_accuracy: 62.5 + winogrande_accuracy: 59.38 mistral-7b-v0.2-vllm: - gsm8k: 34.38 - GPQA_diamond: 6.25 - race-high: 62.5 - winogrande: 65.62 + gsm8k_accuracy: 34.38 + GPQA_diamond_accuracy: 6.25 + race-high_accuracy: 62.5 + winogrande_accuracy: 65.62 qwen2.5-7b-hf: - gsm8k: 81.25 - GPQA_diamond: 18.75 - race-high: 87.5 - winogrande: 71.88 + gsm8k_accuracy: 81.25 + GPQA_diamond_accuracy: 18.75 + race-high_accuracy: 87.5 + winogrande_accuracy: 71.88 qwen2.5-1.5b-turbomind: - gsm8k: 71.88 - GPQA_diamond: 15.62 - race-high: 78.12 - winogrande: 71.88 + gsm8k_accuracy: 71.88 + GPQA_diamond_accuracy: 15.62 + race-high_accuracy: 78.12 + winogrande_accuracy: 71.88 qwen2.5-7b-turbomind: - gsm8k: 71.88 - GPQA_diamond: 25 - race-high: 87.5 - winogrande: 71.88 + gsm8k_accuracy: 71.88 + GPQA_diamond_accuracy: 25 + race-high_accuracy: 87.5 + winogrande_accuracy: 71.88 qwen1.5-moe-a2.7b-hf: - gsm8k: 62.5 - GPQA_diamond: 18.75 - race-high: 84.38 - winogrande: 75 + gsm8k_accuracy: 62.5 + GPQA_diamond_accuracy: 18.75 + race-high_accuracy: 84.38 + winogrande_accuracy: 75 qwen2-0.5b-hf: - gsm8k: 25 - GPQA_diamond: 0 - race-high: 40.62 - winogrande: 62.5 + gsm8k_accuracy: 25 + GPQA_diamond_accuracy: 0 + race-high_accuracy: 40.62 + winogrande_accuracy: 62.5 qwen2-1.5b-hf: - gsm8k: 59.38 - GPQA_diamond: 9.38 - race-high: 81.25 - winogrande: 62.5 + gsm8k_accuracy: 59.38 + GPQA_diamond_accuracy: 9.38 + race-high_accuracy: 81.25 + winogrande_accuracy: 62.5 qwen2-7b-hf: - gsm8k: 68.75 - GPQA_diamond: 9.38 - race-high: 87.5 - winogrande: 68.75 + gsm8k_accuracy: 68.75 + GPQA_diamond_accuracy: 9.38 + race-high_accuracy: 87.5 + winogrande_accuracy: 68.75 qwen2-1.5b-turbomind: - gsm8k: 62.50 - GPQA_diamond: 6.25 - race-high: 81.25 - winogrande: 75 + gsm8k_accuracy: 62.50 + GPQA_diamond_accuracy: 6.25 + race-high_accuracy: 81.25 + winogrande_accuracy: 75 qwen2-7b-turbomind: - gsm8k: 68.75 - GPQA_diamond: 12.5 - race-high: 87.5 - winogrande: 71.88 + gsm8k_accuracy: 68.75 + GPQA_diamond_accuracy: 12.5 + race-high_accuracy: 87.5 + winogrande_accuracy: 71.88 qwen1.5-0.5b-vllm: - gsm8k: 9.38 - GPQA_diamond: 0 - race-high: 56.25 - winogrande: 62.5 + gsm8k_accuracy: 9.38 + GPQA_diamond_accuracy: 0 + race-high_accuracy: 56.25 + winogrande_accuracy: 62.5 yi-1.5-6b-hf: - gsm8k: 62.5 - GPQA_diamond: 3.12 - race-high: 87.5 - winogrande: 62.5 + gsm8k_accuracy: 62.5 + GPQA_diamond_accuracy: 3.12 + race-high_accuracy: 87.5 + winogrande_accuracy: 62.5 yi-1.5-9b-hf: - gsm8k: 75 - GPQA_diamond: 40.62 - race-high: 87.5 - winogrande: 59.38 + gsm8k_accuracy: 75 + GPQA_diamond_accuracy: 40.62 + race-high_accuracy: 87.5 + winogrande_accuracy: 59.38 deepseek-v2-lite-hf: - gsm8k: 28.12 - GPQA_diamond: 21.88 - race-high: 59.38 - winogrande: 75 + gsm8k_accuracy: 28.12 + GPQA_diamond_accuracy: 21.88 + race-high_accuracy: 59.38 + winogrande_accuracy: 75 internlm2-20b-hf: - gsm8k: 56.25 - GPQA_diamond: 15.62 - race-high: 68.75 - winogrande: 75 + gsm8k_accuracy: 56.25 + GPQA_diamond_accuracy: 15.62 + race-high_accuracy: 68.75 + winogrande_accuracy: 75 internlm2-base-20b-hf: - gsm8k: 12.5 - GPQA_diamond: 9.38 - race-high: 84.38 - winogrande: 65.62 + gsm8k_accuracy: 12.5 + GPQA_diamond_accuracy: 9.38 + race-high_accuracy: 84.38 + winogrande_accuracy: 65.62 internlm2-20b-turbomind: - gsm8k: 68.75 - GPQA_diamond: 15.62 - race-high: 68.75 - winogrande: 81.25 + gsm8k_accuracy: 68.75 + GPQA_diamond_accuracy: 15.62 + race-high_accuracy: 68.75 + winogrande_accuracy: 81.25 qwen2.5-14b-hf: - gsm8k: 75 - GPQA_diamond: 37.5 - race-high: 93.75 - winogrande: 84.38 + gsm8k_accuracy: 75 + GPQA_diamond_accuracy: 37.5 + race-high_accuracy: 93.75 + winogrande_accuracy: 84.38 diff --git a/.github/workflows/daily-run-test.yml b/.github/workflows/daily-run-test.yml index 1d7a1189..35614e6a 100644 --- a/.github/workflows/daily-run-test.yml +++ b/.github/workflows/daily-run-test.yml @@ -38,28 +38,21 @@ on: description: "regression conda env, eg. ['dsw_cu11','dsw_cu12']" type: string default: "['dsw_cu12']" + fullbench_eval: + required: true + description: 'fullbench volc functions' + type: string + default: "['base_long_context','base_objective','chat_long_context','chat_objective','chat_subjective']" schedule: - - cron: '15 16 * * *' - -concurrency: - group: ${{ github.workflow }}-${{ github.ref }} - cancel-in-progress: true + - cron: '15 14 * * *' env: - CONDA_ENV: opencompass_regression - PIP_CACHE_PATH: /cpfs01/user/qa-llm-cicd/.cache/pip - HF_CACHE_PATH: /cpfs01/shared/public/public_hdd/llmeval/model_weights/hf_hub - HUGGINGFACE_HUB_CACHE: /cpfs01/shared/public/public_hdd/llmeval/model_weights/hf_hub - HF_HUB_CACHE: /cpfs01/shared/public/public_hdd/llmeval/model_weights/hf_hub - COMPASS_DATA_CACHE: /cpfs01/shared/public/llmeval/compass_data_cache HF_DATASETS_OFFLINE: 1 HF_EVALUATE_OFFLINE: 1 TRANSFORMERS_OFFLINE: 1 VLLM_USE_MODELSCOPE: false LMDEPLOY_USE_MODELSCOPE: false HF_HUB_OFFLINE: 1 - TRITON_PTXAS_PATH: /usr/local/cuda/bin/ptxas - REPORT_ROOT: /cpfs01/shared/public/qa-llm-cicd/report OUTPUT_FOLDER: cuda12.1_dist_${{ github.run_id }} jobs: @@ -129,6 +122,9 @@ jobs: matrix: cuda_env: ${{ fromJSON(inputs.cuda_env || '["dsw_cu12"]')}} runs-on: ${{ matrix.cuda_env }} + env: + CONDA_ENV: opencompass_regression + PIP_CACHE_PATH: /cpfs01/user/qa-llm-cicd/.cache/pip environment: 'prod' timeout-minutes: 240 #4hours steps: @@ -209,6 +205,14 @@ jobs: cuda_env: ${{ fromJSON(inputs.cuda_env || '["dsw_cu12"]')}} regression_func: ${{fromJSON(github.event.inputs.regression_func || '["chat_models","base_models","chat_obj_fullbench","chat_sub_fullbench","base_fullbench","cmd","api"]')}} runs-on: ${{ matrix.cuda_env }} + env: + CONDA_ENV: opencompass_regression + PIP_CACHE_PATH: /cpfs01/user/qa-llm-cicd/.cache/pip + HF_CACHE_PATH: /cpfs01/shared/public/public_hdd/llmeval/model_weights/hf_hub + HUGGINGFACE_HUB_CACHE: /cpfs01/shared/public/public_hdd/llmeval/model_weights/hf_hub + HF_HUB_CACHE: /cpfs01/shared/public/public_hdd/llmeval/model_weights/hf_hub + COMPASS_DATA_CACHE: /cpfs01/shared/public/llmeval/compass_data_cache + REPORT_ROOT: /cpfs01/shared/public/qa-llm-cicd/report environment: 'prod' timeout-minutes: 240 #4hours steps: @@ -305,9 +309,68 @@ jobs: run: | kill -15 "$restful_pid" + fullbench_run_test: + if: ${{!cancelled()}} + needs: ['build-pypi', 'build-pypi-lmdeploy'] + env: + FULLBENCH_CONDA_ENV: regression_test + FULLBENCH_REPORT_ROOT: /fs-computility/llm/qa-llm-cicd/eval_report/regression + COMPASS_DATA_CACHE: /fs-computility/llm/shared/llmeval/datasets/compass_data_cache + strategy: + fail-fast: false + matrix: + function_type: ${{fromJSON(github.event.inputs.fullbench_eval || '["base_long_context","base_objective","chat_long_context","chat_objective","chat_subjective"]')}} + runs-on: volc_cu12 + environment: 'prod' + timeout-minutes: 360 #6hours + steps: + - name: Clone repository + uses: actions/checkout@v2 + with: + repository: ${{ github.event.inputs.repo_org || 'open-compass/opencompass' }} + ref: ${{github.event.inputs.repo_ref || 'main'}} + - name: Download Artifacts + uses: actions/download-artifact@v4 + with: + name: my-artifact-${{ github.run_id }} + - name: Prepare - reinstall opencompass - cu12 + if: ${{matrix.cuda_env == 'dsw_cu12' && inputs.build_lmdeploy}} + run: | + . /fs-computility/llm/qa-llm-cicd/miniconda3/bin/activate + conda activate ${{env.FULLBENCH_CONDA_ENV}} + pip install opencompass*.whl --no-deps + - name: Prepare - reinstall lmdeploy - cu12 + if: ${{matrix.cuda_env == 'dsw_cu12' && inputs.build_lmdeploy}} + uses: actions/download-artifact@v4 + with: + name: my-artifact-${{ github.run_id }}-py310 + - name: Prepare - reinstall lmdeploy - cu12 + if: ${{matrix.cuda_env == 'dsw_cu12' && inputs.build_lmdeploy}} + run: | + . /fs-computility/llm/qa-llm-cicd/miniconda3/bin/activate + conda activate ${{env.FULLBENCH_CONDA_ENV}} + pip install lmdeploy-*.whl --no-deps + - name: Conda env + if: ${{matrix.cuda_env == 'dsw_cu12' && inputs.build_lmdeploy}} + run: | + . /fs-computility/llm/qa-llm-cicd/miniconda3/bin/activate + conda activate ${{env.FULLBENCH_CONDA_ENV}} + conda info --envs + pip list + - name: Run command testcase + run: | + . /fs-computility/llm/qa-llm-cicd/miniconda3/bin/activate + conda activate ${{env.FULLBENCH_CONDA_ENV}} + conda info --envs + export from_tf=TRUE + opencompass /fs-computility/llm/qa-llm-cicd/ocplayground/template/regression/eval_${{ matrix.function_type }}.py --work-dir ${{env.FULLBENCH_REPORT_ROOT}}/${{ github.run_id }}/${{ matrix.function_type }} --reuse + rm regression_result_daily -f && ln -s ${{env.FULLBENCH_REPORT_ROOT}}/${{ github.run_id }}/${{ matrix.function_type }}/*/summary regression_result_daily + python -m pytest -m ${{ matrix.function_type }} -s -v --color=yes .github/scripts/oc_score_assert.py + + notify_to_feishu: if: ${{ always() && !cancelled() && contains(needs.*.result, 'failure') && (github.ref_name == 'develop' || github.ref_name == 'main') }} - needs: [daily_run_test] + needs: [daily_run_test, fullbench_run_test] environment: 'prod' timeout-minutes: 5 runs-on: self-hosted diff --git a/.github/workflows/pr-run-test.yml b/.github/workflows/pr-run-test.yml index 561ef750..bc829eab 100644 --- a/.github/workflows/pr-run-test.yml +++ b/.github/workflows/pr-run-test.yml @@ -29,7 +29,7 @@ env: jobs: pr_run_test: - runs-on: self-hosted + runs-on: dsw_cu12 environment: 'prod' timeout-minutes: 30 steps: