From a9d6b6461ff15d11aaaaee02a8c3552c93321c59 Mon Sep 17 00:00:00 2001 From: zhulinJulia24 <145004780+zhulinJulia24@users.noreply.github.com> Date: Tue, 12 Nov 2024 18:40:27 +0800 Subject: [PATCH] [ci] react daily test (#1668) * updaste * update * update * update * update * update * update * update * update * update * updaste * update * update * refactor summarize * update * update * update * update * update * updaste * update * update * update * update * updaste * update * update * update * update * update * updaste * updaste * update * update * update * update * update * update * update * update * update * update * update * Update daily-run-test.yml * Update daily-run-test.yml * update * update * update * update * update * Update daily-run-test.yml * update * update * update * update * update * update * update * update * update * update * update * Update daily-run-test.yml * Update daily-run-test.yml * update * update * Update daily-run-test.yml * update * update * update --------- Co-authored-by: zhulin1 --- .github/scripts/eval_regression_api.py | 39 ++ .github/scripts/eval_regression_base.py | 58 ++- .../scripts/eval_regression_base_fullbench.py | 184 +++++++ .github/scripts/eval_regression_chat.py | 78 +-- ...val_regression_chat_objective_fullbench.py | 246 ++++++++++ ...al_regression_chat_subjective_fullbench.py | 70 +++ .github/scripts/oc_score_assert.py | 314 +++++++++--- .github/scripts/oc_score_baseline.yaml | 389 ++------------- .../scripts/oc_score_baseline_fullbench.yaml | 153 ++++++ .../scripts/oc_score_baseline_testrange.yaml | 459 ++++++++++++++++++ .github/workflows/daily-run-test.yml | 241 +++++++-- 11 files changed, 1710 insertions(+), 521 deletions(-) create mode 100644 .github/scripts/eval_regression_api.py create mode 100644 .github/scripts/eval_regression_base_fullbench.py create mode 100644 .github/scripts/eval_regression_chat_objective_fullbench.py create mode 100644 .github/scripts/eval_regression_chat_subjective_fullbench.py create mode 100644 .github/scripts/oc_score_baseline_fullbench.yaml create mode 100644 .github/scripts/oc_score_baseline_testrange.yaml diff --git a/.github/scripts/eval_regression_api.py b/.github/scripts/eval_regression_api.py new file mode 100644 index 00000000..db4f0ab2 --- /dev/null +++ b/.github/scripts/eval_regression_api.py @@ -0,0 +1,39 @@ +from mmengine.config import read_base + +from opencompass.models.openai_api import OpenAISDK + +with read_base(): + # choose a list of datasets + from opencompass.configs.datasets.gsm8k.gsm8k_gen import \ + gsm8k_datasets # noqa: F401, E501 + from opencompass.configs.datasets.race.race_gen import \ + race_datasets # noqa: F401, E501 + +datasets = sum([v for k, v in locals().items() if k.endswith('_datasets')], []) + +api_meta_template = dict( + round=[ + dict(role='HUMAN', api_role='HUMAN'), + dict(role='BOT', api_role='BOT', generate=True), + ], + reserved_roles=[dict(role='SYSTEM', api_role='SYSTEM')], +) + +models = [ + dict( + abbr='lmdeploy-api-test', + type=OpenAISDK, + key='EMPTY', + openai_api_base='http://localhost:23333/v1', + path='internlm2', + tokenizer_path='internlm/internlm2_5-7b-chat', + rpm_verbose=True, + meta_template=api_meta_template, + query_per_second=128, + max_out_len=1024, + max_seq_len=4096, + temperature=0.01, + batch_size=128, + retry=20, + ) +] diff --git a/.github/scripts/eval_regression_base.py b/.github/scripts/eval_regression_base.py index 12339ecf..330c97e5 100644 --- a/.github/scripts/eval_regression_base.py +++ b/.github/scripts/eval_regression_base.py @@ -2,15 +2,21 @@ from mmengine.config import read_base with read_base(): # choose a list of datasets + from opencompass.configs.datasets.gpqa.gpqa_openai_simple_evals_gen_5aeece import \ + gpqa_datasets # noqa: F401, E501 from opencompass.configs.datasets.gsm8k.gsm8k_gen_17d0dc import \ gsm8k_datasets # noqa: F401, E501 from opencompass.configs.datasets.race.race_ppl import \ race_datasets # noqa: F401, E501 + from opencompass.configs.datasets.winogrande.winogrande_5shot_ll_252f01 import \ + winogrande_datasets # noqa: F401, E501 + # read hf models - chat models + from opencompass.configs.models.chatglm.hf_glm4_9b import \ + models as hf_glm4_9b_model # noqa: F401, E501 from opencompass.configs.models.deepseek.hf_deepseek_moe_16b_base import \ models as hf_deepseek_moe_16b_base_model # noqa: F401, E501 from opencompass.configs.models.deepseek.hf_deepseek_v2_lite import \ models as hf_deepseek_v2_lite_model # noqa: F401, E501 - # read hf models - chat models from opencompass.configs.models.deepseek.lmdeploy_deepseek_7b_base import \ models as lmdeploy_deepseek_7b_base_model # noqa: F401, E501 from opencompass.configs.models.deepseek.vllm_deepseek_moe_16b_base import \ @@ -19,34 +25,58 @@ with read_base(): models as hf_gemma2_2b_model # noqa: F401, E501 from opencompass.configs.models.gemma.hf_gemma2_9b import \ models as hf_gemma2_9b_model # noqa: F401, E501 + from opencompass.configs.models.gemma.hf_gemma_2b import \ + models as hf_gemma_2b_model # noqa: F401, E501 + from opencompass.configs.models.gemma.hf_gemma_7b import \ + models as hf_gemma_7b_model # noqa: F401, E501 + from opencompass.configs.models.gemma.vllm_gemma_2b import \ + models as vllm_gemma_2b_model # noqa: F401, E501 + from opencompass.configs.models.gemma.vllm_gemma_7b import \ + models as vllm_gemma_7b_model # noqa: F401, E501 from opencompass.configs.models.hf_internlm.hf_internlm2_5_7b import \ models as hf_internlm2_5_7b_model # noqa: F401, E501 from opencompass.configs.models.hf_internlm.hf_internlm2_7b import \ models as hf_internlm2_7b_model # noqa: F401, E501 + from opencompass.configs.models.hf_internlm.hf_internlm2_20b import \ + models as hf_internlm2_20b_model # noqa: F401, E501 from opencompass.configs.models.hf_internlm.hf_internlm2_base_7b import \ models as hf_internlm2_base_7b_model # noqa: F401, E501 + from opencompass.configs.models.hf_internlm.hf_internlm2_base_20b import \ + models as hf_internlm2_base_20b_model # noqa: F401, E501 from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_1_8b import \ models as lmdeploy_internlm2_1_8b_model # noqa: F401, E501 from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b import \ models as lmdeploy_internlm2_5_7b_model # noqa: F401, E501 from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_7b import \ models as lmdeploy_internlm2_7b_model # noqa: F401, E501 + from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_20b import \ + models as lmdeploy_internlm2_20b_model # noqa: F401, E501 from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_base_7b import \ models as lmdeploy_internlm2_base_7b_model # noqa: F401, E501 from opencompass.configs.models.hf_llama.hf_llama2_7b import \ models as hf_llama2_7b_model # noqa: F401, E501 + from opencompass.configs.models.hf_llama.hf_llama3_1_8b import \ + models as hf_llama3_1_8b_model # noqa: F401, E501 from opencompass.configs.models.hf_llama.hf_llama3_8b import \ models as hf_llama3_8b_model # noqa: F401, E501 from opencompass.configs.models.hf_llama.lmdeploy_llama3_1_8b import \ models as lmdeploy_llama3_1_8b_model # noqa: F401, E501 from opencompass.configs.models.hf_llama.lmdeploy_llama3_8b import \ models as lmdeploy_llama3_8b_model # noqa: F401, E501 + from opencompass.configs.models.mistral.hf_mistral_7b_v0_2 import \ + models as hf_mistral_7b_v0_2_model # noqa: F401, E501 from opencompass.configs.models.mistral.hf_mistral_7b_v0_3 import \ models as hf_mistral_7b_v0_3_model # noqa: F401, E501 from opencompass.configs.models.mistral.vllm_mistral_7b_v0_2 import \ models as vllm_mistral_7b_v0_2_model # noqa: F401, E501 - from opencompass.configs.models.mistral.vllm_mixtral_8x7b_v0_1 import \ - models as vllm_mixtral_8x7b_v0_1_model # noqa: F401, E501 + from opencompass.configs.models.qwen2_5.hf_qwen_2_5_7b import \ + models as hf_qwen_2_5_7b_model # noqa: F401, E501 + from opencompass.configs.models.qwen2_5.hf_qwen_2_5_14b import \ + models as hf_qwen_2_5_14b_model # noqa: F401, E501 + from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_1_5b import \ + models as lmdeploy_qwen2_5_1_5b_model # noqa: F401, E501 + from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_7b import \ + models as lmdeploy_qwen2_5_7b_model # noqa: F401, E501 from opencompass.configs.models.qwen.hf_qwen1_5_moe_a2_7b import \ models as hf_qwen1_5_moe_a2_7b_model # noqa: F401, E501 from opencompass.configs.models.qwen.hf_qwen2_0_5b import \ @@ -65,11 +95,27 @@ with read_base(): models as hf_yi_1_5_6b_model # noqa: F401, E501 from opencompass.configs.models.yi.hf_yi_1_5_9b import \ models as hf_yi_1_5_9b_model # noqa: F401, E501 - from opencompass.configs.summarizers.medium import \ - summarizer # noqa: F401, E501 +race_datasets = [race_datasets[1]] models = sum([v for k, v in locals().items() if k.endswith('_model')], []) datasets = sum([v for k, v in locals().items() if k.endswith('_datasets')], []) for d in datasets: - d['reader_cfg']['test_range'] = '[0:100]' + d['reader_cfg']['test_range'] = '[0:32]' + +for m in models: + if 'turbomind' in m['abbr'] or 'lmdeploy' in m['abbr']: + m['engine_config']['max_batch_size'] = 1 + m['batch_size'] = 1 +models = sorted(models, key=lambda x: x['run_cfg']['num_gpus']) + +summarizer = dict( + dataset_abbrs=[ + ['gsm8k', 'accuracy'], + ['GPQA_diamond', 'accuracy'], + ['race-high', 'accuracy'], + ['winogrande', 'accuracy'], + ], + summary_groups=sum( + [v for k, v in locals().items() if k.endswith('_summary_groups')], []), +) diff --git a/.github/scripts/eval_regression_base_fullbench.py b/.github/scripts/eval_regression_base_fullbench.py new file mode 100644 index 00000000..d5ad48c4 --- /dev/null +++ b/.github/scripts/eval_regression_base_fullbench.py @@ -0,0 +1,184 @@ +from mmengine.config import read_base + +with read_base(): + from opencompass.configs.datasets.ARC_c.ARC_c_few_shot_ppl import \ + ARC_c_datasets # noqa: F401, E501 + from opencompass.configs.datasets.bbh.bbh_gen_98fba6 import \ + bbh_datasets # noqa: F401, E501 + from opencompass.configs.datasets.cmmlu.cmmlu_ppl_041cbf import \ + cmmlu_datasets # noqa: F401, E501 + from opencompass.configs.datasets.drop.drop_gen_a2697c import \ + drop_datasets # noqa: F401, E501 + from opencompass.configs.datasets.GaokaoBench.GaokaoBench_no_subjective_gen_d21e37 import \ + GaokaoBench_datasets # noqa: F401, E501 + from opencompass.configs.datasets.gpqa.gpqa_few_shot_ppl_4b5a83 import \ + gpqa_datasets # noqa: F401, E501 + # Corebench v1.7 + from opencompass.configs.datasets.gsm8k.gsm8k_gen_17d0dc import \ + gsm8k_datasets # noqa: F401, E501 + from opencompass.configs.datasets.hellaswag.hellaswag_10shot_ppl_59c85e import \ + hellaswag_datasets # noqa: F401, E501 + from opencompass.configs.datasets.humaneval.internal_humaneval_gen_ce6b06 import \ + humaneval_datasets as humaneval_v2_datasets # noqa: F401, E501 + from opencompass.configs.datasets.humaneval.internal_humaneval_gen_d2537e import \ + humaneval_datasets # noqa: F401, E501 + from opencompass.configs.datasets.math.math_4shot_base_gen_43d5b6 import \ + math_datasets # noqa: F401, E501 + from opencompass.configs.datasets.MathBench.mathbench_2024_few_shot_mixed_4a3fd4 import \ + mathbench_datasets # noqa: F401, E501 + from opencompass.configs.datasets.mbpp.sanitized_mbpp_gen_742f0c import \ + sanitized_mbpp_datasets # noqa: F401, E501 + from opencompass.configs.datasets.mmlu.mmlu_ppl_ac766d import \ + mmlu_datasets # noqa: F401, E501 + from opencompass.configs.datasets.mmlu_pro.mmlu_pro_few_shot_gen_bfaf90 import \ + mmlu_pro_datasets # noqa: F401, E501 + from opencompass.configs.datasets.nq.nq_open_1shot_gen_20a989 import \ + nq_datasets # noqa: F401, E501 + from opencompass.configs.datasets.race.race_few_shot_ppl import \ + race_datasets # noqa: F401, E501 + from opencompass.configs.datasets.SuperGLUE_BoolQ.SuperGLUE_BoolQ_few_shot_ppl import \ + BoolQ_datasets # noqa: F401, E501 + from opencompass.configs.datasets.TheoremQA.TheoremQA_5shot_gen_6f0af8 import \ + TheoremQA_datasets # noqa: F401, E501 + from opencompass.configs.datasets.triviaqa.triviaqa_wiki_1shot_gen_20a989 import \ + triviaqa_datasets # noqa: F401, E501 + from opencompass.configs.datasets.wikibench.wikibench_few_shot_ppl_c23d79 import \ + wikibench_datasets # noqa: F401, E501 + from opencompass.configs.datasets.winogrande.winogrande_5shot_ll_252f01 import \ + winogrande_datasets # noqa: F401, E501 + from opencompass.configs.models.hf_internlm.hf_internlm2_5_7b import \ + models as hf_internlm2_5_7b_model # noqa: F401, E501 + from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b import \ + models as lmdeploy_internlm2_5_7b_model # noqa: F401, E501 + from opencompass.configs.summarizers.groups.bbh import \ + bbh_summary_groups # noqa: F401, E501 + # Summary Groups + from opencompass.configs.summarizers.groups.cmmlu import \ + cmmlu_summary_groups # noqa: F401, E501 + from opencompass.configs.summarizers.groups.GaokaoBench import \ + GaokaoBench_summary_groups # noqa: F401, E501 + from opencompass.configs.summarizers.groups.mathbench_v1_2024 import \ + mathbench_2024_summary_groups # noqa: F401, E501 + from opencompass.configs.summarizers.groups.mmlu import \ + mmlu_summary_groups # noqa: F401, E501 + from opencompass.configs.summarizers.groups.mmlu_pro import \ + mmlu_pro_summary_groups # noqa: F401, E501 + +race_datasets = [race_datasets[1]] # Only take RACE-High +humaneval_v2_datasets[0]['abbr'] = 'openai_humaneval_v2' +bbh_datasets = [ + x for x in bbh_datasets if 'logical_deduction_seven_objects' in x['abbr'] + or 'multistep_arithmetic_two' in x['abbr'] +] +cmmlu_datasets = [ + x for x in cmmlu_datasets if x['abbr'].replace('cmmlu-', '') in [ + 'ancient_chinese', 'chinese_civil_service_exam', + 'chinese_driving_rule', 'chinese_food_culture', + 'chinese_foreign_policy', 'chinese_history', 'chinese_literature', + 'chinese_teacher_qualification', 'construction_project_management', + 'elementary_chinese', 'elementary_commonsense', 'ethnology', + 'high_school_politics', 'modern_chinese', + 'traditional_chinese_medicine' + ] +] +mmlu_datasets = [ + x for x in mmlu_datasets if x['abbr'].replace('lukaemon_mmlu_', '') in [ + 'business_ethics', 'clinical_knowledge', 'college_medicine', + 'global_facts', 'human_aging', 'management', 'marketing', + 'medical_genetics', 'miscellaneous', 'nutrition', + 'professional_accounting', 'professional_medicine', 'virology' + ] +] +mmlu_pro_datasets = [mmlu_pro_datasets[0]] +mathbench_datasets = [x for x in mathbench_datasets if 'college' in x['abbr']] +GaokaoBench_datasets = [ + x for x in GaokaoBench_datasets if '2010-2022_Math_II_MCQs' in x['abbr'] + or '2010-2022_Math_II_Fill-in-the-Blank' in x['abbr'] +] +datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), []) + +summarizer = dict( + dataset_abbrs=[ + ['race-high', 'accuracy'], + ['ARC-c', 'accuracy'], + ['BoolQ', 'accuracy'], + ['mmlu_pro', 'naive_average'], + ['GPQA_diamond', 'accuracy'], + ['cmmlu', 'naive_average'], + ['mmlu', 'naive_average'], + ['drop', 'accuracy'], + ['bbh', 'naive_average'], + ['math', 'accuracy'], + ['openai_humaneval', 'humaneval_pass@1'], + ['openai_humaneval_v2', 'humaneval_pass@1'], + ['sanitized_mbpp', 'score'], + ['wikibench-wiki-single_choice_cncircular', 'perf_4'], + ['gsm8k', 'accuracy'], + ['GaokaoBench', 'weighted_average'], + ['triviaqa_wiki_1shot', 'score'], + ['nq_open_1shot', 'score'], + ['winogrande', 'accuracy'], + ['hellaswag', 'accuracy'], + ['TheoremQA', 'score'], + '###### MathBench-A: Application Part ######', + 'college', + 'high', + 'middle', + 'primary', + 'arithmetic', + 'mathbench-a (average)', + '###### MathBench-T: Theory Part ######', + 'college_knowledge', + 'high_knowledge', + 'middle_knowledge', + 'primary_knowledge', + 'mathbench-t (average)', + '###### Overall: Average between MathBench-A and MathBench-T ######', + 'Overall', + '', + 'bbh-logical_deduction_seven_objects', + 'bbh-multistep_arithmetic_two', + '', + 'mmlu', + 'mmlu-stem', + 'mmlu-social-science', + 'mmlu-humanities', + ['mmlu-other', 'accuracy'], + 'cmmlu', + 'cmmlu-stem', + 'cmmlu-social-science', + 'cmmlu-humanities', + 'cmmlu-other', + ['cmmlu-china-specific', 'accuracy'], + 'mmlu_pro', + 'mmlu_pro_biology', + 'mmlu_pro_business', + 'mmlu_pro_chemistry', + 'mmlu_pro_computer_science', + 'mmlu_pro_economics', + 'mmlu_pro_engineering', + 'mmlu_pro_health', + 'mmlu_pro_history', + 'mmlu_pro_law', + 'mmlu_pro_math', + 'mmlu_pro_philosophy', + 'mmlu_pro_physics', + 'mmlu_pro_psychology', + 'mmlu_pro_other', + ], + summary_groups=sum( + [v for k, v in locals().items() if k.endswith('_summary_groups')], []), +) + +models = sum([v for k, v in locals().items() if k.endswith('_model')], []) +datasets = sum([v for k, v in locals().items() if k.endswith('_datasets')], []) + +for d in datasets: + d['reader_cfg']['test_range'] = '[0:16]' + +for m in models: + m['abbr'] = m['abbr'] + '_fullbench' + if 'turbomind' in m['abbr'] or 'lmdeploy' in m['abbr']: + m['engine_config']['max_batch_size'] = 1 + m['batch_size'] = 1 +models = sorted(models, key=lambda x: x['run_cfg']['num_gpus']) diff --git a/.github/scripts/eval_regression_chat.py b/.github/scripts/eval_regression_chat.py index fa28562f..68c225c5 100644 --- a/.github/scripts/eval_regression_chat.py +++ b/.github/scripts/eval_regression_chat.py @@ -1,7 +1,5 @@ from mmengine.config import read_base -from opencompass.models import OpenAISDK - with read_base(): # choose a list of datasets from opencompass.configs.datasets.gsm8k.gsm8k_gen import \ @@ -29,6 +27,12 @@ with read_base(): models as hf_gemma2_2b_it_model # noqa: F401, E501 from opencompass.configs.models.gemma.hf_gemma2_9b_it import \ models as hf_gemma2_9b_it_model # noqa: F401, E501 + from opencompass.configs.models.gemma.hf_gemma_2b_it import \ + models as hf_gemma_2b_it_model # noqa: F401, E501 + from opencompass.configs.models.gemma.hf_gemma_7b_it import \ + models as hf_gemma_7b_it_model # noqa: F401, E501 + from opencompass.configs.models.gemma.lmdeploy_gemma_9b_it import \ + models as lmdeploy_gemma_9b_it_model # noqa: F401, E501 from opencompass.configs.models.gemma.vllm_gemma_7b_it import \ models as vllm_gemma_7b_it_model # noqa: F401, E501 from opencompass.configs.models.hf_internlm.hf_internlm2_5_7b_chat import \ @@ -51,18 +55,35 @@ with read_base(): models as vllm_internlm2_chat_7b_model # noqa: F401, E501 from opencompass.configs.models.hf_llama.hf_llama3_1_8b_instruct import \ models as hf_llama3_1_8b_instruct_model # noqa: F401, E501 + from opencompass.configs.models.hf_llama.hf_llama3_2_3b_instruct import \ + models as hf_llama3_2_3b_instruct_model # noqa: F401, E501 from opencompass.configs.models.hf_llama.hf_llama3_8b_instruct import \ models as hf_llama3_8b_instruct_model # noqa: F401, E501 from opencompass.configs.models.hf_llama.lmdeploy_llama3_1_8b_instruct import \ models as lmdeploy_llama3_1_8b_instruct_model # noqa: F401, E501 + from opencompass.configs.models.hf_llama.lmdeploy_llama3_2_3b_instruct import \ + models as lmdeploy_llama3_2_3b_instruct_model # noqa: F401, E501 from opencompass.configs.models.hf_llama.lmdeploy_llama3_8b_instruct import \ models as lmdeploy_llama3_8b_instruct_model # noqa: F401, E501 + from opencompass.configs.models.mistral.hf_mistral_7b_instruct_v0_2 import \ + models as hf_mistral_7b_instruct_v0_2_model # noqa: F401, E501 from opencompass.configs.models.mistral.hf_mistral_7b_instruct_v0_3 import \ models as hf_mistral_7b_instruct_v0_3_model # noqa: F401, E501 + from opencompass.configs.models.mistral.hf_mistral_nemo_instruct_2407 import \ + models as hf_mistral_nemo_instruct_2407_model # noqa: F401, E501 + from opencompass.configs.models.mistral.hf_mistral_small_instruct_2409 import \ + models as hf_mistral_small_instruct_2409_model # noqa: F401, E501 + from opencompass.configs.models.mistral.lmdeploy_mistral_nemo_instruct_2407 import \ + models as lmdeploy_mistral_nemo_instruct_2407_model # noqa: F401, E501 + from opencompass.configs.models.mistral.lmdeploy_mistral_small_instruct_2409 import \ + models as \ + lmdeploy_mistral_small_instruct_2409_model # noqa: F401, E501 + from opencompass.configs.models.mistral.vllm_mistral_7b_instruct_v0_1 import \ + models as vllm_mistral_7b_instruct_v0_1_model # noqa: F401, E501 from opencompass.configs.models.mistral.vllm_mistral_7b_instruct_v0_2 import \ models as vllm_mistral_7b_instruct_v0_2_model # noqa: F401, E501 - from opencompass.configs.models.mistral.vllm_mixtral_8x7b_instruct_v0_1 import \ - models as vllm_mixtral_8x7b_instruct_v0_1_model # noqa: F401, E501 + from opencompass.configs.models.openbmb.hf_minicpm3_4b import \ + models as hf_minicpm3_4b_model # noqa: F401, E501 from opencompass.configs.models.openbmb.hf_minicpm_2b_dpo_fp32 import \ models as hf_minicpm_2b_dpo_fp32_model # noqa: F401, E501 from opencompass.configs.models.openbmb.hf_minicpm_2b_sft_bf16 import \ @@ -73,6 +94,10 @@ with read_base(): models as hf_phi_3_mini_4k_instruct_model # noqa: F401, E501 from opencompass.configs.models.phi.hf_phi_3_small_8k_instruct import \ models as hf_phi_3_mini_8k_instruct_model # noqa: F401, E501 + from opencompass.configs.models.qwen2_5.hf_qwen2_5_14b_instruct import \ + models as hf_qwen2_5_14b_instruct_model # noqa: F401, E501 + from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_14b_instruct import \ + models as lmdeploy_qwen2_5_14b_instruct_model # noqa: F401, E501 from opencompass.configs.models.qwen.hf_qwen1_5_0_5b_chat import \ models as hf_qwen1_5_0_5b_chat_model # noqa: F401, E501 from opencompass.configs.models.qwen.hf_qwen2_1_5b_instruct import \ @@ -89,10 +114,8 @@ with read_base(): models as hf_yi_1_5_6b_chat_model # noqa: F401, E501 from opencompass.configs.models.yi.hf_yi_1_5_9b_chat import \ models as hf_yi_1_5_9b_chat_model # noqa: F401, E501 - from opencompass.configs.summarizers.medium import \ - summarizer # noqa: F401, E501 -models = sum([v for k, v in locals().items() if k.endswith('_model')], []) +race_datasets = [race_datasets[1]] datasets = sum([v for k, v in locals().items() if k.endswith('_datasets')], []) api_meta_template = dict( @@ -103,25 +126,24 @@ api_meta_template = dict( reserved_roles=[dict(role='SYSTEM', api_role='SYSTEM')], ) -model_name = '' - -models.append( - dict( - abbr='lmdeploy-api-test', - type=OpenAISDK, - key='EMPTY', - openai_api_base='http://judgemodel:10001/v1', - path='compass_judger_internlm2_102b_0508', - tokenizer_path='internlm/internlm2_5-20b-chat', - rpm_verbose=True, - meta_template=api_meta_template, - query_per_second=50, - max_out_len=1024, - max_seq_len=4096, - temperature=0.01, - batch_size=128, - retry=3, - )) - for d in datasets: - d['reader_cfg']['test_range'] = '[0:100]' + d['reader_cfg']['test_range'] = '[0:32]' + +models = sum([v for k, v in locals().items() if k.endswith('_model')], []) + +for m in models: + if 'turbomind' in m['abbr'] or 'lmdeploy' in m['abbr']: + m['engine_config']['max_batch_size'] = 1 + m['batch_size'] = 1 + +models = sorted(models, key=lambda x: x['run_cfg']['num_gpus']) + +summarizer = dict( + dataset_abbrs=[ + 'gsm8k', + 'race-middle', + 'race-high', + ], + summary_groups=sum( + [v for k, v in locals().items() if k.endswith('_summary_groups')], []), +) diff --git a/.github/scripts/eval_regression_chat_objective_fullbench.py b/.github/scripts/eval_regression_chat_objective_fullbench.py new file mode 100644 index 00000000..ff8dfba4 --- /dev/null +++ b/.github/scripts/eval_regression_chat_objective_fullbench.py @@ -0,0 +1,246 @@ +from mmengine.config import read_base + +with read_base(): + # read hf models - chat models + # Dataset + from opencompass.configs.datasets.ARC_c.ARC_c_cot_gen_926652 import \ + ARC_c_datasets # noqa: F401, E501 + from opencompass.configs.datasets.bbh.bbh_gen_5b92b0 import \ + bbh_datasets # noqa: F401, E501 + from opencompass.configs.datasets.cmmlu.cmmlu_0shot_cot_gen_305931 import \ + cmmlu_datasets # noqa: F401, E501 + from opencompass.configs.datasets.drop.drop_openai_simple_evals_gen_3857b0 import \ + drop_datasets # noqa: F401, E501 + from opencompass.configs.datasets.ds1000.ds1000_service_eval_gen_cbc84f import \ + ds1000_datasets # noqa: F401, E501 + from opencompass.configs.datasets.GaokaoBench.GaokaoBench_no_subjective_gen_4c31db import \ + GaokaoBench_datasets # noqa: F401, E501 + from opencompass.configs.datasets.gpqa.gpqa_openai_simple_evals_gen_5aeece import \ + gpqa_datasets # noqa: F401, E501 + # new datasets in Fullbench v1.1 + from opencompass.configs.datasets.gsm8k.gsm8k_0shot_v2_gen_a58960 import \ + gsm8k_datasets # noqa: F401, E501 + from opencompass.configs.datasets.hellaswag.hellaswag_10shot_gen_e42710 import \ + hellaswag_datasets # noqa: F401, E501 + from opencompass.configs.datasets.humaneval.humaneval_openai_sample_evals_gen_159614 import \ + humaneval_datasets # noqa: F401, E501 + from opencompass.configs.datasets.humanevalx.humanevalx_gen_620cfa import \ + humanevalx_datasets # noqa: F401, E501 + from opencompass.configs.datasets.IFEval.IFEval_gen_3321a3 import \ + ifeval_datasets # noqa: F401, E501 + from opencompass.configs.datasets.math.math_0shot_gen_393424 import \ + math_datasets # noqa: F401, E501 + from opencompass.configs.datasets.MathBench.mathbench_2024_gen_50a320 import \ + mathbench_datasets # noqa: F401, E501 + from opencompass.configs.datasets.mbpp.sanitized_mbpp_mdblock_gen_a447ff import \ + sanitized_mbpp_datasets # noqa: F401, E501 + from opencompass.configs.datasets.mmlu.mmlu_openai_simple_evals_gen_b618ea import \ + mmlu_datasets # noqa: F401, E501 + from opencompass.configs.datasets.mmlu_pro.mmlu_pro_0shot_cot_gen_08c1de import \ + mmlu_pro_datasets # noqa: F401, E501 + from opencompass.configs.datasets.nq.nq_open_1shot_gen_2e45e5 import \ + nq_datasets # noqa: F401, E501 + from opencompass.configs.datasets.race.race_cot_gen_d95929 import \ + race_datasets # noqa: F401, E501 + from opencompass.configs.datasets.scicode.scicode_gen_085b98 import \ + SciCode_datasets # noqa: F401, E501 + from opencompass.configs.datasets.SuperGLUE_BoolQ.SuperGLUE_BoolQ_cot_gen_1d56df import \ + BoolQ_datasets # noqa: F401, E501 + from opencompass.configs.datasets.teval.teval_en_gen_1ac254 import \ + teval_datasets as teval_en_datasets # noqa: F401, E501 + from opencompass.configs.datasets.teval.teval_zh_gen_1ac254 import \ + teval_datasets as teval_zh_datasets # noqa: F401, E501 + from opencompass.configs.datasets.TheoremQA.TheoremQA_5shot_gen_6f0af8 import \ + TheoremQA_datasets # noqa: F401, E501 + from opencompass.configs.datasets.triviaqa.triviaqa_wiki_1shot_gen_bc5f21 import \ + triviaqa_datasets # noqa: F401, E501 + from opencompass.configs.datasets.wikibench.wikibench_gen_0978ad import \ + wikibench_datasets # noqa: F401, E501 + from opencompass.configs.models.hf_internlm.hf_internlm2_5_7b_chat import \ + models as hf_internlm2_5_7b_chat_model # noqa: F401, E501 + from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat import \ + models as lmdeploy_internlm2_5_7b_chat_model # noqa: F401, E501 + # Summary Groups + from opencompass.configs.summarizers.groups.bbh import \ + bbh_summary_groups # noqa: F401, E501 + from opencompass.configs.summarizers.groups.cmmlu import \ + cmmlu_summary_groups # noqa: F401, E501 + from opencompass.configs.summarizers.groups.ds1000 import \ + ds1000_summary_groups # noqa: F401, E501 + from opencompass.configs.summarizers.groups.GaokaoBench import \ + GaokaoBench_summary_groups # noqa: F401, E501 + from opencompass.configs.summarizers.groups.humanevalx import \ + humanevalx_summary_groups # noqa: F401, E501 + from opencompass.configs.summarizers.groups.mathbench_v1_2024 import \ + mathbench_2024_summary_groups # noqa: F401, E501 + from opencompass.configs.summarizers.groups.mmlu import \ + mmlu_summary_groups # noqa: F401, E501 + from opencompass.configs.summarizers.groups.mmlu_pro import \ + mmlu_pro_summary_groups # noqa: F401, E501 + from opencompass.configs.summarizers.groups.scicode import \ + scicode_summary_groups # noqa: F401, E501 + from opencompass.configs.summarizers.groups.teval import \ + teval_summary_groups # noqa: F401, E501 + +# For HumanEval-X Evaluation +# Apply the evaluator ip_address and port +race_datasets = [race_datasets[1]] +for item in humanevalx_datasets: + item['eval_cfg']['evaluator'][ + 'ip_address'] = 'codeeval.opencompass.org.cn/humanevalx' + item['eval_cfg']['evaluator']['port'] = '' + +# For DS-1000 Evaluation +# Apply the evaluator ip_address and port +for item in ds1000_datasets: + item['eval_cfg']['evaluator'][ + 'ip_address'] = 'codeeval.opencompass.org.cn/ds1000' + item['eval_cfg']['evaluator']['port'] = '' + +bbh_datasets = [ + x for x in bbh_datasets if 'logical_deduction_seven_objects' in x['abbr'] + or 'multistep_arithmetic_two' in x['abbr'] +] +cmmlu_datasets = [ + x for x in cmmlu_datasets if x['abbr'].replace('cmmlu-', '') in [ + 'ancient_chinese', 'chinese_civil_service_exam', + 'chinese_driving_rule', 'chinese_food_culture', + 'chinese_foreign_policy', 'chinese_history', 'chinese_literature', + 'chinese_teacher_qualification', 'construction_project_management', + 'elementary_chinese', 'elementary_commonsense', 'ethnology', + 'high_school_politics', 'modern_chinese', + 'traditional_chinese_medicine' + ] +] +mmlu_datasets = [ + x for x in mmlu_datasets if x['abbr'].replace('lukaemon_mmlu_', '') in [ + 'business_ethics', 'clinical_knowledge', 'college_medicine', + 'global_facts', 'human_aging', 'management', 'marketing', + 'medical_genetics', 'miscellaneous', 'nutrition', + 'professional_accounting', 'professional_medicine', 'virology' + ] +] + +mmlu_pro_datasets = [mmlu_pro_datasets[0]] +mathbench_datasets = [x for x in mathbench_datasets if 'college' in x['abbr']] +GaokaoBench_datasets = [ + x for x in GaokaoBench_datasets if '2010-2022_Math_II_MCQs' in x['abbr'] + or '2010-2022_Math_II_Fill-in-the-Blank' in x['abbr'] +] + +datasets = sum( + (v for k, v in locals().items() if k.endswith('_datasets') + and 'scicode' not in k.lower() and 'teval' not in k), + [], +) +datasets += teval_en_datasets +datasets += teval_zh_datasets +# datasets += SciCode_datasets + +summarizer = dict( + dataset_abbrs=[ + ['race-high', 'accuracy'], + ['ARC-c', 'accuracy'], + ['BoolQ', 'accuracy'], + ['mmlu_pro', 'naive_average'], + ['drop', 'accuracy'], + ['bbh', 'naive_average'], + ['GPQA_diamond', 'accuracy'], + ['math', 'accuracy'], + ['wikibench-wiki-single_choice_cncircular', 'perf_4'], + ['openai_humaneval', 'humaneval_pass@1'], + ['sanitized_mbpp', 'score'], + ['cmmlu', 'naive_average'], + ['mmlu', 'naive_average'], + ['teval', 'naive_average'], + ['SciCode', 'accuracy'], + ['SciCode', 'sub_accuracy'], + ['humanevalx', 'naive_average'], + ['ds1000', 'naive_average'], + ['IFEval', 'Prompt-level-strict-accuracy'], + ['gsm8k', 'accuracy'], + ['GaokaoBench', 'weighted_average'], + ['triviaqa_wiki_1shot', 'score'], + ['nq_open_1shot', 'score'], + ['hellaswag', 'accuracy'], + ['TheoremQA', 'score'], + '###### MathBench-A: Application Part ######', + 'college', + 'high', + 'middle', + 'primary', + 'arithmetic', + 'mathbench-a (average)', + '###### MathBench-T: Theory Part ######', + 'college_knowledge', + 'high_knowledge', + 'middle_knowledge', + 'primary_knowledge', + 'mathbench-t (average)', + '###### Overall: Average between MathBench-A and MathBench-T ######', + 'Overall', + '', + 'bbh-logical_deduction_seven_objects', + 'bbh-multistep_arithmetic_two', + '' + 'mmlu', + 'mmlu-stem', + 'mmlu-social-science', + 'mmlu-humanities', + 'mmlu-other', + '', + 'cmmlu', + 'cmmlu-stem', + 'cmmlu-social-science', + 'cmmlu-humanities', + 'cmmlu-other', + 'cmmlu-china-specific', + '', + 'mmlu_pro', + 'mmlu_pro_biology', + 'mmlu_pro_business', + 'mmlu_pro_chemistry', + 'mmlu_pro_computer_science', + 'mmlu_pro_economics', + 'mmlu_pro_engineering', + 'mmlu_pro_health', + 'mmlu_pro_history', + 'mmlu_pro_law', + 'mmlu_pro_math', + 'mmlu_pro_philosophy', + 'mmlu_pro_physics', + 'mmlu_pro_psychology', + 'mmlu_pro_other', + '', + 'GaokaoBench_2010-2022_Math_II_MCQs', + 'GaokaoBench_2010-2022_Math_II_Fill-in-the-Blank', + '', + 'humanevalx-python', + 'humanevalx-cpp', + 'humanevalx-go', + 'humanevalx-java', + 'humanevalx-js', + '', + 'ds1000_Pandas', + 'ds1000_Numpy', + 'ds1000_Tensorflow', + 'ds1000_Scipy', + 'ds1000_Sklearn', + 'ds1000_Pytorch', + 'ds1000_Matplotlib', + ], + summary_groups=sum( + [v for k, v in locals().items() if k.endswith('_summary_groups')], []), +) + +for d in datasets: + d['reader_cfg']['test_range'] = '[0:16]' + +models = sum([v for k, v in locals().items() if k.endswith('_model')], []) +for m in models: + m['abbr'] = m['abbr'] + '_fullbench' + if 'turbomind' in m['abbr'] or 'lmdeploy' in m['abbr']: + m['engine_config']['max_batch_size'] = 1 + m['batch_size'] = 1 + +models = sorted(models, key=lambda x: x['run_cfg']['num_gpus']) diff --git a/.github/scripts/eval_regression_chat_subjective_fullbench.py b/.github/scripts/eval_regression_chat_subjective_fullbench.py new file mode 100644 index 00000000..8a6ef8fd --- /dev/null +++ b/.github/scripts/eval_regression_chat_subjective_fullbench.py @@ -0,0 +1,70 @@ +from copy import deepcopy + +from mmengine.config import read_base + +from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner +from opencompass.runners import LocalRunner +from opencompass.summarizers import SubjectiveSummarizer +from opencompass.tasks.subjective_eval import SubjectiveEvalTask + +with read_base(): + # read hf models - chat models + # Dataset + from opencompass.configs.datasets.subjective.alignbench.alignbench_v1_1_judgeby_critiquellm import \ + alignbench_datasets # noqa: F401, E501 + from opencompass.configs.datasets.subjective.alpaca_eval.alpacav2_judgeby_gpt4 import \ + alpacav2_datasets # noqa: F401, E501 + from opencompass.configs.datasets.subjective.arena_hard.arena_hard_compare import \ + arenahard_datasets # noqa: F401, E501 + from opencompass.configs.datasets.subjective.compassarena.compassarena_compare import \ + compassarena_datasets # noqa: F401, E501 + from opencompass.configs.datasets.subjective.fofo.fofo_bilingual_judge import \ + fofo_datasets # noqa: F401, E501 + from opencompass.configs.datasets.subjective.followbench.followbench_llmeval import \ + followbench_llmeval_datasets # noqa: F401, E501 + from opencompass.configs.datasets.subjective.multiround.mtbench101_judge import \ + mtbench101_datasets # noqa: F401, E501 + from opencompass.configs.datasets.subjective.wildbench.wildbench_pair_judge import \ + wildbench_datasets # noqa: F401, E501 + from opencompass.configs.models.hf_internlm.hf_internlm2_5_7b_chat import \ + models as hf_internlm2_5_7b_chat_model # noqa: F401, E501 + from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat import \ + models as lmdeploy_internlm2_5_7b_chat_model # noqa: F401, E501 + +summarizer = dict(type=SubjectiveSummarizer, function='subjective') + +datasets = sum((v for k, v in locals().items() if k.endswith('_datasets') + and 'mtbench101' not in k and 'wildbench' not in k), []) +datasets += mtbench101_datasets # noqa: F401, E501 +datasets += wildbench_datasets # noqa: F401, E501 + +api_meta_template = dict( + round=[ + dict(role='HUMAN', api_role='HUMAN'), + dict(role='BOT', api_role='BOT', generate=True), + ], + reserved_roles=[dict(role='SYSTEM', api_role='SYSTEM')], +) + +models = sum([v for k, v in locals().items() if k.endswith('_model')], []) +for m in models: + m['abbr'] = m['abbr'] + '_fullbench' + if 'turbomind' in m['abbr'] or 'lmdeploy' in m['abbr']: + m['engine_config']['max_batch_size'] = 1 + m['batch_size'] = 1 + +models = sorted(models, key=lambda x: x['run_cfg']['num_gpus']) + +judge_models = deepcopy([models[1]]) +judge_models[0]['abbr'] = judge_models[0]['abbr'] + '-judge' + +eval = dict( + partitioner=dict( + type=SubjectiveNaivePartitioner, + models=models, + judge_models=judge_models, + ), + runner=dict(type=LocalRunner, + max_num_workers=16, + task=dict(type=SubjectiveEvalTask)), +) diff --git a/.github/scripts/oc_score_assert.py b/.github/scripts/oc_score_assert.py index 2741a66a..d8e33adb 100644 --- a/.github/scripts/oc_score_assert.py +++ b/.github/scripts/oc_score_assert.py @@ -7,36 +7,56 @@ import yaml output_path = 'regression_result_daily' chat_model_list = [ - 'baichuan2-7b-chat-hf', 'deepseek-7b-chat-hf', 'deepseek-moe-16b-chat-hf', - 'deepseek-v2-lite-chat-hf', 'deepseek-7b-chat-vllm', 'gemma2-2b-it-hf', - 'gemma2-9b-it-hf', 'gemma-7b-it-vllm', 'internlm2_5-7b-chat-hf', - 'internlm2_5-20b-chat-hf', 'internlm2_5-7b-chat-turbomind', - 'internlm2_5-20b-chat-turbomind', 'internlm2-chat-1.8b-turbomind', + 'baichuan2-7b-chat-hf', 'glm-4-9b-chat-hf', 'glm-4-9b-chat-turbomind', + 'glm-4-9b-chat-vllm', 'deepseek-7b-chat-hf', 'deepseek-moe-16b-chat-hf', + 'deepseek-7b-chat-vllm', 'gemma2-2b-it-hf', 'gemma2-9b-it-hf', + 'gemma-2b-it-hf', 'gemma-7b-it-hf', 'gemma-2-9b-it-turbomind', + 'gemma-7b-it-vllm', 'internlm2_5-7b-chat-hf', + 'internlm2_5-7b-chat-turbomind', 'internlm2-chat-1.8b-turbomind', 'internlm2-chat-1.8b-sft-turbomind', 'internlm2-chat-7b-lmdeploy', 'internlm2-chat-7b-sft-turbomind', 'internlm2-chat-7b-vllm', - 'llama-3_1-8b-instruct-hf', 'llama-3-8b-instruct-hf', - 'llama-3_1-8b-instruct-turbomind', 'llama-3-8b-instruct-turbomind', - 'mistral-7b-instruct-v0.3-hf', 'mistral-7b-instruct-v0.2-vllm', - 'minicpm-2b-dpo-fp32-hf', 'minicpm-2b-sft-bf16-hf', + 'llama-3_1-8b-instruct-hf', 'llama-3_2-3b-instruct-hf', + 'llama-3-8b-instruct-hf', 'llama-3_1-8b-instruct-turbomind', + 'llama-3_2-3b-instruct-turbomind', 'llama-3-8b-instruct-turbomind', + 'mistral-7b-instruct-v0.2-hf', 'mistral-7b-instruct-v0.3-hf', + 'mistral-nemo-instruct-2407-hf', 'mistral-nemo-instruct-2407-turbomind', + 'mistral-7b-instruct-v0.1-vllm', 'mistral-7b-instruct-v0.2-vllm', + 'MiniCPM3-4B-hf', 'minicpm-2b-dpo-fp32-hf', 'minicpm-2b-sft-bf16-hf', 'minicpm-2b-sft-fp32-hf', 'phi-3-mini-4k-instruct-hf', 'qwen1.5-0.5b-chat-hf', 'qwen2-1.5b-instruct-hf', 'qwen2-7b-instruct-hf', 'qwen2-1.5b-instruct-turbomind', 'qwen2-7b-instruct-turbomind', 'qwen1.5-0.5b-chat-vllm', 'yi-1.5-6b-chat-hf', 'yi-1.5-9b-chat-hf', - 'lmdeploy-api-test' + 'deepseek-v2-lite-chat-hf', 'internlm2_5-20b-chat-hf', + 'internlm2_5-20b-chat-turbomind', 'mistral-small-instruct-2409-hf', + 'mistral-small-instruct-2409-turbomind', 'qwen2.5-14b-instruct-hf', + 'qwen2.5-14b-instruct-turbomind' ] base_model_list = [ - 'deepseek-moe-16b-base-hf', 'deepseek-v2-lite-hf', - 'deepseek-7b-base-turbomind', 'deepseek-moe-16b-base-vllm', 'gemma2-2b-hf', - 'gemma2-9b-hf', 'internlm2_5-7b-hf', 'internlm2-7b-hf', - 'internlm2-base-7b-hf', 'internlm2-1.8b-turbomind', - 'internlm2_5-7b-turbomind', 'internlm2-7b-turbomind', - 'internlm2-base-7b-turbomind', 'llama-2-7b-hf', 'llama-3-8b-hf', - 'llama-3.1-8b-turbomind', 'llama-3-8b-turbomind', 'mistral-7b-v0.3-hf', - 'mistral-7b-v0.2-vllm', 'qwen1.5-moe-a2.7b-hf', 'qwen2-0.5b-hf', + 'glm-4-9b-hf', 'deepseek-moe-16b-base-hf', 'deepseek-7b-base-turbomind', + 'deepseek-moe-16b-base-vllm', 'gemma2-2b-hf', 'gemma2-9b-hf', + 'gemma-2b-hf', 'gemma-7b-hf', 'gemma-2b-vllm', 'gemma-7b-vllm', + 'internlm2_5-7b-hf', 'internlm2-7b-hf', 'internlm2-base-7b-hf', + 'internlm2-1.8b-turbomind', 'internlm2_5-7b-turbomind', + 'internlm2-7b-turbomind', 'internlm2-base-7b-turbomind', 'llama-2-7b-hf', + 'llama-3_1-8b-hf', 'llama-3-8b-hf', 'llama-3.1-8b-turbomind', + 'llama-3-8b-turbomind', 'mistral-7b-v0.2-hf', 'mistral-7b-v0.3-hf', + 'mistral-7b-v0.2-vllm', 'qwen2.5-7b-hf', 'qwen2.5-1.5b-turbomind', + 'qwen2.5-7b-turbomind', 'qwen1.5-moe-a2.7b-hf', 'qwen2-0.5b-hf', 'qwen2-1.5b-hf', 'qwen2-7b-hf', 'qwen2-1.5b-turbomind', - 'qwen2-7b-turbomind', 'qwen1.5-0.5b-vllm', 'yi-1.5-6b-hf', 'yi-1.5-9b-hf' + 'qwen2-7b-turbomind', 'qwen1.5-0.5b-vllm', 'yi-1.5-6b-hf', 'yi-1.5-9b-hf', + 'deepseek-v2-lite-hf', 'internlm2-20b-hf', 'internlm2-base-20b-hf', + 'internlm2-20b-turbomind', 'qwen2.5-14b-hf' ] -dataset_list = ['gsm8k', 'race-middle', 'race-high'] + + +@pytest.fixture() +def baseline_scores_testrange(request): + config_path = os.path.join( + request.config.rootdir, + '.github/scripts/oc_score_baseline_testrange.yaml') + with open(config_path) as f: + config = yaml.load(f.read(), Loader=yaml.SafeLoader) + return config @pytest.fixture() @@ -48,6 +68,16 @@ def baseline_scores(request): return config +@pytest.fixture() +def baseline_scores_fullbench(request): + config_path = os.path.join( + request.config.rootdir, + '.github/scripts/oc_score_baseline_fullbench.yaml') + with open(config_path) as f: + config = yaml.load(f.read(), Loader=yaml.SafeLoader) + return config + + @pytest.fixture() def result_scores(): file = find_csv_files(output_path) @@ -57,100 +87,228 @@ def result_scores(): @pytest.mark.usefixtures('result_scores') -@pytest.mark.usefixtures('baseline_scores') +@pytest.mark.usefixtures('baseline_scores_testrange') @pytest.mark.chat class TestChat: """Test cases for chat model.""" - @pytest.mark.parametrize('model, dataset', [(p1, p2) - for p1 in chat_model_list - for p2 in dataset_list]) - def test_model_dataset_score(self, baseline_scores, result_scores, model, - dataset): - base_score = baseline_scores.get(model).get(dataset) + @pytest.mark.parametrize('model, dataset', + [(p1, p2) for p1 in chat_model_list + for p2 in ['gsm8k', 'race-high']]) + def test_model_dataset_score(self, baseline_scores_testrange, + result_scores, model, dataset): + base_score = baseline_scores_testrange.get(model).get(dataset) result_score = result_scores.get(model).get(dataset) - assert_score(result_score, base_score) + assert_score(model, result_score, base_score) @pytest.mark.usefixtures('result_scores') -@pytest.mark.usefixtures('baseline_scores') +@pytest.mark.usefixtures('baseline_scores_testrange') @pytest.mark.base class TestBase: """Test cases for base model.""" - @pytest.mark.parametrize('model, dataset', [(p1, p2) - for p1 in base_model_list - for p2 in dataset_list]) - def test_model_dataset_score(self, baseline_scores, result_scores, model, - dataset): - if model == 'mistral-7b-v0.2-vllm' and dataset == 'race-high': + @pytest.mark.parametrize( + 'model, dataset', + [(p1, p2) for p1 in base_model_list + for p2 in ['gsm8k', 'GPQA_diamond', 'race-high', 'winogrande']]) + def test_model_dataset_score(self, baseline_scores_testrange, + result_scores, model, dataset): + if model in ['gemma-2b-vllm', 'gemma-7b-vllm'] and dataset != 'gsm8k': return - base_score = baseline_scores.get(model).get(dataset) + base_score = baseline_scores_testrange.get(model).get(dataset) result_score = result_scores.get(model).get(dataset) - assert_score(result_score, base_score) + assert_score(model, result_score, base_score) @pytest.mark.usefixtures('result_scores') +@pytest.mark.usefixtures('baseline_scores_fullbench') +@pytest.mark.chat_obj_fullbench +class TestChatObjFullbench: + """Test cases for chat model.""" + + @pytest.mark.parametrize('model, dataset', [(p1, p2) for p1 in [ + 'internlm2_5-7b-chat-hf_fullbench', + 'internlm2_5-7b-chat-turbomind_fullbench' + ] for p2 in [ + 'race-high', 'ARC-c', 'BoolQ', 'drop', 'GPQA_diamond', 'math', + 'wikibench-wiki-single_choice_cncircular', 'sanitized_mbpp', 'ds1000', + 'gsm8k', 'triviaqa_wiki_1shot', 'nq_open_1shot', 'hellaswag', + 'TheoremQA', 'college', 'college_knowledge', + 'bbh-logical_deduction_seven_objects', 'bbh-multistep_arithmetic_two', + 'mmlu-other', 'cmmlu-china-specific', 'mmlu_pro_math', 'ds1000_Pandas', + 'ds1000_Numpy', 'ds1000_Tensorflow', 'ds1000_Scipy', 'ds1000_Sklearn', + 'ds1000_Pytorch', 'ds1000_Matplotlib' + ]]) + def test_model_dataset_score(self, baseline_scores_fullbench, + result_scores, model, dataset): + base_score = baseline_scores_fullbench.get(model).get(dataset) + result_score = result_scores.get(model).get(dataset) + assert_score(model, result_score, base_score) + + +@pytest.mark.usefixtures('result_scores') +@pytest.mark.usefixtures('baseline_scores_fullbench') +@pytest.mark.chat_sub_fullbench +class TestChatSubFullbench: + """Test cases for chat model.""" + + @pytest.mark.parametrize('model, dataset', [(p1, p2) for p1 in [ + 'internlm2_5-7b-chat-hf_fullbench', + 'internlm2_5-7b-chat-turbomind_fullbench' + ] for p2 in [ + 'Alignbench总分', 'Alignbench专业能力', 'AlpacaEvaltotal', + 'AlpacaEvalhelpful_base', 'CompassArenacompassarena_language', + 'CompassArenacompassarena_knowledge', + 'CompassArenacompassarena_reason_v2', + 'CompassArenacompassarena_math_v2', + 'CompassArenacompassarena_creationv2_zh', 'Fofofofo_test_prompts', + 'followbenchHSR_AVG', 'followbenchSSR_AVG', 'followbenchHSR_L1', + 'followbenchHSR_L2', 'followbenchHSR_L3', 'followbenchHSR_L4', + 'followbenchHSR_L5', 'followbenchSSR_L1', 'followbenchSSR_L2', + 'followbenchSSR_L3', 'followbenchSSR_L4', 'followbenchSSR_L5', + 'MTBench101average', 'Wildbenchscore' + ]]) + def test_model_dataset_score(self, baseline_scores_fullbench, + result_scores, model, dataset): + base_score = baseline_scores_fullbench.get(model).get(dataset) + result_score = result_scores.get(model).get(dataset) + assert_score(model, result_score, base_score) + + +@pytest.mark.usefixtures('result_scores') +@pytest.mark.usefixtures('baseline_scores_fullbench') +@pytest.mark.base_fullbench +class TestBaseFullbench: + """Test cases for chat model.""" + + @pytest.mark.parametrize('model, dataset', [(p1, p2) for p1 in [ + 'internlm2_5-7b-hf_fullbench', 'internlm2_5-7b-turbomind_fullbench' + ] for p2 in [ + 'race-high', 'ARC-c', 'BoolQ', 'drop', 'GPQA_diamond', 'math', + 'wikibench-wiki-single_choice_cncircular', 'sanitized_mbpp', 'gsm8k', + 'triviaqa_wiki_1shot', 'nq_open_1shot', 'winogrande', 'hellaswag', + 'TheoremQA', 'college', 'college_knowledge', + 'bbh-logical_deduction_seven_objects', 'bbh-multistep_arithmetic_two', + 'mmlu-other', 'cmmlu-china-specific', 'mmlu_pro_math' + ]]) + def test_model_dataset_score(self, baseline_scores_fullbench, + result_scores, model, dataset): + base_score = baseline_scores_fullbench.get(model).get(dataset) + result_score = result_scores.get(model).get(dataset) + assert_score(model, result_score, base_score) + + +@pytest.mark.usefixtures('result_scores') +@pytest.mark.usefixtures('baseline_scores') +@pytest.mark.api +class TestApibench: + """Test cases for chat model.""" + + @pytest.mark.parametrize('model, dataset', + [('lmdeploy-api-test', 'race-middle'), + ('lmdeploy-api-test', 'race-high'), + ('lmdeploy-api-test', 'gsm8k')]) + def test_api(self, baseline_scores, result_scores, model, dataset): + base_score = baseline_scores.get(model).get(dataset) + result_score = result_scores.get(model).get(dataset) + assert_score(model + '_batch', result_score, base_score) + + +@pytest.mark.usefixtures('result_scores') +@pytest.mark.usefixtures('baseline_scores') class TestCmdCase: @pytest.mark.case1 @pytest.mark.parametrize('model, dataset', [('internlm2_5-7b-hf', 'race-middle'), - ('internlm2_5-7b-hf', 'race-high')]) - def test_cmd_case1(self, result_scores, model, dataset): - if len(result_scores.keys()) != 1: - assert False, 'result is none' + ('internlm2_5-7b-hf', 'race-high'), + ('internlm2_5-7b-hf', 'demo_gsm8k'), + ('internlm2-1.8b-hf', 'race-middle'), + ('internlm2-1.8b-hf', 'race-high'), + ('internlm2-1.8b-hf', 'demo_gsm8k')]) + def test_cmd_case1(self, baseline_scores, result_scores, model, dataset): + base_score = baseline_scores.get(model).get(dataset) result_score = result_scores.get(model).get(dataset) - assert_score(result_score, 91) + assert_score(model, result_score, base_score) @pytest.mark.case2 @pytest.mark.parametrize('model, dataset', [('internlm2_5-7b-chat-lmdeploy', 'race-middle'), - ('internlm2_5-7b-chat-lmdeploy', 'race-high')]) - def test_cmd_case2(self, result_scores, model, dataset): - if len(result_scores.keys()) != 1: - assert False, 'result is none' + ('internlm2_5-7b-chat-lmdeploy', 'race-high'), + ('internlm2_5-7b-chat-lmdeploy', 'demo_gsm8k'), + ('internlm2-chat-1.8b-lmdeploy', 'race-middle'), + ('internlm2-chat-1.8b-lmdeploy', 'race-high'), + ('internlm2-chat-1.8b-lmdeploy', 'demo_gsm8k')]) + def test_cmd_case2(self, baseline_scores, result_scores, model, dataset): + base_score = baseline_scores.get(model).get(dataset) result_score = result_scores.get(model).get(dataset) - assert_score(result_score, 91) + assert_score(model + '_batch', result_score, base_score) @pytest.mark.case3 @pytest.mark.parametrize('model, dataset', [('internlm2_5-7b_hf', 'race-middle'), - ('internlm2_5-7b_hf', 'race-high')]) - def test_cmd_case3(self, result_scores, model, dataset): - if len(result_scores.keys()) != 1: - assert False, 'result is none' + ('internlm2_5-7b_hf', 'race-high'), + ('internlm2_5-7b_hf', 'demo_gsm8k')]) + def test_cmd_case3(self, baseline_scores, result_scores, model, dataset): + base_score = baseline_scores.get(model).get(dataset) result_score = result_scores.get(model).get(dataset) - assert_score(result_score, 91) + assert_score(model, result_score, base_score) @pytest.mark.case4 @pytest.mark.parametrize('model, dataset', [('internlm2_5-7b-chat_hf', 'race-middle'), - ('internlm2_5-7b-chat_hf', 'race-high')]) - def test_cmd_case4(self, result_scores, model, dataset): - if len(result_scores.keys()) != 1: - assert False, 'result is none' + ('internlm2_5-7b-chat_hf', 'race-high'), + ('internlm2_5-7b-chat_hf', 'demo_gsm8k')]) + def test_cmd_case4(self, baseline_scores, result_scores, model, dataset): + base_score = baseline_scores.get(model).get(dataset) result_score = result_scores.get(model).get(dataset) - assert_score(result_score, 91) + assert_score(model, result_score, base_score) -def assert_score(score, baseline): +THRESHOLD = 3 + + +def assert_score(model_type, score, baseline): if score is None or score == '-': assert False, 'value is none' - if float(score) <= (baseline + 5) and float(score) >= (baseline - 5): - print(score + ' between ' + str(baseline - 5) + ' and ' + - str(baseline + 5)) - assert True + + if 'batch' not in model_type: + if float(score) <= (baseline + 0.01) and float(score) >= (baseline - + 0.01): + print(' '.join([score, 'is equal', str(baseline)])) + assert True + else: + print(' '.join([score, 'is not equal', str(baseline)])) + assert False, ' '.join([score, 'is not equal', str(baseline)]) else: - assert False, score + ' not between ' + str( - baseline - 5) + ' and ' + str(baseline + 5) + if float(score) <= (baseline + THRESHOLD) and float(score) >= ( + baseline - THRESHOLD): + print(' '.join([ + score, 'is between', + str(baseline - THRESHOLD), 'and', + str(baseline + THRESHOLD) + ])) + assert True + else: + print(' '.join([ + score, 'is not etween', + str(baseline - THRESHOLD), 'and', + str(baseline + THRESHOLD) + ])) + assert False, ' '.join([ + score, 'is not etween', + str(baseline - THRESHOLD), 'and', + str(baseline + THRESHOLD) + ]) def find_csv_files(directory): csv_files = [] for root, dirs, files in os.walk(directory): for file in files: - if file.endswith('.csv'): + if file.endswith('.csv') and (file.startswith('summary') or + file.startswith('Subjective_all')): csv_files.append(os.path.join(root, file)) csv_files_with_time = {f: os.path.getctime(f) for f in csv_files} @@ -163,14 +321,24 @@ def read_csv_file(file_path): with open(file_path, 'r') as csvfile: reader = csv.DictReader(csvfile) filtered_data = [] - - for row in reader: - filtered_row = { - k: v - for k, v in row.items() - if k not in ['version', 'metric', 'mode'] - } - filtered_data.append(filtered_row) + if 'Subjective_all' not in file_path: + for row in reader: + if row['metric'] is not None and 'bpb' not in row['metric']: + filtered_row = { + k: v + for k, v in row.items() + if k not in ['version', 'metric', 'mode'] + } + filtered_data.append(filtered_row) + else: + for row in reader: + if row['Detailed Scores'] is not None: + filtered_row = row + filtered_row['dataset'] = filtered_row[ + 'Dataset'] + filtered_row['Detailed Scores'] + del filtered_row['Dataset'] + del filtered_row['Detailed Scores'] + filtered_data.append(filtered_row) result = {} for data in filtered_data: diff --git a/.github/scripts/oc_score_baseline.yaml b/.github/scripts/oc_score_baseline.yaml index 4c3b38f6..40cb1087 100644 --- a/.github/scripts/oc_score_baseline.yaml +++ b/.github/scripts/oc_score_baseline.yaml @@ -1,369 +1,34 @@ -baichuan2-7b-chat-hf: - gsm8k: 30 - race-middle: 74 - race-high: 79 +internlm2_5-7b-hf: + demo_gsm8k: 42.19 + race-middle: 91.78 + race-high: 90.02 -glm-4-9b-chat-hf: - gsm8k: 75 - race-middle: 88 - race-high: 88 +internlm2_5-7b_hf: + demo_gsm8k: 42.19 + race-middle: 91.78 + race-high: 90.02 -glm-4-9b-chat-turbomind: - gsm8k: 69 - race-middle: 82 - race-high: 77 +internlm2-1.8b-hf: + demo_gsm8k: 15.62 + race-middle: 71.66 + race-high: 66.38 -glm-4-9b-chat-vllm: - gsm8k: 73 - race-middle: 87 - race-high: 87 +internlm2_5-7b-chat-lmdeploy: + demo_gsm8k: 84.38 + race-middle: 92.76 + race-high: 90.54 -deepseek-7b-chat-hf: - gsm8k: 60 - race-middle: 74 - race-high: 80 +internlm2-chat-1.8b-lmdeploy: + demo_gsm8k: 31 + race-middle: 81.34 + race-high: 73.96 -deepseek-moe-16b-chat-hf: - gsm8k: 62 - race-middle: 62 - race-high: 70 - -deepseek-v2-lite-chat-hf: - gsm8k: 59 - race-middle: 82 - race-high: 79 - -deepseek-7b-chat-vllm: - gsm8k: 63 - race-middle: 74 - race-high: 79 - -gemma-2b-it-hf: - gsm8k: 14 - race-middle: 62 - race-high: 52 - -gemma-7b-it-hf: - gsm8k: 39 - race-middle: 74 - race-high: 71 - -gemma-7b-it-vllm: - gsm8k: 38 - race-middle: 75 - race-high: 70 - -gemma2-2b-it-hf: - gsm8k: 62 - race-middle: 75 - race-high: 67 - -gemma2-9b-it-hf: - gsm8k: 80 - race-middle: 89 - race-high: 85 - -internlm2_5-7b-chat-hf: - gsm8k: 86 - race-middle: 92 - race-high: 93 - -internlm2_5-20b-chat-hf: - gsm8k: 91 - race-middle: 95 - race-high: 91 - -internlm2_5-7b-chat-turbomind: - gsm8k: 87 - race-middle: 92 - race-high: 93 - -internlm2_5-20b-chat-turbomind: - gsm8k: 91 - race-middle: 95 - race-high: 91 - -internlm2-chat-1.8b-turbomind: - gsm8k: 40 - race-middle: 82 - race-high: 83 - -internlm2-chat-1.8b-sft-turbomind: - gsm8k: 34 - race-middle: 81 - race-high: 83 - -internlm2-chat-7b-lmdeploy: - gsm8k: 69 - race-middle: 90 - race-high: 88 - -internlm2-chat-7b-sft-turbomind: - gsm8k: 71 - race-middle: 91 - race-high: 92 - -internlm2-chat-7b-vllm: - gsm8k: 63 - race-middle: 90 - race-high: 91 - -llama-3_1-8b-instruct-hf: - gsm8k: 82 - race-middle: 82 - race-high: 88 - -llama-3-8b-instruct-hf: - gsm8k: 77 - race-middle: 85 - race-high: 87 - -llama-3_1-8b-instruct-turbomind: - gsm8k: 79 - race-middle: 82 - race-high: 88 - -llama-3-8b-instruct-turbomind: - gsm8k: 77 - race-middle: 85 - race-high: 89 - -mistral-7b-instruct-v0.2-hf: - gsm8k: 48 - race-middle: 82 - race-high: 78 - -mistral-7b-instruct-v0.3-hf: - gsm8k: 53 - race-middle: 80 - race-high: 78 - -mistral-7b-instruct-v0.2-vllm: - gsm8k: 49 - race-middle: 81 - race-high: 77 - -minicpm-2b-dpo-fp32-hf: - gsm8k: 58 - race-middle: 66 - race-high: 74 - -minicpm-2b-sft-bf16-hf: - gsm8k: 58 - race-middle: 75 - race-high: 81 - -minicpm-2b-sft-fp32-hf: - gsm8k: 58 - race-middle: 75 - race-high: 81 - -phi-3-mini-4k-instruct-hf: - gsm8k: 67 - race-middle: 81 - race-high: 84 - -phi-3-small-8k-instruct-hf: - gsm8k: 88 - race-middle: 89 - race-high: 88 - -qwen1.5-0.5b-chat-hf: - gsm8k: 5 - race-middle: 55 - race-high: 50 - -qwen2-1.5b-instruct-hf: - gsm8k: 63 - race-middle: 77 - race-high: 86 - -qwen2-1.5b-instruct-turbomind: - gsm8k: 60 - race-middle: 77 - race-high: 86 - -qwen2-7b-instruct-turbomind: - gsm8k: 88 - race-middle: 87 - race-high: 89 - -qwen2-7b-instruct-hf: - gsm8k: 85 - race-middle: 87 - race-high: 91 - -qwen1.5-0.5b-chat-vllm: - gsm8k: 5 - race-middle: 57 - race-high: 51 - -yi-1.5-6b-chat-hf: - gsm8k: 72 - race-middle: 88 - race-high: 86 - -yi-1.5-9b-chat-hf: - gsm8k: 81 - race-middle: 89 - race-high: 91 +internlm2_5-7b-chat_hf: + demo_gsm8k: 87.50 + race-middle: 92.76 + race-high: 90.48 lmdeploy-api-test: - gsm8k: 90 - race-middle: 95 - race-high: 96 - -deepseek-moe-16b-base-hf: - gsm8k: 25 - race-middle: 35 - race-high: 23 - -deepseek-v2-lite-hf: - gsm8k: 37 - race-middle: 56 - race-high: 62 - -deepseek-7b-base-turbomind: - gsm8k: 21 - race-middle: 42 - race-high: 42 - -deepseek-moe-16b-base-vllm: - gsm8k: 22 - race-middle: 35 - race-high: 20 - -gemma-2b-hf: - gsm8k: 19 - race-middle: 33 - race-high: 26 - -gemma-7b-hf: - gsm8k: 65 - race-middle: 59 - race-high: 66 - -gemma2-2b-hf: - gsm8k: 33 - race-middle: 56 - race-high: 58 - -gemma2-9b-hf: - gsm8k: 70 - race-middle: 82 - race-high: 84 - -internlm2_5-7b-hf: - gsm8k: 47 - race-middle: 92 - race-high: 91 - -internlm2-7b-hf: - gsm8k: 65 - race-middle: 77 - race-high: 72 - -internlm2-base-7b-hf: - gsm8k: 5 - race-middle: 71 - race-high: 74 - -internlm2_5-7b-turbomind: - gsm8k: 73 - race-middle: 90 - race-high: 91 - -internlm2-1.8b-turbomind: - gsm8k: 25 - race-middle: 75 - race-high: 72 - -internlm2-7b-turbomind: - gsm8k: 67 - race-middle: 78 - race-high: 76 - -internlm2-base-7b-turbomind: - gsm8k: 39 - race-middle: 75 - race-high: 81 - -llama-2-7b-hf: - gsm8k: 17 - race-middle: 32 - race-high: 38 - -llama-3-8b-hf: - gsm8k: 48 - race-middle: 64 - race-high: 70 - -llama-3.1-8b-turbomind: - gsm8k: 57 - race-middle: 67 - race-high: 75 - -llama-3-8b-turbomind: - gsm8k: 52 - race-middle: 63 - race-high: 70 - -mistral-7b-v0.2-hf: - gsm8k: 43 - race-middle: 42 - race-high: 60 - -mistral-7b-v0.3-hf: - gsm8k: 43 - race-middle: 42 - race-high: 60 - -mistral-7b-v0.2-vllm: - gsm8k: 45 - race-middle: 42 - race-high: 58 - -qwen1.5-moe-a2.7b-hf: - gsm8k: 64 - race-middle: 78 - race-high: 90 - -qwen2-1.5b-hf: - gsm8k: 58 - race-middle: 65 - race-high: 78 - -qwen2-0.5b-hf: - gsm8k: 35 - race-middle: 52 - race-high: 48 - -qwen2-7b-hf: - gsm8k: 82 - race-middle: 88 - race-high: 89 - -qwen2-1.5b-turbomind: - gsm8k: 57 - race-middle: 64 - race-high: 78 - -qwen2-7b-turbomind: - gsm8k: 83 - race-middle: 88 - race-high: 88 - -qwen1.5-0.5b-vllm: - gsm8k: 12 - race-middle: 54 - race-high: 59 - -yi-1.5-6b-hf: - gsm8k: 59 - race-middle: 81 - race-high: 89 - -yi-1.5-9b-hf: - gsm8k: 77 - race-middle: 90 - race-high: 90 + gsm8k: 83.78 + race-middle: 92.41 + race-high: 90.37 diff --git a/.github/scripts/oc_score_baseline_fullbench.yaml b/.github/scripts/oc_score_baseline_fullbench.yaml new file mode 100644 index 00000000..4eea62fe --- /dev/null +++ b/.github/scripts/oc_score_baseline_fullbench.yaml @@ -0,0 +1,153 @@ +internlm2_5-7b-chat-hf_fullbench: + race-high: 93.75 + ARC-c: 87.5 + BoolQ: 81.25 + drop: 81.25 + GPQA_diamond: 25 + math: 75 + wikibench-wiki-single_choice_cncircular: 50 + sanitized_mbpp: 68.75 + ds1000: 16.96 + gsm8k: 56.25 + triviaqa_wiki_1shot: 50 + nq_open_1shot: 25 + hellaswag: 87.5 + TheoremQA: 18.75 + college: 12.5 + college_knowledge: 87.5 + bbh-logical_deduction_seven_objects: 50 + bbh-multistep_arithmetic_two: 68.75 + mmlu-other: 72.6 + cmmlu-china-specific: 76.25 + mmlu_pro_math: 25 + ds1000_Pandas: 12.5 + ds1000_Numpy: 0 + ds1000_Tensorflow: 12.5 + ds1000_Scipy: 18.75 + ds1000_Sklearn: 18.75 + ds1000_Pytorch: 12.5 + ds1000_Matplotlib: 43.75 + Alignbench总分: 0.65 + Alignbench专业能力: 7.83 + AlpacaEvaltotal: 0 + AlpacaEvalhelpful_base: 0 + CompassArenacompassarena_language: 60 + CompassArenacompassarena_knowledge: 56 + CompassArenacompassarena_reason_v2: 50 + CompassArenacompassarena_math_v2: 53.5 + CompassArenacompassarena_creationv2_zh: 48.75 + Fofofofo_test_prompts: 1 + followbenchHSR_AVG: 1 + followbenchSSR_AVG: 1 + followbenchHSR_L1: 1 + followbenchHSR_L2: 1 + followbenchHSR_L3: 1 + followbenchHSR_L4: 1 + followbenchHSR_L5: 1 + followbenchSSR_L1: 1 + followbenchSSR_L2: 1 + followbenchSSR_L3: 1 + followbenchSSR_L4: 1 + followbenchSSR_L5: 1 + MTBench101average: 8.1 + Wildbenchscore: -3.3333333333333335 + +internlm2_5-7b-chat-turbomind_fullbench: + race-high: 93.75 + ARC-c: 87.5 + BoolQ: 68.75 + drop: 75 + GPQA_diamond: 25 + math: 75 + wikibench-wiki-single_choice_cncircular: 25 + sanitized_mbpp: 68.75 + ds1000: 13.39 + gsm8k: 68.75 + triviaqa_wiki_1shot: 50 + nq_open_1shot: 25 + hellaswag: 81.25 + TheoremQA: 6.25 + college: 0 + college_knowledge: 87.5 + bbh-logical_deduction_seven_objects: 56.25 + bbh-multistep_arithmetic_two: 68.75 + mmlu-other: 74.04 + cmmlu-china-specific: 76.25 + mmlu_pro_math: 25 + ds1000_Pandas: 0 + ds1000_Numpy: 0 + ds1000_Tensorflow: 12.5 + ds1000_Scipy: 18.75 + ds1000_Sklearn: 18.75 + ds1000_Pytorch: 6.25 + ds1000_Matplotlib: 37.5 + Alignbench总分: 0.64 + Alignbench专业能力: 7.6 + AlpacaEvaltotal: 10 + AlpacaEvalhelpful_base: 10 + CompassArenacompassarena_language: 59 + CompassArenacompassarena_knowledge: 57 + CompassArenacompassarena_reason_v2: 49.5 + CompassArenacompassarena_math_v2: 51 + CompassArenacompassarena_creationv2_zh: 43.75 + Fofofofo_test_prompts: 1 + followbenchHSR_AVG: 1 + followbenchSSR_AVG: 1 + followbenchHSR_L1: 1 + followbenchHSR_L2: 1 + followbenchHSR_L3: 1 + followbenchHSR_L4: 1 + followbenchHSR_L5: 1 + followbenchSSR_L1: 1 + followbenchSSR_L2: 1 + followbenchSSR_L3: 1 + followbenchSSR_L4: 1 + followbenchSSR_L5: 1 + MTBench101average: 8.1 + Wildbenchscore: -8.333333333333334 + +internlm2_5-7b-hf_fullbench: + race-high: 100 + ARC-c: 68.75 + BoolQ: 87.5 + GPQA_diamond: 62.5 + drop: 62.5 + math: 12.5 + wikibench-wiki-single_choice_cncircular: 25 + sanitized_mbpp: 56.25 + gsm8k: 37.5 + triviaqa_wiki_1shot: 43.75 + nq_open_1shot: 43.75 + winogrande: 75 + hellaswag: 93.75 + TheoremQA: 25 + college: 12.5 + college_knowledge: 87.5 + bbh-logical_deduction_seven_objects: 43.75 + bbh-multistep_arithmetic_two: 56.25 + mmlu-other: 76.92 + cmmlu-china-specific: 84.17 + mmlu_pro_math: 18.75 + +internlm2_5-7b-turbomind_fullbench: + race-high: 100 + ARC-c: 68.75 + BoolQ: 87.5 + GPQA_diamond: 62.5 + drop: 62.5 + math: 18.75 + wikibench-wiki-single_choice_cncircular: 25 + sanitized_mbpp: 56.25 + gsm8k: 68.75 + triviaqa_wiki_1shot: 43.75 + nq_open_1shot: 43.75 + winogrande: 87.5 + hellaswag: 93.75 + TheoremQA: 31.25 + college: 12.5 + college_knowledge: 87.5 + bbh-logical_deduction_seven_objects: 50 + bbh-multistep_arithmetic_two: 56.25 + mmlu-other: 76.92 + cmmlu-china-specific: 84.17 + mmlu_pro_math: 18.75 diff --git a/.github/scripts/oc_score_baseline_testrange.yaml b/.github/scripts/oc_score_baseline_testrange.yaml new file mode 100644 index 00000000..6df2b515 --- /dev/null +++ b/.github/scripts/oc_score_baseline_testrange.yaml @@ -0,0 +1,459 @@ +baichuan2-7b-chat-hf: + gsm8k: 18.75 + race-high: 78.12 + +glm-4-9b-chat-hf: + gsm8k: 68.75 + race-high: 90.62 + +glm-4-9b-chat-turbomind: + gsm8k: 75.00 + race-high: 90.62 + +glm-4-9b-chat-vllm: + gsm8k: 65.62 + race-high: 90.62 + +deepseek-7b-chat-hf: + gsm8k: 46.88 + race-high: 81.25 + +deepseek-moe-16b-chat-hf: + gsm8k: 50 + race-high: 68.75 + +deepseek-7b-chat-vllm: + gsm8k: 43.75 + race-high: 75 + +gemma2-2b-it-hf: + gsm8k: 50 + race-high: 71.88 + +gemma2-9b-it-hf: + gsm8k: 71.88 + race-high: 84.38 + +gemma-2b-it-hf: + gsm8k: 3.12 + race-high: 40.62 + +gemma-7b-it-hf: + gsm8k: 40.62 + race-high: 68.75 + +gemma-2-9b-it-turbomind: + gsm8k: 68.75 + race-high: 81.25 + +gemma-7b-it-vllm: + gsm8k: 28.12 + race-high: 68.75 + +internlm2_5-7b-chat-hf: + gsm8k: 84.38 + race-high: 90.62 + +internlm2_5-7b-chat-turbomind: + gsm8k: 84.38 + race-high: 90.62 + +internlm2-chat-1.8b-turbomind: + gsm8k: 25 + race-high: 84.38 + +internlm2-chat-1.8b-sft-turbomind: + gsm8k: 21.88 + race-high: 84.38 + +internlm2-chat-7b-lmdeploy: + gsm8k: 53.12 + race-high: 84.38 + +internlm2-chat-7b-sft-turbomind: + gsm8k: 50 + race-high: 90.62 + +internlm2-chat-7b-vllm: + gsm8k: 43.75 + race-high: 87.5 + +llama-3_1-8b-instruct-hf: + gsm8k: 84.38 + race-high: 90.62 + +llama-3_2-3b-instruct-hf: + gsm8k: 65.62 + race-high: 81.25 + +llama-3-8b-instruct-hf: + gsm8k: 68.75 + race-high: 87.5 + +llama-3_1-8b-instruct-turbomind: + gsm8k: 78.12 + race-high: 90.62 + +llama-3_2-3b-instruct-turbomind: + gsm8k: 65.62 + race-high: 81.25 + +llama-3-8b-instruct-turbomind: + gsm8k: 68.75 + race-high: 87.5 + +mistral-7b-instruct-v0.2-hf: + gsm8k: 40.62 + race-high: 75 + +mistral-7b-instruct-v0.3-hf: + gsm8k: 40.62 + race-high: 75 + +mistral-nemo-instruct-2407-hf: + gsm8k: 75 + race-high: 81.25 + +mistral-nemo-instruct-2407-turbomind: + gsm8k: 75 + race-high: 81.25 + +mistral-7b-instruct-v0.1-vllm: + gsm8k: 37.5 + race-high: 71.88 + +mistral-7b-instruct-v0.2-vllm: + gsm8k: 43.75 + race-high: 75 + +MiniCPM3-4B-hf: + gsm8k: 68.75 + race-high: 84.38 + +minicpm-2b-dpo-fp32-hf: + gsm8k: 56.25 + race-high: 56.25 + +minicpm-2b-sft-bf16-hf: + gsm8k: 46.88 + race-high: 65.62 + +minicpm-2b-sft-fp32-hf: + gsm8k: 46.88 + race-high: 65.62 + +phi-3-mini-4k-instruct-hf: + gsm8k: 56.25 + race-high: 78.12 + +qwen1.5-0.5b-chat-hf: + gsm8k: 0 + race-high: 53.12 + +qwen2-1.5b-instruct-hf: + gsm8k: 62.5 + race-high: 84.38 + +qwen2-7b-instruct-hf: + gsm8k: 68.75 + race-high: 90.62 + +qwen2-1.5b-instruct-turbomind: + gsm8k: 62.50 + race-high: 84.38 + +qwen2-7b-instruct-turbomind: + gsm8k: 81.25 + race-high: 87.5 + +qwen1.5-0.5b-chat-vllm: + gsm8k: 3.12 + race-high: 53.12 + +yi-1.5-6b-chat-hf: + gsm8k: 65.62 + race-high: 84.38 + +yi-1.5-9b-chat-hf: + gsm8k: 75 + race-high: 93.75 + +deepseek-v2-lite-chat-hf: + gsm8k: 43.75 + race-high: 71.88 + +internlm2_5-20b-chat-hf: + gsm8k: 84.38 + race-high: 87.5 + +internlm2_5-20b-chat-turbomind: + gsm8k: 84.38 + race-high: 87.5 + +mistral-small-instruct-2409-hf: + gsm8k: 81.25 + race-high: 90.62 + +mistral-small-instruct-2409-turbomind: + gsm8k: 78.12 + race-high: 90.62 + +qwen2.5-14b-instruct-hf: + gsm8k: 71.88 + race-high: 93.75 + +qwen2.5-14b-instruct-turbomind: + gsm8k: 71.88 + race-high: 93.75 + +glm-4-9b-hf: + gsm8k: 68.75 + GPQA_diamond: 31.25 + race-high: 93.75 + winogrande: 84.38 + +deepseek-moe-16b-base-hf: + gsm8k: 21.88 + GPQA_diamond: 0 + race-high: 21.88 + winogrande: 65.62 + +deepseek-7b-base-turbomind: + gsm8k: 21.88 + GPQA_diamond: 0 + race-high: 46.88 + winogrande: 84.38 + +deepseek-moe-16b-base-vllm: + gsm8k: 21.88 + GPQA_diamond: 0 + race-high: 25 + winogrande: 68.75 + +gemma2-2b-hf: + gsm8k: 31.25 + GPQA_diamond: 3.12 + race-high: 56.25 + winogrande: 71.88 + +gemma2-9b-hf: + gsm8k: 68.75 + GPQA_diamond: 0 + race-high: 81.25 + winogrande: 84.38 + +gemma-2b-hf: + gsm8k: 18.75 + GPQA_diamond: 3.12 + race-high: 25 + winogrande: 53.12 + +gemma-7b-hf: + gsm8k: 56.25 + GPQA_diamond: 6.25 + race-high: 65.62 + winogrande: 78.12 + +gemma-2b-vllm: + gsm8k: 18.75 + GPQA_diamond: 6.25 + race-high: + winogrande: + +gemma-7b-vllm: + gsm8k: 59.38 + GPQA_diamond: 6.25 + race-high: + winogrande: + +internlm2_5-7b-hf: + gsm8k: 37.5 + GPQA_diamond: 25 + race-high: 93.75 + winogrande: 71.88 + +internlm2-7b-hf: + gsm8k: 53.12 + GPQA_diamond: 18.75 + race-high: 62.5 + winogrande: 78.12 + +internlm2-base-7b-hf: + gsm8k: 3.12 + GPQA_diamond: 21.88 + race-high: 75 + winogrande: 65.62 + +internlm2-1.8b-turbomind: + gsm8k: 12.5 + GPQA_diamond: 12.5 + race-high: 71.88 + winogrande: 75 + +internlm2_5-7b-turbomind: + gsm8k: 68.75 + GPQA_diamond: 31.25 + race-high: 93.75 + winogrande: 84.38 + +internlm2-7b-turbomind: + gsm8k: 56.25 + GPQA_diamond: 21.88 + race-high: 75 + winogrande: 81.25 + +internlm2-base-7b-turbomind: + gsm8k: 40.62 + GPQA_diamond: 28.12 + race-high: 84.38 + winogrande: 71.88 + +llama-2-7b-hf: + gsm8k: 21.88 + GPQA_diamond: 21.88 + race-high: 40.62 + winogrande: 71.88 + +llama-3_1-8b-hf: + gsm8k: 78.12 + GPQA_diamond: 25 + race-high: 90.62 + winogrande: 62.5 + +llama-3-8b-hf: + gsm8k: 46.88 + GPQA_diamond: 6.25 + race-high: 65.62 + winogrande: 65.62 + +llama-3.1-8b-turbomind: + gsm8k: 56.25 + GPQA_diamond: 6.25 + race-high: 78.12 + winogrande: 78.12 + +llama-3-8b-turbomind: + gsm8k: 50 + GPQA_diamond: 9.38 + race-high: 65.62 + winogrande: 78.12 + +mistral-7b-v0.2-hf: + gsm8k: 31.25 + GPQA_diamond: 6.25 + race-high: 62.5 + winogrande: 59.38 + +mistral-7b-v0.3-hf: + gsm8k: 31.25 + GPQA_diamond: 6.25 + race-high: 62.5 + winogrande: 59.38 + +mistral-7b-v0.2-vllm: + gsm8k: 34.38 + GPQA_diamond: 6.25 + race-high: 62.5 + winogrande: 65.62 + +qwen2.5-7b-hf: + gsm8k: 81.25 + GPQA_diamond: 18.75 + race-high: 87.5 + winogrande: 71.88 + +qwen2.5-1.5b-turbomind: + gsm8k: 71.88 + GPQA_diamond: 15.62 + race-high: 78.12 + winogrande: 71.88 + +qwen2.5-7b-turbomind: + gsm8k: 71.88 + GPQA_diamond: 25 + race-high: 87.5 + winogrande: 71.88 + +qwen1.5-moe-a2.7b-hf: + gsm8k: 62.5 + GPQA_diamond: 18.75 + race-high: 84.38 + winogrande: 75 + +qwen2-0.5b-hf: + gsm8k: 25 + GPQA_diamond: 0 + race-high: 40.62 + winogrande: 62.5 + +qwen2-1.5b-hf: + gsm8k: 59.38 + GPQA_diamond: 9.38 + race-high: 81.25 + winogrande: 62.5 + +qwen2-7b-hf: + gsm8k: 68.75 + GPQA_diamond: 9.38 + race-high: 87.5 + winogrande: 68.75 + +qwen2-1.5b-turbomind: + gsm8k: 62.50 + GPQA_diamond: 6.25 + race-high: 81.25 + winogrande: 75 + +qwen2-7b-turbomind: + gsm8k: 68.75 + GPQA_diamond: 12.5 + race-high: 87.5 + winogrande: 71.88 + +qwen1.5-0.5b-vllm: + gsm8k: 9.38 + GPQA_diamond: 0 + race-high: 56.25 + winogrande: 62.5 + +yi-1.5-6b-hf: + gsm8k: 62.5 + GPQA_diamond: 3.12 + race-high: 87.5 + winogrande: 62.5 + +yi-1.5-9b-hf: + gsm8k: 75 + GPQA_diamond: 40.62 + race-high: 87.5 + winogrande: 59.38 + +deepseek-v2-lite-hf: + gsm8k: 28.12 + GPQA_diamond: 21.88 + race-high: 59.38 + winogrande: 75 + +internlm2-20b-hf: + gsm8k: 56.25 + GPQA_diamond: 15.62 + race-high: 68.75 + winogrande: 75 + +internlm2-base-20b-hf: + gsm8k: 12.5 + GPQA_diamond: 9.38 + race-high: 84.38 + winogrande: 65.62 + +internlm2-20b-turbomind: + gsm8k: 68.75 + GPQA_diamond: 15.62 + race-high: 68.75 + winogrande: 81.25 + +qwen2.5-14b-hf: + gsm8k: 75 + GPQA_diamond: 37.5 + race-high: 93.75 + winogrande: 84.38 diff --git a/.github/workflows/daily-run-test.yml b/.github/workflows/daily-run-test.yml index a141c66a..d16c5b03 100644 --- a/.github/workflows/daily-run-test.yml +++ b/.github/workflows/daily-run-test.yml @@ -13,11 +13,31 @@ on: description: 'Set branch or tag or commit id. Default is "main"' type: string default: 'main' + build_lmdeploy: + required: false + description: 'whether to build lmdeploy' + type: boolean + default: false + repo_org_lmdeploy: + required: false + description: 'Tested repository organization name. Default is internlm/lmdeploy' + type: string + default: 'InternLM/lmdeploy' + repo_ref_lmdeploy: + required: false + description: 'Set branch or tag or commit id. Default is "main"' + type: string + default: 'main' regression_func: required: true description: 'regression functions' type: string - default: "['chat','base','cmd']" + default: "['chat_models','base_models', 'chat_obj_fullbench', 'chat_sub_fullbench', 'base_fullbench','cmd', 'api']" + cuda_env: + required: true + description: "regression conda env, eg. ['dsw_cu11','dsw_cu12']" + type: string + default: "['dsw_cu12']" schedule: - cron: '56 16 * * *' @@ -31,7 +51,7 @@ env: HF_CACHE_PATH: /cpfs01/shared/public/public_hdd/llmeval/model_weights/hf_hub HUGGINGFACE_HUB_CACHE: /cpfs01/shared/public/public_hdd/llmeval/model_weights/hf_hub HF_HUB_CACHE: /cpfs01/shared/public/public_hdd/llmeval/model_weights/hf_hub - DATEASET_CACHE_PATH: /cpfs01/shared/public/public_hdd/llmeval/llm-evaluation-datasets + COMPASS_DATA_CACHE: /cpfs01/shared/public/llmeval/compass_data_cache HF_DATASETS_OFFLINE: 1 HF_EVALUATE_OFFLINE: 1 TRANSFORMERS_OFFLINE: 1 @@ -39,6 +59,8 @@ env: LMDEPLOY_USE_MODELSCOPE: false HF_HUB_OFFLINE: 1 TRITON_PTXAS_PATH: /usr/local/cuda/bin/ptxas + REPORT_ROOT: /cpfs01/shared/public/qa-llm-cicd/report + OUTPUT_FOLDER: cuda12.1_dist_${{ github.run_id }} jobs: build-pypi: @@ -64,16 +86,51 @@ jobs: retention-days: 1 name: my-artifact-${{ github.run_id }} - daily_run_test: + build-pypi-lmdeploy: + if: ${{!cancelled() && (github.event_name != 'schedule' && inputs.build_lmdeploy)}} + strategy: + matrix: + pyver: [py310] + runs-on: ubuntu-latest + env: + PYTHON_VERSION: ${{ matrix.pyver }} + PLAT_NAME: manylinux2014_x86_64 + DOCKER_TAG: cuda12.1 + steps: + - name: Checkout repository + uses: actions/checkout@v3 + with: + repository: ${{ github.event.inputs.repo_org_lmdeploy || 'InternLM/lmdeploy' }} + ref: ${{github.event.inputs.repo_ref_lmdeploy || 'main'}} + - name: Build + run: | + echo ${PYTHON_VERSION} + echo ${PLAT_NAME} + echo ${DOCKER_TAG} + echo ${OUTPUT_FOLDER} + echo ${GITHUB_RUN_ID} + # remove -it + sed -i 's/docker run --rm -it/docker run --rm/g' builder/manywheel/build_wheel.sh + bash builder/manywheel/build_wheel.sh ${PYTHON_VERSION} ${PLAT_NAME} ${DOCKER_TAG} ${OUTPUT_FOLDER} + - name: Upload Artifacts + uses: actions/upload-artifact@v4 + with: + if-no-files-found: error + path: builder/manywheel/${{ env.OUTPUT_FOLDER }} + retention-days: 1 + name: my-artifact-${{ github.run_id }}-${{ matrix.pyver }} + + + prepare_env: if: ${{!cancelled()}} - needs: build-pypi + needs: ['build-pypi', 'build-pypi-lmdeploy'] strategy: fail-fast: false matrix: - cuda_env: [dsw_cu11, dsw_cu12] + cuda_env: ${{ fromJSON(inputs.cuda_env || '["dsw_cu12"]')}} runs-on: ${{ matrix.cuda_env }} environment: 'prod' - timeout-minutes: 600 #10hours + timeout-minutes: 240 #4hours steps: - name: Clone repository uses: actions/checkout@v2 @@ -84,89 +141,169 @@ jobs: uses: actions/download-artifact@v4 with: name: my-artifact-${{ github.run_id }} + - name: Remove Conda Env + if: always() + run: | + . /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate + conda env remove -y --name ${{env.CONDA_ENV}}_${{ matrix.cuda_env }} + conda info --envs - name: Prepare - create conda env and install torch - cu11 if: ${{matrix.cuda_env == 'dsw_cu11'}} - run: | - . /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate - conda create -y --name ${{env.CONDA_ENV}}_${{ matrix.cuda_env }} python=3.10 - conda activate ${{env.CONDA_ENV}}_${{ matrix.cuda_env }} - pip install -r /cpfs01/shared/public/qa-llm-cicd/requirements-cu11.txt --cache-dir ${{env.PIP_CACHE_PATH}} - pip install opencompass*.whl --cache-dir ${{env.PIP_CACHE_PATH}} - pip install /cpfs01/user/qa-llm-cicd/packages/lmdeploy-0.6.1+cu118-cp310-cp310-manylinux2014_x86_64.whl --cache-dir ${{env.PIP_CACHE_PATH}} - pip install /cpfs01/user/qa-llm-cicd/packages/vllm-0.6.1.post1+cu118-cp310-cp310-manylinux1_x86_64.whl --cache-dir ${{env.PIP_CACHE_PATH}} - pip uninstall torch torchvision torchaudio -y - pip install torch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 --cache-dir ${{env.PIP_CACHE_PATH}} --index-url https://download.pytorch.org/whl/cu118 - FLASH_ATTENTION_FORCE_BUILD=TRUE pip install /cpfs01/user/qa-llm-cicd/packages/flash_attn-2.6.3+cu118torch2.4cxx11abiFALSE-cp310-cp310-linux_x86_64.whl - pip install /cpfs01/user/qa-llm-cicd/packages/xformers-0.0.27.post2+cu118-cp310-cp310-manylinux2014_x86_64.whl --cache-dir ${{env.PIP_CACHE_PATH}} - conda info --envs - pip list + uses: nick-fields/retry@v3 + id: retry1 + with: + max_attempts: 3 + timeout_minutes: 40 + command: | + . /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate + conda create -y --name ${{env.CONDA_ENV}}_${{ matrix.cuda_env }} python=3.10 + conda activate ${{env.CONDA_ENV}}_${{ matrix.cuda_env }} + pip install -r /cpfs01/shared/public/qa-llm-cicd/requirements-cu11.txt --cache-dir ${{env.PIP_CACHE_PATH}} + pip install opencompass*.whl --cache-dir ${{env.PIP_CACHE_PATH}} + pip install /cpfs01/user/qa-llm-cicd/packages/lmdeploy-0.6.1+cu118-cp310-cp310-manylinux2014_x86_64.whl --cache-dir ${{env.PIP_CACHE_PATH}} + pip install /cpfs01/user/qa-llm-cicd/packages/vllm-0.6.1.post1+cu118-cp310-cp310-manylinux1_x86_64.whl --cache-dir ${{env.PIP_CACHE_PATH}} + pip uninstall torch torchvision torchaudio -y + pip install torch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 --cache-dir ${{env.PIP_CACHE_PATH}} --index-url https://download.pytorch.org/whl/cu118 + FLASH_ATTENTION_FORCE_BUILD=TRUE pip install /cpfs01/user/qa-llm-cicd/packages/flash_attn-2.6.3+cu118torch2.4cxx11abiFALSE-cp310-cp310-linux_x86_64.whl + pip install /cpfs01/user/qa-llm-cicd/packages/xformers-0.0.27.post2+cu118-cp310-cp310-manylinux2014_x86_64.whl --cache-dir ${{env.PIP_CACHE_PATH}} + conda info --envs + pip list - name: Prepare - create conda env and install torch - cu12 if: ${{matrix.cuda_env == 'dsw_cu12'}} + uses: nick-fields/retry@v3 + id: retry2 + with: + max_attempts: 3 + timeout_minutes: 40 + command: | + . /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate + conda create -y --name ${{env.CONDA_ENV}}_${{ matrix.cuda_env }} python=3.10 + conda activate ${{env.CONDA_ENV}}_${{ matrix.cuda_env }} + pip install -r /cpfs01/shared/public/qa-llm-cicd/requirements-cu12.txt --cache-dir ${{env.PIP_CACHE_PATH}} + pip install opencompass*.whl --cache-dir ${{env.PIP_CACHE_PATH}} + pip install opencompass[lmdeploy] --cache-dir ${{env.PIP_CACHE_PATH}} + pip install opencompass[vllm] --cache-dir ${{env.PIP_CACHE_PATH}} + pip install torch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 --cache-dir ${{env.PIP_CACHE_PATH}} + FLASH_ATTENTION_FORCE_BUILD=TRUE pip install /cpfs01/user/qa-llm-cicd/packages/flash_attn-2.6.3+cu123torch2.4cxx11abiFALSE-cp310-cp310-linux_x86_64.whl + pip install /cpfs01/user/qa-llm-cicd/packages/xformers-0.0.27.post2-cp310-cp310-manylinux2014_x86_64.whl --cache-dir ${{env.PIP_CACHE_PATH}} + conda info --envs + pip list + - name: Prepare - reinstall lmdeploy - cu12 + if: ${{matrix.cuda_env == 'dsw_cu12' && inputs.build_lmdeploy}} + uses: actions/download-artifact@v4 + with: + name: my-artifact-${{ github.run_id }}-py310 + - name: Prepare - reinstall lmdeploy - cu12 + if: ${{matrix.cuda_env == 'dsw_cu12' && inputs.build_lmdeploy}} run: | . /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate - conda create -y --name ${{env.CONDA_ENV}}_${{ matrix.cuda_env }} python=3.10 conda activate ${{env.CONDA_ENV}}_${{ matrix.cuda_env }} - pip install -r /cpfs01/shared/public/qa-llm-cicd/requirements-cu12.txt --cache-dir ${{env.PIP_CACHE_PATH}} - pip install opencompass*.whl --cache-dir ${{env.PIP_CACHE_PATH}} - pip install opencompass[lmdeploy] --cache-dir ${{env.PIP_CACHE_PATH}} - pip install opencompass[vllm] --cache-dir ${{env.PIP_CACHE_PATH}} - pip uninstall torch torchvision torchaudio -y - pip install torch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 --cache-dir ${{env.PIP_CACHE_PATH}} - FLASH_ATTENTION_FORCE_BUILD=TRUE pip install /cpfs01/user/qa-llm-cicd/packages/flash_attn-2.6.3+cu123torch2.4cxx11abiFALSE-cp310-cp310-linux_x86_64.whl - pip install /cpfs01/user/qa-llm-cicd/packages/xformers-0.0.27.post2-cp310-cp310-manylinux2014_x86_64.whl --cache-dir ${{env.PIP_CACHE_PATH}} - conda info --envs - pip list + pip install lmdeploy-*.whl --no-deps + + daily_run_test: + if: ${{!cancelled()}} + needs: prepare_env + strategy: + fail-fast: false + matrix: + cuda_env: ${{ fromJSON(inputs.cuda_env || '["dsw_cu12"]')}} + regression_func: ${{fromJSON(github.event.inputs.regression_func || '["chat_models","base_models","chat_obj_fullbench","chat_sub_fullbench","base_fullbench","cmd","api"]')}} + runs-on: ${{ matrix.cuda_env }} + environment: 'prod' + timeout-minutes: 240 #4hours + steps: + - name: Clone repository + uses: actions/checkout@v2 + with: + repository: ${{ github.event.inputs.repo_org || 'open-compass/opencompass' }} + ref: ${{github.event.inputs.repo_ref || 'main'}} - name: Prepare - prepare data and hf model run: | - ln -s ${{env.DATEASET_CACHE_PATH}} data rm -rf ~/.cache/huggingface/hub -f && mkdir ~/.cache -p && mkdir ~/.cache/huggingface -p ln -s ${{env.HF_CACHE_PATH}} ~/.cache/huggingface/hub - name: Run command testcase - if: github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'cmd') + if: matrix.regression_func == 'cmd' run: | . /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate conda activate ${{env.CONDA_ENV}}_${{ matrix.cuda_env }} conda info --envs export from_tf=TRUE python tools/list_configs.py internlm2_5 mmlu - opencompass --models hf_internlm2_5_7b --datasets race_ppl --work-dir /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd1_${{ matrix.cuda_env }} --reuse --max-num-workers 2 - rm regression_result_daily -f && ln -s /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd1_${{ matrix.cuda_env }}/*/summary regression_result_daily + opencompass --models hf_internlm2_5_7b hf_internlm2_1_8b --datasets race_ppl demo_gsm8k_chat_gen --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd1_${{ matrix.cuda_env }} --reuse --max-num-workers 2 --dump-eval-details + rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd1_${{ matrix.cuda_env }}/*/summary regression_result_daily python -m pytest -m case1 -s -v --color=yes .github/scripts/oc_score_assert.py - opencompass --models hf_internlm2_5_7b_chat --datasets race_gen -a lmdeploy --work-dir /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd2_${{ matrix.cuda_env }} --reuse --max-num-workers 2 - rm regression_result_daily -f && ln -s /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd2_${{ matrix.cuda_env }}/*/summary regression_result_daily + opencompass --models hf_internlm2_5_7b_chat hf_internlm2_chat_1_8b --datasets race_gen demo_gsm8k_chat_gen -a lmdeploy --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd2_${{ matrix.cuda_env }} --reuse --max-num-workers 2 --dump-eval-details + rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd2_${{ matrix.cuda_env }}/*/summary regression_result_daily python -m pytest -m case2 -s -v --color=yes .github/scripts/oc_score_assert.py - opencompass --datasets race_ppl --hf-type base --hf-path internlm/internlm2_5-7b --work-dir /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd3_${{ matrix.cuda_env }} --reuse --max-num-workers 2 - rm regression_result_daily -f && ln -s /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd3_${{ matrix.cuda_env }}/*/summary regression_result_daily + opencompass --datasets race_ppl demo_gsm8k_chat_gen --hf-type base --hf-path internlm/internlm2_5-7b --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd3_${{ matrix.cuda_env }} --reuse --max-num-workers 2 --dump-eval-details + rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd3_${{ matrix.cuda_env }}/*/summary regression_result_daily python -m pytest -m case3 -s -v --color=yes .github/scripts/oc_score_assert.py - opencompass --datasets race_gen --hf-type chat --hf-path internlm/internlm2_5-7b-chat --work-dir /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd4_${{ matrix.cuda_env }} --reuse --max-num-workers 2 - rm regression_result_daily -f && ln -s /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd4_${{ matrix.cuda_env }}/*/summary regression_result_daily + opencompass --datasets race_gen demo_gsm8k_chat_gen --hf-type chat --hf-path internlm/internlm2_5-7b-chat --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd4_${{ matrix.cuda_env }} --reuse --max-num-workers 2 --dump-eval-details + rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd4_${{ matrix.cuda_env }}/*/summary regression_result_daily python -m pytest -m case4 -s -v --color=yes .github/scripts/oc_score_assert.py - name: Run chat model test - if: github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'chat') + if: matrix.regression_func == 'chat_models' run: | . /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate conda activate ${{env.CONDA_ENV}}_${{ matrix.cuda_env }} conda info --envs - sed -i 's/judgemodel/'$(tail -n 1 /cpfs01/shared/public/llmeval/share_info/compassjuder_ip.txt)'/g' .github/scripts/eval_regression_chat.py - opencompass .github/scripts/eval_regression_chat.py --work-dir /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/chat_${{ matrix.cuda_env }} --reuse --max-num-workers 2 - rm regression_result_daily -f && ln -s /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/chat_${{ matrix.cuda_env }}/*/summary regression_result_daily + opencompass .github/scripts/eval_regression_chat.py --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/chat_${{ matrix.cuda_env }} --reuse --max-num-workers 2 --dump-eval-details + rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/chat_${{ matrix.cuda_env }}/*/summary regression_result_daily python -m pytest -m chat -s -v --color=yes .github/scripts/oc_score_assert.py - name: Run base model test - if: github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'base') + if: matrix.regression_func == 'base_models' run: | . /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate conda activate ${{env.CONDA_ENV}}_${{ matrix.cuda_env }} conda info --envs - opencompass .github/scripts/eval_regression_base.py --work-dir /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/base_${{ matrix.cuda_env }} --reuse --max-num-workers 2 - rm regression_result_daily -f && ln -s /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/base_${{ matrix.cuda_env }}/*/summary regression_result_daily + opencompass .github/scripts/eval_regression_base.py --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/base_${{ matrix.cuda_env }} --reuse --max-num-workers 2 --dump-eval-details + rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/base_${{ matrix.cuda_env }}/*/summary regression_result_daily python -m pytest -m base -s -v --color=yes .github/scripts/oc_score_assert.py - - name: Remove Conda Env - if: always() + - name: Run chat model test - fullbench + if: matrix.regression_func == 'chat_obj_fullbench' run: | - rm -rf regression_result_daily . /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate - conda env remove -y --name ${{env.CONDA_ENV}}_${{ matrix.cuda_env }} + conda activate ${{env.CONDA_ENV}}_${{ matrix.cuda_env }} conda info --envs + opencompass .github/scripts/eval_regression_chat_objective_fullbench.py --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/chat_obj_${{ matrix.cuda_env }} --reuse --max-num-workers 2 --dump-eval-details + rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/chat_obj_${{ matrix.cuda_env }}/*/summary regression_result_daily + python -m pytest -m chat_obj_fullbench -s -v --color=yes .github/scripts/oc_score_assert.py + - name: Run chat model test - fullbench + if: matrix.regression_func == 'chat_sub_fullbench' + env: + COMPASS_DATA_CACHE: /cpfs01/shared/public/llmeval/compass_data_cache_subset + run: | + . /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate + conda activate ${{env.CONDA_ENV}}_${{ matrix.cuda_env }} + conda info --envs + opencompass .github/scripts/eval_regression_chat_subjective_fullbench.py --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/chat_sub_${{ matrix.cuda_env }} --reuse --max-num-workers 2 --dump-eval-details + rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/chat_sub_${{ matrix.cuda_env }}/*/summary regression_result_daily + python -m pytest -m chat_sub_fullbench -s -v --color=yes .github/scripts/oc_score_assert.py + - name: Run base model test - fullbench + if: matrix.regression_func == 'base_fullbench' + run: | + . /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate + conda activate ${{env.CONDA_ENV}}_${{ matrix.cuda_env }} + conda info --envs + opencompass .github/scripts/eval_regression_base_fullbench.py --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/base_full_${{ matrix.cuda_env }} --reuse --max-num-workers 2 --dump-eval-details + rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/base_full_${{ matrix.cuda_env }}/*/summary regression_result_daily + python -m pytest -m base_fullbench -s -v --color=yes .github/scripts/oc_score_assert.py + - name: Run model test - api + if: matrix.regression_func == 'api' + run: | + . /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate + conda activate ${{env.CONDA_ENV}}_${{ matrix.cuda_env }} + conda info --envs + lmdeploy serve api_server internlm/internlm2_5-7b-chat --max-batch-size 256 --model-name internlm2 > ${{env.REPORT_ROOT}}/${{ github.run_id }}/restful.log 2>&1 & + echo "restful_pid=$!" >> "$GITHUB_ENV" + sleep 120s + opencompass .github/scripts/eval_regression_api.py --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/api_${{ matrix.cuda_env }} --reuse --max-num-workers 2 --dump-eval-details + rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/api_${{ matrix.cuda_env }}/*/summary regression_result_daily + python -m pytest -m api -s -v --color=yes .github/scripts/oc_score_assert.py + - name: Run model test - api kill + if: always() && matrix.regression_func == 'api' + run: | + kill -15 "$restful_pid" notify_to_feishu: if: ${{ always() && !cancelled() && contains(needs.*.result, 'failure') && (github.ref_name == 'develop' || github.ref_name == 'main') }}