From a9d6b6461ff15d11aaaaee02a8c3552c93321c59 Mon Sep 17 00:00:00 2001 From: zhulinJulia24 <145004780+zhulinJulia24@users.noreply.github.com> Date: Tue, 12 Nov 2024 18:40:27 +0800 Subject: [PATCH 01/17] [ci] react daily test (#1668) * updaste * update * update * update * update * update * update * update * update * update * updaste * update * update * refactor summarize * update * update * update * update * update * updaste * update * update * update * update * updaste * update * update * update * update * update * updaste * updaste * update * update * update * update * update * update * update * update * update * update * update * Update daily-run-test.yml * Update daily-run-test.yml * update * update * update * update * update * Update daily-run-test.yml * update * update * update * update * update * update * update * update * update * update * update * Update daily-run-test.yml * Update daily-run-test.yml * update * update * Update daily-run-test.yml * update * update * update --------- Co-authored-by: zhulin1 --- .github/scripts/eval_regression_api.py | 39 ++ .github/scripts/eval_regression_base.py | 58 ++- .../scripts/eval_regression_base_fullbench.py | 184 +++++++ .github/scripts/eval_regression_chat.py | 78 +-- ...val_regression_chat_objective_fullbench.py | 246 ++++++++++ ...al_regression_chat_subjective_fullbench.py | 70 +++ .github/scripts/oc_score_assert.py | 314 +++++++++--- .github/scripts/oc_score_baseline.yaml | 389 ++------------- .../scripts/oc_score_baseline_fullbench.yaml | 153 ++++++ .../scripts/oc_score_baseline_testrange.yaml | 459 ++++++++++++++++++ .github/workflows/daily-run-test.yml | 241 +++++++-- 11 files changed, 1710 insertions(+), 521 deletions(-) create mode 100644 .github/scripts/eval_regression_api.py create mode 100644 .github/scripts/eval_regression_base_fullbench.py create mode 100644 .github/scripts/eval_regression_chat_objective_fullbench.py create mode 100644 .github/scripts/eval_regression_chat_subjective_fullbench.py create mode 100644 .github/scripts/oc_score_baseline_fullbench.yaml create mode 100644 .github/scripts/oc_score_baseline_testrange.yaml diff --git a/.github/scripts/eval_regression_api.py b/.github/scripts/eval_regression_api.py new file mode 100644 index 00000000..db4f0ab2 --- /dev/null +++ b/.github/scripts/eval_regression_api.py @@ -0,0 +1,39 @@ +from mmengine.config import read_base + +from opencompass.models.openai_api import OpenAISDK + +with read_base(): + # choose a list of datasets + from opencompass.configs.datasets.gsm8k.gsm8k_gen import \ + gsm8k_datasets # noqa: F401, E501 + from opencompass.configs.datasets.race.race_gen import \ + race_datasets # noqa: F401, E501 + +datasets = sum([v for k, v in locals().items() if k.endswith('_datasets')], []) + +api_meta_template = dict( + round=[ + dict(role='HUMAN', api_role='HUMAN'), + dict(role='BOT', api_role='BOT', generate=True), + ], + reserved_roles=[dict(role='SYSTEM', api_role='SYSTEM')], +) + +models = [ + dict( + abbr='lmdeploy-api-test', + type=OpenAISDK, + key='EMPTY', + openai_api_base='http://localhost:23333/v1', + path='internlm2', + tokenizer_path='internlm/internlm2_5-7b-chat', + rpm_verbose=True, + meta_template=api_meta_template, + query_per_second=128, + max_out_len=1024, + max_seq_len=4096, + temperature=0.01, + batch_size=128, + retry=20, + ) +] diff --git a/.github/scripts/eval_regression_base.py b/.github/scripts/eval_regression_base.py index 12339ecf..330c97e5 100644 --- a/.github/scripts/eval_regression_base.py +++ b/.github/scripts/eval_regression_base.py @@ -2,15 +2,21 @@ from mmengine.config import read_base with read_base(): # choose a list of datasets + from opencompass.configs.datasets.gpqa.gpqa_openai_simple_evals_gen_5aeece import \ + gpqa_datasets # noqa: F401, E501 from opencompass.configs.datasets.gsm8k.gsm8k_gen_17d0dc import \ gsm8k_datasets # noqa: F401, E501 from opencompass.configs.datasets.race.race_ppl import \ race_datasets # noqa: F401, E501 + from opencompass.configs.datasets.winogrande.winogrande_5shot_ll_252f01 import \ + winogrande_datasets # noqa: F401, E501 + # read hf models - chat models + from opencompass.configs.models.chatglm.hf_glm4_9b import \ + models as hf_glm4_9b_model # noqa: F401, E501 from opencompass.configs.models.deepseek.hf_deepseek_moe_16b_base import \ models as hf_deepseek_moe_16b_base_model # noqa: F401, E501 from opencompass.configs.models.deepseek.hf_deepseek_v2_lite import \ models as hf_deepseek_v2_lite_model # noqa: F401, E501 - # read hf models - chat models from opencompass.configs.models.deepseek.lmdeploy_deepseek_7b_base import \ models as lmdeploy_deepseek_7b_base_model # noqa: F401, E501 from opencompass.configs.models.deepseek.vllm_deepseek_moe_16b_base import \ @@ -19,34 +25,58 @@ with read_base(): models as hf_gemma2_2b_model # noqa: F401, E501 from opencompass.configs.models.gemma.hf_gemma2_9b import \ models as hf_gemma2_9b_model # noqa: F401, E501 + from opencompass.configs.models.gemma.hf_gemma_2b import \ + models as hf_gemma_2b_model # noqa: F401, E501 + from opencompass.configs.models.gemma.hf_gemma_7b import \ + models as hf_gemma_7b_model # noqa: F401, E501 + from opencompass.configs.models.gemma.vllm_gemma_2b import \ + models as vllm_gemma_2b_model # noqa: F401, E501 + from opencompass.configs.models.gemma.vllm_gemma_7b import \ + models as vllm_gemma_7b_model # noqa: F401, E501 from opencompass.configs.models.hf_internlm.hf_internlm2_5_7b import \ models as hf_internlm2_5_7b_model # noqa: F401, E501 from opencompass.configs.models.hf_internlm.hf_internlm2_7b import \ models as hf_internlm2_7b_model # noqa: F401, E501 + from opencompass.configs.models.hf_internlm.hf_internlm2_20b import \ + models as hf_internlm2_20b_model # noqa: F401, E501 from opencompass.configs.models.hf_internlm.hf_internlm2_base_7b import \ models as hf_internlm2_base_7b_model # noqa: F401, E501 + from opencompass.configs.models.hf_internlm.hf_internlm2_base_20b import \ + models as hf_internlm2_base_20b_model # noqa: F401, E501 from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_1_8b import \ models as lmdeploy_internlm2_1_8b_model # noqa: F401, E501 from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b import \ models as lmdeploy_internlm2_5_7b_model # noqa: F401, E501 from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_7b import \ models as lmdeploy_internlm2_7b_model # noqa: F401, E501 + from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_20b import \ + models as lmdeploy_internlm2_20b_model # noqa: F401, E501 from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_base_7b import \ models as lmdeploy_internlm2_base_7b_model # noqa: F401, E501 from opencompass.configs.models.hf_llama.hf_llama2_7b import \ models as hf_llama2_7b_model # noqa: F401, E501 + from opencompass.configs.models.hf_llama.hf_llama3_1_8b import \ + models as hf_llama3_1_8b_model # noqa: F401, E501 from opencompass.configs.models.hf_llama.hf_llama3_8b import \ models as hf_llama3_8b_model # noqa: F401, E501 from opencompass.configs.models.hf_llama.lmdeploy_llama3_1_8b import \ models as lmdeploy_llama3_1_8b_model # noqa: F401, E501 from opencompass.configs.models.hf_llama.lmdeploy_llama3_8b import \ models as lmdeploy_llama3_8b_model # noqa: F401, E501 + from opencompass.configs.models.mistral.hf_mistral_7b_v0_2 import \ + models as hf_mistral_7b_v0_2_model # noqa: F401, E501 from opencompass.configs.models.mistral.hf_mistral_7b_v0_3 import \ models as hf_mistral_7b_v0_3_model # noqa: F401, E501 from opencompass.configs.models.mistral.vllm_mistral_7b_v0_2 import \ models as vllm_mistral_7b_v0_2_model # noqa: F401, E501 - from opencompass.configs.models.mistral.vllm_mixtral_8x7b_v0_1 import \ - models as vllm_mixtral_8x7b_v0_1_model # noqa: F401, E501 + from opencompass.configs.models.qwen2_5.hf_qwen_2_5_7b import \ + models as hf_qwen_2_5_7b_model # noqa: F401, E501 + from opencompass.configs.models.qwen2_5.hf_qwen_2_5_14b import \ + models as hf_qwen_2_5_14b_model # noqa: F401, E501 + from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_1_5b import \ + models as lmdeploy_qwen2_5_1_5b_model # noqa: F401, E501 + from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_7b import \ + models as lmdeploy_qwen2_5_7b_model # noqa: F401, E501 from opencompass.configs.models.qwen.hf_qwen1_5_moe_a2_7b import \ models as hf_qwen1_5_moe_a2_7b_model # noqa: F401, E501 from opencompass.configs.models.qwen.hf_qwen2_0_5b import \ @@ -65,11 +95,27 @@ with read_base(): models as hf_yi_1_5_6b_model # noqa: F401, E501 from opencompass.configs.models.yi.hf_yi_1_5_9b import \ models as hf_yi_1_5_9b_model # noqa: F401, E501 - from opencompass.configs.summarizers.medium import \ - summarizer # noqa: F401, E501 +race_datasets = [race_datasets[1]] models = sum([v for k, v in locals().items() if k.endswith('_model')], []) datasets = sum([v for k, v in locals().items() if k.endswith('_datasets')], []) for d in datasets: - d['reader_cfg']['test_range'] = '[0:100]' + d['reader_cfg']['test_range'] = '[0:32]' + +for m in models: + if 'turbomind' in m['abbr'] or 'lmdeploy' in m['abbr']: + m['engine_config']['max_batch_size'] = 1 + m['batch_size'] = 1 +models = sorted(models, key=lambda x: x['run_cfg']['num_gpus']) + +summarizer = dict( + dataset_abbrs=[ + ['gsm8k', 'accuracy'], + ['GPQA_diamond', 'accuracy'], + ['race-high', 'accuracy'], + ['winogrande', 'accuracy'], + ], + summary_groups=sum( + [v for k, v in locals().items() if k.endswith('_summary_groups')], []), +) diff --git a/.github/scripts/eval_regression_base_fullbench.py b/.github/scripts/eval_regression_base_fullbench.py new file mode 100644 index 00000000..d5ad48c4 --- /dev/null +++ b/.github/scripts/eval_regression_base_fullbench.py @@ -0,0 +1,184 @@ +from mmengine.config import read_base + +with read_base(): + from opencompass.configs.datasets.ARC_c.ARC_c_few_shot_ppl import \ + ARC_c_datasets # noqa: F401, E501 + from opencompass.configs.datasets.bbh.bbh_gen_98fba6 import \ + bbh_datasets # noqa: F401, E501 + from opencompass.configs.datasets.cmmlu.cmmlu_ppl_041cbf import \ + cmmlu_datasets # noqa: F401, E501 + from opencompass.configs.datasets.drop.drop_gen_a2697c import \ + drop_datasets # noqa: F401, E501 + from opencompass.configs.datasets.GaokaoBench.GaokaoBench_no_subjective_gen_d21e37 import \ + GaokaoBench_datasets # noqa: F401, E501 + from opencompass.configs.datasets.gpqa.gpqa_few_shot_ppl_4b5a83 import \ + gpqa_datasets # noqa: F401, E501 + # Corebench v1.7 + from opencompass.configs.datasets.gsm8k.gsm8k_gen_17d0dc import \ + gsm8k_datasets # noqa: F401, E501 + from opencompass.configs.datasets.hellaswag.hellaswag_10shot_ppl_59c85e import \ + hellaswag_datasets # noqa: F401, E501 + from opencompass.configs.datasets.humaneval.internal_humaneval_gen_ce6b06 import \ + humaneval_datasets as humaneval_v2_datasets # noqa: F401, E501 + from opencompass.configs.datasets.humaneval.internal_humaneval_gen_d2537e import \ + humaneval_datasets # noqa: F401, E501 + from opencompass.configs.datasets.math.math_4shot_base_gen_43d5b6 import \ + math_datasets # noqa: F401, E501 + from opencompass.configs.datasets.MathBench.mathbench_2024_few_shot_mixed_4a3fd4 import \ + mathbench_datasets # noqa: F401, E501 + from opencompass.configs.datasets.mbpp.sanitized_mbpp_gen_742f0c import \ + sanitized_mbpp_datasets # noqa: F401, E501 + from opencompass.configs.datasets.mmlu.mmlu_ppl_ac766d import \ + mmlu_datasets # noqa: F401, E501 + from opencompass.configs.datasets.mmlu_pro.mmlu_pro_few_shot_gen_bfaf90 import \ + mmlu_pro_datasets # noqa: F401, E501 + from opencompass.configs.datasets.nq.nq_open_1shot_gen_20a989 import \ + nq_datasets # noqa: F401, E501 + from opencompass.configs.datasets.race.race_few_shot_ppl import \ + race_datasets # noqa: F401, E501 + from opencompass.configs.datasets.SuperGLUE_BoolQ.SuperGLUE_BoolQ_few_shot_ppl import \ + BoolQ_datasets # noqa: F401, E501 + from opencompass.configs.datasets.TheoremQA.TheoremQA_5shot_gen_6f0af8 import \ + TheoremQA_datasets # noqa: F401, E501 + from opencompass.configs.datasets.triviaqa.triviaqa_wiki_1shot_gen_20a989 import \ + triviaqa_datasets # noqa: F401, E501 + from opencompass.configs.datasets.wikibench.wikibench_few_shot_ppl_c23d79 import \ + wikibench_datasets # noqa: F401, E501 + from opencompass.configs.datasets.winogrande.winogrande_5shot_ll_252f01 import \ + winogrande_datasets # noqa: F401, E501 + from opencompass.configs.models.hf_internlm.hf_internlm2_5_7b import \ + models as hf_internlm2_5_7b_model # noqa: F401, E501 + from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b import \ + models as lmdeploy_internlm2_5_7b_model # noqa: F401, E501 + from opencompass.configs.summarizers.groups.bbh import \ + bbh_summary_groups # noqa: F401, E501 + # Summary Groups + from opencompass.configs.summarizers.groups.cmmlu import \ + cmmlu_summary_groups # noqa: F401, E501 + from opencompass.configs.summarizers.groups.GaokaoBench import \ + GaokaoBench_summary_groups # noqa: F401, E501 + from opencompass.configs.summarizers.groups.mathbench_v1_2024 import \ + mathbench_2024_summary_groups # noqa: F401, E501 + from opencompass.configs.summarizers.groups.mmlu import \ + mmlu_summary_groups # noqa: F401, E501 + from opencompass.configs.summarizers.groups.mmlu_pro import \ + mmlu_pro_summary_groups # noqa: F401, E501 + +race_datasets = [race_datasets[1]] # Only take RACE-High +humaneval_v2_datasets[0]['abbr'] = 'openai_humaneval_v2' +bbh_datasets = [ + x for x in bbh_datasets if 'logical_deduction_seven_objects' in x['abbr'] + or 'multistep_arithmetic_two' in x['abbr'] +] +cmmlu_datasets = [ + x for x in cmmlu_datasets if x['abbr'].replace('cmmlu-', '') in [ + 'ancient_chinese', 'chinese_civil_service_exam', + 'chinese_driving_rule', 'chinese_food_culture', + 'chinese_foreign_policy', 'chinese_history', 'chinese_literature', + 'chinese_teacher_qualification', 'construction_project_management', + 'elementary_chinese', 'elementary_commonsense', 'ethnology', + 'high_school_politics', 'modern_chinese', + 'traditional_chinese_medicine' + ] +] +mmlu_datasets = [ + x for x in mmlu_datasets if x['abbr'].replace('lukaemon_mmlu_', '') in [ + 'business_ethics', 'clinical_knowledge', 'college_medicine', + 'global_facts', 'human_aging', 'management', 'marketing', + 'medical_genetics', 'miscellaneous', 'nutrition', + 'professional_accounting', 'professional_medicine', 'virology' + ] +] +mmlu_pro_datasets = [mmlu_pro_datasets[0]] +mathbench_datasets = [x for x in mathbench_datasets if 'college' in x['abbr']] +GaokaoBench_datasets = [ + x for x in GaokaoBench_datasets if '2010-2022_Math_II_MCQs' in x['abbr'] + or '2010-2022_Math_II_Fill-in-the-Blank' in x['abbr'] +] +datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), []) + +summarizer = dict( + dataset_abbrs=[ + ['race-high', 'accuracy'], + ['ARC-c', 'accuracy'], + ['BoolQ', 'accuracy'], + ['mmlu_pro', 'naive_average'], + ['GPQA_diamond', 'accuracy'], + ['cmmlu', 'naive_average'], + ['mmlu', 'naive_average'], + ['drop', 'accuracy'], + ['bbh', 'naive_average'], + ['math', 'accuracy'], + ['openai_humaneval', 'humaneval_pass@1'], + ['openai_humaneval_v2', 'humaneval_pass@1'], + ['sanitized_mbpp', 'score'], + ['wikibench-wiki-single_choice_cncircular', 'perf_4'], + ['gsm8k', 'accuracy'], + ['GaokaoBench', 'weighted_average'], + ['triviaqa_wiki_1shot', 'score'], + ['nq_open_1shot', 'score'], + ['winogrande', 'accuracy'], + ['hellaswag', 'accuracy'], + ['TheoremQA', 'score'], + '###### MathBench-A: Application Part ######', + 'college', + 'high', + 'middle', + 'primary', + 'arithmetic', + 'mathbench-a (average)', + '###### MathBench-T: Theory Part ######', + 'college_knowledge', + 'high_knowledge', + 'middle_knowledge', + 'primary_knowledge', + 'mathbench-t (average)', + '###### Overall: Average between MathBench-A and MathBench-T ######', + 'Overall', + '', + 'bbh-logical_deduction_seven_objects', + 'bbh-multistep_arithmetic_two', + '', + 'mmlu', + 'mmlu-stem', + 'mmlu-social-science', + 'mmlu-humanities', + ['mmlu-other', 'accuracy'], + 'cmmlu', + 'cmmlu-stem', + 'cmmlu-social-science', + 'cmmlu-humanities', + 'cmmlu-other', + ['cmmlu-china-specific', 'accuracy'], + 'mmlu_pro', + 'mmlu_pro_biology', + 'mmlu_pro_business', + 'mmlu_pro_chemistry', + 'mmlu_pro_computer_science', + 'mmlu_pro_economics', + 'mmlu_pro_engineering', + 'mmlu_pro_health', + 'mmlu_pro_history', + 'mmlu_pro_law', + 'mmlu_pro_math', + 'mmlu_pro_philosophy', + 'mmlu_pro_physics', + 'mmlu_pro_psychology', + 'mmlu_pro_other', + ], + summary_groups=sum( + [v for k, v in locals().items() if k.endswith('_summary_groups')], []), +) + +models = sum([v for k, v in locals().items() if k.endswith('_model')], []) +datasets = sum([v for k, v in locals().items() if k.endswith('_datasets')], []) + +for d in datasets: + d['reader_cfg']['test_range'] = '[0:16]' + +for m in models: + m['abbr'] = m['abbr'] + '_fullbench' + if 'turbomind' in m['abbr'] or 'lmdeploy' in m['abbr']: + m['engine_config']['max_batch_size'] = 1 + m['batch_size'] = 1 +models = sorted(models, key=lambda x: x['run_cfg']['num_gpus']) diff --git a/.github/scripts/eval_regression_chat.py b/.github/scripts/eval_regression_chat.py index fa28562f..68c225c5 100644 --- a/.github/scripts/eval_regression_chat.py +++ b/.github/scripts/eval_regression_chat.py @@ -1,7 +1,5 @@ from mmengine.config import read_base -from opencompass.models import OpenAISDK - with read_base(): # choose a list of datasets from opencompass.configs.datasets.gsm8k.gsm8k_gen import \ @@ -29,6 +27,12 @@ with read_base(): models as hf_gemma2_2b_it_model # noqa: F401, E501 from opencompass.configs.models.gemma.hf_gemma2_9b_it import \ models as hf_gemma2_9b_it_model # noqa: F401, E501 + from opencompass.configs.models.gemma.hf_gemma_2b_it import \ + models as hf_gemma_2b_it_model # noqa: F401, E501 + from opencompass.configs.models.gemma.hf_gemma_7b_it import \ + models as hf_gemma_7b_it_model # noqa: F401, E501 + from opencompass.configs.models.gemma.lmdeploy_gemma_9b_it import \ + models as lmdeploy_gemma_9b_it_model # noqa: F401, E501 from opencompass.configs.models.gemma.vllm_gemma_7b_it import \ models as vllm_gemma_7b_it_model # noqa: F401, E501 from opencompass.configs.models.hf_internlm.hf_internlm2_5_7b_chat import \ @@ -51,18 +55,35 @@ with read_base(): models as vllm_internlm2_chat_7b_model # noqa: F401, E501 from opencompass.configs.models.hf_llama.hf_llama3_1_8b_instruct import \ models as hf_llama3_1_8b_instruct_model # noqa: F401, E501 + from opencompass.configs.models.hf_llama.hf_llama3_2_3b_instruct import \ + models as hf_llama3_2_3b_instruct_model # noqa: F401, E501 from opencompass.configs.models.hf_llama.hf_llama3_8b_instruct import \ models as hf_llama3_8b_instruct_model # noqa: F401, E501 from opencompass.configs.models.hf_llama.lmdeploy_llama3_1_8b_instruct import \ models as lmdeploy_llama3_1_8b_instruct_model # noqa: F401, E501 + from opencompass.configs.models.hf_llama.lmdeploy_llama3_2_3b_instruct import \ + models as lmdeploy_llama3_2_3b_instruct_model # noqa: F401, E501 from opencompass.configs.models.hf_llama.lmdeploy_llama3_8b_instruct import \ models as lmdeploy_llama3_8b_instruct_model # noqa: F401, E501 + from opencompass.configs.models.mistral.hf_mistral_7b_instruct_v0_2 import \ + models as hf_mistral_7b_instruct_v0_2_model # noqa: F401, E501 from opencompass.configs.models.mistral.hf_mistral_7b_instruct_v0_3 import \ models as hf_mistral_7b_instruct_v0_3_model # noqa: F401, E501 + from opencompass.configs.models.mistral.hf_mistral_nemo_instruct_2407 import \ + models as hf_mistral_nemo_instruct_2407_model # noqa: F401, E501 + from opencompass.configs.models.mistral.hf_mistral_small_instruct_2409 import \ + models as hf_mistral_small_instruct_2409_model # noqa: F401, E501 + from opencompass.configs.models.mistral.lmdeploy_mistral_nemo_instruct_2407 import \ + models as lmdeploy_mistral_nemo_instruct_2407_model # noqa: F401, E501 + from opencompass.configs.models.mistral.lmdeploy_mistral_small_instruct_2409 import \ + models as \ + lmdeploy_mistral_small_instruct_2409_model # noqa: F401, E501 + from opencompass.configs.models.mistral.vllm_mistral_7b_instruct_v0_1 import \ + models as vllm_mistral_7b_instruct_v0_1_model # noqa: F401, E501 from opencompass.configs.models.mistral.vllm_mistral_7b_instruct_v0_2 import \ models as vllm_mistral_7b_instruct_v0_2_model # noqa: F401, E501 - from opencompass.configs.models.mistral.vllm_mixtral_8x7b_instruct_v0_1 import \ - models as vllm_mixtral_8x7b_instruct_v0_1_model # noqa: F401, E501 + from opencompass.configs.models.openbmb.hf_minicpm3_4b import \ + models as hf_minicpm3_4b_model # noqa: F401, E501 from opencompass.configs.models.openbmb.hf_minicpm_2b_dpo_fp32 import \ models as hf_minicpm_2b_dpo_fp32_model # noqa: F401, E501 from opencompass.configs.models.openbmb.hf_minicpm_2b_sft_bf16 import \ @@ -73,6 +94,10 @@ with read_base(): models as hf_phi_3_mini_4k_instruct_model # noqa: F401, E501 from opencompass.configs.models.phi.hf_phi_3_small_8k_instruct import \ models as hf_phi_3_mini_8k_instruct_model # noqa: F401, E501 + from opencompass.configs.models.qwen2_5.hf_qwen2_5_14b_instruct import \ + models as hf_qwen2_5_14b_instruct_model # noqa: F401, E501 + from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_14b_instruct import \ + models as lmdeploy_qwen2_5_14b_instruct_model # noqa: F401, E501 from opencompass.configs.models.qwen.hf_qwen1_5_0_5b_chat import \ models as hf_qwen1_5_0_5b_chat_model # noqa: F401, E501 from opencompass.configs.models.qwen.hf_qwen2_1_5b_instruct import \ @@ -89,10 +114,8 @@ with read_base(): models as hf_yi_1_5_6b_chat_model # noqa: F401, E501 from opencompass.configs.models.yi.hf_yi_1_5_9b_chat import \ models as hf_yi_1_5_9b_chat_model # noqa: F401, E501 - from opencompass.configs.summarizers.medium import \ - summarizer # noqa: F401, E501 -models = sum([v for k, v in locals().items() if k.endswith('_model')], []) +race_datasets = [race_datasets[1]] datasets = sum([v for k, v in locals().items() if k.endswith('_datasets')], []) api_meta_template = dict( @@ -103,25 +126,24 @@ api_meta_template = dict( reserved_roles=[dict(role='SYSTEM', api_role='SYSTEM')], ) -model_name = '' - -models.append( - dict( - abbr='lmdeploy-api-test', - type=OpenAISDK, - key='EMPTY', - openai_api_base='http://judgemodel:10001/v1', - path='compass_judger_internlm2_102b_0508', - tokenizer_path='internlm/internlm2_5-20b-chat', - rpm_verbose=True, - meta_template=api_meta_template, - query_per_second=50, - max_out_len=1024, - max_seq_len=4096, - temperature=0.01, - batch_size=128, - retry=3, - )) - for d in datasets: - d['reader_cfg']['test_range'] = '[0:100]' + d['reader_cfg']['test_range'] = '[0:32]' + +models = sum([v for k, v in locals().items() if k.endswith('_model')], []) + +for m in models: + if 'turbomind' in m['abbr'] or 'lmdeploy' in m['abbr']: + m['engine_config']['max_batch_size'] = 1 + m['batch_size'] = 1 + +models = sorted(models, key=lambda x: x['run_cfg']['num_gpus']) + +summarizer = dict( + dataset_abbrs=[ + 'gsm8k', + 'race-middle', + 'race-high', + ], + summary_groups=sum( + [v for k, v in locals().items() if k.endswith('_summary_groups')], []), +) diff --git a/.github/scripts/eval_regression_chat_objective_fullbench.py b/.github/scripts/eval_regression_chat_objective_fullbench.py new file mode 100644 index 00000000..ff8dfba4 --- /dev/null +++ b/.github/scripts/eval_regression_chat_objective_fullbench.py @@ -0,0 +1,246 @@ +from mmengine.config import read_base + +with read_base(): + # read hf models - chat models + # Dataset + from opencompass.configs.datasets.ARC_c.ARC_c_cot_gen_926652 import \ + ARC_c_datasets # noqa: F401, E501 + from opencompass.configs.datasets.bbh.bbh_gen_5b92b0 import \ + bbh_datasets # noqa: F401, E501 + from opencompass.configs.datasets.cmmlu.cmmlu_0shot_cot_gen_305931 import \ + cmmlu_datasets # noqa: F401, E501 + from opencompass.configs.datasets.drop.drop_openai_simple_evals_gen_3857b0 import \ + drop_datasets # noqa: F401, E501 + from opencompass.configs.datasets.ds1000.ds1000_service_eval_gen_cbc84f import \ + ds1000_datasets # noqa: F401, E501 + from opencompass.configs.datasets.GaokaoBench.GaokaoBench_no_subjective_gen_4c31db import \ + GaokaoBench_datasets # noqa: F401, E501 + from opencompass.configs.datasets.gpqa.gpqa_openai_simple_evals_gen_5aeece import \ + gpqa_datasets # noqa: F401, E501 + # new datasets in Fullbench v1.1 + from opencompass.configs.datasets.gsm8k.gsm8k_0shot_v2_gen_a58960 import \ + gsm8k_datasets # noqa: F401, E501 + from opencompass.configs.datasets.hellaswag.hellaswag_10shot_gen_e42710 import \ + hellaswag_datasets # noqa: F401, E501 + from opencompass.configs.datasets.humaneval.humaneval_openai_sample_evals_gen_159614 import \ + humaneval_datasets # noqa: F401, E501 + from opencompass.configs.datasets.humanevalx.humanevalx_gen_620cfa import \ + humanevalx_datasets # noqa: F401, E501 + from opencompass.configs.datasets.IFEval.IFEval_gen_3321a3 import \ + ifeval_datasets # noqa: F401, E501 + from opencompass.configs.datasets.math.math_0shot_gen_393424 import \ + math_datasets # noqa: F401, E501 + from opencompass.configs.datasets.MathBench.mathbench_2024_gen_50a320 import \ + mathbench_datasets # noqa: F401, E501 + from opencompass.configs.datasets.mbpp.sanitized_mbpp_mdblock_gen_a447ff import \ + sanitized_mbpp_datasets # noqa: F401, E501 + from opencompass.configs.datasets.mmlu.mmlu_openai_simple_evals_gen_b618ea import \ + mmlu_datasets # noqa: F401, E501 + from opencompass.configs.datasets.mmlu_pro.mmlu_pro_0shot_cot_gen_08c1de import \ + mmlu_pro_datasets # noqa: F401, E501 + from opencompass.configs.datasets.nq.nq_open_1shot_gen_2e45e5 import \ + nq_datasets # noqa: F401, E501 + from opencompass.configs.datasets.race.race_cot_gen_d95929 import \ + race_datasets # noqa: F401, E501 + from opencompass.configs.datasets.scicode.scicode_gen_085b98 import \ + SciCode_datasets # noqa: F401, E501 + from opencompass.configs.datasets.SuperGLUE_BoolQ.SuperGLUE_BoolQ_cot_gen_1d56df import \ + BoolQ_datasets # noqa: F401, E501 + from opencompass.configs.datasets.teval.teval_en_gen_1ac254 import \ + teval_datasets as teval_en_datasets # noqa: F401, E501 + from opencompass.configs.datasets.teval.teval_zh_gen_1ac254 import \ + teval_datasets as teval_zh_datasets # noqa: F401, E501 + from opencompass.configs.datasets.TheoremQA.TheoremQA_5shot_gen_6f0af8 import \ + TheoremQA_datasets # noqa: F401, E501 + from opencompass.configs.datasets.triviaqa.triviaqa_wiki_1shot_gen_bc5f21 import \ + triviaqa_datasets # noqa: F401, E501 + from opencompass.configs.datasets.wikibench.wikibench_gen_0978ad import \ + wikibench_datasets # noqa: F401, E501 + from opencompass.configs.models.hf_internlm.hf_internlm2_5_7b_chat import \ + models as hf_internlm2_5_7b_chat_model # noqa: F401, E501 + from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat import \ + models as lmdeploy_internlm2_5_7b_chat_model # noqa: F401, E501 + # Summary Groups + from opencompass.configs.summarizers.groups.bbh import \ + bbh_summary_groups # noqa: F401, E501 + from opencompass.configs.summarizers.groups.cmmlu import \ + cmmlu_summary_groups # noqa: F401, E501 + from opencompass.configs.summarizers.groups.ds1000 import \ + ds1000_summary_groups # noqa: F401, E501 + from opencompass.configs.summarizers.groups.GaokaoBench import \ + GaokaoBench_summary_groups # noqa: F401, E501 + from opencompass.configs.summarizers.groups.humanevalx import \ + humanevalx_summary_groups # noqa: F401, E501 + from opencompass.configs.summarizers.groups.mathbench_v1_2024 import \ + mathbench_2024_summary_groups # noqa: F401, E501 + from opencompass.configs.summarizers.groups.mmlu import \ + mmlu_summary_groups # noqa: F401, E501 + from opencompass.configs.summarizers.groups.mmlu_pro import \ + mmlu_pro_summary_groups # noqa: F401, E501 + from opencompass.configs.summarizers.groups.scicode import \ + scicode_summary_groups # noqa: F401, E501 + from opencompass.configs.summarizers.groups.teval import \ + teval_summary_groups # noqa: F401, E501 + +# For HumanEval-X Evaluation +# Apply the evaluator ip_address and port +race_datasets = [race_datasets[1]] +for item in humanevalx_datasets: + item['eval_cfg']['evaluator'][ + 'ip_address'] = 'codeeval.opencompass.org.cn/humanevalx' + item['eval_cfg']['evaluator']['port'] = '' + +# For DS-1000 Evaluation +# Apply the evaluator ip_address and port +for item in ds1000_datasets: + item['eval_cfg']['evaluator'][ + 'ip_address'] = 'codeeval.opencompass.org.cn/ds1000' + item['eval_cfg']['evaluator']['port'] = '' + +bbh_datasets = [ + x for x in bbh_datasets if 'logical_deduction_seven_objects' in x['abbr'] + or 'multistep_arithmetic_two' in x['abbr'] +] +cmmlu_datasets = [ + x for x in cmmlu_datasets if x['abbr'].replace('cmmlu-', '') in [ + 'ancient_chinese', 'chinese_civil_service_exam', + 'chinese_driving_rule', 'chinese_food_culture', + 'chinese_foreign_policy', 'chinese_history', 'chinese_literature', + 'chinese_teacher_qualification', 'construction_project_management', + 'elementary_chinese', 'elementary_commonsense', 'ethnology', + 'high_school_politics', 'modern_chinese', + 'traditional_chinese_medicine' + ] +] +mmlu_datasets = [ + x for x in mmlu_datasets if x['abbr'].replace('lukaemon_mmlu_', '') in [ + 'business_ethics', 'clinical_knowledge', 'college_medicine', + 'global_facts', 'human_aging', 'management', 'marketing', + 'medical_genetics', 'miscellaneous', 'nutrition', + 'professional_accounting', 'professional_medicine', 'virology' + ] +] + +mmlu_pro_datasets = [mmlu_pro_datasets[0]] +mathbench_datasets = [x for x in mathbench_datasets if 'college' in x['abbr']] +GaokaoBench_datasets = [ + x for x in GaokaoBench_datasets if '2010-2022_Math_II_MCQs' in x['abbr'] + or '2010-2022_Math_II_Fill-in-the-Blank' in x['abbr'] +] + +datasets = sum( + (v for k, v in locals().items() if k.endswith('_datasets') + and 'scicode' not in k.lower() and 'teval' not in k), + [], +) +datasets += teval_en_datasets +datasets += teval_zh_datasets +# datasets += SciCode_datasets + +summarizer = dict( + dataset_abbrs=[ + ['race-high', 'accuracy'], + ['ARC-c', 'accuracy'], + ['BoolQ', 'accuracy'], + ['mmlu_pro', 'naive_average'], + ['drop', 'accuracy'], + ['bbh', 'naive_average'], + ['GPQA_diamond', 'accuracy'], + ['math', 'accuracy'], + ['wikibench-wiki-single_choice_cncircular', 'perf_4'], + ['openai_humaneval', 'humaneval_pass@1'], + ['sanitized_mbpp', 'score'], + ['cmmlu', 'naive_average'], + ['mmlu', 'naive_average'], + ['teval', 'naive_average'], + ['SciCode', 'accuracy'], + ['SciCode', 'sub_accuracy'], + ['humanevalx', 'naive_average'], + ['ds1000', 'naive_average'], + ['IFEval', 'Prompt-level-strict-accuracy'], + ['gsm8k', 'accuracy'], + ['GaokaoBench', 'weighted_average'], + ['triviaqa_wiki_1shot', 'score'], + ['nq_open_1shot', 'score'], + ['hellaswag', 'accuracy'], + ['TheoremQA', 'score'], + '###### MathBench-A: Application Part ######', + 'college', + 'high', + 'middle', + 'primary', + 'arithmetic', + 'mathbench-a (average)', + '###### MathBench-T: Theory Part ######', + 'college_knowledge', + 'high_knowledge', + 'middle_knowledge', + 'primary_knowledge', + 'mathbench-t (average)', + '###### Overall: Average between MathBench-A and MathBench-T ######', + 'Overall', + '', + 'bbh-logical_deduction_seven_objects', + 'bbh-multistep_arithmetic_two', + '' + 'mmlu', + 'mmlu-stem', + 'mmlu-social-science', + 'mmlu-humanities', + 'mmlu-other', + '', + 'cmmlu', + 'cmmlu-stem', + 'cmmlu-social-science', + 'cmmlu-humanities', + 'cmmlu-other', + 'cmmlu-china-specific', + '', + 'mmlu_pro', + 'mmlu_pro_biology', + 'mmlu_pro_business', + 'mmlu_pro_chemistry', + 'mmlu_pro_computer_science', + 'mmlu_pro_economics', + 'mmlu_pro_engineering', + 'mmlu_pro_health', + 'mmlu_pro_history', + 'mmlu_pro_law', + 'mmlu_pro_math', + 'mmlu_pro_philosophy', + 'mmlu_pro_physics', + 'mmlu_pro_psychology', + 'mmlu_pro_other', + '', + 'GaokaoBench_2010-2022_Math_II_MCQs', + 'GaokaoBench_2010-2022_Math_II_Fill-in-the-Blank', + '', + 'humanevalx-python', + 'humanevalx-cpp', + 'humanevalx-go', + 'humanevalx-java', + 'humanevalx-js', + '', + 'ds1000_Pandas', + 'ds1000_Numpy', + 'ds1000_Tensorflow', + 'ds1000_Scipy', + 'ds1000_Sklearn', + 'ds1000_Pytorch', + 'ds1000_Matplotlib', + ], + summary_groups=sum( + [v for k, v in locals().items() if k.endswith('_summary_groups')], []), +) + +for d in datasets: + d['reader_cfg']['test_range'] = '[0:16]' + +models = sum([v for k, v in locals().items() if k.endswith('_model')], []) +for m in models: + m['abbr'] = m['abbr'] + '_fullbench' + if 'turbomind' in m['abbr'] or 'lmdeploy' in m['abbr']: + m['engine_config']['max_batch_size'] = 1 + m['batch_size'] = 1 + +models = sorted(models, key=lambda x: x['run_cfg']['num_gpus']) diff --git a/.github/scripts/eval_regression_chat_subjective_fullbench.py b/.github/scripts/eval_regression_chat_subjective_fullbench.py new file mode 100644 index 00000000..8a6ef8fd --- /dev/null +++ b/.github/scripts/eval_regression_chat_subjective_fullbench.py @@ -0,0 +1,70 @@ +from copy import deepcopy + +from mmengine.config import read_base + +from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner +from opencompass.runners import LocalRunner +from opencompass.summarizers import SubjectiveSummarizer +from opencompass.tasks.subjective_eval import SubjectiveEvalTask + +with read_base(): + # read hf models - chat models + # Dataset + from opencompass.configs.datasets.subjective.alignbench.alignbench_v1_1_judgeby_critiquellm import \ + alignbench_datasets # noqa: F401, E501 + from opencompass.configs.datasets.subjective.alpaca_eval.alpacav2_judgeby_gpt4 import \ + alpacav2_datasets # noqa: F401, E501 + from opencompass.configs.datasets.subjective.arena_hard.arena_hard_compare import \ + arenahard_datasets # noqa: F401, E501 + from opencompass.configs.datasets.subjective.compassarena.compassarena_compare import \ + compassarena_datasets # noqa: F401, E501 + from opencompass.configs.datasets.subjective.fofo.fofo_bilingual_judge import \ + fofo_datasets # noqa: F401, E501 + from opencompass.configs.datasets.subjective.followbench.followbench_llmeval import \ + followbench_llmeval_datasets # noqa: F401, E501 + from opencompass.configs.datasets.subjective.multiround.mtbench101_judge import \ + mtbench101_datasets # noqa: F401, E501 + from opencompass.configs.datasets.subjective.wildbench.wildbench_pair_judge import \ + wildbench_datasets # noqa: F401, E501 + from opencompass.configs.models.hf_internlm.hf_internlm2_5_7b_chat import \ + models as hf_internlm2_5_7b_chat_model # noqa: F401, E501 + from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat import \ + models as lmdeploy_internlm2_5_7b_chat_model # noqa: F401, E501 + +summarizer = dict(type=SubjectiveSummarizer, function='subjective') + +datasets = sum((v for k, v in locals().items() if k.endswith('_datasets') + and 'mtbench101' not in k and 'wildbench' not in k), []) +datasets += mtbench101_datasets # noqa: F401, E501 +datasets += wildbench_datasets # noqa: F401, E501 + +api_meta_template = dict( + round=[ + dict(role='HUMAN', api_role='HUMAN'), + dict(role='BOT', api_role='BOT', generate=True), + ], + reserved_roles=[dict(role='SYSTEM', api_role='SYSTEM')], +) + +models = sum([v for k, v in locals().items() if k.endswith('_model')], []) +for m in models: + m['abbr'] = m['abbr'] + '_fullbench' + if 'turbomind' in m['abbr'] or 'lmdeploy' in m['abbr']: + m['engine_config']['max_batch_size'] = 1 + m['batch_size'] = 1 + +models = sorted(models, key=lambda x: x['run_cfg']['num_gpus']) + +judge_models = deepcopy([models[1]]) +judge_models[0]['abbr'] = judge_models[0]['abbr'] + '-judge' + +eval = dict( + partitioner=dict( + type=SubjectiveNaivePartitioner, + models=models, + judge_models=judge_models, + ), + runner=dict(type=LocalRunner, + max_num_workers=16, + task=dict(type=SubjectiveEvalTask)), +) diff --git a/.github/scripts/oc_score_assert.py b/.github/scripts/oc_score_assert.py index 2741a66a..d8e33adb 100644 --- a/.github/scripts/oc_score_assert.py +++ b/.github/scripts/oc_score_assert.py @@ -7,36 +7,56 @@ import yaml output_path = 'regression_result_daily' chat_model_list = [ - 'baichuan2-7b-chat-hf', 'deepseek-7b-chat-hf', 'deepseek-moe-16b-chat-hf', - 'deepseek-v2-lite-chat-hf', 'deepseek-7b-chat-vllm', 'gemma2-2b-it-hf', - 'gemma2-9b-it-hf', 'gemma-7b-it-vllm', 'internlm2_5-7b-chat-hf', - 'internlm2_5-20b-chat-hf', 'internlm2_5-7b-chat-turbomind', - 'internlm2_5-20b-chat-turbomind', 'internlm2-chat-1.8b-turbomind', + 'baichuan2-7b-chat-hf', 'glm-4-9b-chat-hf', 'glm-4-9b-chat-turbomind', + 'glm-4-9b-chat-vllm', 'deepseek-7b-chat-hf', 'deepseek-moe-16b-chat-hf', + 'deepseek-7b-chat-vllm', 'gemma2-2b-it-hf', 'gemma2-9b-it-hf', + 'gemma-2b-it-hf', 'gemma-7b-it-hf', 'gemma-2-9b-it-turbomind', + 'gemma-7b-it-vllm', 'internlm2_5-7b-chat-hf', + 'internlm2_5-7b-chat-turbomind', 'internlm2-chat-1.8b-turbomind', 'internlm2-chat-1.8b-sft-turbomind', 'internlm2-chat-7b-lmdeploy', 'internlm2-chat-7b-sft-turbomind', 'internlm2-chat-7b-vllm', - 'llama-3_1-8b-instruct-hf', 'llama-3-8b-instruct-hf', - 'llama-3_1-8b-instruct-turbomind', 'llama-3-8b-instruct-turbomind', - 'mistral-7b-instruct-v0.3-hf', 'mistral-7b-instruct-v0.2-vllm', - 'minicpm-2b-dpo-fp32-hf', 'minicpm-2b-sft-bf16-hf', + 'llama-3_1-8b-instruct-hf', 'llama-3_2-3b-instruct-hf', + 'llama-3-8b-instruct-hf', 'llama-3_1-8b-instruct-turbomind', + 'llama-3_2-3b-instruct-turbomind', 'llama-3-8b-instruct-turbomind', + 'mistral-7b-instruct-v0.2-hf', 'mistral-7b-instruct-v0.3-hf', + 'mistral-nemo-instruct-2407-hf', 'mistral-nemo-instruct-2407-turbomind', + 'mistral-7b-instruct-v0.1-vllm', 'mistral-7b-instruct-v0.2-vllm', + 'MiniCPM3-4B-hf', 'minicpm-2b-dpo-fp32-hf', 'minicpm-2b-sft-bf16-hf', 'minicpm-2b-sft-fp32-hf', 'phi-3-mini-4k-instruct-hf', 'qwen1.5-0.5b-chat-hf', 'qwen2-1.5b-instruct-hf', 'qwen2-7b-instruct-hf', 'qwen2-1.5b-instruct-turbomind', 'qwen2-7b-instruct-turbomind', 'qwen1.5-0.5b-chat-vllm', 'yi-1.5-6b-chat-hf', 'yi-1.5-9b-chat-hf', - 'lmdeploy-api-test' + 'deepseek-v2-lite-chat-hf', 'internlm2_5-20b-chat-hf', + 'internlm2_5-20b-chat-turbomind', 'mistral-small-instruct-2409-hf', + 'mistral-small-instruct-2409-turbomind', 'qwen2.5-14b-instruct-hf', + 'qwen2.5-14b-instruct-turbomind' ] base_model_list = [ - 'deepseek-moe-16b-base-hf', 'deepseek-v2-lite-hf', - 'deepseek-7b-base-turbomind', 'deepseek-moe-16b-base-vllm', 'gemma2-2b-hf', - 'gemma2-9b-hf', 'internlm2_5-7b-hf', 'internlm2-7b-hf', - 'internlm2-base-7b-hf', 'internlm2-1.8b-turbomind', - 'internlm2_5-7b-turbomind', 'internlm2-7b-turbomind', - 'internlm2-base-7b-turbomind', 'llama-2-7b-hf', 'llama-3-8b-hf', - 'llama-3.1-8b-turbomind', 'llama-3-8b-turbomind', 'mistral-7b-v0.3-hf', - 'mistral-7b-v0.2-vllm', 'qwen1.5-moe-a2.7b-hf', 'qwen2-0.5b-hf', + 'glm-4-9b-hf', 'deepseek-moe-16b-base-hf', 'deepseek-7b-base-turbomind', + 'deepseek-moe-16b-base-vllm', 'gemma2-2b-hf', 'gemma2-9b-hf', + 'gemma-2b-hf', 'gemma-7b-hf', 'gemma-2b-vllm', 'gemma-7b-vllm', + 'internlm2_5-7b-hf', 'internlm2-7b-hf', 'internlm2-base-7b-hf', + 'internlm2-1.8b-turbomind', 'internlm2_5-7b-turbomind', + 'internlm2-7b-turbomind', 'internlm2-base-7b-turbomind', 'llama-2-7b-hf', + 'llama-3_1-8b-hf', 'llama-3-8b-hf', 'llama-3.1-8b-turbomind', + 'llama-3-8b-turbomind', 'mistral-7b-v0.2-hf', 'mistral-7b-v0.3-hf', + 'mistral-7b-v0.2-vllm', 'qwen2.5-7b-hf', 'qwen2.5-1.5b-turbomind', + 'qwen2.5-7b-turbomind', 'qwen1.5-moe-a2.7b-hf', 'qwen2-0.5b-hf', 'qwen2-1.5b-hf', 'qwen2-7b-hf', 'qwen2-1.5b-turbomind', - 'qwen2-7b-turbomind', 'qwen1.5-0.5b-vllm', 'yi-1.5-6b-hf', 'yi-1.5-9b-hf' + 'qwen2-7b-turbomind', 'qwen1.5-0.5b-vllm', 'yi-1.5-6b-hf', 'yi-1.5-9b-hf', + 'deepseek-v2-lite-hf', 'internlm2-20b-hf', 'internlm2-base-20b-hf', + 'internlm2-20b-turbomind', 'qwen2.5-14b-hf' ] -dataset_list = ['gsm8k', 'race-middle', 'race-high'] + + +@pytest.fixture() +def baseline_scores_testrange(request): + config_path = os.path.join( + request.config.rootdir, + '.github/scripts/oc_score_baseline_testrange.yaml') + with open(config_path) as f: + config = yaml.load(f.read(), Loader=yaml.SafeLoader) + return config @pytest.fixture() @@ -48,6 +68,16 @@ def baseline_scores(request): return config +@pytest.fixture() +def baseline_scores_fullbench(request): + config_path = os.path.join( + request.config.rootdir, + '.github/scripts/oc_score_baseline_fullbench.yaml') + with open(config_path) as f: + config = yaml.load(f.read(), Loader=yaml.SafeLoader) + return config + + @pytest.fixture() def result_scores(): file = find_csv_files(output_path) @@ -57,100 +87,228 @@ def result_scores(): @pytest.mark.usefixtures('result_scores') -@pytest.mark.usefixtures('baseline_scores') +@pytest.mark.usefixtures('baseline_scores_testrange') @pytest.mark.chat class TestChat: """Test cases for chat model.""" - @pytest.mark.parametrize('model, dataset', [(p1, p2) - for p1 in chat_model_list - for p2 in dataset_list]) - def test_model_dataset_score(self, baseline_scores, result_scores, model, - dataset): - base_score = baseline_scores.get(model).get(dataset) + @pytest.mark.parametrize('model, dataset', + [(p1, p2) for p1 in chat_model_list + for p2 in ['gsm8k', 'race-high']]) + def test_model_dataset_score(self, baseline_scores_testrange, + result_scores, model, dataset): + base_score = baseline_scores_testrange.get(model).get(dataset) result_score = result_scores.get(model).get(dataset) - assert_score(result_score, base_score) + assert_score(model, result_score, base_score) @pytest.mark.usefixtures('result_scores') -@pytest.mark.usefixtures('baseline_scores') +@pytest.mark.usefixtures('baseline_scores_testrange') @pytest.mark.base class TestBase: """Test cases for base model.""" - @pytest.mark.parametrize('model, dataset', [(p1, p2) - for p1 in base_model_list - for p2 in dataset_list]) - def test_model_dataset_score(self, baseline_scores, result_scores, model, - dataset): - if model == 'mistral-7b-v0.2-vllm' and dataset == 'race-high': + @pytest.mark.parametrize( + 'model, dataset', + [(p1, p2) for p1 in base_model_list + for p2 in ['gsm8k', 'GPQA_diamond', 'race-high', 'winogrande']]) + def test_model_dataset_score(self, baseline_scores_testrange, + result_scores, model, dataset): + if model in ['gemma-2b-vllm', 'gemma-7b-vllm'] and dataset != 'gsm8k': return - base_score = baseline_scores.get(model).get(dataset) + base_score = baseline_scores_testrange.get(model).get(dataset) result_score = result_scores.get(model).get(dataset) - assert_score(result_score, base_score) + assert_score(model, result_score, base_score) @pytest.mark.usefixtures('result_scores') +@pytest.mark.usefixtures('baseline_scores_fullbench') +@pytest.mark.chat_obj_fullbench +class TestChatObjFullbench: + """Test cases for chat model.""" + + @pytest.mark.parametrize('model, dataset', [(p1, p2) for p1 in [ + 'internlm2_5-7b-chat-hf_fullbench', + 'internlm2_5-7b-chat-turbomind_fullbench' + ] for p2 in [ + 'race-high', 'ARC-c', 'BoolQ', 'drop', 'GPQA_diamond', 'math', + 'wikibench-wiki-single_choice_cncircular', 'sanitized_mbpp', 'ds1000', + 'gsm8k', 'triviaqa_wiki_1shot', 'nq_open_1shot', 'hellaswag', + 'TheoremQA', 'college', 'college_knowledge', + 'bbh-logical_deduction_seven_objects', 'bbh-multistep_arithmetic_two', + 'mmlu-other', 'cmmlu-china-specific', 'mmlu_pro_math', 'ds1000_Pandas', + 'ds1000_Numpy', 'ds1000_Tensorflow', 'ds1000_Scipy', 'ds1000_Sklearn', + 'ds1000_Pytorch', 'ds1000_Matplotlib' + ]]) + def test_model_dataset_score(self, baseline_scores_fullbench, + result_scores, model, dataset): + base_score = baseline_scores_fullbench.get(model).get(dataset) + result_score = result_scores.get(model).get(dataset) + assert_score(model, result_score, base_score) + + +@pytest.mark.usefixtures('result_scores') +@pytest.mark.usefixtures('baseline_scores_fullbench') +@pytest.mark.chat_sub_fullbench +class TestChatSubFullbench: + """Test cases for chat model.""" + + @pytest.mark.parametrize('model, dataset', [(p1, p2) for p1 in [ + 'internlm2_5-7b-chat-hf_fullbench', + 'internlm2_5-7b-chat-turbomind_fullbench' + ] for p2 in [ + 'Alignbench总分', 'Alignbench专业能力', 'AlpacaEvaltotal', + 'AlpacaEvalhelpful_base', 'CompassArenacompassarena_language', + 'CompassArenacompassarena_knowledge', + 'CompassArenacompassarena_reason_v2', + 'CompassArenacompassarena_math_v2', + 'CompassArenacompassarena_creationv2_zh', 'Fofofofo_test_prompts', + 'followbenchHSR_AVG', 'followbenchSSR_AVG', 'followbenchHSR_L1', + 'followbenchHSR_L2', 'followbenchHSR_L3', 'followbenchHSR_L4', + 'followbenchHSR_L5', 'followbenchSSR_L1', 'followbenchSSR_L2', + 'followbenchSSR_L3', 'followbenchSSR_L4', 'followbenchSSR_L5', + 'MTBench101average', 'Wildbenchscore' + ]]) + def test_model_dataset_score(self, baseline_scores_fullbench, + result_scores, model, dataset): + base_score = baseline_scores_fullbench.get(model).get(dataset) + result_score = result_scores.get(model).get(dataset) + assert_score(model, result_score, base_score) + + +@pytest.mark.usefixtures('result_scores') +@pytest.mark.usefixtures('baseline_scores_fullbench') +@pytest.mark.base_fullbench +class TestBaseFullbench: + """Test cases for chat model.""" + + @pytest.mark.parametrize('model, dataset', [(p1, p2) for p1 in [ + 'internlm2_5-7b-hf_fullbench', 'internlm2_5-7b-turbomind_fullbench' + ] for p2 in [ + 'race-high', 'ARC-c', 'BoolQ', 'drop', 'GPQA_diamond', 'math', + 'wikibench-wiki-single_choice_cncircular', 'sanitized_mbpp', 'gsm8k', + 'triviaqa_wiki_1shot', 'nq_open_1shot', 'winogrande', 'hellaswag', + 'TheoremQA', 'college', 'college_knowledge', + 'bbh-logical_deduction_seven_objects', 'bbh-multistep_arithmetic_two', + 'mmlu-other', 'cmmlu-china-specific', 'mmlu_pro_math' + ]]) + def test_model_dataset_score(self, baseline_scores_fullbench, + result_scores, model, dataset): + base_score = baseline_scores_fullbench.get(model).get(dataset) + result_score = result_scores.get(model).get(dataset) + assert_score(model, result_score, base_score) + + +@pytest.mark.usefixtures('result_scores') +@pytest.mark.usefixtures('baseline_scores') +@pytest.mark.api +class TestApibench: + """Test cases for chat model.""" + + @pytest.mark.parametrize('model, dataset', + [('lmdeploy-api-test', 'race-middle'), + ('lmdeploy-api-test', 'race-high'), + ('lmdeploy-api-test', 'gsm8k')]) + def test_api(self, baseline_scores, result_scores, model, dataset): + base_score = baseline_scores.get(model).get(dataset) + result_score = result_scores.get(model).get(dataset) + assert_score(model + '_batch', result_score, base_score) + + +@pytest.mark.usefixtures('result_scores') +@pytest.mark.usefixtures('baseline_scores') class TestCmdCase: @pytest.mark.case1 @pytest.mark.parametrize('model, dataset', [('internlm2_5-7b-hf', 'race-middle'), - ('internlm2_5-7b-hf', 'race-high')]) - def test_cmd_case1(self, result_scores, model, dataset): - if len(result_scores.keys()) != 1: - assert False, 'result is none' + ('internlm2_5-7b-hf', 'race-high'), + ('internlm2_5-7b-hf', 'demo_gsm8k'), + ('internlm2-1.8b-hf', 'race-middle'), + ('internlm2-1.8b-hf', 'race-high'), + ('internlm2-1.8b-hf', 'demo_gsm8k')]) + def test_cmd_case1(self, baseline_scores, result_scores, model, dataset): + base_score = baseline_scores.get(model).get(dataset) result_score = result_scores.get(model).get(dataset) - assert_score(result_score, 91) + assert_score(model, result_score, base_score) @pytest.mark.case2 @pytest.mark.parametrize('model, dataset', [('internlm2_5-7b-chat-lmdeploy', 'race-middle'), - ('internlm2_5-7b-chat-lmdeploy', 'race-high')]) - def test_cmd_case2(self, result_scores, model, dataset): - if len(result_scores.keys()) != 1: - assert False, 'result is none' + ('internlm2_5-7b-chat-lmdeploy', 'race-high'), + ('internlm2_5-7b-chat-lmdeploy', 'demo_gsm8k'), + ('internlm2-chat-1.8b-lmdeploy', 'race-middle'), + ('internlm2-chat-1.8b-lmdeploy', 'race-high'), + ('internlm2-chat-1.8b-lmdeploy', 'demo_gsm8k')]) + def test_cmd_case2(self, baseline_scores, result_scores, model, dataset): + base_score = baseline_scores.get(model).get(dataset) result_score = result_scores.get(model).get(dataset) - assert_score(result_score, 91) + assert_score(model + '_batch', result_score, base_score) @pytest.mark.case3 @pytest.mark.parametrize('model, dataset', [('internlm2_5-7b_hf', 'race-middle'), - ('internlm2_5-7b_hf', 'race-high')]) - def test_cmd_case3(self, result_scores, model, dataset): - if len(result_scores.keys()) != 1: - assert False, 'result is none' + ('internlm2_5-7b_hf', 'race-high'), + ('internlm2_5-7b_hf', 'demo_gsm8k')]) + def test_cmd_case3(self, baseline_scores, result_scores, model, dataset): + base_score = baseline_scores.get(model).get(dataset) result_score = result_scores.get(model).get(dataset) - assert_score(result_score, 91) + assert_score(model, result_score, base_score) @pytest.mark.case4 @pytest.mark.parametrize('model, dataset', [('internlm2_5-7b-chat_hf', 'race-middle'), - ('internlm2_5-7b-chat_hf', 'race-high')]) - def test_cmd_case4(self, result_scores, model, dataset): - if len(result_scores.keys()) != 1: - assert False, 'result is none' + ('internlm2_5-7b-chat_hf', 'race-high'), + ('internlm2_5-7b-chat_hf', 'demo_gsm8k')]) + def test_cmd_case4(self, baseline_scores, result_scores, model, dataset): + base_score = baseline_scores.get(model).get(dataset) result_score = result_scores.get(model).get(dataset) - assert_score(result_score, 91) + assert_score(model, result_score, base_score) -def assert_score(score, baseline): +THRESHOLD = 3 + + +def assert_score(model_type, score, baseline): if score is None or score == '-': assert False, 'value is none' - if float(score) <= (baseline + 5) and float(score) >= (baseline - 5): - print(score + ' between ' + str(baseline - 5) + ' and ' + - str(baseline + 5)) - assert True + + if 'batch' not in model_type: + if float(score) <= (baseline + 0.01) and float(score) >= (baseline - + 0.01): + print(' '.join([score, 'is equal', str(baseline)])) + assert True + else: + print(' '.join([score, 'is not equal', str(baseline)])) + assert False, ' '.join([score, 'is not equal', str(baseline)]) else: - assert False, score + ' not between ' + str( - baseline - 5) + ' and ' + str(baseline + 5) + if float(score) <= (baseline + THRESHOLD) and float(score) >= ( + baseline - THRESHOLD): + print(' '.join([ + score, 'is between', + str(baseline - THRESHOLD), 'and', + str(baseline + THRESHOLD) + ])) + assert True + else: + print(' '.join([ + score, 'is not etween', + str(baseline - THRESHOLD), 'and', + str(baseline + THRESHOLD) + ])) + assert False, ' '.join([ + score, 'is not etween', + str(baseline - THRESHOLD), 'and', + str(baseline + THRESHOLD) + ]) def find_csv_files(directory): csv_files = [] for root, dirs, files in os.walk(directory): for file in files: - if file.endswith('.csv'): + if file.endswith('.csv') and (file.startswith('summary') or + file.startswith('Subjective_all')): csv_files.append(os.path.join(root, file)) csv_files_with_time = {f: os.path.getctime(f) for f in csv_files} @@ -163,14 +321,24 @@ def read_csv_file(file_path): with open(file_path, 'r') as csvfile: reader = csv.DictReader(csvfile) filtered_data = [] - - for row in reader: - filtered_row = { - k: v - for k, v in row.items() - if k not in ['version', 'metric', 'mode'] - } - filtered_data.append(filtered_row) + if 'Subjective_all' not in file_path: + for row in reader: + if row['metric'] is not None and 'bpb' not in row['metric']: + filtered_row = { + k: v + for k, v in row.items() + if k not in ['version', 'metric', 'mode'] + } + filtered_data.append(filtered_row) + else: + for row in reader: + if row['Detailed Scores'] is not None: + filtered_row = row + filtered_row['dataset'] = filtered_row[ + 'Dataset'] + filtered_row['Detailed Scores'] + del filtered_row['Dataset'] + del filtered_row['Detailed Scores'] + filtered_data.append(filtered_row) result = {} for data in filtered_data: diff --git a/.github/scripts/oc_score_baseline.yaml b/.github/scripts/oc_score_baseline.yaml index 4c3b38f6..40cb1087 100644 --- a/.github/scripts/oc_score_baseline.yaml +++ b/.github/scripts/oc_score_baseline.yaml @@ -1,369 +1,34 @@ -baichuan2-7b-chat-hf: - gsm8k: 30 - race-middle: 74 - race-high: 79 +internlm2_5-7b-hf: + demo_gsm8k: 42.19 + race-middle: 91.78 + race-high: 90.02 -glm-4-9b-chat-hf: - gsm8k: 75 - race-middle: 88 - race-high: 88 +internlm2_5-7b_hf: + demo_gsm8k: 42.19 + race-middle: 91.78 + race-high: 90.02 -glm-4-9b-chat-turbomind: - gsm8k: 69 - race-middle: 82 - race-high: 77 +internlm2-1.8b-hf: + demo_gsm8k: 15.62 + race-middle: 71.66 + race-high: 66.38 -glm-4-9b-chat-vllm: - gsm8k: 73 - race-middle: 87 - race-high: 87 +internlm2_5-7b-chat-lmdeploy: + demo_gsm8k: 84.38 + race-middle: 92.76 + race-high: 90.54 -deepseek-7b-chat-hf: - gsm8k: 60 - race-middle: 74 - race-high: 80 +internlm2-chat-1.8b-lmdeploy: + demo_gsm8k: 31 + race-middle: 81.34 + race-high: 73.96 -deepseek-moe-16b-chat-hf: - gsm8k: 62 - race-middle: 62 - race-high: 70 - -deepseek-v2-lite-chat-hf: - gsm8k: 59 - race-middle: 82 - race-high: 79 - -deepseek-7b-chat-vllm: - gsm8k: 63 - race-middle: 74 - race-high: 79 - -gemma-2b-it-hf: - gsm8k: 14 - race-middle: 62 - race-high: 52 - -gemma-7b-it-hf: - gsm8k: 39 - race-middle: 74 - race-high: 71 - -gemma-7b-it-vllm: - gsm8k: 38 - race-middle: 75 - race-high: 70 - -gemma2-2b-it-hf: - gsm8k: 62 - race-middle: 75 - race-high: 67 - -gemma2-9b-it-hf: - gsm8k: 80 - race-middle: 89 - race-high: 85 - -internlm2_5-7b-chat-hf: - gsm8k: 86 - race-middle: 92 - race-high: 93 - -internlm2_5-20b-chat-hf: - gsm8k: 91 - race-middle: 95 - race-high: 91 - -internlm2_5-7b-chat-turbomind: - gsm8k: 87 - race-middle: 92 - race-high: 93 - -internlm2_5-20b-chat-turbomind: - gsm8k: 91 - race-middle: 95 - race-high: 91 - -internlm2-chat-1.8b-turbomind: - gsm8k: 40 - race-middle: 82 - race-high: 83 - -internlm2-chat-1.8b-sft-turbomind: - gsm8k: 34 - race-middle: 81 - race-high: 83 - -internlm2-chat-7b-lmdeploy: - gsm8k: 69 - race-middle: 90 - race-high: 88 - -internlm2-chat-7b-sft-turbomind: - gsm8k: 71 - race-middle: 91 - race-high: 92 - -internlm2-chat-7b-vllm: - gsm8k: 63 - race-middle: 90 - race-high: 91 - -llama-3_1-8b-instruct-hf: - gsm8k: 82 - race-middle: 82 - race-high: 88 - -llama-3-8b-instruct-hf: - gsm8k: 77 - race-middle: 85 - race-high: 87 - -llama-3_1-8b-instruct-turbomind: - gsm8k: 79 - race-middle: 82 - race-high: 88 - -llama-3-8b-instruct-turbomind: - gsm8k: 77 - race-middle: 85 - race-high: 89 - -mistral-7b-instruct-v0.2-hf: - gsm8k: 48 - race-middle: 82 - race-high: 78 - -mistral-7b-instruct-v0.3-hf: - gsm8k: 53 - race-middle: 80 - race-high: 78 - -mistral-7b-instruct-v0.2-vllm: - gsm8k: 49 - race-middle: 81 - race-high: 77 - -minicpm-2b-dpo-fp32-hf: - gsm8k: 58 - race-middle: 66 - race-high: 74 - -minicpm-2b-sft-bf16-hf: - gsm8k: 58 - race-middle: 75 - race-high: 81 - -minicpm-2b-sft-fp32-hf: - gsm8k: 58 - race-middle: 75 - race-high: 81 - -phi-3-mini-4k-instruct-hf: - gsm8k: 67 - race-middle: 81 - race-high: 84 - -phi-3-small-8k-instruct-hf: - gsm8k: 88 - race-middle: 89 - race-high: 88 - -qwen1.5-0.5b-chat-hf: - gsm8k: 5 - race-middle: 55 - race-high: 50 - -qwen2-1.5b-instruct-hf: - gsm8k: 63 - race-middle: 77 - race-high: 86 - -qwen2-1.5b-instruct-turbomind: - gsm8k: 60 - race-middle: 77 - race-high: 86 - -qwen2-7b-instruct-turbomind: - gsm8k: 88 - race-middle: 87 - race-high: 89 - -qwen2-7b-instruct-hf: - gsm8k: 85 - race-middle: 87 - race-high: 91 - -qwen1.5-0.5b-chat-vllm: - gsm8k: 5 - race-middle: 57 - race-high: 51 - -yi-1.5-6b-chat-hf: - gsm8k: 72 - race-middle: 88 - race-high: 86 - -yi-1.5-9b-chat-hf: - gsm8k: 81 - race-middle: 89 - race-high: 91 +internlm2_5-7b-chat_hf: + demo_gsm8k: 87.50 + race-middle: 92.76 + race-high: 90.48 lmdeploy-api-test: - gsm8k: 90 - race-middle: 95 - race-high: 96 - -deepseek-moe-16b-base-hf: - gsm8k: 25 - race-middle: 35 - race-high: 23 - -deepseek-v2-lite-hf: - gsm8k: 37 - race-middle: 56 - race-high: 62 - -deepseek-7b-base-turbomind: - gsm8k: 21 - race-middle: 42 - race-high: 42 - -deepseek-moe-16b-base-vllm: - gsm8k: 22 - race-middle: 35 - race-high: 20 - -gemma-2b-hf: - gsm8k: 19 - race-middle: 33 - race-high: 26 - -gemma-7b-hf: - gsm8k: 65 - race-middle: 59 - race-high: 66 - -gemma2-2b-hf: - gsm8k: 33 - race-middle: 56 - race-high: 58 - -gemma2-9b-hf: - gsm8k: 70 - race-middle: 82 - race-high: 84 - -internlm2_5-7b-hf: - gsm8k: 47 - race-middle: 92 - race-high: 91 - -internlm2-7b-hf: - gsm8k: 65 - race-middle: 77 - race-high: 72 - -internlm2-base-7b-hf: - gsm8k: 5 - race-middle: 71 - race-high: 74 - -internlm2_5-7b-turbomind: - gsm8k: 73 - race-middle: 90 - race-high: 91 - -internlm2-1.8b-turbomind: - gsm8k: 25 - race-middle: 75 - race-high: 72 - -internlm2-7b-turbomind: - gsm8k: 67 - race-middle: 78 - race-high: 76 - -internlm2-base-7b-turbomind: - gsm8k: 39 - race-middle: 75 - race-high: 81 - -llama-2-7b-hf: - gsm8k: 17 - race-middle: 32 - race-high: 38 - -llama-3-8b-hf: - gsm8k: 48 - race-middle: 64 - race-high: 70 - -llama-3.1-8b-turbomind: - gsm8k: 57 - race-middle: 67 - race-high: 75 - -llama-3-8b-turbomind: - gsm8k: 52 - race-middle: 63 - race-high: 70 - -mistral-7b-v0.2-hf: - gsm8k: 43 - race-middle: 42 - race-high: 60 - -mistral-7b-v0.3-hf: - gsm8k: 43 - race-middle: 42 - race-high: 60 - -mistral-7b-v0.2-vllm: - gsm8k: 45 - race-middle: 42 - race-high: 58 - -qwen1.5-moe-a2.7b-hf: - gsm8k: 64 - race-middle: 78 - race-high: 90 - -qwen2-1.5b-hf: - gsm8k: 58 - race-middle: 65 - race-high: 78 - -qwen2-0.5b-hf: - gsm8k: 35 - race-middle: 52 - race-high: 48 - -qwen2-7b-hf: - gsm8k: 82 - race-middle: 88 - race-high: 89 - -qwen2-1.5b-turbomind: - gsm8k: 57 - race-middle: 64 - race-high: 78 - -qwen2-7b-turbomind: - gsm8k: 83 - race-middle: 88 - race-high: 88 - -qwen1.5-0.5b-vllm: - gsm8k: 12 - race-middle: 54 - race-high: 59 - -yi-1.5-6b-hf: - gsm8k: 59 - race-middle: 81 - race-high: 89 - -yi-1.5-9b-hf: - gsm8k: 77 - race-middle: 90 - race-high: 90 + gsm8k: 83.78 + race-middle: 92.41 + race-high: 90.37 diff --git a/.github/scripts/oc_score_baseline_fullbench.yaml b/.github/scripts/oc_score_baseline_fullbench.yaml new file mode 100644 index 00000000..4eea62fe --- /dev/null +++ b/.github/scripts/oc_score_baseline_fullbench.yaml @@ -0,0 +1,153 @@ +internlm2_5-7b-chat-hf_fullbench: + race-high: 93.75 + ARC-c: 87.5 + BoolQ: 81.25 + drop: 81.25 + GPQA_diamond: 25 + math: 75 + wikibench-wiki-single_choice_cncircular: 50 + sanitized_mbpp: 68.75 + ds1000: 16.96 + gsm8k: 56.25 + triviaqa_wiki_1shot: 50 + nq_open_1shot: 25 + hellaswag: 87.5 + TheoremQA: 18.75 + college: 12.5 + college_knowledge: 87.5 + bbh-logical_deduction_seven_objects: 50 + bbh-multistep_arithmetic_two: 68.75 + mmlu-other: 72.6 + cmmlu-china-specific: 76.25 + mmlu_pro_math: 25 + ds1000_Pandas: 12.5 + ds1000_Numpy: 0 + ds1000_Tensorflow: 12.5 + ds1000_Scipy: 18.75 + ds1000_Sklearn: 18.75 + ds1000_Pytorch: 12.5 + ds1000_Matplotlib: 43.75 + Alignbench总分: 0.65 + Alignbench专业能力: 7.83 + AlpacaEvaltotal: 0 + AlpacaEvalhelpful_base: 0 + CompassArenacompassarena_language: 60 + CompassArenacompassarena_knowledge: 56 + CompassArenacompassarena_reason_v2: 50 + CompassArenacompassarena_math_v2: 53.5 + CompassArenacompassarena_creationv2_zh: 48.75 + Fofofofo_test_prompts: 1 + followbenchHSR_AVG: 1 + followbenchSSR_AVG: 1 + followbenchHSR_L1: 1 + followbenchHSR_L2: 1 + followbenchHSR_L3: 1 + followbenchHSR_L4: 1 + followbenchHSR_L5: 1 + followbenchSSR_L1: 1 + followbenchSSR_L2: 1 + followbenchSSR_L3: 1 + followbenchSSR_L4: 1 + followbenchSSR_L5: 1 + MTBench101average: 8.1 + Wildbenchscore: -3.3333333333333335 + +internlm2_5-7b-chat-turbomind_fullbench: + race-high: 93.75 + ARC-c: 87.5 + BoolQ: 68.75 + drop: 75 + GPQA_diamond: 25 + math: 75 + wikibench-wiki-single_choice_cncircular: 25 + sanitized_mbpp: 68.75 + ds1000: 13.39 + gsm8k: 68.75 + triviaqa_wiki_1shot: 50 + nq_open_1shot: 25 + hellaswag: 81.25 + TheoremQA: 6.25 + college: 0 + college_knowledge: 87.5 + bbh-logical_deduction_seven_objects: 56.25 + bbh-multistep_arithmetic_two: 68.75 + mmlu-other: 74.04 + cmmlu-china-specific: 76.25 + mmlu_pro_math: 25 + ds1000_Pandas: 0 + ds1000_Numpy: 0 + ds1000_Tensorflow: 12.5 + ds1000_Scipy: 18.75 + ds1000_Sklearn: 18.75 + ds1000_Pytorch: 6.25 + ds1000_Matplotlib: 37.5 + Alignbench总分: 0.64 + Alignbench专业能力: 7.6 + AlpacaEvaltotal: 10 + AlpacaEvalhelpful_base: 10 + CompassArenacompassarena_language: 59 + CompassArenacompassarena_knowledge: 57 + CompassArenacompassarena_reason_v2: 49.5 + CompassArenacompassarena_math_v2: 51 + CompassArenacompassarena_creationv2_zh: 43.75 + Fofofofo_test_prompts: 1 + followbenchHSR_AVG: 1 + followbenchSSR_AVG: 1 + followbenchHSR_L1: 1 + followbenchHSR_L2: 1 + followbenchHSR_L3: 1 + followbenchHSR_L4: 1 + followbenchHSR_L5: 1 + followbenchSSR_L1: 1 + followbenchSSR_L2: 1 + followbenchSSR_L3: 1 + followbenchSSR_L4: 1 + followbenchSSR_L5: 1 + MTBench101average: 8.1 + Wildbenchscore: -8.333333333333334 + +internlm2_5-7b-hf_fullbench: + race-high: 100 + ARC-c: 68.75 + BoolQ: 87.5 + GPQA_diamond: 62.5 + drop: 62.5 + math: 12.5 + wikibench-wiki-single_choice_cncircular: 25 + sanitized_mbpp: 56.25 + gsm8k: 37.5 + triviaqa_wiki_1shot: 43.75 + nq_open_1shot: 43.75 + winogrande: 75 + hellaswag: 93.75 + TheoremQA: 25 + college: 12.5 + college_knowledge: 87.5 + bbh-logical_deduction_seven_objects: 43.75 + bbh-multistep_arithmetic_two: 56.25 + mmlu-other: 76.92 + cmmlu-china-specific: 84.17 + mmlu_pro_math: 18.75 + +internlm2_5-7b-turbomind_fullbench: + race-high: 100 + ARC-c: 68.75 + BoolQ: 87.5 + GPQA_diamond: 62.5 + drop: 62.5 + math: 18.75 + wikibench-wiki-single_choice_cncircular: 25 + sanitized_mbpp: 56.25 + gsm8k: 68.75 + triviaqa_wiki_1shot: 43.75 + nq_open_1shot: 43.75 + winogrande: 87.5 + hellaswag: 93.75 + TheoremQA: 31.25 + college: 12.5 + college_knowledge: 87.5 + bbh-logical_deduction_seven_objects: 50 + bbh-multistep_arithmetic_two: 56.25 + mmlu-other: 76.92 + cmmlu-china-specific: 84.17 + mmlu_pro_math: 18.75 diff --git a/.github/scripts/oc_score_baseline_testrange.yaml b/.github/scripts/oc_score_baseline_testrange.yaml new file mode 100644 index 00000000..6df2b515 --- /dev/null +++ b/.github/scripts/oc_score_baseline_testrange.yaml @@ -0,0 +1,459 @@ +baichuan2-7b-chat-hf: + gsm8k: 18.75 + race-high: 78.12 + +glm-4-9b-chat-hf: + gsm8k: 68.75 + race-high: 90.62 + +glm-4-9b-chat-turbomind: + gsm8k: 75.00 + race-high: 90.62 + +glm-4-9b-chat-vllm: + gsm8k: 65.62 + race-high: 90.62 + +deepseek-7b-chat-hf: + gsm8k: 46.88 + race-high: 81.25 + +deepseek-moe-16b-chat-hf: + gsm8k: 50 + race-high: 68.75 + +deepseek-7b-chat-vllm: + gsm8k: 43.75 + race-high: 75 + +gemma2-2b-it-hf: + gsm8k: 50 + race-high: 71.88 + +gemma2-9b-it-hf: + gsm8k: 71.88 + race-high: 84.38 + +gemma-2b-it-hf: + gsm8k: 3.12 + race-high: 40.62 + +gemma-7b-it-hf: + gsm8k: 40.62 + race-high: 68.75 + +gemma-2-9b-it-turbomind: + gsm8k: 68.75 + race-high: 81.25 + +gemma-7b-it-vllm: + gsm8k: 28.12 + race-high: 68.75 + +internlm2_5-7b-chat-hf: + gsm8k: 84.38 + race-high: 90.62 + +internlm2_5-7b-chat-turbomind: + gsm8k: 84.38 + race-high: 90.62 + +internlm2-chat-1.8b-turbomind: + gsm8k: 25 + race-high: 84.38 + +internlm2-chat-1.8b-sft-turbomind: + gsm8k: 21.88 + race-high: 84.38 + +internlm2-chat-7b-lmdeploy: + gsm8k: 53.12 + race-high: 84.38 + +internlm2-chat-7b-sft-turbomind: + gsm8k: 50 + race-high: 90.62 + +internlm2-chat-7b-vllm: + gsm8k: 43.75 + race-high: 87.5 + +llama-3_1-8b-instruct-hf: + gsm8k: 84.38 + race-high: 90.62 + +llama-3_2-3b-instruct-hf: + gsm8k: 65.62 + race-high: 81.25 + +llama-3-8b-instruct-hf: + gsm8k: 68.75 + race-high: 87.5 + +llama-3_1-8b-instruct-turbomind: + gsm8k: 78.12 + race-high: 90.62 + +llama-3_2-3b-instruct-turbomind: + gsm8k: 65.62 + race-high: 81.25 + +llama-3-8b-instruct-turbomind: + gsm8k: 68.75 + race-high: 87.5 + +mistral-7b-instruct-v0.2-hf: + gsm8k: 40.62 + race-high: 75 + +mistral-7b-instruct-v0.3-hf: + gsm8k: 40.62 + race-high: 75 + +mistral-nemo-instruct-2407-hf: + gsm8k: 75 + race-high: 81.25 + +mistral-nemo-instruct-2407-turbomind: + gsm8k: 75 + race-high: 81.25 + +mistral-7b-instruct-v0.1-vllm: + gsm8k: 37.5 + race-high: 71.88 + +mistral-7b-instruct-v0.2-vllm: + gsm8k: 43.75 + race-high: 75 + +MiniCPM3-4B-hf: + gsm8k: 68.75 + race-high: 84.38 + +minicpm-2b-dpo-fp32-hf: + gsm8k: 56.25 + race-high: 56.25 + +minicpm-2b-sft-bf16-hf: + gsm8k: 46.88 + race-high: 65.62 + +minicpm-2b-sft-fp32-hf: + gsm8k: 46.88 + race-high: 65.62 + +phi-3-mini-4k-instruct-hf: + gsm8k: 56.25 + race-high: 78.12 + +qwen1.5-0.5b-chat-hf: + gsm8k: 0 + race-high: 53.12 + +qwen2-1.5b-instruct-hf: + gsm8k: 62.5 + race-high: 84.38 + +qwen2-7b-instruct-hf: + gsm8k: 68.75 + race-high: 90.62 + +qwen2-1.5b-instruct-turbomind: + gsm8k: 62.50 + race-high: 84.38 + +qwen2-7b-instruct-turbomind: + gsm8k: 81.25 + race-high: 87.5 + +qwen1.5-0.5b-chat-vllm: + gsm8k: 3.12 + race-high: 53.12 + +yi-1.5-6b-chat-hf: + gsm8k: 65.62 + race-high: 84.38 + +yi-1.5-9b-chat-hf: + gsm8k: 75 + race-high: 93.75 + +deepseek-v2-lite-chat-hf: + gsm8k: 43.75 + race-high: 71.88 + +internlm2_5-20b-chat-hf: + gsm8k: 84.38 + race-high: 87.5 + +internlm2_5-20b-chat-turbomind: + gsm8k: 84.38 + race-high: 87.5 + +mistral-small-instruct-2409-hf: + gsm8k: 81.25 + race-high: 90.62 + +mistral-small-instruct-2409-turbomind: + gsm8k: 78.12 + race-high: 90.62 + +qwen2.5-14b-instruct-hf: + gsm8k: 71.88 + race-high: 93.75 + +qwen2.5-14b-instruct-turbomind: + gsm8k: 71.88 + race-high: 93.75 + +glm-4-9b-hf: + gsm8k: 68.75 + GPQA_diamond: 31.25 + race-high: 93.75 + winogrande: 84.38 + +deepseek-moe-16b-base-hf: + gsm8k: 21.88 + GPQA_diamond: 0 + race-high: 21.88 + winogrande: 65.62 + +deepseek-7b-base-turbomind: + gsm8k: 21.88 + GPQA_diamond: 0 + race-high: 46.88 + winogrande: 84.38 + +deepseek-moe-16b-base-vllm: + gsm8k: 21.88 + GPQA_diamond: 0 + race-high: 25 + winogrande: 68.75 + +gemma2-2b-hf: + gsm8k: 31.25 + GPQA_diamond: 3.12 + race-high: 56.25 + winogrande: 71.88 + +gemma2-9b-hf: + gsm8k: 68.75 + GPQA_diamond: 0 + race-high: 81.25 + winogrande: 84.38 + +gemma-2b-hf: + gsm8k: 18.75 + GPQA_diamond: 3.12 + race-high: 25 + winogrande: 53.12 + +gemma-7b-hf: + gsm8k: 56.25 + GPQA_diamond: 6.25 + race-high: 65.62 + winogrande: 78.12 + +gemma-2b-vllm: + gsm8k: 18.75 + GPQA_diamond: 6.25 + race-high: + winogrande: + +gemma-7b-vllm: + gsm8k: 59.38 + GPQA_diamond: 6.25 + race-high: + winogrande: + +internlm2_5-7b-hf: + gsm8k: 37.5 + GPQA_diamond: 25 + race-high: 93.75 + winogrande: 71.88 + +internlm2-7b-hf: + gsm8k: 53.12 + GPQA_diamond: 18.75 + race-high: 62.5 + winogrande: 78.12 + +internlm2-base-7b-hf: + gsm8k: 3.12 + GPQA_diamond: 21.88 + race-high: 75 + winogrande: 65.62 + +internlm2-1.8b-turbomind: + gsm8k: 12.5 + GPQA_diamond: 12.5 + race-high: 71.88 + winogrande: 75 + +internlm2_5-7b-turbomind: + gsm8k: 68.75 + GPQA_diamond: 31.25 + race-high: 93.75 + winogrande: 84.38 + +internlm2-7b-turbomind: + gsm8k: 56.25 + GPQA_diamond: 21.88 + race-high: 75 + winogrande: 81.25 + +internlm2-base-7b-turbomind: + gsm8k: 40.62 + GPQA_diamond: 28.12 + race-high: 84.38 + winogrande: 71.88 + +llama-2-7b-hf: + gsm8k: 21.88 + GPQA_diamond: 21.88 + race-high: 40.62 + winogrande: 71.88 + +llama-3_1-8b-hf: + gsm8k: 78.12 + GPQA_diamond: 25 + race-high: 90.62 + winogrande: 62.5 + +llama-3-8b-hf: + gsm8k: 46.88 + GPQA_diamond: 6.25 + race-high: 65.62 + winogrande: 65.62 + +llama-3.1-8b-turbomind: + gsm8k: 56.25 + GPQA_diamond: 6.25 + race-high: 78.12 + winogrande: 78.12 + +llama-3-8b-turbomind: + gsm8k: 50 + GPQA_diamond: 9.38 + race-high: 65.62 + winogrande: 78.12 + +mistral-7b-v0.2-hf: + gsm8k: 31.25 + GPQA_diamond: 6.25 + race-high: 62.5 + winogrande: 59.38 + +mistral-7b-v0.3-hf: + gsm8k: 31.25 + GPQA_diamond: 6.25 + race-high: 62.5 + winogrande: 59.38 + +mistral-7b-v0.2-vllm: + gsm8k: 34.38 + GPQA_diamond: 6.25 + race-high: 62.5 + winogrande: 65.62 + +qwen2.5-7b-hf: + gsm8k: 81.25 + GPQA_diamond: 18.75 + race-high: 87.5 + winogrande: 71.88 + +qwen2.5-1.5b-turbomind: + gsm8k: 71.88 + GPQA_diamond: 15.62 + race-high: 78.12 + winogrande: 71.88 + +qwen2.5-7b-turbomind: + gsm8k: 71.88 + GPQA_diamond: 25 + race-high: 87.5 + winogrande: 71.88 + +qwen1.5-moe-a2.7b-hf: + gsm8k: 62.5 + GPQA_diamond: 18.75 + race-high: 84.38 + winogrande: 75 + +qwen2-0.5b-hf: + gsm8k: 25 + GPQA_diamond: 0 + race-high: 40.62 + winogrande: 62.5 + +qwen2-1.5b-hf: + gsm8k: 59.38 + GPQA_diamond: 9.38 + race-high: 81.25 + winogrande: 62.5 + +qwen2-7b-hf: + gsm8k: 68.75 + GPQA_diamond: 9.38 + race-high: 87.5 + winogrande: 68.75 + +qwen2-1.5b-turbomind: + gsm8k: 62.50 + GPQA_diamond: 6.25 + race-high: 81.25 + winogrande: 75 + +qwen2-7b-turbomind: + gsm8k: 68.75 + GPQA_diamond: 12.5 + race-high: 87.5 + winogrande: 71.88 + +qwen1.5-0.5b-vllm: + gsm8k: 9.38 + GPQA_diamond: 0 + race-high: 56.25 + winogrande: 62.5 + +yi-1.5-6b-hf: + gsm8k: 62.5 + GPQA_diamond: 3.12 + race-high: 87.5 + winogrande: 62.5 + +yi-1.5-9b-hf: + gsm8k: 75 + GPQA_diamond: 40.62 + race-high: 87.5 + winogrande: 59.38 + +deepseek-v2-lite-hf: + gsm8k: 28.12 + GPQA_diamond: 21.88 + race-high: 59.38 + winogrande: 75 + +internlm2-20b-hf: + gsm8k: 56.25 + GPQA_diamond: 15.62 + race-high: 68.75 + winogrande: 75 + +internlm2-base-20b-hf: + gsm8k: 12.5 + GPQA_diamond: 9.38 + race-high: 84.38 + winogrande: 65.62 + +internlm2-20b-turbomind: + gsm8k: 68.75 + GPQA_diamond: 15.62 + race-high: 68.75 + winogrande: 81.25 + +qwen2.5-14b-hf: + gsm8k: 75 + GPQA_diamond: 37.5 + race-high: 93.75 + winogrande: 84.38 diff --git a/.github/workflows/daily-run-test.yml b/.github/workflows/daily-run-test.yml index a141c66a..d16c5b03 100644 --- a/.github/workflows/daily-run-test.yml +++ b/.github/workflows/daily-run-test.yml @@ -13,11 +13,31 @@ on: description: 'Set branch or tag or commit id. Default is "main"' type: string default: 'main' + build_lmdeploy: + required: false + description: 'whether to build lmdeploy' + type: boolean + default: false + repo_org_lmdeploy: + required: false + description: 'Tested repository organization name. Default is internlm/lmdeploy' + type: string + default: 'InternLM/lmdeploy' + repo_ref_lmdeploy: + required: false + description: 'Set branch or tag or commit id. Default is "main"' + type: string + default: 'main' regression_func: required: true description: 'regression functions' type: string - default: "['chat','base','cmd']" + default: "['chat_models','base_models', 'chat_obj_fullbench', 'chat_sub_fullbench', 'base_fullbench','cmd', 'api']" + cuda_env: + required: true + description: "regression conda env, eg. ['dsw_cu11','dsw_cu12']" + type: string + default: "['dsw_cu12']" schedule: - cron: '56 16 * * *' @@ -31,7 +51,7 @@ env: HF_CACHE_PATH: /cpfs01/shared/public/public_hdd/llmeval/model_weights/hf_hub HUGGINGFACE_HUB_CACHE: /cpfs01/shared/public/public_hdd/llmeval/model_weights/hf_hub HF_HUB_CACHE: /cpfs01/shared/public/public_hdd/llmeval/model_weights/hf_hub - DATEASET_CACHE_PATH: /cpfs01/shared/public/public_hdd/llmeval/llm-evaluation-datasets + COMPASS_DATA_CACHE: /cpfs01/shared/public/llmeval/compass_data_cache HF_DATASETS_OFFLINE: 1 HF_EVALUATE_OFFLINE: 1 TRANSFORMERS_OFFLINE: 1 @@ -39,6 +59,8 @@ env: LMDEPLOY_USE_MODELSCOPE: false HF_HUB_OFFLINE: 1 TRITON_PTXAS_PATH: /usr/local/cuda/bin/ptxas + REPORT_ROOT: /cpfs01/shared/public/qa-llm-cicd/report + OUTPUT_FOLDER: cuda12.1_dist_${{ github.run_id }} jobs: build-pypi: @@ -64,16 +86,51 @@ jobs: retention-days: 1 name: my-artifact-${{ github.run_id }} - daily_run_test: + build-pypi-lmdeploy: + if: ${{!cancelled() && (github.event_name != 'schedule' && inputs.build_lmdeploy)}} + strategy: + matrix: + pyver: [py310] + runs-on: ubuntu-latest + env: + PYTHON_VERSION: ${{ matrix.pyver }} + PLAT_NAME: manylinux2014_x86_64 + DOCKER_TAG: cuda12.1 + steps: + - name: Checkout repository + uses: actions/checkout@v3 + with: + repository: ${{ github.event.inputs.repo_org_lmdeploy || 'InternLM/lmdeploy' }} + ref: ${{github.event.inputs.repo_ref_lmdeploy || 'main'}} + - name: Build + run: | + echo ${PYTHON_VERSION} + echo ${PLAT_NAME} + echo ${DOCKER_TAG} + echo ${OUTPUT_FOLDER} + echo ${GITHUB_RUN_ID} + # remove -it + sed -i 's/docker run --rm -it/docker run --rm/g' builder/manywheel/build_wheel.sh + bash builder/manywheel/build_wheel.sh ${PYTHON_VERSION} ${PLAT_NAME} ${DOCKER_TAG} ${OUTPUT_FOLDER} + - name: Upload Artifacts + uses: actions/upload-artifact@v4 + with: + if-no-files-found: error + path: builder/manywheel/${{ env.OUTPUT_FOLDER }} + retention-days: 1 + name: my-artifact-${{ github.run_id }}-${{ matrix.pyver }} + + + prepare_env: if: ${{!cancelled()}} - needs: build-pypi + needs: ['build-pypi', 'build-pypi-lmdeploy'] strategy: fail-fast: false matrix: - cuda_env: [dsw_cu11, dsw_cu12] + cuda_env: ${{ fromJSON(inputs.cuda_env || '["dsw_cu12"]')}} runs-on: ${{ matrix.cuda_env }} environment: 'prod' - timeout-minutes: 600 #10hours + timeout-minutes: 240 #4hours steps: - name: Clone repository uses: actions/checkout@v2 @@ -84,89 +141,169 @@ jobs: uses: actions/download-artifact@v4 with: name: my-artifact-${{ github.run_id }} + - name: Remove Conda Env + if: always() + run: | + . /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate + conda env remove -y --name ${{env.CONDA_ENV}}_${{ matrix.cuda_env }} + conda info --envs - name: Prepare - create conda env and install torch - cu11 if: ${{matrix.cuda_env == 'dsw_cu11'}} - run: | - . /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate - conda create -y --name ${{env.CONDA_ENV}}_${{ matrix.cuda_env }} python=3.10 - conda activate ${{env.CONDA_ENV}}_${{ matrix.cuda_env }} - pip install -r /cpfs01/shared/public/qa-llm-cicd/requirements-cu11.txt --cache-dir ${{env.PIP_CACHE_PATH}} - pip install opencompass*.whl --cache-dir ${{env.PIP_CACHE_PATH}} - pip install /cpfs01/user/qa-llm-cicd/packages/lmdeploy-0.6.1+cu118-cp310-cp310-manylinux2014_x86_64.whl --cache-dir ${{env.PIP_CACHE_PATH}} - pip install /cpfs01/user/qa-llm-cicd/packages/vllm-0.6.1.post1+cu118-cp310-cp310-manylinux1_x86_64.whl --cache-dir ${{env.PIP_CACHE_PATH}} - pip uninstall torch torchvision torchaudio -y - pip install torch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 --cache-dir ${{env.PIP_CACHE_PATH}} --index-url https://download.pytorch.org/whl/cu118 - FLASH_ATTENTION_FORCE_BUILD=TRUE pip install /cpfs01/user/qa-llm-cicd/packages/flash_attn-2.6.3+cu118torch2.4cxx11abiFALSE-cp310-cp310-linux_x86_64.whl - pip install /cpfs01/user/qa-llm-cicd/packages/xformers-0.0.27.post2+cu118-cp310-cp310-manylinux2014_x86_64.whl --cache-dir ${{env.PIP_CACHE_PATH}} - conda info --envs - pip list + uses: nick-fields/retry@v3 + id: retry1 + with: + max_attempts: 3 + timeout_minutes: 40 + command: | + . /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate + conda create -y --name ${{env.CONDA_ENV}}_${{ matrix.cuda_env }} python=3.10 + conda activate ${{env.CONDA_ENV}}_${{ matrix.cuda_env }} + pip install -r /cpfs01/shared/public/qa-llm-cicd/requirements-cu11.txt --cache-dir ${{env.PIP_CACHE_PATH}} + pip install opencompass*.whl --cache-dir ${{env.PIP_CACHE_PATH}} + pip install /cpfs01/user/qa-llm-cicd/packages/lmdeploy-0.6.1+cu118-cp310-cp310-manylinux2014_x86_64.whl --cache-dir ${{env.PIP_CACHE_PATH}} + pip install /cpfs01/user/qa-llm-cicd/packages/vllm-0.6.1.post1+cu118-cp310-cp310-manylinux1_x86_64.whl --cache-dir ${{env.PIP_CACHE_PATH}} + pip uninstall torch torchvision torchaudio -y + pip install torch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 --cache-dir ${{env.PIP_CACHE_PATH}} --index-url https://download.pytorch.org/whl/cu118 + FLASH_ATTENTION_FORCE_BUILD=TRUE pip install /cpfs01/user/qa-llm-cicd/packages/flash_attn-2.6.3+cu118torch2.4cxx11abiFALSE-cp310-cp310-linux_x86_64.whl + pip install /cpfs01/user/qa-llm-cicd/packages/xformers-0.0.27.post2+cu118-cp310-cp310-manylinux2014_x86_64.whl --cache-dir ${{env.PIP_CACHE_PATH}} + conda info --envs + pip list - name: Prepare - create conda env and install torch - cu12 if: ${{matrix.cuda_env == 'dsw_cu12'}} + uses: nick-fields/retry@v3 + id: retry2 + with: + max_attempts: 3 + timeout_minutes: 40 + command: | + . /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate + conda create -y --name ${{env.CONDA_ENV}}_${{ matrix.cuda_env }} python=3.10 + conda activate ${{env.CONDA_ENV}}_${{ matrix.cuda_env }} + pip install -r /cpfs01/shared/public/qa-llm-cicd/requirements-cu12.txt --cache-dir ${{env.PIP_CACHE_PATH}} + pip install opencompass*.whl --cache-dir ${{env.PIP_CACHE_PATH}} + pip install opencompass[lmdeploy] --cache-dir ${{env.PIP_CACHE_PATH}} + pip install opencompass[vllm] --cache-dir ${{env.PIP_CACHE_PATH}} + pip install torch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 --cache-dir ${{env.PIP_CACHE_PATH}} + FLASH_ATTENTION_FORCE_BUILD=TRUE pip install /cpfs01/user/qa-llm-cicd/packages/flash_attn-2.6.3+cu123torch2.4cxx11abiFALSE-cp310-cp310-linux_x86_64.whl + pip install /cpfs01/user/qa-llm-cicd/packages/xformers-0.0.27.post2-cp310-cp310-manylinux2014_x86_64.whl --cache-dir ${{env.PIP_CACHE_PATH}} + conda info --envs + pip list + - name: Prepare - reinstall lmdeploy - cu12 + if: ${{matrix.cuda_env == 'dsw_cu12' && inputs.build_lmdeploy}} + uses: actions/download-artifact@v4 + with: + name: my-artifact-${{ github.run_id }}-py310 + - name: Prepare - reinstall lmdeploy - cu12 + if: ${{matrix.cuda_env == 'dsw_cu12' && inputs.build_lmdeploy}} run: | . /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate - conda create -y --name ${{env.CONDA_ENV}}_${{ matrix.cuda_env }} python=3.10 conda activate ${{env.CONDA_ENV}}_${{ matrix.cuda_env }} - pip install -r /cpfs01/shared/public/qa-llm-cicd/requirements-cu12.txt --cache-dir ${{env.PIP_CACHE_PATH}} - pip install opencompass*.whl --cache-dir ${{env.PIP_CACHE_PATH}} - pip install opencompass[lmdeploy] --cache-dir ${{env.PIP_CACHE_PATH}} - pip install opencompass[vllm] --cache-dir ${{env.PIP_CACHE_PATH}} - pip uninstall torch torchvision torchaudio -y - pip install torch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 --cache-dir ${{env.PIP_CACHE_PATH}} - FLASH_ATTENTION_FORCE_BUILD=TRUE pip install /cpfs01/user/qa-llm-cicd/packages/flash_attn-2.6.3+cu123torch2.4cxx11abiFALSE-cp310-cp310-linux_x86_64.whl - pip install /cpfs01/user/qa-llm-cicd/packages/xformers-0.0.27.post2-cp310-cp310-manylinux2014_x86_64.whl --cache-dir ${{env.PIP_CACHE_PATH}} - conda info --envs - pip list + pip install lmdeploy-*.whl --no-deps + + daily_run_test: + if: ${{!cancelled()}} + needs: prepare_env + strategy: + fail-fast: false + matrix: + cuda_env: ${{ fromJSON(inputs.cuda_env || '["dsw_cu12"]')}} + regression_func: ${{fromJSON(github.event.inputs.regression_func || '["chat_models","base_models","chat_obj_fullbench","chat_sub_fullbench","base_fullbench","cmd","api"]')}} + runs-on: ${{ matrix.cuda_env }} + environment: 'prod' + timeout-minutes: 240 #4hours + steps: + - name: Clone repository + uses: actions/checkout@v2 + with: + repository: ${{ github.event.inputs.repo_org || 'open-compass/opencompass' }} + ref: ${{github.event.inputs.repo_ref || 'main'}} - name: Prepare - prepare data and hf model run: | - ln -s ${{env.DATEASET_CACHE_PATH}} data rm -rf ~/.cache/huggingface/hub -f && mkdir ~/.cache -p && mkdir ~/.cache/huggingface -p ln -s ${{env.HF_CACHE_PATH}} ~/.cache/huggingface/hub - name: Run command testcase - if: github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'cmd') + if: matrix.regression_func == 'cmd' run: | . /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate conda activate ${{env.CONDA_ENV}}_${{ matrix.cuda_env }} conda info --envs export from_tf=TRUE python tools/list_configs.py internlm2_5 mmlu - opencompass --models hf_internlm2_5_7b --datasets race_ppl --work-dir /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd1_${{ matrix.cuda_env }} --reuse --max-num-workers 2 - rm regression_result_daily -f && ln -s /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd1_${{ matrix.cuda_env }}/*/summary regression_result_daily + opencompass --models hf_internlm2_5_7b hf_internlm2_1_8b --datasets race_ppl demo_gsm8k_chat_gen --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd1_${{ matrix.cuda_env }} --reuse --max-num-workers 2 --dump-eval-details + rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd1_${{ matrix.cuda_env }}/*/summary regression_result_daily python -m pytest -m case1 -s -v --color=yes .github/scripts/oc_score_assert.py - opencompass --models hf_internlm2_5_7b_chat --datasets race_gen -a lmdeploy --work-dir /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd2_${{ matrix.cuda_env }} --reuse --max-num-workers 2 - rm regression_result_daily -f && ln -s /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd2_${{ matrix.cuda_env }}/*/summary regression_result_daily + opencompass --models hf_internlm2_5_7b_chat hf_internlm2_chat_1_8b --datasets race_gen demo_gsm8k_chat_gen -a lmdeploy --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd2_${{ matrix.cuda_env }} --reuse --max-num-workers 2 --dump-eval-details + rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd2_${{ matrix.cuda_env }}/*/summary regression_result_daily python -m pytest -m case2 -s -v --color=yes .github/scripts/oc_score_assert.py - opencompass --datasets race_ppl --hf-type base --hf-path internlm/internlm2_5-7b --work-dir /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd3_${{ matrix.cuda_env }} --reuse --max-num-workers 2 - rm regression_result_daily -f && ln -s /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd3_${{ matrix.cuda_env }}/*/summary regression_result_daily + opencompass --datasets race_ppl demo_gsm8k_chat_gen --hf-type base --hf-path internlm/internlm2_5-7b --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd3_${{ matrix.cuda_env }} --reuse --max-num-workers 2 --dump-eval-details + rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd3_${{ matrix.cuda_env }}/*/summary regression_result_daily python -m pytest -m case3 -s -v --color=yes .github/scripts/oc_score_assert.py - opencompass --datasets race_gen --hf-type chat --hf-path internlm/internlm2_5-7b-chat --work-dir /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd4_${{ matrix.cuda_env }} --reuse --max-num-workers 2 - rm regression_result_daily -f && ln -s /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd4_${{ matrix.cuda_env }}/*/summary regression_result_daily + opencompass --datasets race_gen demo_gsm8k_chat_gen --hf-type chat --hf-path internlm/internlm2_5-7b-chat --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd4_${{ matrix.cuda_env }} --reuse --max-num-workers 2 --dump-eval-details + rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd4_${{ matrix.cuda_env }}/*/summary regression_result_daily python -m pytest -m case4 -s -v --color=yes .github/scripts/oc_score_assert.py - name: Run chat model test - if: github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'chat') + if: matrix.regression_func == 'chat_models' run: | . /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate conda activate ${{env.CONDA_ENV}}_${{ matrix.cuda_env }} conda info --envs - sed -i 's/judgemodel/'$(tail -n 1 /cpfs01/shared/public/llmeval/share_info/compassjuder_ip.txt)'/g' .github/scripts/eval_regression_chat.py - opencompass .github/scripts/eval_regression_chat.py --work-dir /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/chat_${{ matrix.cuda_env }} --reuse --max-num-workers 2 - rm regression_result_daily -f && ln -s /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/chat_${{ matrix.cuda_env }}/*/summary regression_result_daily + opencompass .github/scripts/eval_regression_chat.py --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/chat_${{ matrix.cuda_env }} --reuse --max-num-workers 2 --dump-eval-details + rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/chat_${{ matrix.cuda_env }}/*/summary regression_result_daily python -m pytest -m chat -s -v --color=yes .github/scripts/oc_score_assert.py - name: Run base model test - if: github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'base') + if: matrix.regression_func == 'base_models' run: | . /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate conda activate ${{env.CONDA_ENV}}_${{ matrix.cuda_env }} conda info --envs - opencompass .github/scripts/eval_regression_base.py --work-dir /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/base_${{ matrix.cuda_env }} --reuse --max-num-workers 2 - rm regression_result_daily -f && ln -s /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/base_${{ matrix.cuda_env }}/*/summary regression_result_daily + opencompass .github/scripts/eval_regression_base.py --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/base_${{ matrix.cuda_env }} --reuse --max-num-workers 2 --dump-eval-details + rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/base_${{ matrix.cuda_env }}/*/summary regression_result_daily python -m pytest -m base -s -v --color=yes .github/scripts/oc_score_assert.py - - name: Remove Conda Env - if: always() + - name: Run chat model test - fullbench + if: matrix.regression_func == 'chat_obj_fullbench' run: | - rm -rf regression_result_daily . /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate - conda env remove -y --name ${{env.CONDA_ENV}}_${{ matrix.cuda_env }} + conda activate ${{env.CONDA_ENV}}_${{ matrix.cuda_env }} conda info --envs + opencompass .github/scripts/eval_regression_chat_objective_fullbench.py --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/chat_obj_${{ matrix.cuda_env }} --reuse --max-num-workers 2 --dump-eval-details + rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/chat_obj_${{ matrix.cuda_env }}/*/summary regression_result_daily + python -m pytest -m chat_obj_fullbench -s -v --color=yes .github/scripts/oc_score_assert.py + - name: Run chat model test - fullbench + if: matrix.regression_func == 'chat_sub_fullbench' + env: + COMPASS_DATA_CACHE: /cpfs01/shared/public/llmeval/compass_data_cache_subset + run: | + . /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate + conda activate ${{env.CONDA_ENV}}_${{ matrix.cuda_env }} + conda info --envs + opencompass .github/scripts/eval_regression_chat_subjective_fullbench.py --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/chat_sub_${{ matrix.cuda_env }} --reuse --max-num-workers 2 --dump-eval-details + rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/chat_sub_${{ matrix.cuda_env }}/*/summary regression_result_daily + python -m pytest -m chat_sub_fullbench -s -v --color=yes .github/scripts/oc_score_assert.py + - name: Run base model test - fullbench + if: matrix.regression_func == 'base_fullbench' + run: | + . /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate + conda activate ${{env.CONDA_ENV}}_${{ matrix.cuda_env }} + conda info --envs + opencompass .github/scripts/eval_regression_base_fullbench.py --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/base_full_${{ matrix.cuda_env }} --reuse --max-num-workers 2 --dump-eval-details + rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/base_full_${{ matrix.cuda_env }}/*/summary regression_result_daily + python -m pytest -m base_fullbench -s -v --color=yes .github/scripts/oc_score_assert.py + - name: Run model test - api + if: matrix.regression_func == 'api' + run: | + . /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate + conda activate ${{env.CONDA_ENV}}_${{ matrix.cuda_env }} + conda info --envs + lmdeploy serve api_server internlm/internlm2_5-7b-chat --max-batch-size 256 --model-name internlm2 > ${{env.REPORT_ROOT}}/${{ github.run_id }}/restful.log 2>&1 & + echo "restful_pid=$!" >> "$GITHUB_ENV" + sleep 120s + opencompass .github/scripts/eval_regression_api.py --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/api_${{ matrix.cuda_env }} --reuse --max-num-workers 2 --dump-eval-details + rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/api_${{ matrix.cuda_env }}/*/summary regression_result_daily + python -m pytest -m api -s -v --color=yes .github/scripts/oc_score_assert.py + - name: Run model test - api kill + if: always() && matrix.regression_func == 'api' + run: | + kill -15 "$restful_pid" notify_to_feishu: if: ${{ always() && !cancelled() && contains(needs.*.result, 'failure') && (github.ref_name == 'develop' || github.ref_name == 'main') }} From aca8ec3c6ab5f2558005862ed0183bb4cfb6414d Mon Sep 17 00:00:00 2001 From: bittersweet1999 <148421775+bittersweet1999@users.noreply.github.com> Date: Wed, 13 Nov 2024 10:14:27 +0800 Subject: [PATCH 02/17] [Hotfix] Hotfix (#1683) * fix pip version * fix pip version * fix lint * hotfix --- opencompass/models/turbomind_with_tf_above_v4_33.py | 2 +- opencompass/models/vllm.py | 6 +++++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/opencompass/models/turbomind_with_tf_above_v4_33.py b/opencompass/models/turbomind_with_tf_above_v4_33.py index 76bbf194..cf5b880b 100644 --- a/opencompass/models/turbomind_with_tf_above_v4_33.py +++ b/opencompass/models/turbomind_with_tf_above_v4_33.py @@ -123,7 +123,7 @@ class TurboMindModelwithChatTemplate(BaseModel): gen_config = copy.deepcopy(DEFAULT_GEN_CONFIG) gen_config.update(self.gen_config) - if do_sample or self.gen_config['do_sample']: + if do_sample or ('do_sample' in self.gen_config and self.gen_config['do_sample']): gen_config['top_k'] = 40 gen_config['temperature'] = temperature else: diff --git a/opencompass/models/vllm.py b/opencompass/models/vllm.py index fbfaf66e..7e166ca7 100644 --- a/opencompass/models/vllm.py +++ b/opencompass/models/vllm.py @@ -101,7 +101,11 @@ class VLLM(BaseModel): if not self.lora_path: outputs = self.model.generate(inputs, sampling_kwargs) else: - outputs = self.model.generate(inputs, sampling_kwargs, lora_request=LoRARequest("sql_adapter", 1, self.lora_path)) + outputs = self.model.generate(inputs, + sampling_kwargs, + lora_request=LoRARequest( + 'sql_adapter', 1, + self.lora_path)) prompt_list, output_strs = [], [] for output in outputs: From f8a1c1f487da7847cbcae2ddafcd4718a2f1b478 Mon Sep 17 00:00:00 2001 From: zhulinJulia24 <145004780+zhulinJulia24@users.noreply.github.com> Date: Wed, 13 Nov 2024 10:48:05 +0800 Subject: [PATCH 03/17] [CI] update (#1682) Co-authored-by: zhulin1 --- .github/workflows/pr-run-test.yml | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/.github/workflows/pr-run-test.yml b/.github/workflows/pr-run-test.yml index 054b2544..77cecb4b 100644 --- a/.github/workflows/pr-run-test.yml +++ b/.github/workflows/pr-run-test.yml @@ -10,17 +10,6 @@ on: - 'tools/**' workflow_dispatch: - inputs: - repo_org: - required: false - description: 'Tested repository organization name. Default is open-compass/opencompass' - type: string - default: 'open-compass/opencompass' - repo_ref: - required: false - description: 'Set branch or tag or commit id. Default is "main"' - type: string - default: 'main' schedule: - cron: '56 22 * * *' @@ -46,9 +35,6 @@ jobs: steps: - name: Checkout repository uses: actions/checkout@v2 - with: - repository: ${{ github.event.inputs.repo_org || 'open-compass/opencompass' }} - ref: ${{github.event.inputs.repo_ref || 'main'}} - name: Prepare - Install opencompass run: | . /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate From 2fee63f537dfc80e0c5a3882dbb1c086825e7bc6 Mon Sep 17 00:00:00 2001 From: Linchen Xiao Date: Wed, 13 Nov 2024 15:47:29 +0800 Subject: [PATCH 04/17] [Update] Auto-download for followbench (#1685) --- opencompass/utils/datasets_info.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/opencompass/utils/datasets_info.py b/opencompass/utils/datasets_info.py index e896f917..38808e99 100644 --- a/opencompass/utils/datasets_info.py +++ b/opencompass/utils/datasets_info.py @@ -326,7 +326,7 @@ DATASETS_MAPPING = { "ms_id": "", "hf_id": "", "local": "./data/mmmlu_lite", - } + }, } DATASETS_URL = { @@ -426,6 +426,10 @@ DATASETS_URL = { "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/fofo.zip", "md5": "8a302712e425e27e4292a9369df5b9d3", }, + "subjective/followbench": { + "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/followbench.zip", + "md5": "da7a831817c969da15d1e78d4a245d8a", + }, "subjective/mtbench101": { "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/mtbench101.zip", "md5": "5d80257bc9929ebe5cfbf6d11184b04c", @@ -496,11 +500,11 @@ DATASETS_URL = { }, "/aime": { "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/aime.zip", - "md5": "fbe2d0577fc210962a549f8cea1a00c8" + "md5": "fbe2d0577fc210962a549f8cea1a00c8", }, "/cmo": { "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/cmo.zip", - "md5": "fad52c81290506a8ca74f46b5400d8fc" + "md5": "fad52c81290506a8ca74f46b5400d8fc", }, "/nq-open": { "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/nq-open.zip", @@ -521,5 +525,5 @@ DATASETS_URL = { "/WikiBench": { "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/WikiBench.zip", "md5": "6dac1d1a3133fe1effff185cbf71d928", - } + }, } From e92a5d4230b7d39427ec7f618f506f024d43005c Mon Sep 17 00:00:00 2001 From: Linchen Xiao Date: Thu, 14 Nov 2024 15:32:43 +0800 Subject: [PATCH 05/17] [Feature] BABILong Dataset added (#1684) * update * update * update * update --- configs/eval_babilong.py | 65 +++ .../configs/datasets/babilong/README.md | 37 ++ .../datasets/babilong/babilong_0k_gen.py | 37 ++ .../datasets/babilong/babilong_128k_gen.py | 38 ++ .../datasets/babilong/babilong_16k_gen.py | 38 ++ .../datasets/babilong/babilong_1m_gen.py | 37 ++ .../datasets/babilong/babilong_256k_gen.py | 38 ++ .../datasets/babilong/babilong_2k_gen.py | 38 ++ .../datasets/babilong/babilong_32k_gen.py | 38 ++ .../datasets/babilong/babilong_4k_gen.py | 38 ++ .../configs/summarizers/groups/babilong.py | 37 ++ opencompass/datasets/__init__.py | 1 + opencompass/datasets/babilong/__init__.py | 1 + opencompass/datasets/babilong/babilong.py | 106 ++++ .../datasets/babilong/babilong_utils.py | 293 ++++++++++ opencompass/datasets/babilong/prompts.py | 516 ++++++++++++++++++ opencompass/utils/datasets_info.py | 9 + requirements/runtime.txt | 1 + 18 files changed, 1368 insertions(+) create mode 100644 configs/eval_babilong.py create mode 100644 opencompass/configs/datasets/babilong/README.md create mode 100644 opencompass/configs/datasets/babilong/babilong_0k_gen.py create mode 100644 opencompass/configs/datasets/babilong/babilong_128k_gen.py create mode 100644 opencompass/configs/datasets/babilong/babilong_16k_gen.py create mode 100644 opencompass/configs/datasets/babilong/babilong_1m_gen.py create mode 100644 opencompass/configs/datasets/babilong/babilong_256k_gen.py create mode 100644 opencompass/configs/datasets/babilong/babilong_2k_gen.py create mode 100644 opencompass/configs/datasets/babilong/babilong_32k_gen.py create mode 100644 opencompass/configs/datasets/babilong/babilong_4k_gen.py create mode 100644 opencompass/configs/summarizers/groups/babilong.py create mode 100644 opencompass/datasets/babilong/__init__.py create mode 100644 opencompass/datasets/babilong/babilong.py create mode 100644 opencompass/datasets/babilong/babilong_utils.py create mode 100644 opencompass/datasets/babilong/prompts.py diff --git a/configs/eval_babilong.py b/configs/eval_babilong.py new file mode 100644 index 00000000..0a5b0ba8 --- /dev/null +++ b/configs/eval_babilong.py @@ -0,0 +1,65 @@ +from mmengine.config import read_base + +with read_base(): + # Models + from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat import ( + models as lmdeploy_internlm2_5_7b_chat_model, + ) + from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_7b_instruct import ( + models as lmdeploy_qwen2_5_7b_instruct_model, + ) + from opencompass.configs.models.hf_llama.lmdeploy_llama3_1_8b_instruct import ( + models as lmdeploy_llama3_1_8b_instruct_model, + ) + from opencompass.configs.models.mistral.lmdeploy_ministral_8b_instruct_2410 import ( + models as lmdeploy_ministral_8b_instruct_2410_model, + ) + + # Datasets + from opencompass.configs.datasets.babilong.babilong_0k_gen import ( + babiLong_0k_datasets, + ) + from opencompass.configs.datasets.babilong.babilong_4k_gen import ( + babiLong_4k_datasets, + ) + from opencompass.configs.datasets.babilong.babilong_16k_gen import ( + babiLong_16k_datasets, + ) + from opencompass.configs.datasets.babilong.babilong_32k_gen import ( + babiLong_32k_datasets, + ) + from opencompass.configs.datasets.babilong.babilong_128k_gen import ( + babiLong_128k_datasets, + ) + from opencompass.configs.datasets.babilong.babilong_256k_gen import ( + babiLong_256k_datasets, + ) + from opencompass.configs.summarizers.groups.babilong import ( + babilong_summary_groups, + ) + +datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), []) + +models = sum([v for k, v in locals().items() if k.endswith('_model')], []) +for model in models: + model['engine_config']['session_len'] = 1024 * 1024 + model['max_seq_len'] = 1024 * 1024 + model['engine_config']['tp'] = 4 + model['run_cfg']['num_gpus'] = 4 + + +summarizer = dict( + dataset_abbrs=[ + 'babilong_0k', + 'babilong_4k', + 'babilong_16k', + 'babilong_32k', + 'babilong_128k', + 'babilong_256k', + ], + summary_groups=sum( + [v for k, v in locals().items() if k.endswith('_summary_groups')], [] + ), +) + +work_dir = './outputs/babilong' diff --git a/opencompass/configs/datasets/babilong/README.md b/opencompass/configs/datasets/babilong/README.md new file mode 100644 index 00000000..091a9319 --- /dev/null +++ b/opencompass/configs/datasets/babilong/README.md @@ -0,0 +1,37 @@ +# BABILong +OpenCompass now supports the brand new long-context language model evaluation benchmark — [BABILong](https://arxiv.org/pdf/2406.10149). BABILong provides an evaluation of long-context reasoning across extremely long documents, including a diverse set of 20 reasoning tasks such as fact chaining, simple induction, deduction, counting, and handling lists/sets. This benchmark is designed to test the ability of language models to reason over facts distributed in long natural text, and it allows for the construction of tasks of almost arbitrary length to adapt to the evaluation of new, more powerful models in an extensible and controllable way. + + + +## How to Use +The BABILong dataset is available on Hugging Face: [RMT-team/babilong](https://huggingface.co/datasets/RMT-team/babilong). Opencompass provides an automatic download for BABILong dataset, due to the dataset size, we only provide the data up to 1M tokens. For longer context, you can download the dataset from Hugging Face directly. + +BABILong paper provides in total 20 tasks, we provide 10 tasks configurations in OpenCompass and they are organized by different context sizes. You can create your own configurations by following the examples in `opencompass/configs/datasets/babilong/babilong_1m_gen.py`. + +Opencompass provides a demo for evaluating language models on the BABILong dataset. + +```bash +opencompass configs/eval_babilong.py +``` +OpenCompass provides the results of some models on the BABILong dataset. The evaluation results are run with LMDeploy with default model settings. + +| dataset | version | metric | mode | internlm2_5-7b-chat-turbomind | qwen2.5-7b-instruct-turbomind | llama-3_1-8b-instruct-turbomind | ministral-8B-instruct-2410-turbomind | +|----- | ----- | ----- | ----- | ----- | ----- | ----- | -----| +| babilong_0k | - | naive_average | gen | 76.51 | 80.25 | 76.44 | 76.40 | +| babilong_4k | - | naive_average | gen | 67.55 | 70.35 | 67.41 | 67.92 | +| babilong_16k | - | naive_average | gen | 53.78 | 65.83 | 60.26 | 56.58 | +| babilong_32k | - | naive_average | gen | 50.86 | 62.66 | 59.56 | 53.52 | +| babilong_128k | - | naive_average | gen | 39.33 | 27.79 | 52.01 | 3.20 | +| babilong_256k | - | naive_average | gen | 17.31 | 7.30 | 23.35 | 9.50 | + +## Citation + +```bibtex +@misc{kuratov2024babilong, + title={BABILong: Testing the Limits of LLMs with Long Context Reasoning-in-a-Haystack}, + author={Yuri Kuratov and Aydar Bulatov and Petr Anokhin and Ivan Rodkin and Dmitry Sorokin and Artyom Sorokin and Mikhail Burtsev}, + year={2024}, + eprint={2406.10149}, + archivePrefix={arXiv} +} +``` \ No newline at end of file diff --git a/opencompass/configs/datasets/babilong/babilong_0k_gen.py b/opencompass/configs/datasets/babilong/babilong_0k_gen.py new file mode 100644 index 00000000..8d10a66c --- /dev/null +++ b/opencompass/configs/datasets/babilong/babilong_0k_gen.py @@ -0,0 +1,37 @@ +from opencompass.datasets.babilong.babilong import BabiLongDataset, BabiLongEvaluator +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer + + +babiLong_0k_datasets = [] +split_name='0k' +tasks = ['qa1', 'qa2', 'qa3', 'qa4', 'qa5', 'qa6', 'qa7', 'qa8', 'qa9', 'qa10'] + + +for task in tasks: + tmp_dataset = { + 'abbr': f'babilong_{task}_{split_name}', + 'type': BabiLongDataset, + 'path': 'opencompass/babilong', + 'task': task, + 'split_name': split_name, + 'reader_cfg': dict(input_columns=['prompt'], output_column='answer'), + 'infer_cfg': dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict(role='HUMAN', prompt='{prompt}'), + dict(role='BOT', prompt='{answer}\n'), + ] + ), + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer), + ), + 'eval_cfg': dict( + evaluator=dict(type=BabiLongEvaluator), + ), + } + babiLong_0k_datasets.append(tmp_dataset) diff --git a/opencompass/configs/datasets/babilong/babilong_128k_gen.py b/opencompass/configs/datasets/babilong/babilong_128k_gen.py new file mode 100644 index 00000000..c0a24f4e --- /dev/null +++ b/opencompass/configs/datasets/babilong/babilong_128k_gen.py @@ -0,0 +1,38 @@ +from opencompass.datasets.babilong.babilong import BabiLongDataset, BabiLongEvaluator +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer + + +babiLong_128k_datasets = [] +split_name='128k' +max_seq_len = 128*1024 +tasks = ['qa1', 'qa2', 'qa3', 'qa4', 'qa5', 'qa6', 'qa7', 'qa8', 'qa9', 'qa10'] + + +for task in tasks: + tmp_dataset = { + 'abbr': f'babilong_{task}_{split_name}', + 'type': BabiLongDataset, + 'path': 'opencompass/babilong', + 'task': task, + 'split_name': split_name, + 'reader_cfg': dict(input_columns=['prompt'], output_column='answer'), + 'infer_cfg': dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict(role='HUMAN', prompt='{prompt}'), + dict(role='BOT', prompt='{answer}\n'), + ] + ), + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_seq_len=max_seq_len), + ), + 'eval_cfg': dict( + evaluator=dict(type=BabiLongEvaluator), + ), + } + babiLong_128k_datasets.append(tmp_dataset) diff --git a/opencompass/configs/datasets/babilong/babilong_16k_gen.py b/opencompass/configs/datasets/babilong/babilong_16k_gen.py new file mode 100644 index 00000000..f192fece --- /dev/null +++ b/opencompass/configs/datasets/babilong/babilong_16k_gen.py @@ -0,0 +1,38 @@ +from opencompass.datasets.babilong.babilong import BabiLongDataset, BabiLongEvaluator +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer + + +babiLong_16k_datasets = [] +split_name='16k' +max_seq_len = 16*1024 +tasks = ['qa1', 'qa2', 'qa3', 'qa4', 'qa5', 'qa6', 'qa7', 'qa8', 'qa9', 'qa10'] + + +for task in tasks: + tmp_dataset = { + 'abbr': f'babilong_{task}_{split_name}', + 'type': BabiLongDataset, + 'path': 'opencompass/babilong', + 'task': task, + 'split_name': split_name, + 'reader_cfg': dict(input_columns=['prompt'], output_column='answer'), + 'infer_cfg': dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict(role='HUMAN', prompt='{prompt}'), + dict(role='BOT', prompt='{answer}\n'), + ] + ), + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_seq_len=max_seq_len), + ), + 'eval_cfg': dict( + evaluator=dict(type=BabiLongEvaluator), + ), + } + babiLong_16k_datasets.append(tmp_dataset) diff --git a/opencompass/configs/datasets/babilong/babilong_1m_gen.py b/opencompass/configs/datasets/babilong/babilong_1m_gen.py new file mode 100644 index 00000000..33b6510a --- /dev/null +++ b/opencompass/configs/datasets/babilong/babilong_1m_gen.py @@ -0,0 +1,37 @@ +from opencompass.datasets.babilong.babilong import BabiLongDataset, BabiLongEvaluator +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer + + +babiLong_1m_datasets = [] +split_name='1m' +tasks = ['qa1', 'qa2', 'qa3', 'qa4', 'qa5', 'qa6', 'qa7', 'qa8', 'qa9', 'qa10'] + + +for task in tasks: + tmp_dataset = { + 'abbr': f'babilong_{task}_{split_name}', + 'type': BabiLongDataset, + 'path': 'opencompass/babilong', + 'task': task, + 'split_name': split_name, + 'reader_cfg': dict(input_columns=['prompt'], output_column='answer'), + 'infer_cfg': dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict(role='HUMAN', prompt='{prompt}'), + dict(role='BOT', prompt='{answer}\n'), + ] + ), + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer), + ), + 'eval_cfg': dict( + evaluator=dict(type=BabiLongEvaluator), + ), + } + babiLong_1m_datasets.append(tmp_dataset) diff --git a/opencompass/configs/datasets/babilong/babilong_256k_gen.py b/opencompass/configs/datasets/babilong/babilong_256k_gen.py new file mode 100644 index 00000000..8e0e591e --- /dev/null +++ b/opencompass/configs/datasets/babilong/babilong_256k_gen.py @@ -0,0 +1,38 @@ +from opencompass.datasets.babilong.babilong import BabiLongDataset, BabiLongEvaluator +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer + + +babiLong_256k_datasets = [] +split_name='256k' +max_seq_len = 256*1024 +tasks = ['qa1', 'qa2', 'qa3', 'qa4', 'qa5', 'qa6', 'qa7', 'qa8', 'qa9', 'qa10'] + + +for task in tasks: + tmp_dataset = { + 'abbr': f'babilong_{task}_{split_name}', + 'type': BabiLongDataset, + 'path': 'opencompass/babilong', + 'task': task, + 'split_name': split_name, + 'reader_cfg': dict(input_columns=['prompt'], output_column='answer'), + 'infer_cfg': dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict(role='HUMAN', prompt='{prompt}'), + dict(role='BOT', prompt='{answer}\n'), + ] + ), + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_seq_len=max_seq_len ), + ), + 'eval_cfg': dict( + evaluator=dict(type=BabiLongEvaluator), + ), + } + babiLong_256k_datasets.append(tmp_dataset) diff --git a/opencompass/configs/datasets/babilong/babilong_2k_gen.py b/opencompass/configs/datasets/babilong/babilong_2k_gen.py new file mode 100644 index 00000000..39a7cb82 --- /dev/null +++ b/opencompass/configs/datasets/babilong/babilong_2k_gen.py @@ -0,0 +1,38 @@ +from opencompass.datasets.babilong.babilong import BabiLongDataset, BabiLongEvaluator +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer + + +babiLong_2k_datasets = [] +split_name='2k' +max_seq_len = 2*1024 +tasks = ['qa1', 'qa2', 'qa3', 'qa4', 'qa5', 'qa6', 'qa7', 'qa8', 'qa9', 'qa10'] + + +for task in tasks: + tmp_dataset = { + 'abbr': f'babilong_{task}_{split_name}', + 'type': BabiLongDataset, + 'path': 'opencompass/babilong', + 'task': task, + 'split_name': split_name, + 'reader_cfg': dict(input_columns=['prompt'], output_column='answer'), + 'infer_cfg': dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict(role='HUMAN', prompt='{prompt}'), + dict(role='BOT', prompt='{answer}\n'), + ] + ), + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_seq_len=max_seq_len), + ), + 'eval_cfg': dict( + evaluator=dict(type=BabiLongEvaluator), + ), + } + babiLong_2k_datasets.append(tmp_dataset) diff --git a/opencompass/configs/datasets/babilong/babilong_32k_gen.py b/opencompass/configs/datasets/babilong/babilong_32k_gen.py new file mode 100644 index 00000000..d3ac20c7 --- /dev/null +++ b/opencompass/configs/datasets/babilong/babilong_32k_gen.py @@ -0,0 +1,38 @@ +from opencompass.datasets.babilong.babilong import BabiLongDataset, BabiLongEvaluator +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer + + +babiLong_32k_datasets = [] +split_name='32k' +max_seq_len = 32*1024 +tasks = ['qa1', 'qa2', 'qa3', 'qa4', 'qa5', 'qa6', 'qa7', 'qa8', 'qa9', 'qa10'] + + +for task in tasks: + tmp_dataset = { + 'abbr': f'babilong_{task}_{split_name}', + 'type': BabiLongDataset, + 'path': 'opencompass/babilong', + 'task': task, + 'split_name': split_name, + 'reader_cfg': dict(input_columns=['prompt'], output_column='answer'), + 'infer_cfg': dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict(role='HUMAN', prompt='{prompt}'), + dict(role='BOT', prompt='{answer}\n'), + ] + ), + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_seq_len=max_seq_len), + ), + 'eval_cfg': dict( + evaluator=dict(type=BabiLongEvaluator), + ), + } + babiLong_32k_datasets.append(tmp_dataset) diff --git a/opencompass/configs/datasets/babilong/babilong_4k_gen.py b/opencompass/configs/datasets/babilong/babilong_4k_gen.py new file mode 100644 index 00000000..b8c23e4d --- /dev/null +++ b/opencompass/configs/datasets/babilong/babilong_4k_gen.py @@ -0,0 +1,38 @@ +from opencompass.datasets.babilong.babilong import BabiLongDataset, BabiLongEvaluator +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer + + +babiLong_4k_datasets = [] +split_name='4k' +max_seq_len=4*1024 +tasks = ['qa1', 'qa2', 'qa3', 'qa4', 'qa5', 'qa6', 'qa7', 'qa8', 'qa9', 'qa10'] + + +for task in tasks: + tmp_dataset = { + 'abbr': f'babilong_{task}_{split_name}', + 'type': BabiLongDataset, + 'path': 'opencompass/babilong', + 'task': task, + 'split_name': split_name, + 'reader_cfg': dict(input_columns=['prompt'], output_column='answer'), + 'infer_cfg': dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict(role='HUMAN', prompt='{prompt}'), + dict(role='BOT', prompt='{answer}\n'), + ] + ), + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_seq_len=max_seq_len), + ), + 'eval_cfg': dict( + evaluator=dict(type=BabiLongEvaluator), + ), + } + babiLong_4k_datasets.append(tmp_dataset) diff --git a/opencompass/configs/summarizers/groups/babilong.py b/opencompass/configs/summarizers/groups/babilong.py new file mode 100644 index 00000000..9b1b39ee --- /dev/null +++ b/opencompass/configs/summarizers/groups/babilong.py @@ -0,0 +1,37 @@ +default_babilong_tasks = [ + 'qa1', + 'qa2', + 'qa3', + 'qa4', + 'qa5', + 'qa6', + 'qa7', + 'qa8', + 'qa9', + 'qa10', +] +context_window_sizes = [ + '0k', + '1k', + '2k', + '4k', + '8k', + '16k', + '32k', + '64k', + '128k', + '256k', + '512k', + '1m', +] +babilong_summary_groups = [] +for context_window_size in context_window_sizes: + babilong_summary_groups.append( + { + 'name': f'babilong_{context_window_size}', + 'subsets': [ + f'babilong_{task}_{context_window_size}' + for task in default_babilong_tasks + ], + } + ) diff --git a/opencompass/datasets/__init__.py b/opencompass/datasets/__init__.py index 4b64d77b..4ab4a7d0 100644 --- a/opencompass/datasets/__init__.py +++ b/opencompass/datasets/__init__.py @@ -7,6 +7,7 @@ from .anthropics_evals import * # noqa: F401, F403 from .apps import * # noqa: F401, F403 from .arc import * # noqa: F401, F403 from .ax import * # noqa: F401, F403 +from .babilong import * # noqa: F401, F403 from .bbh import * # noqa: F401, F403 from .boolq import * # noqa: F401, F403 from .bustum import * # noqa: F401, F403 diff --git a/opencompass/datasets/babilong/__init__.py b/opencompass/datasets/babilong/__init__.py new file mode 100644 index 00000000..e7c55705 --- /dev/null +++ b/opencompass/datasets/babilong/__init__.py @@ -0,0 +1 @@ +from .babilong import * # noqa: F401, F403 diff --git a/opencompass/datasets/babilong/babilong.py b/opencompass/datasets/babilong/babilong.py new file mode 100644 index 00000000..2529c761 --- /dev/null +++ b/opencompass/datasets/babilong/babilong.py @@ -0,0 +1,106 @@ +# flake8: noqa: F401, E501 +import json +import os + +from datasets import Dataset + +from opencompass.datasets.babilong.babilong_utils import compare_answers +from opencompass.datasets.babilong.prompts import (DEFAULT_PROMPTS, + DEFAULT_TEMPLATE, + get_formatted_input) +from opencompass.datasets.base import BaseDataset +from opencompass.openicl import BaseEvaluator +from opencompass.registry import LOAD_DATASET +from opencompass.utils import get_data_path + + +@LOAD_DATASET.register_module() +class BabiLongDataset(BaseDataset): + + @staticmethod + def load( + path, + task, + split_name, + use_instruction=True, + use_examples=True, + use_post_prompt=True, + ) -> Dataset: + + assert task in [ + 'qa1', + 'qa2', + 'qa3', + 'qa4', + 'qa5', + 'qa6', + 'qa7', + 'qa8', + 'qa9', + 'qa10', + ], f"Task must be in ['qa1', 'qa2', 'qa3', 'qa4', 'qa5', 'qa6', 'qa7', 'qa8', 'qa9', 'qa10']" + assert split_name in [ + '0k', + '1k', + '2k', + '4k', + '8k', + '16k', + '32k', + '64k', + '128k', + '256k', + '512k', + '1m', + ], f"Split name must be in ['0k', '1k', '2k', '4k', '8k', '16k', '32k', '64k', '128k', '256k', '512k', '1m']" + + # configure the prompt + prompt_cfg = { + 'instruction': + (DEFAULT_PROMPTS[task]['instruction'] if use_instruction else ''), + 'examples': + (DEFAULT_PROMPTS[task]['examples'] if use_examples else ''), + 'post_prompt': + (DEFAULT_PROMPTS[task]['post_prompt'] if use_post_prompt else ''), + 'template': + DEFAULT_TEMPLATE, + } + + path = get_data_path(path) + file = os.path.join(path, task, f'{split_name}.json') + + with open(file, 'r') as f: + task_data = json.load(f) + + data = [] + for sample in task_data: + tmp_data = {'prompt': [], 'answer': []} + target = sample['target'] + context = sample['input'] + question = sample['question'] + + input_text = get_formatted_input( + context, + question, + prompt_cfg['examples'], + prompt_cfg['instruction'], + prompt_cfg['post_prompt'], + template=DEFAULT_TEMPLATE, + ) + + tmp_data['prompt'].append(input_text) + tmp_data['answer'].append(target) + data.append(tmp_data) + return Dataset.from_list(data) + + +class BabiLongEvaluator(BaseEvaluator): + + def score(self, predictions, gold): + assert len(predictions) == len(gold) + score = (sum([ + compare_answers(str(ref[0]), pred) + for pred, ref in zip(predictions, gold) + ]) / len(predictions) * 100) + result = {'score': round(score, 2)} + return result diff --git a/opencompass/datasets/babilong/babilong_utils.py b/opencompass/datasets/babilong/babilong_utils.py new file mode 100644 index 00000000..8089072d --- /dev/null +++ b/opencompass/datasets/babilong/babilong_utils.py @@ -0,0 +1,293 @@ +# flake8: noqa: E501 +# Modifided from https://github.com/booydar/babilong/blob/main/babilong/babilong_utils.py +import re + +import nltk +import numpy as np +import pandas as pd +from torch.utils.data import Dataset + + +def compare_answers(target, output): + """Compare target and output answers. + + Takes only the first sentence from output and filters responses when model + tries to generate examples. We consider prediction correct if target is in + output. + """ + target = target.lower() + output = output.lower() + # take only the first sentence from output + output = output.split('.')[0] + # filter responses when model tries to generate examples + output = output.split('')[0] + output = output.split('')[0] + + # we consider prediction correct if target is in output + if target in output: + return True + + return False + + +def get_dataset_df(dataset_path, max_n_facts=None): + """Preprocess babi text files.""" + with open(dataset_path, 'r') as f: + texts = f.read().strip() + texts = texts.split('\n') + df = pd.DataFrame(texts, columns=['text']) + + # parse samples + df['phrase_num'] = df.text.apply(lambda x: int(x.split(' ')[0])) + df.text = df.text.apply(lambda x: x[x.index(' ') + 1:]) + df['answer'] = df.text.apply(lambda x: x[x.index('\t') + 1:] + if '\t' in x else None) + df['reference_num'] = df.answer.apply( + lambda x: x + if x is None else [int(n) for n in re.split('\t| ', x)[1:]]) + df.answer = df.answer.apply(lambda x: x if x is None else x.split('\t')[0]) + df.text = df.text.apply(lambda x: x.split('\t')[0] if '\t' in x else x) + + # mark each sample + sample_start_inds = list(np.where(df.phrase_num == 1)[0]) + [df.shape[0]] + for i, (start, + end) in enumerate(zip(sample_start_inds, sample_start_inds[1:])): + df.loc[start:end, 'initial_sample_num'] = i + + df.initial_sample_num = df.initial_sample_num.astype(int) + + # multiple questions in sample -> samples with single question + initial_samples = [ + df[df.initial_sample_num == sn] + for sn in df.initial_sample_num.unique() + ] + + single_question_slices = [] + for sample in initial_samples: + answer_positions = sample[~sample.answer.isna()].index + slices = [sample.loc[:ans_pos].copy() for ans_pos in answer_positions] + for i, slc in enumerate(slices): + slices[i] = slc[(slc.answer.isna()) | (slc.index == slc.index[-1])] + if max_n_facts is not None: # drop samples with too many facts + slices = [slc for slc in slices if slc.shape[0] <= max_n_facts] + single_question_slices += slices + + df = pd.concat(single_question_slices).reset_index(drop=True) + + # mark each sample again + sample_start_inds = list(np.where(df.phrase_num == 1)[0]) + [df.shape[0]] + for i, (start, + end) in enumerate(zip(sample_start_inds, sample_start_inds[1:])): + df.loc[start:end, 'sample_num'] = i + + df.sample_num = df.sample_num.astype(int) + + return df + + +class TaskDataset(Dataset): + """Babi task loader dataset.""" + + def __init__(self, dataset_path, max_n_facts=None): + self.fact_dataset = get_dataset_df(dataset_path, + max_n_facts=max_n_facts) + + def __getitem__(self, ind): + slc = self.fact_dataset[self.fact_dataset.sample_num == ind] + references = slc[slc.phrase_num.isin( + slc.reference_num.values[-1])].text.values + sample = { + 'facts': slc.text.values[:-1], + 'question': slc.text.values[-1], + 'answer': slc.answer.values[-1], + 'references': references, + } + return sample + + def __len__(self): + return self.fact_dataset.sample_num.max() + + +def sum_lengths(sentences): + return sum([len(s) for s in sentences]) + + +class SentenceSampler: + """Sampler of background text.""" + + def __init__( + self, + dataset, + tokenizer, + min_sentence_len=10, + max_sentence_len=None, + shuffle=False, + random_seed=42, + ): + self.sample_ind = 0 + self.dataset = dataset + self.sentences = [] + self.tokenizer = tokenizer + self.min_sentence_len = min_sentence_len + self.max_sentence_len = max_sentence_len + self.sentence_tokenizer = nltk.PunktSentenceTokenizer() + self.shuffle = shuffle + self.gen = np.random.default_rng(seed=random_seed) + + def get_sample(self, sample_size): + sample = [] + total_len = 0 + while True: + sentences = list(self.sentences) + for i, sent in enumerate( + sentences + ): # add new sentence until sample_size is reached + tokenized = self.tokenizer.encode(sent, + add_special_tokens=False) + if not self.length_is_ok(tokenized): + continue + total_len += len(tokenized) + sample.append(tokenized) + if total_len >= sample_size: + self.sentences = self.sentences[i + 1:] + cutoff = total_len - sample_size + if cutoff > 0: + sample[-1] = sample[-1][:-cutoff] + return sample + + self.sentences = [] + self.sample_sentences_( + sample_size + ) # appends new sentences, can be updated to just return new sentences + + def sample_sentences_(self, sample_size): + sentences = [] + while len(sentences) == 0: + text = self.next_sample_() + if self.shuffle: + if len(text) == 0: + continue + text = text[self.gen.choice(len( + text)):] # start from random position in text + text = text[:sample_size * + 10] # cut too long texts to speed up tokenization + sentences += self.sentence_tokenizer.tokenize(text) + if self.shuffle: + sentences = sentences[1:-1] + self.sentences += sentences + + def next_sample_(self): + if self.shuffle: + self.total_tokens = 0 + sample_ind = self.gen.choice(len(self.dataset)) + sample = self.dataset[int(sample_ind)]['text'] + else: + sample = self.dataset[int(self.sample_ind)]['text'] + self.sample_ind += 1 + self.sample_ind = self.sample_ind % len(self.dataset) + return sample + + def length_is_ok(self, tokenized): + if (self.max_sentence_len is not None + and len(tokenized) > self.max_sentence_len): + return False + if (self.min_sentence_len is not None + and len(tokenized) < self.min_sentence_len): + return False + return True + + +class NoiseInjectionDataset(Dataset): + """Combined dataset for noisy babi QA. + + It's recommended to use sample_size >= 1024 and task_end_pct - task_start_pct >= 0.2 + """ + + def __init__( + self, + task_dataset, + noise_sampler, + tokenizer, + task_start_pct=None, # left border of facts in sample, between 0 and 1 + task_end_pct=None, # right border of facts in sample, between task_start_pct and 1 + sample_size=1024, + mixed_length_ratio=0.0, # used for mixed length curriculum, prob for shorter samples + random_seed=42, + ): + self.task_dataset = task_dataset + self.noise_sampler = noise_sampler + self.sample_size = sample_size + self.mixed_length_ratio = mixed_length_ratio + self.tokenizer = tokenizer + self.task_start_pct = task_start_pct + self.task_end_pct = task_end_pct + if random_seed: + self.gen = np.random.default_rng(seed=random_seed) + + def __getitem__(self, ind): + sample = self.task_dataset[ind] + facts_tok = self.tokenizer(list(sample['facts']))['input_ids'] + question_tok = self.tokenizer(sample['question'])['input_ids'] + answer_tok = self.tokenizer(sample['answer'])['input_ids'] + + sample_size = self.get_sample_size() + task_len = sum_lengths(facts_tok) + background_text_len = sample_size - task_len + background_text = self.noise_sampler.get_sample(background_text_len) + sample['background_text'] = background_text + + if (self.task_start_pct is None + and self.task_end_pct is None): # if fact position unspecified + possible_positions = range(len(background_text) + 1) + else: + task_start_ind = int(sample_size * self.task_start_pct) + task_end_ind = int(sample_size * self.task_end_pct) + total_facts_len = sum_lengths(facts_tok) + + possible_positions = [] # where can we insert facts? + current_length = 0 + for i, text in enumerate(background_text): + if (current_length >= task_start_ind) and ( + current_length < task_end_ind - total_facts_len): + possible_positions.append(i) + current_length += len(text) + + if len(possible_positions) == 0: + raise IndexError( + f'Unable to insert facts in specified place: {self.task_start_pct, self.task_end_pct}.' + f'Total fact length: {total_facts_len}, ' + f'sentences length: {[len(t) for t in background_text]}. ' + f'Make the range wider or increase the sample size.') + + fact_positions = self.gen.choice(possible_positions, len(facts_tok)) + fact_positions.sort() + sample['fact_positions'] = ( + fact_positions # positions of facts between noise sentences + ) + + updated_sample = [[] for _ in range(len(background_text) + 1)] + for fact, pos in zip(facts_tok, fact_positions): + updated_sample[pos].append(fact) + + for i, s in enumerate(background_text): + updated_sample[i].append(s) + + flat = [i for s in updated_sample for i in s] + tokens = [i for s in flat for i in s] + + sample['input_tokens'] = tokens + sample['question_tokens'] = question_tok + sample['target_tokens'] = answer_tok + + return sample + + def __len__(self): + return len(self.task_dataset) + + def get_sample_size(self): + if isinstance(self.sample_size, list): + if self.gen.random() > self.mixed_length_ratio: + return self.gen.choice(self.sample_size) + return max(self.sample_size) + else: + return self.sample_size diff --git a/opencompass/datasets/babilong/prompts.py b/opencompass/datasets/babilong/prompts.py new file mode 100644 index 00000000..b402e7ce --- /dev/null +++ b/opencompass/datasets/babilong/prompts.py @@ -0,0 +1,516 @@ +# flake8: noqa: E501 +SYSTEM_TEMPLATE = '{instruction}\n\n{examples}\n\n{post_prompt}' +USER_TEMPLATE = '\n{context}\n\n\nQuestion: {question}' +DEFAULT_TEMPLATE = f'{SYSTEM_TEMPLATE}\n\n{USER_TEMPLATE}' + +CUSTOM_SYSTEM_PROMPTS = { + # https://github.com/dvlab-research/LongLoRA/blob/2345c6d030f61ac3a031906386a103a5b05e0e6f/inference.py#L18 + 'LONGLORA_LLAMA2': + 'You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. ' + 'Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. ' + 'Please ensure that your responses are socially unbiased and positive in nature.\n\n' + 'If a question does not make any sense, or is not factually coherent, explain why instead of answering ' + 'something not correct. If you don\'t know the answer to a question, please don\'t share false information.' +} + + +def get_formatted_input( + context, + question, + examples, + instruction, + post_prompt, + template=DEFAULT_TEMPLATE, +): + # pre_prompt - general instruction + # examples - in-context examples + # post_prompt - any additional instructions after examples + # context - text to use for qa + # question - question to answer based on context + formatted_input = template.format( + instruction=instruction, + examples=examples, + post_prompt=post_prompt, + context=context.strip(), + question=question, + ) + return formatted_input.strip() + + +DEFAULT_PROMPTS = { + 'qa1': { + 'instruction': + 'I will give you context with the facts about positions of different persons hidden in some random text ' + 'and a question. You need to answer the question based only on the information from the facts. ' + 'If a person was in different locations, use the latest location to answer the question.', + 'examples': + '\n' + 'Charlie went to the hallway. Judith come back to the kitchen. Charlie travelled to balcony. ' + 'Where is Charlie?\n' + 'Answer: The most recent location of Charlie is balcony.\n' + '\n\n' + '\n' + 'Alan moved to the garage. Charlie went to the beach. Alan went to the shop. Rouse ' + 'travelled to balcony. Where is Alan?\n' + 'Answer: The most recent location of Alan is shop.\n' + '', + 'post_prompt': + 'Always return your answer in the following format: ' + 'The most recent location of ’person’ is ’location’. Do not write anything else after that.', + }, + 'qa2': { + 'instruction': + 'I give you context with the facts about locations and actions of different persons ' + 'hidden in some random text and a question.' + 'You need to answer the question based only on the information from the facts.\n' + 'If a person got an item in the first location and travelled to the second location ' + 'the item is also in the second location. ' + 'If a person dropped an item in the first location and moved to the second location ' + 'the item remains in the first location.', + 'examples': + '\n' + 'Charlie went to the kitchen. Charlie got a bottle. Charlie moved to the balcony. ' + 'Where is the bottle?\n' + 'Answer: The bottle is in the balcony.\n' + '\n' + '\n' + 'Alan moved to the garage. Alan got a screw driver. Alan moved to the kitchen. Where ' + 'is the screw driver?\n' + 'Answer: The screw driver is in the kitchen.\n' + '', + 'post_prompt': + 'Always return your answer in the following format: The ’item’ is in ’location’. ' + 'Do not write anything else after that.', + }, + 'qa3': { + 'instruction': + 'I give you context with the facts about locations and actions of different persons ' + 'hidden in some random text and a question. ' + 'You need to answer the question based only on the information from the facts.\n' + 'If a person got an item in the first location and travelled to the second location ' + 'the item is also in the second location. ' + 'If a person dropped an item in the first location and moved to the second location ' + 'the item remains in the first location.', + 'examples': + '\n' + 'John journeyed to the bedroom. Mary grabbed the apple. Mary went back to the bathroom. ' + 'Daniel journeyed to the bedroom. Daniel moved to the garden. Mary travelled to the kitchen. ' + 'Where was the apple before the kitchen?\n' + 'Answer: Before the kitchen the apple was in the bathroom.\n' + '\n' + '\n' + 'John went back to the bedroom. John went back to the garden. John went back to the kitchen. ' + 'Sandra took the football. Sandra travelled to the garden. Sandra journeyed to the bedroom. ' + 'Where was the football before the bedroom?\n' + 'Answer: Before the bedroom the football was in the garden.\n' + '', + 'post_prompt': + 'Always return your answer in the following format: ' + 'Before the $location_1$ the $item$ was in the $location_2$. Do not write anything else after that.', + }, + 'qa4': { + 'instruction': + 'I will give you context with the facts about different people, their location and actions, hidden in ' + 'some random text and a question. ' + 'You need to answer the question based only on the information from the facts.', + 'examples': + '\n' + 'The hallway is south of the kitchen. The bedroom is north of the kitchen. ' + 'What is the kitchen south of?\n' + 'Answer: bedroom\n' + '\n' + '\n' + 'The garden is west of the bedroom. The bedroom is west of the kitchen. What is west of the bedroom?\n' + 'Answer: garden\n' + '', + 'post_prompt': + 'Your answer should contain only one word - location. Do not write anything else after that.', + }, + 'qa5': { + 'instruction': + 'I will give you context with the facts about locations and their relations hidden in some random text ' + 'and a question. You need to answer the question based only on the information from the facts.', + 'examples': + '\n' + 'Mary picked up the apple there. Mary gave the apple to Fred. Mary moved to the bedroom. ' + 'Bill took the milk there. Who did Mary give the apple to?\n' + 'Answer: Fred\n' + '\n' + '\n' + 'Jeff took the football there. Jeff passed the football to Fred. Jeff got the milk there. ' + 'Bill travelled to the bedroom. Who gave the football?\n' + 'Answer: Jeff\n' + '\n' + '\n' + 'Fred picked up the apple there. Fred handed the apple to Bill. Bill journeyed to the bedroom. ' + 'Jeff went back to the garden. What did Fred give to Bill?\n' + 'Answer: apple\n' + '', + 'post_prompt': + 'Your answer should contain only one word. Do not write anything else after that. ' + 'Do not explain your answer.', + }, + 'qa6': { + 'instruction': + 'I will give you context with the facts about people and their locations hidden in some random text and a ' + 'question. You need to answer the question based only on the information from the facts. ' + 'If a person was in different locations, use the latest location the person was in to answer the question.', + 'examples': + '\n' + 'John travelled to the hallway. John travelled to the garden. Is John in the garden?\n' + 'Answer: yes\n' + '\n' + '\n' + 'Mary went to the office. Daniel journeyed to the hallway. Mary went to the bedroom. ' + 'Sandra went to the garden. Is Mary in the office?\n' + 'Answer: no\n' + '\n', + 'post_prompt': + 'Your answer should contain only one word - $yes$ or $no$. Do not write anything else after that. ' + 'Do not explain your answer.', + }, + 'qa7': { + 'instruction': + 'I will give you context with the facts about people and objects they carry, hidden in some random text ' + 'and a question. You need to answer the question based only on the information from the facts.', + 'examples': + '\n' + 'Daniel went to the bedroom. Daniel got the apple there. How many objects is Daniel carrying?\n' + 'Answer: one\n' + '\n' + '\n' + 'Mary grabbed the apple there. Mary gave the apple to John. How many objects is Mary carrying?\n' + 'Answer: none\n' + '\n' + '\n' + 'Sandra travelled to the hallway. Sandra picked up the milk there. Sandra took the apple there. ' + 'Mary travelled to the garden. How many objects is Sandra carrying?\n' + 'Answer: two\n' + '\n', + 'post_prompt': + 'Your answer should contain only one word - $none$ or $number_of_objects$. ' + 'Do not write anything else after that. Do not explain your answer.', + }, + 'qa8': { + 'instruction': + 'I will give you context with the facts about people and objects they carry, hidden in some random text ' + 'and a question. You need to answer the question based only on the information from the facts.', + 'examples': + '\n' + 'Sandra travelled to the garden. Mary grabbed the milk there. What is Mary carrying?\n' + 'Answer: milk\n' + '\n' + '\n' + 'Mary travelled to the kitchen. Sandra travelled to the office. John travelled to the office. ' + 'Sandra discarded the milk there. What is Sandra carrying?\n' + 'Answer: nothing\n' + '\n' + '\n' + 'Daniel grabbed the apple there. Mary went to the office. Daniel moved to the garden. ' + 'Daniel grabbed the milk there. Mary went to the kitchen. What is Daniel carrying?\n' + 'Answer: apple,milk\n' + '\n', + 'post_prompt': + 'Your answer should contain only one or two words: $nothing$ or $object$ or $object_1$, $object_2$. ' + 'Do not write anything else. Do not explain your answer.', + }, + 'qa9': { + 'instruction': + 'I will give you context with the facts about people and their locations hidden in some random text and ' + 'a question. You need to answer the question based only on the information from the facts. ' + 'If a person was in different locations, use the latest location the person was in to answer the question.', + 'examples': + '\n' + 'John is not in the bathroom. Sandra is not in the bedroom. Is John in the bathroom?\n' + 'Answer: no\n' + '\n' + '\n' + 'Mary journeyed to the kitchen. John is in the bedroom. Sandra is not in the garden. ' + 'Is Mary in the kitchen?\n' + 'Answer: yes\n' + '\n', + 'post_prompt': + 'Your answer should contain only one word - $yes$ or $no$. Do not write anything else. ' + 'Do not explain your answer.', + }, + 'qa10': { + 'instruction': + 'I will give you context with the facts about people and their locations hidden in some random text and a ' + 'question. You need to answer the question based only on the information from the facts. ' + 'If a person was in different locations, use the latest location the person was in to answer the question.', + 'examples': + '\n' + 'Bill is in the kitchen. Julie is either in the school or the cinema. Is Bill in the bedroom?\n' + 'Answer: no\n' + '\n' + '\n' + 'Fred is in the bedroom. Mary is either in the school or the cinema. Is Mary in the school?\n' + 'Answer: maybe\n' + '\n' + '\n' + 'Fred is either in the kitchen or the park. Bill moved to the cinema. Is Bill in the cinema?\n' + 'Answer: yes\n' + '\n' + '\n', + 'post_prompt': + 'Your answer should contain only one word - $yes$ or $no$ or $maybe$. Do not write anything else. ' + 'Do not explain your answer.', + }, + 'qa11': { + 'instruction': + 'I will give you context with the facts about people and their locations hidden in some random text and a ' + 'question. You need to answer the question based only on the information from the facts. ' + 'If a person was in different locations, use the latest location the person was in to answer the question.', + 'examples': + '\n' + 'Daniel journeyed to the hallway. After that he journeyed to the garden. Where is Daniel?\n' + 'Answer: garden\n' + '\n' + '\n' + 'Mary moved to the office. Afterwards she journeyed to the kitchen. Daniel went to the hallway. ' + 'Then he journeyed to the garden. Where is Mary?\n' + 'Answer: kitchen\n' + '\n' + '\n' + 'Sandra moved to the kitchen. After that she went back to the hallway. Sandra moved to the bedroom. ' + 'Then she went to the hallway. Mary moved to the bedroom. Afterwards she travelled to the bathroom. ' + 'Where is Sandra?\n' + 'Answer: hallway\n' + '\n' + '\n', + 'post_prompt': + 'Your answer should contain only one word - location. Do not write anything else after that. ' + 'Do not explain your answer.', + }, + 'qa12': { + 'instruction': + 'I will give you context with the facts about people and their locations hidden in some random text and a ' + 'question. You need to answer the question based only on the information from the facts. ' + 'If a person was in different locations, use the latest location the person was in to answer the question.', + 'examples': + '\n' + 'Mary and Daniel travelled to the bathroom. John and Daniel travelled to the office. Where is Daniel?\n' + 'Answer: office\n' + '\n' + '\n' + 'Sandra and Mary went back to the office. Daniel and Sandra went to the bedroom. Sandra and Mary travelled to the hallway. ' + 'John and Mary went to the kitchen. Where is Mary?\n' + 'Answer: kitchen\n' + '\n' + '\n' + 'Daniel and Sandra went back to the hallway. Daniel and John moved to the office. Daniel and John moved to the garden. ' + 'Daniel and Mary went back to the bathroom. Daniel and John went back to the kitchen. Daniel and Sandra went to the bathroom. ' + 'Where is John?\n' + 'Answer: kitchen\n' + '\n' + '\n', + 'post_prompt': + 'Your answer should contain only one word - location. Do not write anything else after that. ' + 'Do not explain your answer.', + }, + 'qa13': { + 'instruction': + 'I will give you context with the facts about people and their locations hidden in some random text and a ' + 'question. You need to answer the question based only on the information from the facts. ' + 'If a person was in different locations, use the latest location the person was in to answer the question.', + 'examples': + '\n' + 'Mary and Daniel travelled to the bathroom. Then they journeyed to the hallway. Where is Daniel?\n' + 'Answer: hallway\n' + '\n' + '\n' + 'Daniel and Sandra travelled to the kitchen. After that they journeyed to the hallway. Mary and Daniel travelled to the bedroom. ' + 'After that they travelled to the hallway. Where is Sandra?\n' + 'Answer: hallway\n' + '\n' + '\n' + 'John and Mary moved to the bathroom. Then they travelled to the office. John and Mary went to the kitchen. ' + 'Afterwards they went to the bedroom. John and Sandra moved to the bathroom. Following that they went back to the kitchen. ' + 'Where is Mary?\n' + 'Answer: bedroom\n' + '\n' + '\n', + 'post_prompt': + 'Your answer should contain only one word - location. Do not write anything else after that. ' + 'Do not explain your answer.', + }, + 'qa14': { + 'instruction': + 'I will give you context with the facts about people and their locations hidden in some random text and a ' + 'question. You need to answer the question based only on the information from the facts. ' + 'If a person was in different locations, use the latest location the person was in to answer the question.', + 'examples': + '\n' + 'Bill went back to the cinema yesterday. Julie went to the school this morning. Fred went to the park yesterday. ' + 'Yesterday Julie went to the office. Where was Julie before the school?\n' + 'Answer: office\n' + '\n' + '\n' + 'This morning Fred went to the kitchen. Fred journeyed to the bedroom yesterday. Mary travelled to the bedroom this morning. ' + 'Yesterday Mary went to the cinema. Where was Mary before the bedroom?\n' + 'Answer: cinema\n' + '\n' + '\n' + 'Yesterday Julie went back to the park. Julie went to the bedroom this morning. Bill journeyed to the cinema yesterday. ' + 'This morning Bill went back to the park. This evening Julie went to the school. This afternoon Julie went back to the park. ' + 'Where was Julie before the bedroom?\n' + 'Answer: park\n' + '\n' + '\n', + 'post_prompt': + 'Your answer should contain only one word - location. Do not write anything else after that. ' + 'Do not explain your answer.', + }, + 'qa15': { + 'instruction': + 'I will give you context with the facts about animals, their names and relations. The facts and a question ' + 'are hidden in some random text. You need to answer the question based only on the information from the facts.', + 'examples': + '\n' + 'Mice are afraid of wolves. Gertrude is a mouse. Cats are afraid of sheep. ' + 'Winona is a mouse. Sheep are afraid of wolves. Emily is a mouse. Jessica is a wolf. ' + 'What is gertrude afraid of?\n' + 'Answer: wolf\n' + '\n' + '\n' + 'Mice are afraid of wolves. Gertrude is a mouse. Cats are afraid of sheep. ' + 'Winona is a mouse. Sheep are afraid of wolves. Emily is a mouse. Jessica is a wolf. ' + 'What is jessica afraid of?\n' + 'Answer: cat\n' + '\n' + '\n' + 'Mice are afraid of cats. Wolves are afraid of sheep. Emily is a wolf. ' + 'Cats are afraid of sheep. Gertrude is a wolf. Sheep are afraid of cats. Winona is a wolf. ' + 'What is emily afraid of?\n' + 'Answer: sheep\n' + '\n' + '\n', + 'post_prompt': + 'Your answer should contain only one word - an animal species. Do not write anything else after that. ' + 'Do not explain your answer.', + }, + 'qa16': { + 'instruction': + 'I will give you context with the facts about animals, their names and colors. The facts and a question ' + 'are hidden in some random text. You need to answer the question based only on the information from the facts.', + 'examples': + '\n' + 'Lily is a frog. Bernhard is a frog. Bernhard is green. Brian is a lion. Brian is white. ' + 'Julius is a swan. Julius is green. Lily is green. Greg is a swan. What color is Greg?\n' + 'Answer: green\n' + '\n' + '\n' + 'Julius is a lion. Lily is a rhino. Bernhard is a swan. Lily is white. Bernhard is green. ' + 'Greg is a rhino. Greg is gray. Julius is white. Brian is a lion. What color is Brian?\n' + 'Answer: white\n' + '\n' + '\n' + 'Brian is a rhino. Julius is a lion. Bernhard is a lion. Greg is a swan. Brian is gray. ' + 'Greg is white. Lily is a rhino. Bernhard is yellow. Lily is gray. What color is Julius?\n' + 'Answer: yellow\n' + '\n' + '\n', + 'post_prompt': + 'Your answer should contain only one word - a color. Do not write anything else after that. ' + 'Do not explain your answer.', + }, + 'qa17': { + 'instruction': + 'I will give you context with the facts about different figures, their location and colors, hidden in ' + 'some random text and a question. ' + 'You need to answer the question based only on the information from the facts.', + 'examples': + '\n' + 'The triangle is above the pink rectangle. The blue square is to the left of the triangle. ' + 'Is the pink rectangle to the right of the blue square?\n' + 'Answer: yes\n' + '\n' + '\n' + 'The red sphere is to the left of the yellow square. The red sphere is below the pink rectangle. ' + 'Is the pink rectangle to the left of the yellow square?\n' + 'Answer: yes\n' + '' + '\n' + 'The red sphere is above the pink rectangle. The red sphere is to the right of the red square. ' + 'Is the pink rectangle above the red square?\n' + 'Answer: no\n' + '', + 'post_prompt': + 'Your answer should contain only one word - $yes$ or $no$. Do not write anything else. ' + 'Do not explain your answer.', + }, + 'qa18': { + 'instruction': + 'I will give you context with the facts about different objects and their sizes, hidden in ' + 'some random text and a question. ' + 'You need to answer the question based only on the information from the facts.', + 'examples': + '\n' + 'The box of chocolates fits inside the chest. The box is bigger than the chest. The box is bigger than the suitcase. ' + 'The suitcase fits inside the box. The container is bigger than the box of chocolates. Does the box fit in the box of chocolates?\n' + 'Answer: no\n' + '\n' + '\n' + 'The suitcase is bigger than the container. The container fits inside the box. The chest is bigger than the chocolate.' + 'The suitcase fits inside the box. The chest fits inside the box. Does the chocolate fit in the box?\n' + 'Answer: yes\n' + '' + '\n' + 'The chocolate fits inside the box of chocolates. The suitcase fits inside the box. The chocolate fits inside the box. ' + 'The box is bigger than the box of chocolates. The suitcase is bigger than the box of chocolates. Is the chocolate bigger than the box?\n' + 'Answer: no\n' + '', + 'post_prompt': + 'Your answer should contain only one word - $yes$ or $no$. Do not write anything else. ' + 'Do not explain your answer.', + }, + 'qa19': { + 'instruction': + 'I will give you context with the facts about different places and their locations, hidden in ' + 'some random text and a question. ' + 'You need to answer the question based only on the information from the facts.', + 'examples': + '\n' + 'The office is east of the hallway. The kitchen is north of the office. The garden is west of the bedroom. ' + 'The office is west of the garden. The bathroom is north of the garden. How do you go from the kitchen to the garden?\n' + 'Answer: s,e\n' + '\n' + '\n' + 'The bedroom is west of the hallway. The office is east of the garden. The garden is north of the kitchen. ' + 'The kitchen is north of the bathroom. The hallway is west of the garden. How do you go from the kitchen to the hallway?\n' + 'Answer: n,w\n' + '\n' + '\n' + 'The bedroom is south of the hallway. The bathroom is east of the office. The kitchen is west of the garden. ' + 'The garden is south of the office. The office is south of the bedroom. How do you go from the garden to the bedroom?\n' + 'Answer: n,n\n' + '\n', + 'post_prompt': + 'Your answer should contain only two letters, separated by a comma - ordinal directions. You can choose the letters from ' + '$n$, $s$, $e$ and $w$. Do not write anything else after that.', + }, + 'qa20': { + 'instruction': + 'I will give you context with the facts about people, their locations and condition hidden in some random text and a ' + 'question. You need to answer the question based only on the information from the facts. ' + 'If a person was in different locations, use the latest location the person was in to answer the question.', + 'examples': + '\n' + 'Sumit is tired. Where will sumit go?\n' + 'Answer: bedroom\n' + '\n' + '\n' + 'Yann is hungry. Yann journeyed to the kitchen. Why did yann go to the kitchen?\n' + 'Answer: hungry\n' + '\n' + '\n' + 'Antoine is thirsty. Yann is tired. Yann went back to the bedroom. Yann picked up the pajamas there.' + 'Jason is thirsty. Antoine went back to the kitchen. Why did antoine go to the kitchen?\n' + 'Answer: thirsty\n' + '\n' + '\n', + 'post_prompt': + 'Your answer should contain only one word - a person condition or a place. Do not write anything else after that. ' + 'Do not explain your answer.', + }, +} diff --git a/opencompass/utils/datasets_info.py b/opencompass/utils/datasets_info.py index 38808e99..9d3ea7b2 100644 --- a/opencompass/utils/datasets_info.py +++ b/opencompass/utils/datasets_info.py @@ -327,6 +327,11 @@ DATASETS_MAPPING = { "hf_id": "", "local": "./data/mmmlu_lite", }, + "opencompass/babilong": { + "ms_id": "", + "hf_id": "", + "local": "./data/babilong/data/", + }, } DATASETS_URL = { @@ -526,4 +531,8 @@ DATASETS_URL = { "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/WikiBench.zip", "md5": "6dac1d1a3133fe1effff185cbf71d928", }, + "/babilong":{ + "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/babilong.zip", + "md5": "e400864c31bc58d29eaa3e199751f99b", + } } diff --git a/requirements/runtime.txt b/requirements/runtime.txt index 867f3920..cfbde9c4 100644 --- a/requirements/runtime.txt +++ b/requirements/runtime.txt @@ -9,6 +9,7 @@ fuzzywuzzy h5py huggingface_hub<=0.24.7 immutabledict +importlib-metadata jieba json5 mmengine-lite From d415439f9bd4e9c811cda30d463910236bbbb5b7 Mon Sep 17 00:00:00 2001 From: Linchen Xiao Date: Thu, 14 Nov 2024 16:45:59 +0800 Subject: [PATCH 06/17] [Fix] Fix bug for first_option_postprocess (#1688) --- README.md | 1 + opencompass/utils/text_postprocessors.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 26a9fd4b..a631b02b 100644 --- a/README.md +++ b/README.md @@ -57,6 +57,7 @@ Just like a compass guides us on our journey, OpenCompass will guide you through ## 🚀 What's New +- **\[2024.11.14\]** OpenCompass now supports the brand new long-context language model evaluation benchmark — [BABILong](https://arxiv.org/pdf/2406.10149). Have a look at the [demo](configs/eval_babilong.py) and give it a try! 🔥🔥🔥 - **\[2024.10.14\]** We now support the OpenAI multilingual QA dataset [MMMLU](https://huggingface.co/datasets/openai/MMMLU). Feel free to give it a try! 🔥🔥🔥 - **\[2024.09.19\]** We now support [Qwen2.5](https://huggingface.co/Qwen)(0.5B to 72B) with multiple backend(huggingface/vllm/lmdeploy). Feel free to give them a try! 🔥🔥🔥 - **\[2024.09.17\]** We now support OpenAI o1(`o1-mini-2024-09-12` and `o1-preview-2024-09-12`). Feel free to give them a try! 🔥🔥🔥 diff --git a/opencompass/utils/text_postprocessors.py b/opencompass/utils/text_postprocessors.py index e86030db..367fcbd1 100644 --- a/opencompass/utils/text_postprocessors.py +++ b/opencompass/utils/text_postprocessors.py @@ -128,7 +128,7 @@ def first_option_postprocess(text: str, options: str, cushion=True) -> str: text = text.strip() match = re.search(pattern, text, re.DOTALL) if match: - outputs = match.group(0) + outputs = match.group(1) for i in options: if i in outputs: return i From e9e4b69ddb06b51142914078f70cae756e61e02c Mon Sep 17 00:00:00 2001 From: abrohamLee <146956824+abrohamLee@users.noreply.github.com> Date: Thu, 14 Nov 2024 20:42:12 +0800 Subject: [PATCH 07/17] [Feature] MuSR Datset Evaluation (#1689) * MuSR Datset Evaluation * MuSR Datset Evaluation Add an assertion and a Readme.md --- README.md | 1 + configs/eval_musr.py | 44 ++ opencompass/configs/datasets/musr/README.md | 75 ++ opencompass/configs/datasets/musr/musr_gen.py | 135 ++++ .../summarizers/groups/musr_average.py | 19 + opencompass/datasets/__init__.py | 1 + opencompass/datasets/musr/__init__.py | 1 + .../datasets/musr/murder_mystery_solved_ex.py | 81 ++ opencompass/datasets/musr/musr.py | 309 ++++++++ .../musr/object_placements_solved_ex.py | 53 ++ .../musr/team_allocation_solved_ex.py | 72 ++ opencompass/datasets/musr/tree.py | 739 ++++++++++++++++++ opencompass/utils/datasets_info.py | 9 + 13 files changed, 1539 insertions(+) create mode 100644 configs/eval_musr.py create mode 100644 opencompass/configs/datasets/musr/README.md create mode 100644 opencompass/configs/datasets/musr/musr_gen.py create mode 100644 opencompass/configs/summarizers/groups/musr_average.py create mode 100644 opencompass/datasets/musr/__init__.py create mode 100644 opencompass/datasets/musr/murder_mystery_solved_ex.py create mode 100644 opencompass/datasets/musr/musr.py create mode 100644 opencompass/datasets/musr/object_placements_solved_ex.py create mode 100644 opencompass/datasets/musr/team_allocation_solved_ex.py create mode 100644 opencompass/datasets/musr/tree.py diff --git a/README.md b/README.md index a631b02b..8debab3b 100644 --- a/README.md +++ b/README.md @@ -57,6 +57,7 @@ Just like a compass guides us on our journey, OpenCompass will guide you through ## 🚀 What's New +- **\[2024.11.14\]** OpenCompass now offers support for a sophisticated benchmark designed to evaluate complex reasoning skills — [MuSR](https://arxiv.org/pdf/2310.16049). Check out the [demo](configs/eval_musr.py) and give it a spin! 🔥🔥🔥 - **\[2024.11.14\]** OpenCompass now supports the brand new long-context language model evaluation benchmark — [BABILong](https://arxiv.org/pdf/2406.10149). Have a look at the [demo](configs/eval_babilong.py) and give it a try! 🔥🔥🔥 - **\[2024.10.14\]** We now support the OpenAI multilingual QA dataset [MMMLU](https://huggingface.co/datasets/openai/MMMLU). Feel free to give it a try! 🔥🔥🔥 - **\[2024.09.19\]** We now support [Qwen2.5](https://huggingface.co/Qwen)(0.5B to 72B) with multiple backend(huggingface/vllm/lmdeploy). Feel free to give them a try! 🔥🔥🔥 diff --git a/configs/eval_musr.py b/configs/eval_musr.py new file mode 100644 index 00000000..03c9dabf --- /dev/null +++ b/configs/eval_musr.py @@ -0,0 +1,44 @@ +from mmengine.config import read_base +import os.path as osp + +with read_base(): + from opencompass.configs.datasets.musr.musr_gen import musr_datasets + # from opencompass.configs.models.hf_internlm.hf_internlm2_5_1_8b_chat import models + from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat import ( + models as lmdeploy_internlm2_5_7b_chat_model, + ) + from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_7b_instruct import ( + models as lmdeploy_qwen2_5_7b_instruct_model, + ) + from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_14b_instruct import ( + models as lmdeploy_qwen2_5_14b_instruct_model, + ) + from opencompass.configs.models.yi.lmdeploy_yi_1_5_9b_chat import ( + models as lmdeploy_yi_1_5_9b_chat_model, + ) + from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_32b_instruct import ( + models as lmdeploy_qwen2_5_32b_instruct_model, + ) + from opencompass.configs.models.chatglm.lmdeploy_glm4_9b_chat import ( + models as lmdeploy_glm4_9b_chat_model, + ) + from opencompass.configs.models.hf_llama.lmdeploy_llama3_1_8b_instruct import ( + models as lmdeploy_llama3_1_8b_instruct_model, + ) + from opencompass.configs.models.mistral.lmdeploy_ministral_8b_instruct_2410 import ( + models as lmdeploy_ministral_8b_instruct_2410_model, + ) + from opencompass.configs.models.gemma.lmdeploy_gemma_9b_it import ( + models as lmdeploy_gemma_9b_it_model, + ) + from opencompass.configs.models.gemma.lmdeploy_gemma_27b_it import ( + models as lmdeploy_gemma_27b_it_model, + ) + from opencompass.configs.summarizers.groups.musr_average import summarizer + + +datasets = [*musr_datasets] +models = sum([v for k, v in locals().items() if k.endswith('_model')], []) + +base_exp_dir = 'outputs/musr/' +work_dir = osp.join(base_exp_dir, 'musr_eval') diff --git a/opencompass/configs/datasets/musr/README.md b/opencompass/configs/datasets/musr/README.md new file mode 100644 index 00000000..d2003a1f --- /dev/null +++ b/opencompass/configs/datasets/musr/README.md @@ -0,0 +1,75 @@ + +# MuSR: Multistep Soft Reasoning Dataset + +MuSR (Multistep Soft Reasoning) is a dataset designed to evaluate language models (LLMs) on complex reasoning tasks embedded in natural language narratives. Created to challenge state-of-the-art models like GPT-4 and others, MuSR emphasizes nuanced reasoning across different domains, including social and physical reasoning, commonsense reasoning, and planning, with tasks framed within realistic scenarios such as murder mysteries, object placements, and team allocations. + +## Overview + +### Purpose + +Current large language models can perform complex tasks through prompting techniques like chain-of-thought reasoning. However, robust multistep reasoning remains challenging. MuSR addresses these limitations by evaluating LLM performance on tasks involving multistep reasoning in three domains: +- **Murder Mysteries**: Requires social and physical deductive reasoning. +- **Object Placements**: Tests observational and theory-of-mind reasoning. +- **Team Allocations**: Focuses on social reasoning and constraint satisfaction. + +### Dataset Construction + +MuSR instances are generated using a neurosymbolic synthetic-to-natural narrative generation algorithm. This approach allows for the creation of complex reasoning instances that combine structured reasoning trees with natural language narratives, challenging both direct and nuanced inference capabilities in LLMs. + +MuSR's dataset consists of: +- **Murder Mysteries**: Scenarios with suspects, motives, and opportunities requiring deductive inference. +- **Object Placements**: Scenarios where individuals' observations inform reasoning about object locations. +- **Team Allocations**: Scenarios that simulate social relationships and teamwork for optimal task assignments. + + +### Dataset Access +MuSR dataset is publicly available, with instructions provided on the [GitHub Project](https://github.com/Zayne-Sprague/MuSR). You can download the dataset and use pre-defined prompts or create your own configurations. + +### Evaluation + +1. Install dependencies and configure the environment. +2. Run evaluations using `opencompass configs/eval_musr.py` to assess LLM performance. +3. Analyze results against human performance benchmarks. + +### Example Command +```bash +opencompass configs/eval_musr.py +``` + +## Baselines and Results + +MuSR includes baseline results for multiple LLMs evaluated with chain-of-thought and advanced reasoning strategies. These benchmarks assess model accuracy on reasoning tasks across the three domains. + +| Domain | Baseline Accuracy (GPT-4) | Human Performance | +|------------------|---------------------------|--------------------| +| Murder Mystery | 80.4% | 94.1% | +| Object Placement | 60.9% | 95.0% | +| Team Allocation | 68.4% | 100% | + + +| dataset | version | metric | mode | internlm2_5-7b-chat-turbomind | qwen2.5-7b-instruct-turbomind | qwen2.5-14b-instruct-turbomind | yi-1.5-9b-chat-turbomind | qwen2.5-32b-instruct-turbomind | glm-4-9b-chat-turbomind | llama-3_1-8b-instruct-turbomind | ministral-8B-instruct-2410-turbomind | gemma-2-9b-it-turbomind | gemma-2-27b-it-turbomind | +|----- | ----- | ----- | ----- | ----- | ----- | ----- | ----- | ----- | ----- | ----- | ----- | ----- | -----| +| musr_murder_mysteries | a5ce30 | accuracy | gen | 59.20 | 63.20 | 76.00 | 68.80 | 78.80 | 71.20 | 73.60 | 73.60 | 74.80 | 77.20 | +| musr_object_placements | a5ce30 | accuracy | gen | 54.69 | 56.25 | 57.42 | 52.73 | 66.02 | 49.22 | 57.42 | 60.94 | 60.94 | 62.11 | +| musr_team_allocation | a5ce30 | accuracy | gen | 39.20 | 32.40 | 55.60 | 40.00 | 67.60 | 50.40 | 46.00 | 36.40 | 40.80 | 41.20 | +| musr_average | - | naive_average | gen | 51.03 | 50.62 | 63.01 | 53.84 | 70.81 | 56.94 | 59.01 | 56.98 | 58.85 | 60.17 | + + +## Citation + +If you use MuSR in your research, please cite: +```bibtex +@misc{sprague2024musrtestinglimitschainofthought, + title={MuSR: Testing the Limits of Chain-of-thought with Multistep Soft Reasoning}, + author={Zayne Sprague and Xi Ye and Kaj Bostrom and Swarat Chaudhuri and Greg Durrett}, + year={2024}, + eprint={2310.16049}, + archivePrefix={arXiv}, + primaryClass={cs.CL}, + url={https://arxiv.org/abs/2310.16049}, +} +``` + +## Details + +For further details, please refer to the MuSR paper [here](https://arxiv.org/abs/2310.16049). diff --git a/opencompass/configs/datasets/musr/musr_gen.py b/opencompass/configs/datasets/musr/musr_gen.py new file mode 100644 index 00000000..2d57392b --- /dev/null +++ b/opencompass/configs/datasets/musr/musr_gen.py @@ -0,0 +1,135 @@ +from opencompass.datasets import MusrDataset, MusrEvaluator +from opencompass.openicl import PromptTemplate, ZeroRetriever, GenInferencer + + +DATASET_CONFIGS = { + 'murder_mysteries': { + 'abbr': 'musr_murder_mysteries', + 'name': 'murder_mysteries', + 'path': 'opencompass/musr', + 'reader_cfg': dict( + input_columns=['context', 'question_text', 'question', 'answer', 'choices', 'choices_str', 'intermediate_trees', 'intermediate_data', 'prompt', 'system_prompt', 'gold_answer', 'scidx', 'self_consistency_n', 'ablation_name'], + output_column='gold_answer', + ), + 'infer_cfg': dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + begin=[ + dict( + role='SYSTEM', + fallback_role='HUMAN', + prompt='{system_prompt}' + ) + ], + round=[ + dict( + role='HUMAN', + prompt='{prompt}' + ), + ] + ), + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=512), + ), + 'eval_cfg': dict( + evaluator=dict( + type=MusrEvaluator, + answer_index_modifier=1, + self_consistency_n=1 + ), + ), + }, + 'object_placements': { + 'abbr': 'musr_object_placements', + 'name': 'object_placements', + 'path': 'opencompass/musr', + 'reader_cfg': dict( + input_columns=['context', 'question_text', 'question', 'answer', 'choices', 'choices_str', 'intermediate_trees', 'intermediate_data', 'prompt', 'system_prompt', 'gold_answer', 'scidx', 'self_consistency_n', 'ablation_name'], + output_column='gold_answer', + ), + 'infer_cfg': dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + begin=[ + dict( + role='SYSTEM', + fallback_role='HUMAN', + prompt='{system_prompt}' + ) + ], + round=[ + dict( + role='HUMAN', + prompt='{prompt}' + ), + ] + ), + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=512), + ), + 'eval_cfg': dict( + evaluator=dict( + type=MusrEvaluator, + answer_index_modifier=1, + self_consistency_n=1 + ), + ), + }, + 'team_allocation': { + 'abbr': 'musr_team_allocation', + 'name': 'team_allocation', + 'path': 'opencompass/musr', + 'reader_cfg': dict( + input_columns=['context', 'question_text', 'question', 'answer', 'choices', 'choices_str', 'intermediate_trees', 'intermediate_data', 'prompt', 'system_prompt', 'gold_answer', 'scidx', 'self_consistency_n', 'ablation_name'], + output_column='gold_answer', + ), + 'infer_cfg': dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + begin=[ + dict( + role='SYSTEM', + fallback_role='HUMAN', + prompt='{system_prompt}' + ) + ], + round=[ + dict( + role='HUMAN', + prompt='{prompt}' + ), + ] + ), + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=512), + ), + 'eval_cfg': dict( + evaluator=dict( + type=MusrEvaluator, + answer_index_modifier=1, + self_consistency_n=1 + ), + ), + }, +} + + +musr_datasets = [] + +for config in DATASET_CONFIGS.values(): + dataset = dict( + abbr=config['abbr'], + type=MusrDataset, + path=config['path'], + name=config['name'], + reader_cfg=config['reader_cfg'], + infer_cfg=config['infer_cfg'], + eval_cfg=config['eval_cfg'], + ) + musr_datasets.append(dataset) diff --git a/opencompass/configs/summarizers/groups/musr_average.py b/opencompass/configs/summarizers/groups/musr_average.py new file mode 100644 index 00000000..cd012cf1 --- /dev/null +++ b/opencompass/configs/summarizers/groups/musr_average.py @@ -0,0 +1,19 @@ +summarizer = dict( + dataset_abbrs=[ + 'musr_murder_mysteries', + 'musr_object_placements', + 'musr_team_allocation', + 'musr_average' + ], + summary_groups=[ + { + 'name': 'musr_average', + 'subsets': [ + 'musr_murder_mysteries', + 'musr_object_placements', + 'musr_team_allocation', + ], + } + ], +) + \ No newline at end of file diff --git a/opencompass/datasets/__init__.py b/opencompass/datasets/__init__.py index 4ab4a7d0..e96ffc28 100644 --- a/opencompass/datasets/__init__.py +++ b/opencompass/datasets/__init__.py @@ -87,6 +87,7 @@ from .mmlu_pro import * # noqa: F401, F403 from .MMLUArabic import * # noqa: F401, F403 from .mmmlu import * # noqa: F401, F403 from .multirc import * # noqa: F401, F403 +from .musr import * # noqa: F401, F403 from .narrativeqa import * # noqa: F401, F403 from .natural_question import * # noqa: F401, F403 from .natural_question_cn import * # noqa: F401, F403 diff --git a/opencompass/datasets/musr/__init__.py b/opencompass/datasets/musr/__init__.py new file mode 100644 index 00000000..5cbc34f8 --- /dev/null +++ b/opencompass/datasets/musr/__init__.py @@ -0,0 +1 @@ +from .musr import * # noqa: F401, F403 diff --git a/opencompass/datasets/musr/murder_mystery_solved_ex.py b/opencompass/datasets/musr/murder_mystery_solved_ex.py new file mode 100644 index 00000000..1fe183b9 --- /dev/null +++ b/opencompass/datasets/musr/murder_mystery_solved_ex.py @@ -0,0 +1,81 @@ +# flake8: noqa: E501 +story = """ +In the smoke-filled haze of a thriving jazz club, Alice met her explosive end, leaving Detective Winston to sift through the suspects: Eugene, the shy pianist, and Gabrielle, the sassy club singer. + +While seated at his desk at the precinct, Winston received a phone call from a certain observant local bartender, tipping off the police about a harsh row peaking in a nearby jazz club. He signaled to his partner as they promptly dispatched to the scene, already ringing with sirens and a restless crowd. + +With the police line restraining the horde, the jazz club was undergoing a full round-up as Winston approached the informative bartender. The bartender was engrossed in his account to the officers about a raucous, punch throwing fight Eugene was part of, to his best recollection. Winston remembered Eugene, a jazz fanatic—lurking around the jazz corners more often than anyone else could recount. + +In the heart of the upheaval, lay a woman sprawled on the floor, later identified as Alice, a frequent face at the jazz scene and a financial analyst deeply engrossed in financial transactions. In public, Alice had made her concerns known about her discovery of fraudulent transactions at the bank, promising to report the same to the authorities. Eugene, remembered conspicuously for being a bank teller at the same bank Alice worked at, suddenly seemed closely linked. + +Eugene’s arrest was far from hushed, with the local news broadcasting the progressing drama live, catching sight of Eugene curtailed in handcuffs. Concurrently, it was ascertained—Eugene was a member of the jazz club. This evidence backed by a jazz club membership card retrieved from his wallet during the arrest. + +Just a few steps away, he noticed a man in a suit, the bouncer, a calm figure amid the bedlam. In their conversation, the bouncer corroborated that he had indeed seen Eugene involved in a heated scuffle, landing a few punches. The whisperings were starting to gain momentum, since Eugene was believed to be on the losing end of a lawsuit—a battle courtesy of Alice charging Eugene with the financial fraud she had publicly vowed to expose. + +Eugene was known for his frequent presence at the jazz club and on top of that, was an actual member. Therefore, it was hardly a leap to presume Alice meeting her untimely end at the club was no mere happenstance. The jazz club, despite its dim lights and pulsating music, was a public place easily accessible to all, including potential suspects like Eugene and, sadly, the ill-starred Alice. + +Det. Winston knew he was now tasked with a cryptic puzzle. A bank teller, embroiled in suspected fraud and a lawsuit, a jazz club murder scene and a local financial analyst—all woven into a ghastly murder mystery. He sighed in distaste as Eugene was escorted away—a man still oblivious to the chain of events waiting for him. But Winston knew, the night had only just begun for him. + +Winston stared down at the crumpled microphone on the floor. He picked it up gingerly, turning it in his hand. The club was in disarray, debris scattered like confetti. The lab boys were still picking pieces of the grenade apart. + +"Gabrielle's microphone," the coroner confirmed, barely looking up from his task. + +"Give him the once-over for evidence," Winston said, handing the microphone to a nearby officer. + +Leaving the club behind him, Winston sighed heavily. The world of jazz had taken a dark turn that night. Alice, the acclaimed critic with her sarcastic wit and keen critical eye, had been last seen alive here. Her purse lay in the club untouched, a testament to the abruptness of the event. + +Gabrielle had been working as a war correspondent. Winston had read her articles. They were richly detailed, passionate, and highlighted the harsh reality of war zones. Gabrielle hadn't been shy about sharing her experiences or publicly criticizing the military in her pieces. She boldly interviewed military personnel and spent extended periods in conflict zones. + +Alice, though, never missed a chance to pick apart Gabrielle's articles. The vitriolic snippets in Alice’s column were regular features and Gabrielle's staunch defense of her articles, her work in the jazz scene, did little against Alice's respected reputation. + +The tension between them was palpable. Alice had been awarded a major journalist award that Gabrielle had desired. This only deepened their rivalry, with Gabrielle feeling overlooked for this recognition in the Jazz scene. + +Winston cast his gaze over the club once more—a hub of pulsating rhythms now eerily silent. + +A significant part of the evening was Gabrielle's recorded interview with Alice. It played on the local radio, their professional rivalry subtly echoing under their professional demeanor. + +With a deep breath, Winston knew he had a tall task ahead. The jazz club, where Alice was last seen alive was now shrouded in an eerie silence, the vibrant rhythms of what used to be a lively night echoing in the abandoned stage. It was up to him to piece together the missing notes and bring the symphony of this unsolved case to a satisfying finale. + +Who is the most likely murderer? + +Pick one of the following choices: +1 - Eugene +2 - Gabrielle + +You must pick one option. Before selecting a choice, explain your reasoning step by step. The murderer needs to have a means (access to weapon), motive (reason to kill the victim), and opportunity (access to crime scene) in order to have killed the victim. Innocent suspects may have two of these proven, but not all three. An innocent suspect may be suspicious for some other reason, but they will not have all of motive, means, and opportunity established. + +If you believe that both suspects have motive, means, and opportunity, you should make an educated guess pick the one for whom these are best established. If you believe that neither suspect has all three established, then choose the suspect where these are most clearly established. Explain your reasoning step by step before you answer. Finally, the last thing you generate should be "ANSWER: (your answer here, including the choice number)" +""".strip() + +reasoning = """ +Let's break this down step-by-step by first deducing which of the two suspects has a means, motive, and opportunity. + +We will start with Eugene. + +Eugene was being sued by Alice for fraudulent transactions. The charge was also very public. Both of these facts point to Eugene having a strong motive. + +Because Eugene has a jazz club membership, and we can deduce that the jazz club membership belongs to the same club Alice was murdered in, we can assume Eugene has an opportunity to commit the crime. + +Although we know Eugene is aggressive because he was throwing punches in the story, we do not know if he has access to the murder weapon. Because he does not have access to a grenade, he does not have a means. + +Let's review Gabrielle next. + +Gabrielle's purse was found at the scene of the crime, and we can then assume she had the opportunity to kill Alice. + +Because Gabrielle has been in conflict zones with military personnel, it's possible that she has access to a grenade. We can say that Gabrielle has a potential means to kill the victim. + +Finally, it appears that Gabrielle and Alice had a rivalry over journalism, which could have boiled up into physical action. Because of this, we can say that Gabrielle has a potential motive to kill the victim. + +Now, reviewing the evidence, we see that: + +Eugene has a motive and opportunity but no means. +Gabrielle has a motive, means, and opportunity. + +Therefore, Gabrielle is the most likely murderer. + +ANSWER: 2 + + +""".strip() + +murder_mystery_solved_ex = f'{story}\n\n{reasoning}' diff --git a/opencompass/datasets/musr/musr.py b/opencompass/datasets/musr/musr.py new file mode 100644 index 00000000..d15e6831 --- /dev/null +++ b/opencompass/datasets/musr/musr.py @@ -0,0 +1,309 @@ +# flake8: noqa: E501 +import json +import os.path as osp + +from datasets import Dataset + +from opencompass.datasets.base import BaseDataset +from opencompass.openicl import BaseEvaluator +from opencompass.registry import ICL_EVALUATORS, LOAD_DATASET +from opencompass.utils import get_data_path + +from .murder_mystery_solved_ex import murder_mystery_solved_ex +from .object_placements_solved_ex import object_placements_solved_ex +from .team_allocation_solved_ex import team_allocation_solved_ex +from .tree import LogicTree + +DATASET_CONFIGS = { + 'murder_mysteries': { + 'file_name': + 'murder_mysteries.json', + 'ex': + murder_mystery_solved_ex, # write user example here + 'system_prompt': + 'You are a helpful assistant that will answer the questions given by the user.', + 'hint': + ('Before selecting a choice, explain your reasoning step by step. ' + 'The murderer needs to have a means (access to weapon), motive (reason to kill the victim), ' + 'and opportunity (access to crime scene) in order to have killed the victim. ' + 'Innocent suspects may have two of these proven, but not all three. ' + 'An innocent suspect may be suspicious for some other reason, but they will not have all of motive, ' + 'means, and opportunity established.\n\n' + 'If you believe that both suspects have motive, means, and opportunity, you should make an educated guess ' + 'and pick the one for whom these are best established. If you believe that neither suspect has all ' + 'three established, then choose the suspect where these are most clearly established.' + ), + 'hint_before_question': + False, + 'answer_index_modifier': + 1 + }, + 'object_placements': { + 'file_name': + 'object_placements.json', + 'ex': + object_placements_solved_ex, + 'skip_ablated': + True, + 'ablation_depth_modifier': + 2, + 'system_prompt': + 'You are a helpful assistant that will answer the questions given by the user.', + 'hint': + ('Based on this story, we want to identify where someone believes that a certain object is at the end of ' + 'the story. In order to do that, you need to read the story and keep track of where they think the object ' + 'is at each point. When an object is moved, the person may observe its new location if they saw it move.\n\n' + 'To see where an object ends up, they must be able to see the location that it moves to and not be too ' + 'distracted by what they are doing. If they do not observe the object moving, then they will still believe ' + 'it to be in the last location where they observed it.'), + 'hint_before_question': + True, + 'answer_index_modifier': + 1 + }, + 'team_allocation': { + 'file_name': + 'team_allocation.json', + 'ex': + team_allocation_solved_ex, + 'system_prompt': + 'You are a helpful assistant that will answer the questions given by the user.', + 'hint': + ('The story should allow you to determine how good each person is at a skill. Roughly, each person is ' + 'either great, acceptable, or bad at a task. We want to find an optimal assignment of people to tasks ' + 'that uses their skills as well as possible. In addition, one task will have to have two people assigned ' + 'to it. The effectiveness of their teamwork (great team, acceptable team, or bad team) also impacts the ' + 'overall quality of the assignment.\n\n' + 'When two people need to work on a task and one is bad at it, they don\'t necessarily benefit from the ' + 'other person being good, unless they work well together.\n\n' + 'With different strengths, weaknesses, and interpersonal dynamics at play, you should allocate your team ' + 'to find the single assignment to ensure that the tasks overall are completed as effectively as possible.' + ), + 'hint_before_question': + False, + 'answer_index_modifier': + 1 + } +} + + +@LOAD_DATASET.register_module() +class MusrDataset(BaseDataset): + """MuSR. + + Args: + path (str): path to dataset + name (str): name of dataset + self_consistency_n (int) + exclude_contrastive_examples (bool): Whether to exclude contrastive examples + reverse_contrastive_sample (bool): Whether to reverse the selection of contrastive samples + skip_ablated (bool): Whether to skip ablated samples + offset (int): Starting offset for the dataset + sample_size (int): Sample size, None indicates using the entire dataset. + """ + + @staticmethod + def load(path, + name, + self_consistency_n=1, + exclude_contrastive_examples=False, + reverse_contrastive_sample=False, + skip_ablated=False, + randomize=False, + offset=0, + sample_size=None, + **kwargs): + """Load the dataset and flatten fields while constructing prompts, + taking self_consistency_n and ablations into account.""" + + if name not in DATASET_CONFIGS: + raise ValueError( + f'Dataset name {name} not supported. Must be one of {list(DATASET_CONFIGS.keys())}' + ) + + config = DATASET_CONFIGS[name] + path = get_data_path(path) + file_path = osp.join(path, config['file_name']) + + with open(file_path, 'r', encoding='utf-8') as f: + dataset = json.load(f) + + filtered_dataset = [] + hashes_done = [] + + for example in dataset: + if exclude_contrastive_examples and example['questions'][0].get('intermediate_data') and \ + len(example['questions'][0].get('intermediate_data')) > 0 and \ + example['questions'][0]['intermediate_data'][0].get('story_hash_id'): + story_hash = example['questions'][0]['intermediate_data'][0][ + 'story_hash_id'] + if story_hash in hashes_done: + if reverse_contrastive_sample: + filtered_dataset.append(example) + else: + continue + elif not reverse_contrastive_sample: + filtered_dataset.append(example) + hashes_done.append(story_hash) + else: + filtered_dataset.append(example) + + filtered_dataset = filtered_dataset[ + offset:offset + + min(len(filtered_dataset), sample_size) if sample_size else None] + + ablations = [ + # {'prompt': 'regular', 'name': 'regular'}, + # {'prompt': 'cot', 'name': 'cot'}, + { + 'prompt': 'cot+', + 'name': 'cot+' + }, + ] + + # create prompts + flattened_data = [] + for example in filtered_dataset: + context = example['context'] + questions = example['questions'] + + for question in questions: + choices_list = question['choices'] + choices_str = '\n'.join([ + f'{idx + 1} - {choice}' + for idx, choice in enumerate(choices_list) + ]) + gold_answer = question['answer'] + config.get( + 'answer_index_modifier', 1) + + for ablation in ablations: + prompt_style = ablation.get('prompt', 'cot+') + ablation_name = ablation.get('name', 'cot+') + + for scidx in range(self_consistency_n): + ex_str = '' + if ablation.get('use_example') and config.get('ex'): + ex_str = ( + 'Here is an example of solving the task:\n\n' + + config.get('ex') + + '\n\nThis is the end of the example. The real task is below.\n\n---\n\n' + ) + + if prompt_style == 'regular': + prompt = f'{ex_str}{context}\n\n{question["question"]}\n\n' \ + f'Pick one of the following choices:\n{choices_str}\n\n' \ + 'You must pick one option. Finally, the last thing you generate should be "ANSWER: (your answer here, include the choice number)"' + elif prompt_style == 'cot': + prompt = f'{ex_str}{context}\n\n{question["question"]}\n\n' \ + f'Pick one of the following choices:\n{choices_str}\n\n' \ + 'You must pick one option. Explain your reasoning step by step before you answer. ' \ + 'Finally, the last thing you generate should be "ANSWER: (your answer here, include the choice number)"' + elif prompt_style == 'cot+': + if config.get('hint_before_question'): + prompt = f'{ex_str}{context}\n\n{config["hint"]}\n\n{question["question"]}\n\n' \ + f'Pick one of the following choices:\n{choices_str}\n\n' \ + 'You must pick one option. Explain your reasoning step by step before you answer. ' \ + 'Finally, the last thing you generate should be "ANSWER: (your answer here, including the choice number)"' + else: + prompt = f'{ex_str}{context}\n\n{question["question"]}\n\n' \ + f'Pick one of the following choices:\n{choices_str}\n\n' \ + f'You must pick one option. {config["hint"]} Explain your reasoning step by step before you answer. ' \ + 'Finally, the last thing you generate should be "ANSWER: (your answer here, including the choice number)"' + else: + if len(question['intermediate_trees'] + ) == 0 or config.get('skip_ablated', False): + continue + + prompt = f'{ex_str}Answer the following questions given the list of facts per answer choice.\n\n' + for c, t in zip(choices_str.split('\n'), + question['intermediate_trees']): + # extract facts from intermediate_trees + facts = list( + set([ + x.value for x in + LogicTree.from_json(t).get_facts( + include_cs=ablation.get( + 'include_cs', False), + include_deductions_from_level=-1, + no_facts_after_depth=ablation.get( + 'no_facts_after_depth', 3) + + config.get( + 'ablation_depth_modifier', 0)) + ])) + if config.get('allow_sorted_facts', True): + facts = sorted(facts) + facts_str = '\n'.join( + [f'- {fact}' for fact in facts]) + prompt += f'Facts for Choice {c}:\n{facts_str}\n\n' + prompt += f'Given the list of facts per answer choice, answer the following question\n\n' \ + f'{question["question"]}\n\n' \ + f'Pick one of the following choices:\n{choices_str}\n\n' \ + 'You must pick one option. After you have found the answer, say it in this format "ANSWER: (your answer here, include the choice number)"' + + flattened_example = { + 'context': + context, + 'question_text': + question['question'], + 'question': + question, + 'answer': + question['answer'], + 'choices': + choices_list, + 'choices_str': + choices_str, + 'intermediate_trees': + question.get('intermediate_trees', []), + 'intermediate_data': + question.get('intermediate_data', []), + 'prompt': + prompt, + 'system_prompt': + config.get('system_prompt', ''), + 'gold_answer': + gold_answer, + 'scidx': + scidx, # self-consistency index + 'self_consistency_n': + self_consistency_n, + 'ablation_name': + ablation_name, + } + flattened_data.append(flattened_example) + + dataset = Dataset.from_list(flattened_data) + + return dataset + + +@ICL_EVALUATORS.register_module() +class MusrEvaluator(BaseEvaluator): + + def __init__(self, answer_index_modifier=1, self_consistency_n=1): + self.answer_index_modifier = answer_index_modifier + self.self_consistency_n = self_consistency_n + + def score(self, predictions, references): + correct = 0 + assert len(predictions) == len( + references + ), 'Predictions and references must have the same length!' + + total = len(predictions) + + for pred, ref in zip(predictions, references): + if 'ANSWER:' in pred: + answer_line = [ + line for line in pred.split('\n') if 'ANSWER:' in line + ] + if answer_line: + answer = answer_line[0].split('ANSWER:')[-1].strip() + import re + match = re.search(r'\d+', answer) + if match: + pred_answer = int(match.group()) + if pred_answer == ref: + correct += 1 + accuracy = 100 * correct / total if total > 0 else 0 + return {'accuracy': accuracy} diff --git a/opencompass/datasets/musr/object_placements_solved_ex.py b/opencompass/datasets/musr/object_placements_solved_ex.py new file mode 100644 index 00000000..1f5638fd --- /dev/null +++ b/opencompass/datasets/musr/object_placements_solved_ex.py @@ -0,0 +1,53 @@ +# flake8: noqa: E501 +story = ''' +Petra, the dedicated housewife, felt a thrill at the thought of her surprise anniversary dinner for her husband, Daniel. She had been skillfully maneuvering around Daniel's eagerness to pitch in without disappointing him or giving up her surprise. + +Daniel, ever-the-observant-husband, noted Petra's unusual enthusiasm about the day's menu. Despite not knowing the details, he appreciated her effort and wanted to help—silently, he decided to deploy his best skill—patiently awaiting his moment to help, maybe when Petra asked for something from the pantry. Amidst the excitement, there was Clara, their maid—ever diligent and efficient, trying to keep the surroundings perfect for this special evening. + +Tucked away, under the counter, was Petra's secret recipe book, her culinary treasure. Her solace in confusing times, her secret weapon during their flavorful adventures. While toward the back of the pantry, was the glass jar of Petra's favorite spice blends—something that Daniel was well aware of, in case an opportunity arose for him to assist or distract when Petra might need it. + +All three residents of the home were aware of each item's location. The secret recipe book under the counter, the glass jar in the pantry, and the anxious excitement that filled the air—a fragrance even more intoxicating than the delicious smells that would soon fill the kitchen. + +With tact and secrecy, Petra relocated her cherished recipe book from its hidden spot under the counter to its temporary home on the kitchen table. The pages were swiftly opened to reveal her secret recipes which she was eager to start preparing for the long-awaited anniversary surprise. While Petra was engrossed in her preparations, Clara continued her sweeping routine in the kitchen. Clara's steady broom strokes on the wooden floor echoed a little in the otherwise busy and humming kitchen. In the background, beyond the kitchen door, Daniel could be seen in the dining room, meticulously setting the table for the anticipated special dinner. + +The placement of the rooms allowed Clara to easily notice Petra's movements in her peripheral vision while she was executing her chores. Every move Petra made was observed in Clara's line of sight. Simultaneously, separated by the walls, Daniel was diligently arranging the tableware in the dining room which was separate from Petra's bustling kitchen. + +Hoping to spruce up the setting, Daniel delicately relocated a glass jar filled with decorative pebbles to the center of the dining table. His subtle contribution for the evening - a perfectly presentable table for their special anniversary dinner. Amidst the flurry of the special day's preparations, Clara diligently carried on with her duties in the upstairs bathroom, unseen from the dining room. Meanwhile, Petra was wholly engrossed in the allure of a new recipe in her cherished, hidden book which lay opened on the kitchen island, away from prying eyes of the dining room. + +In the middle of her usual tidying, Clara spotted Petra's treasured recipe book on the kitchen table. Ensuring it stayed clandestine, Clara carefully transferred it back to its usual hideaway spot beneath the counter. In the midst of the anniversary excitement, Clara deftly transferred Petra's secret weapon back to its hiding place when Daniel stepped out into the garage to retrieve extra utensils. Performing her duty with a sense of urgency, she made sure to move quietly to not disturb Petra, who was engrossed in the process of boiling a massive pot of pasta water on the stove. + +Despite the commotion and fervor in the kitchen, the hubbub did not stretch as far as the garage, which remained undisturbed by the domestic activity occurring in the main part of the house. Meanwhile, in the kitchen, Petra was oblivious to Clara's subtle maneuver while she busied herself at the stove, focused on making sure the large pot of water reached the perfect boil. + +In the end, the careful orchestration of duties by each individual within the house concluded in a harmonious anniversary celebration. The marks of a successful evening consisted of a delectable meal, a serene atmosphere, and the memory of a smooth, incident-free evening where everyone played their role to perfection. + +Based on this story, we want to identify where someone believes that a certain object is at the end of the story. In order to do that, you need to read the story and keep track of where they think the object is at each point. When an object is moved, the person may observe its new location if they saw it move. + +To see where an object ends up, they must be able to see the location that it moves to and not be too distracted by what they are doing. If they do not observe the object moving, then they will still believe it to be in the last location where they observed it. + +Which location is the most likely place Clara would look to find the glass jar given the story? + +Pick one of the following choices: +1 - dining table +2 - kitchen table +3 - pantry +4 - under counter + +You must pick one option. Explain your reasoning step by step before you answer. Finally, the last thing you generate should be "ANSWER: (your answer here, including the choice number)" +'''.strip() + +reasoning = ''' +Let's solve this by thinking step-by-step. We want to know where Clara will check to find the glass jar, so let's track where Clara sees the glass jar throughout the story. + +At the beginning of the story, it is stated that "All three residents of the home were aware of each item's location... the glass jar in the pantry." From this, we can conclude that the first place in the story where Clara sees the glass jar is in the pantry. + +Throughout the story, the glass jar only moves once to the dining table. However, while Daniel was moving the glass jar, Clara was upstairs in the restroom carrying out her duties. It's highly unlikely that she saw Daniel move the glass jar, so we can assume that she still believes it to be in the pantry. + +Clara does go to the kitchen in the story and moves a recipe book from the kitchen table, but because it's the kitchen table and not the dining room table, we can assume she hasn't seen the glass jar there. + +Now, given the story and evidence, we can assume that Clara believes the glass jar to be in the pantry. + +ANSWER: 3 + +'''.strip() + +object_placements_solved_ex = f'{story}\n\n{reasoning}' diff --git a/opencompass/datasets/musr/team_allocation_solved_ex.py b/opencompass/datasets/musr/team_allocation_solved_ex.py new file mode 100644 index 00000000..3c3fc689 --- /dev/null +++ b/opencompass/datasets/musr/team_allocation_solved_ex.py @@ -0,0 +1,72 @@ +# flake8: noqa: E501 +story = ''' +In the quaint community of Midvale, the local school stood as a beacon of enlightenment, nurturing the minds of the next generation. The teachers, the lifeblood of this institution, were tasked with the noble duty of education, while the unsung heroes—the maintenance crew—ensured the smooth functioning of the school's infrastructure. Amidst this, three town residents, Angela, Greg, and Travis, found themselves at a juncture of life where they were presented with the opportunity to serve in one of these crucial roles. The challenge now lay in the hands of the manager, who had to assign them to either teaching or maintenance, a decision that would set the course for their contributions to the school. + +Angela was a fiercely independent woman, beset with a unique set of strengths and weaknesses. She was a woman of very few words, often finding it hard to articulate her thoughts and explain things clearly. Venturing into education seemed a maze with her apathetic attitude towards learning. She was also seen to be disinterested in reading and the literary field as a whole. This was a juxtaposition to her inability to contribute to maintenance duties because of her fear of tools and machinery, a sinister remnant of a past accident that still haunted her. The basic handyman skills, which most locals learned growing up, were also absent from her repertoire. + +Angela's interactions with Greg and Travis further complicated the equation. On one hand, Greg and Angela had a habit of arguing constantly over trivial matters, which once culminated in their failure to complete a shared basic training exercise adequately. On the other hand, Angela and Travis simply had nothing in common. Their conversations were often fraught with awkward silences, indicative of their lack of shared interests. This lack of coordination was epitomized during a recent team-building exercise when their team finished last. + +Greg was the blue-collar type with a broad frame and muscular build. He had a work ethic that never shied away from toiling through the day to get things done. Growing up, he often helped his father with simple home repairs and minor renovations, learning the ropes of basic handiwork. Additionally, Greg had fortified his skills while refurbishing an old shed with Travis, a testament to their compatible personalities. However, his dislike for education was well known throughout town, further amplified by his lack of patience, especially with children. + +Travis, the third cog in the wheel, was a man of many peculiarities. His stage fright was almost legendary and made it nearly impossible for him to stand in front of a crowd. Often, the mere thought of it could unnerve him. His physical constitution was lightweight and fragile, and long hours of manual labor made him weary. He also had a revulsion towards dirt that he complained about at every opportune moment. Like the others, studying did not appeal to him much, so much so that he had stopped reading completely after leaving school prematurely. + +The manager understood well that a team’s success depends heavily on the contribution and compatibility of each member. He observed, analyzed, and considered. Now, it was up to him to assign roles to Angela, Greg, and Travis. The school needed educators and maintenance staff, and each had to play their part perfectly. + +Given the story, how would you uniquely allocate each person to make sure both tasks are accomplished efficiently? + +Pick one of the following choices: +1 - Teaching: Travis, Maintenance: Angela and Greg +2 - Teaching: Greg, Maintenance: Angela and Travis +3 - Teaching: Angela, Maintenance: Greg and Travis + +You must pick one option. The story should allow you to determine how good each person is at a skill. Roughly, each person is either great, acceptable, or bad at a task. We want to find an optimal assignment of people to tasks that uses their skills as well as possible. In addition, one task will have to have two people assigned to it. The effectiveness of their teamwork (great team, acceptable team, or bad team) also impacts the overall quality of the assignment. + +When two people need to work on a task and one is bad at it, they don’t necessarily benefit from the other person being good, unless they work well together. + +With different strengths, weaknesses, and interpersonal dynamics at play, you should allocate your team to find the single assignment to ensure that the tasks overall are completed as effectively as possible. + + Explain your reasoning step by step before you answer. Finally, the last thing you generate should be "ANSWER: (your answer here, including the choice number)" +'''.strip() + +reasoning = ''' +Let's solve this by thinking step-by-step. First, we will figure out each person's skill level for each task. Then, we can measure how well they all work together in pairs. From this, we can find the most efficient assignment that maximizes the scores. + +Let's start with Angela. Angela can't articulate her thoughts, and she seems unprepared for teaching. So, let's assume her skill level is 1 for teaching. She also is bad at maintenance due to her fear of maintenance. So, let's assume her skill level is 1 for maintenance as well. + +Now, let's look at Greg. Greg has a dislike for education and a lack of patience, so let's assume his skill level for maintenance is 1. However, Greg has helped with home repairs and minor renovations, so let's assume his maintenance skill level is 2. + +Finally, let's look at Travis. Travis has extreme stage fright, which will make it difficult to teach, so let's assume his teaching skill level is 1. He also has a lightweight and fragile frame as well as hates dirt, so let's assume his maintenance skill level is 1. + +Now, let's look at the relationships and how people work together. + +Angela and Greg do not get along; they are constantly arguing, so let's assume their ability to work together is 1. + +Angela and Travis aren't much better. They both have nothing in common, and they couldn't do a team-building exercise previously, so let's assume their ability to work together is 1. + +Finally, Greg and Travis have worked together, and their personalities seem to meld, so let's assume they work well together with a score of 3. + +Let's summarize and figure out the best assignment. + +Angela is bad at teaching. (1) +Angela is bad at maintenance. (1) +Angela does not work well with Greg. (1) +Angela does not work well with Travis. (1) +Greg is bad at teaching. (1) +Greg is okay with maintenance. (2) +Greg and Travis work well together. (3) +Travis is bad at teaching. (1) +Travis is bad at maintenance. (1) + +Now, let's find the best assignment. + +Option 1: Travis as a teacher (1) + Angela working in maintenance (1) + Greg working in maintenance (2) + Angela and Greg work badly together (1) = 5 +Option 2: Greg as a teacher (1) + Angela working in maintenance (1) + Travis working in maintenance (1) + Angela and Travis work badly together (1) = 4 +Option 3: Angela as a teacher (1) + Greg working in maintenance (2) + Travis working in maintenance (1) + Greg and Travis work well together (3) = 7 + +So, from this, we can see Option 3 has the maximum score. + +ANSWER: 3 + +'''.strip() + +team_allocation_solved_ex = f'{story}\n\n{reasoning}' diff --git a/opencompass/datasets/musr/tree.py b/opencompass/datasets/musr/tree.py new file mode 100644 index 00000000..5d80618f --- /dev/null +++ b/opencompass/datasets/musr/tree.py @@ -0,0 +1,739 @@ +# flake8: noqa: E501 +"""WARNING (or more like an aggressive note). + +A lot of functionality was implemented here for earlier experiments. Most of which is not used. We have left it here +for backwards compatibility with the current dataset as well as because why not. + +ALSO NOTE: + +This file was created to have no dependencies on anything in the repo for a reason. You can copy this file into your +own project and use the classes to parse/visualize/edit the logic trees in the dataset or create your own. + +FINAL NOTE: + +See examples of how to create LogicNodes and LogicTrees in the __main__ part of the file. +""" + +import random +from copy import deepcopy +from enum import Enum +from typing import Any, Dict, List + +import numpy as np + + +class LogicNodeOperatorType: + """How should the deduction combine the nodes (choose will randomly sample + and/or when populate is called)""" + AND = 'and' + OR = 'or' + CHOOSE = 'choose' + + +class LogicNodeFactType: + """Is a node explicit (mentioned in the story) or commonsense knowledge + (left unsaid)""" + EXPLICIT = 'explicit' + COMMONSENSE = 'commonsense' + + +class LogicNodeConstraints: + """Useful for things like children = ['X is the murderer', 'Y is the murderer', 'Z is the murderer'], we no longer use this structure though.""" + ONLY_ONE_CAN_BE_TRUE = 'Only one child can be true' + + +class LogicNodeDeductionType: + """What type of deduction should be used here (not used currently)""" + SYLLOGISM = 'syllogism' + TEMPORAL = 'temporal' + SPATIAL = 'spatial' + CHOOSE = 'choose' + + +class LogicNode: + """A LogicNode is a tree primitive. + + It is either a deduction or a leaf fact. Leaf facts are the ones that we + use in story generation (if they are explicit facts and not commonsense). + """ + value: str + children: List['LogicNode'] + fact_type: str + operator: str + constraints: List[str] + deduction_type: str + prunable: bool + can_be_leaf: bool + + def __init__( + self, + value: str = '', + children: List['LogicNode'] = None, + operator: str = LogicNodeOperatorType.OR, + fact_type: str = LogicNodeFactType.EXPLICIT, + constraints: List[str] = (), + deduction_type: str = None, + prunable: bool = True, + can_be_leaf: bool = False, + frozen: bool = False, + ): + """ + :param value: Content for this specific node (also the deduction of the children). + :param children: The children for this node. + :param operator: Should the children be "And"ed or "Or"ed to create the deduction (the content of this node). + :param fact_type: Explicit or commonsense + :param constraints: Not used anymore (see LogicNodeConstraints) + :param deduction_type: Not used anymore (see LogicNodeDeductionType) + :param prunable: Can this node be removed from the tree (we don't prune in our datasets) + :param can_be_leaf: Can this node be a leaf node (usually false for nodes that you are injecting manually) + :param frozen: Should we add/prune children in the populate function (if frozen, no children will be added or removed, but the children may have children appended/pruned from them). + """ + self.value = value + if children is None: + children = [] + self.children = children + self.operator = operator + self.fact_type = fact_type + self.constraints = constraints + self.deduction_type = deduction_type + self.prunable = prunable + self.can_be_leaf = can_be_leaf + self.frozen = frozen + self.parent = None + + @property + def children(self): + return self._children + + @children.setter + def children(self, children: List['LogicNode']): + self._children = children + for c in self.children: + c.parent = self + + def __str__(self): + line = [] + cnsts = ', '.join([str(x.value) for x in self.constraints]) + + if self.value and self.value != '': + line.append(self.value) + if len(self.children) > 0: + line.append(self.operator) + else: + line.append(self.fact_type) + + if self.deduction_type: + line.append(self.deduction_type) + + if len(self.constraints) > 0: + line.append(cnsts) + + if len(self.children) > 0: + line.append(f'children: {len(self.children)}') + + return ' | '.join(line) + + def __repr__(self): + return str(self) + + def to_json(self): + return { + 'value': self.value, + 'children': [x.to_json() for x in self.children], + 'fact_type': self.fact_type, + 'operator': self.operator, + 'constraints': self.constraints, + 'deduction_type': self.deduction_type, + 'prunable': self.prunable, + 'can_be_leaf': self.can_be_leaf + } + + @classmethod + def from_json(cls, js): + js['children'] = [LogicNode.from_json(x) for x in js['children']] + return cls(**js) + + +class LogicTree: + """Main datastructure used when creating a MuSR example. + + It's basically a standard tree with some parameters controlling the shape. + """ + + nodes: List[LogicNode] + + chance_of_or: float + chance_of_cs_fact: float + depth: int + chance_to_prune: float + chance_to_prune_all: float + bf_factor: Dict[int, float] + deduction_type_sample_rate: Dict[LogicNodeDeductionType, float] + root_structure: List[List[LogicNode]] = () + + def __init__(self, + chance_of_or: float = 0.3, + chance_of_cs_fact: float = 0.1, + depth: int = 2, + chance_to_prune: float = 0.6, + chance_to_prune_all: float = 0.2, + bf_factor: Dict[int, float] = None, + deduction_type_sample_rate: Dict[LogicNodeDeductionType, + float] = None, + enforce_cs_fact_per_level: bool = False, + root_structure: List[Any] = (), + nodes: List[LogicNode] = (), + populate: bool = True, + prune: bool = True): + """ + :param chance_of_or: (not used) how often should a node with children be an OR + :param chance_of_cs_fact: (not used) how often should there be a commonsense node + :param depth: How deep should a tree go + :param chance_to_prune: Percentage chance of pruning a node + :param chance_to_prune_all: Percentage chance of pruning all children from a node. + :param bf_factor: Branching factor (dictionary of percentages {1: 0.33, 2:0.33, 3:0.33} for example. + :param deduction_type_sample_rate: (not used, see bf_factor and LogicNodeDeductionType) + :param enforce_cs_fact_per_level: Enforce 1 commonsense fact per level in the tree (we use this instead of chance_of_cs_fact) + :param root_structure: List of LogicNodes to build off of. + :param nodes: List of LogicNodes to define the LogicTree on (we will not populate/prune the tree if this is filled) + :param populate: Should we populate children for the tree according to the other parameters? + :param prune: Should we prune the children for the tree according to the other parameters? + """ + self.chance_of_or = chance_of_or + self.chance_of_cs_fact = chance_of_cs_fact + self.depth = depth + self.chance_to_prune = chance_to_prune + self.chance_to_prune_all = chance_to_prune_all + self.bf_factor = bf_factor + self.enforce_cs_fact_per_level = enforce_cs_fact_per_level + + if not bf_factor: + self.bf_factor = {2: 0.8, 3: 0.2} + if not deduction_type_sample_rate: + deduction_type_sample_rate = { + LogicNodeDeductionType.SYLLOGISM: 1.0 + } + + self.deduction_type_sample_rate = deduction_type_sample_rate + self.root_structure = root_structure + + if len(nodes) > 0: + self.nodes = nodes + else: + + if root_structure is not None and len(root_structure) > 0: + self.nodes = root_structure + else: + self.nodes = [ + LogicNode('root', operator=LogicNodeOperatorType.AND) + ] + + if populate: + [self.populate(x, 1) for x in self.nodes] + if prune: + [self.prune(x, 1) for x in self.nodes] + + def __str__(self): + return self.print_tree() + + def get_facts(self, + include_cs: bool = False, + include_deductions_from_level: int = -1, + no_facts_after_depth: int = -1): + """Get a list of LogicNodes from the tree. By default, you will get the + explicit leaf nodes. + + :param include_cs: Include the commonsense nodes from all levels. + :param include_deductions_from_level: Include any intermediate deduction nodes from the specified level and deeper. + :param no_facts_after_depth: Essentially tree the deductions at the specified depth as leaf nodes. + """ + + def recurse_facts(_node: LogicNode, depth: int = 0) -> List[str]: + node = deepcopy(_node) + if depth >= no_facts_after_depth and no_facts_after_depth > -1: + node.children = [] + + facts = [] + + if node.fact_type == LogicNodeFactType.EXPLICIT and len( + node.children) == 0: + facts.append(node) + if node.fact_type == LogicNodeFactType.COMMONSENSE and include_cs and len( + node.children) == 0: + facts.append(node) + if len( + node.children + ) > 0 and include_deductions_from_level <= depth and include_deductions_from_level > -1: + facts.append(node) + + for child in node.children: + facts.extend(recurse_facts(child, depth + 1)) + return list(set(facts)) + + facts = [] + for n in self.nodes: + facts.extend(recurse_facts(n)) + return facts + + def print_tree(self, node=None, level=0): + """Deprecated (not used)""" + if node is None: + node = self.nodes[0] + line = '-' * level * 4 + str(node) + (' | ' + str(node.operator) if + len(node.children) > 0 else '') + + for child in node.children: + line += '\n' + self.print_tree(child, level + 1) + + return line + + def print_for_gpt(self, + node=None, + level=0, + pad_char=' ', + pad_space=4, + print_forward=True, + print_conjection_types: bool = False, + print_reasoning_types: bool = False, + ignore_value_after_depth: int = -1, + print_only_nodes_with_value: bool = False): + """Complex print function. We often use it as + print_for_gpt(pad_space=1, pad_char='> ') + + However, more complex arguments can be used to control what is printed. + + This returns a string that must be printed (don't be confused by the method name.) + + :param node: Start at a specific node. + :param level: Controls how much tabbing is done when printing the current node. + :param pad_char: Char to use that specifies depth ('> ' at depth 3 will look like '> > > ' if you have pad_space equal to 1 for example) + :param pad_space: How many spaces to include between pad_chars + :param print_forward: Print the tree with parent nodes first. + :param print_conjection_types: Print the Ands and Ors per deduction (not used) + :param print_reasoning_types: Print the deduction types (not used) + :param ignore_value_after_depth: Ignore content of the nodes once a depth is met + :param print_only_nodes_with_value: Ignore nodes without content. + """ + + line = '' + + if node is None: + node = self.nodes[0] + + if not print_forward: + for child in node.children: + v = self.print_for_gpt( + child, + level + 1, + pad_char=pad_char, + pad_space=pad_space, + print_forward=print_forward, + ignore_value_after_depth=ignore_value_after_depth, + print_only_nodes_with_value=print_only_nodes_with_value) + if v != '': + line += v + '\n' + + ignore_val = ignore_value_after_depth > -1 and ignore_value_after_depth < level + ignore_line = print_only_nodes_with_value and node.value == '' + + if ignore_line: + line_val = '' + else: + line_val = (node.value + ' | ' if node.value != '' and not ignore_val else '') + ( + ('Fact From Story' if node.fact_type == LogicNodeFactType.EXPLICIT else 'Commonsense Knowledge') \ + if len(node.children) == 0 else 'Deduced Fact') + + if level == 0: + line_val = (node.value + ' | ' if node.value != '' else + '') + 'Deduced Root Conclusion' + + if len(node.children) > 0 and (print_conjection_types + or print_reasoning_types): + if print_conjection_types: + line_val += f' ({node.operator}' + else: + line_val += f'(' + if node.deduction_type and print_reasoning_types: + line_val += f' | {node.deduction_type})' + else: + line_val += ')' + + if len(node.constraints) > 0: + cnsts = ', '.join([str(x) for x in node.constraints]) + line_val += f' constraints: [{cnsts}]' + + line += pad_char * level * pad_space + line_val + + if print_forward: + for child in node.children: + v = self.print_for_gpt( + child, + level + 1, + pad_char=pad_char, + pad_space=pad_space, + print_forward=print_forward, + ignore_value_after_depth=ignore_value_after_depth, + print_only_nodes_with_value=print_only_nodes_with_value) + if v != '': + line += '\n' + v + + return line + + def populate(self, node: LogicNode, current_depth: int = 1): + if node.operator == LogicNodeOperatorType.CHOOSE: + node.operator = LogicNodeOperatorType.OR \ + if random.random() < self.chance_of_or else LogicNodeOperatorType.AND + if node.deduction_type == LogicNodeDeductionType.CHOOSE: + if node.operator != LogicNodeOperatorType.AND: + node.deduction_type = None + else: + node.deduction_type = random.choices( + list(self.deduction_type_sample_rate.keys()), + list(self.deduction_type_sample_rate.values()), + k=1)[0] + + if not node.frozen: + + bf = max( + 0, + random.choices(list(self.bf_factor.keys()), + list(self.bf_factor.values()), + k=1)[0] - len(node.children)) + + if bf > 0: + + new_nodes = [] + one_fact_is_cs = False + for idx in range(bf): + roll_for_or = random.random() + fact_type = LogicNodeFactType.COMMONSENSE \ + if random.random() < self.chance_of_cs_fact and not one_fact_is_cs else \ + LogicNodeFactType.EXPLICIT + + if roll_for_or > self.chance_of_or and\ + current_depth < self.depth and\ + not fact_type == LogicNodeFactType.COMMONSENSE: + new_nodes.append( + LogicNode( + f'', + operator=LogicNodeOperatorType.AND, + fact_type=fact_type, + deduction_type=random.choices( + list(self.deduction_type_sample_rate.keys( + )), + list(self.deduction_type_sample_rate. + values()), + k=1)[0], + prunable=True, + can_be_leaf=True, + )) + else: + new_nodes.append( + LogicNode(f'', + operator=LogicNodeOperatorType.OR, + fact_type=fact_type, + prunable=True, + can_be_leaf=True)) + + if fact_type == LogicNodeFactType.COMMONSENSE: + node.operator = LogicNodeOperatorType.AND + if not node.deduction_type: + node.deduction_type = random.choices( + list(self.deduction_type_sample_rate.keys()), + list(self.deduction_type_sample_rate.values()), + k=1)[0] + one_fact_is_cs = True + + if not one_fact_is_cs and self.enforce_cs_fact_per_level: + new_nodes.append( + LogicNode(f'', + operator=LogicNodeOperatorType.OR, + fact_type=LogicNodeFactType.COMMONSENSE, + prunable=False, + can_be_leaf=True)) + + node.children.extend(new_nodes) + + if current_depth < self.depth: + for node in node.children: + if node.fact_type == LogicNodeFactType.COMMONSENSE: + continue + self.populate(node, current_depth + 1) + + def prune(self, node: LogicNode, current_depth: int = 1): + to_prune = [] + + if current_depth > 1 and node.can_be_leaf: + if random.random() < self.chance_to_prune_all: + node.children = [] + return + + prunable = [x for x in node.children if x.prunable] + if (len(prunable) > 1 and node.operator == LogicNodeOperatorType.OR or\ + len(prunable) > 2 and node.operator == LogicNodeOperatorType.AND) and\ + current_depth <= self.depth: + + if node.prunable: + for n in random.sample( + prunable, + len(prunable) - + (1 if node.operator == LogicNodeOperatorType.OR else 2)): + roll_to_prune = random.random() + if roll_to_prune < self.chance_to_prune: + to_prune.append(n) + + node.children = [x for x in node.children if x not in to_prune] + for n in node.children: + self.prune(n, current_depth + 1) + + def to_json(self): + args = { + 'chance_of_or': self.chance_of_or, + 'depth': self.depth, + 'chance_to_prune': self.chance_to_prune, + 'chance_to_prune_all': self.chance_to_prune_all, + 'bf_factor': self.bf_factor, + 'deduction_type_sample_rate': self.deduction_type_sample_rate, + 'root_structure': [x.to_json() for x in self.root_structure], + 'nodes': [x.to_json() for x in self.nodes] + } + return args + + @classmethod + def from_json(cls, _js): + js = deepcopy(_js) + js['nodes'] = [LogicNode.from_json(x) for x in js['nodes']] + js['root_structure'] = [ + LogicNode.from_json(x) for x in js['root_structure'] + ] + return cls(**js) + + +if __name__ == '__main__': + """EXAMPLE USES.""" + + def tv_scene_ex(): + root_structure = [ + LogicNode('A good drama tv scene', + operator=LogicNodeOperatorType.OR, + prunable=False, + can_be_leaf=False, + frozen=True) + ] + + root_structure[0].children = [ + LogicNode('Bob is sad.', + operator=LogicNodeOperatorType.CHOOSE, + prunable=True, + can_be_leaf=False), + LogicNode('John now hates Bob.', + operator=LogicNodeOperatorType.CHOOSE, + prunable=True, + can_be_leaf=False), + LogicNode('Bob bought a car.', + operator=LogicNodeOperatorType.CHOOSE, + prunable=True, + can_be_leaf=False), + LogicNode('Bob wanted to be happy.', + operator=LogicNodeOperatorType.CHOOSE, + prunable=True, + can_be_leaf=False), + ] + + tree = LogicTree(depth=4, + root_structure=root_structure, + bf_factor={ + 1: 0.5, + 2: 0.5 + }, + chance_of_or=0.0, + chance_of_cs_fact=0.0, + chance_to_prune_all=0.5, + chance_to_prune=0.5, + enforce_cs_fact_per_level=True) + + rep = tree.print_for_gpt(pad_space=1, pad_char='- ') + print(rep) + + def eb_ex(): + root_structure = [ + LogicNode('', + operator=LogicNodeOperatorType.CHOOSE, + prunable=False, + can_be_leaf=False) + ] + + n = LogicNode('Eruptions block sunlight.', + operator=LogicNodeOperatorType.CHOOSE, + prunable=False, + can_be_leaf=False, + frozen=True) + n.children = [ + LogicNode('Eruptions produce ash clouds.', + operator=LogicNodeOperatorType.CHOOSE, + prunable=False, + can_be_leaf=True, + frozen=True), + LogicNode('Ash blocks sunlight.', + operator=LogicNodeOperatorType.CHOOSE, + prunable=False, + can_be_leaf=True, + frozen=True), + ] + + g = LogicNode('Eruptions can cause plants to die.', + operator=LogicNodeOperatorType.CHOOSE, + prunable=True, + can_be_leaf=False, + frozen=True) + + g.children = [ + n, + LogicNode('Producers will die without sunlight.', + operator=LogicNodeOperatorType.CHOOSE, + prunable=False, + can_be_leaf=True, + frozen=True) + ] + + l = LogicNode('', + operator=LogicNodeOperatorType.AND, + prunable=False, + can_be_leaf=False) + l.children = [g] + + root_structure[0].children = [l] + + tree = LogicTree(depth=5, + root_structure=root_structure, + bf_factor={ + 1: 0.3, + 2: 0.7 + }, + chance_of_or=0.0, + chance_of_cs_fact=0.0, + chance_to_prune_all=0.0, + chance_to_prune=0.0, + enforce_cs_fact_per_level=True) + + rep = tree.print_for_gpt(pad_space=1, pad_char='- ') + print(rep) + + def murder_mystery_ex(): + root_structure = [ + LogicNode('Killer', + operator=LogicNodeOperatorType.OR, + constraints=[LogicNodeConstraints.ONLY_ONE_CAN_BE_TRUE], + prunable=False, + can_be_leaf=False, + frozen=True) + ] + + suspect_nodes = [ + LogicNode(f'Murderer Suspect {idx + 1}', + operator=LogicNodeOperatorType.AND, + prunable=False, + can_be_leaf=False, + frozen=True) for idx in range(1) + ] + for s in suspect_nodes: + s.children = [ + LogicNode('Suspect has means', + operator=LogicNodeOperatorType.CHOOSE, + prunable=True, + can_be_leaf=False), + LogicNode('Suspect has motive', + operator=LogicNodeOperatorType.CHOOSE, + prunable=True, + can_be_leaf=False), + LogicNode('Suspect has opportunity', + operator=LogicNodeOperatorType.CHOOSE, + prunable=True, + can_be_leaf=False) + ] + root_structure[0].children = suspect_nodes + + tree = LogicTree(depth=4, + root_structure=root_structure, + bf_factor={ + 1: 0.5, + 2: 0.5 + }, + chance_of_or=0.0, + chance_of_cs_fact=0.0, + chance_to_prune_all=0.5, + chance_to_prune=0.5, + enforce_cs_fact_per_level=True) + + rep = tree.print_for_gpt(pad_space=1, pad_char='> ') + print(rep) + + def action_ex(): + root_structure = [ + LogicNode('Take an action', + operator=LogicNodeOperatorType.OR, + prunable=False, + can_be_leaf=False, + frozen=True) + ] + + root_structure[0].children = [ + LogicNode('Run away', + operator=LogicNodeOperatorType.CHOOSE, + prunable=False, + can_be_leaf=False, + frozen=True), + LogicNode('Fight back', + operator=LogicNodeOperatorType.CHOOSE, + prunable=False, + can_be_leaf=False, + frozen=True), + LogicNode('Hide', + operator=LogicNodeOperatorType.CHOOSE, + prunable=False, + can_be_leaf=False, + frozen=True), + ] + + for cidx, c in enumerate(root_structure[0].children): + nfacts = random.randint(2, 4) + + for n in range(nfacts): + fact = LogicNode('', + operator=LogicNodeOperatorType.CHOOSE, + prunable=False, + can_be_leaf=False, + frozen=True) + fact.children = [ + LogicNode('Pro (supporting the parent action)', + operator=LogicNodeOperatorType.CHOOSE, + prunable=True, + can_be_leaf=False, + frozen=False), + LogicNode('Con (counters the sibling Pro only)', + operator=LogicNodeOperatorType.CHOOSE, + prunable=True, + can_be_leaf=False, + frozen=False) + ] + root_structure[0].children[cidx].children.append(fact) + + tree = LogicTree(depth=4, + root_structure=root_structure, + bf_factor={ + 1: 0.25, + 2: 0.5, + 3: 0.25 + }, + chance_of_or=0.0, + chance_of_cs_fact=0.0, + chance_to_prune_all=0.5, + chance_to_prune=0.75, + enforce_cs_fact_per_level=True) + + rep = tree.print_for_gpt(pad_space=1, pad_char='- ') + print(rep) + + tv_scene_ex() + eb_ex() + action_ex() diff --git a/opencompass/utils/datasets_info.py b/opencompass/utils/datasets_info.py index 9d3ea7b2..53fa1175 100644 --- a/opencompass/utils/datasets_info.py +++ b/opencompass/utils/datasets_info.py @@ -327,6 +327,11 @@ DATASETS_MAPPING = { "hf_id": "", "local": "./data/mmmlu_lite", }, + "opencompass/musr": { + "ms_id": "", + "hf_id": "", + "local": "./data/musr", + }, "opencompass/babilong": { "ms_id": "", "hf_id": "", @@ -335,6 +340,10 @@ DATASETS_MAPPING = { } DATASETS_URL = { + "/musr": { + "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/musr.zip", + "md5": "7447d2a5bec4586035196102135e2af9", + }, "/mmlu/": { "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/mmlu.zip", "md5": "761310671509a239e41c4b717f7fab9c", From 40a9f0be0ddba83f731891ce2cf618101dd399d2 Mon Sep 17 00:00:00 2001 From: Linchen Xiao Date: Fri, 15 Nov 2024 11:06:30 +0800 Subject: [PATCH 08/17] [Update] MUSR dataset config prefix update (#1692) --- configs/eval_musr.py | 2 +- opencompass/configs/datasets/musr/musr_gen.py | 137 +----------------- .../configs/datasets/musr/musr_gen_3c6e15.py | 135 +++++++++++++++++ 3 files changed, 139 insertions(+), 135 deletions(-) create mode 100644 opencompass/configs/datasets/musr/musr_gen_3c6e15.py diff --git a/configs/eval_musr.py b/configs/eval_musr.py index 03c9dabf..0949e82a 100644 --- a/configs/eval_musr.py +++ b/configs/eval_musr.py @@ -2,7 +2,7 @@ from mmengine.config import read_base import os.path as osp with read_base(): - from opencompass.configs.datasets.musr.musr_gen import musr_datasets + from opencompass.configs.datasets.musr.musr_gen_3c6e15 import musr_datasets # from opencompass.configs.models.hf_internlm.hf_internlm2_5_1_8b_chat import models from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat import ( models as lmdeploy_internlm2_5_7b_chat_model, diff --git a/opencompass/configs/datasets/musr/musr_gen.py b/opencompass/configs/datasets/musr/musr_gen.py index 2d57392b..eab34936 100644 --- a/opencompass/configs/datasets/musr/musr_gen.py +++ b/opencompass/configs/datasets/musr/musr_gen.py @@ -1,135 +1,4 @@ -from opencompass.datasets import MusrDataset, MusrEvaluator -from opencompass.openicl import PromptTemplate, ZeroRetriever, GenInferencer +from mmengine.config import read_base - -DATASET_CONFIGS = { - 'murder_mysteries': { - 'abbr': 'musr_murder_mysteries', - 'name': 'murder_mysteries', - 'path': 'opencompass/musr', - 'reader_cfg': dict( - input_columns=['context', 'question_text', 'question', 'answer', 'choices', 'choices_str', 'intermediate_trees', 'intermediate_data', 'prompt', 'system_prompt', 'gold_answer', 'scidx', 'self_consistency_n', 'ablation_name'], - output_column='gold_answer', - ), - 'infer_cfg': dict( - prompt_template=dict( - type=PromptTemplate, - template=dict( - begin=[ - dict( - role='SYSTEM', - fallback_role='HUMAN', - prompt='{system_prompt}' - ) - ], - round=[ - dict( - role='HUMAN', - prompt='{prompt}' - ), - ] - ), - ), - retriever=dict(type=ZeroRetriever), - inferencer=dict(type=GenInferencer, max_out_len=512), - ), - 'eval_cfg': dict( - evaluator=dict( - type=MusrEvaluator, - answer_index_modifier=1, - self_consistency_n=1 - ), - ), - }, - 'object_placements': { - 'abbr': 'musr_object_placements', - 'name': 'object_placements', - 'path': 'opencompass/musr', - 'reader_cfg': dict( - input_columns=['context', 'question_text', 'question', 'answer', 'choices', 'choices_str', 'intermediate_trees', 'intermediate_data', 'prompt', 'system_prompt', 'gold_answer', 'scidx', 'self_consistency_n', 'ablation_name'], - output_column='gold_answer', - ), - 'infer_cfg': dict( - prompt_template=dict( - type=PromptTemplate, - template=dict( - begin=[ - dict( - role='SYSTEM', - fallback_role='HUMAN', - prompt='{system_prompt}' - ) - ], - round=[ - dict( - role='HUMAN', - prompt='{prompt}' - ), - ] - ), - ), - retriever=dict(type=ZeroRetriever), - inferencer=dict(type=GenInferencer, max_out_len=512), - ), - 'eval_cfg': dict( - evaluator=dict( - type=MusrEvaluator, - answer_index_modifier=1, - self_consistency_n=1 - ), - ), - }, - 'team_allocation': { - 'abbr': 'musr_team_allocation', - 'name': 'team_allocation', - 'path': 'opencompass/musr', - 'reader_cfg': dict( - input_columns=['context', 'question_text', 'question', 'answer', 'choices', 'choices_str', 'intermediate_trees', 'intermediate_data', 'prompt', 'system_prompt', 'gold_answer', 'scidx', 'self_consistency_n', 'ablation_name'], - output_column='gold_answer', - ), - 'infer_cfg': dict( - prompt_template=dict( - type=PromptTemplate, - template=dict( - begin=[ - dict( - role='SYSTEM', - fallback_role='HUMAN', - prompt='{system_prompt}' - ) - ], - round=[ - dict( - role='HUMAN', - prompt='{prompt}' - ), - ] - ), - ), - retriever=dict(type=ZeroRetriever), - inferencer=dict(type=GenInferencer, max_out_len=512), - ), - 'eval_cfg': dict( - evaluator=dict( - type=MusrEvaluator, - answer_index_modifier=1, - self_consistency_n=1 - ), - ), - }, -} - - -musr_datasets = [] - -for config in DATASET_CONFIGS.values(): - dataset = dict( - abbr=config['abbr'], - type=MusrDataset, - path=config['path'], - name=config['name'], - reader_cfg=config['reader_cfg'], - infer_cfg=config['infer_cfg'], - eval_cfg=config['eval_cfg'], - ) - musr_datasets.append(dataset) +with read_base(): + from .musr_gen_3c6e15 import musr_datasets # noqa: F401, F403 \ No newline at end of file diff --git a/opencompass/configs/datasets/musr/musr_gen_3c6e15.py b/opencompass/configs/datasets/musr/musr_gen_3c6e15.py new file mode 100644 index 00000000..2d57392b --- /dev/null +++ b/opencompass/configs/datasets/musr/musr_gen_3c6e15.py @@ -0,0 +1,135 @@ +from opencompass.datasets import MusrDataset, MusrEvaluator +from opencompass.openicl import PromptTemplate, ZeroRetriever, GenInferencer + + +DATASET_CONFIGS = { + 'murder_mysteries': { + 'abbr': 'musr_murder_mysteries', + 'name': 'murder_mysteries', + 'path': 'opencompass/musr', + 'reader_cfg': dict( + input_columns=['context', 'question_text', 'question', 'answer', 'choices', 'choices_str', 'intermediate_trees', 'intermediate_data', 'prompt', 'system_prompt', 'gold_answer', 'scidx', 'self_consistency_n', 'ablation_name'], + output_column='gold_answer', + ), + 'infer_cfg': dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + begin=[ + dict( + role='SYSTEM', + fallback_role='HUMAN', + prompt='{system_prompt}' + ) + ], + round=[ + dict( + role='HUMAN', + prompt='{prompt}' + ), + ] + ), + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=512), + ), + 'eval_cfg': dict( + evaluator=dict( + type=MusrEvaluator, + answer_index_modifier=1, + self_consistency_n=1 + ), + ), + }, + 'object_placements': { + 'abbr': 'musr_object_placements', + 'name': 'object_placements', + 'path': 'opencompass/musr', + 'reader_cfg': dict( + input_columns=['context', 'question_text', 'question', 'answer', 'choices', 'choices_str', 'intermediate_trees', 'intermediate_data', 'prompt', 'system_prompt', 'gold_answer', 'scidx', 'self_consistency_n', 'ablation_name'], + output_column='gold_answer', + ), + 'infer_cfg': dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + begin=[ + dict( + role='SYSTEM', + fallback_role='HUMAN', + prompt='{system_prompt}' + ) + ], + round=[ + dict( + role='HUMAN', + prompt='{prompt}' + ), + ] + ), + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=512), + ), + 'eval_cfg': dict( + evaluator=dict( + type=MusrEvaluator, + answer_index_modifier=1, + self_consistency_n=1 + ), + ), + }, + 'team_allocation': { + 'abbr': 'musr_team_allocation', + 'name': 'team_allocation', + 'path': 'opencompass/musr', + 'reader_cfg': dict( + input_columns=['context', 'question_text', 'question', 'answer', 'choices', 'choices_str', 'intermediate_trees', 'intermediate_data', 'prompt', 'system_prompt', 'gold_answer', 'scidx', 'self_consistency_n', 'ablation_name'], + output_column='gold_answer', + ), + 'infer_cfg': dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + begin=[ + dict( + role='SYSTEM', + fallback_role='HUMAN', + prompt='{system_prompt}' + ) + ], + round=[ + dict( + role='HUMAN', + prompt='{prompt}' + ), + ] + ), + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=512), + ), + 'eval_cfg': dict( + evaluator=dict( + type=MusrEvaluator, + answer_index_modifier=1, + self_consistency_n=1 + ), + ), + }, +} + + +musr_datasets = [] + +for config in DATASET_CONFIGS.values(): + dataset = dict( + abbr=config['abbr'], + type=MusrDataset, + path=config['path'], + name=config['name'], + reader_cfg=config['reader_cfg'], + infer_cfg=config['infer_cfg'], + eval_cfg=config['eval_cfg'], + ) + musr_datasets.append(dataset) From 4a20e1176d1caa3e4476d0c429612c419bb369e7 Mon Sep 17 00:00:00 2001 From: zhulinJulia24 <145004780+zhulinJulia24@users.noreply.github.com> Date: Fri, 15 Nov 2024 14:46:29 +0800 Subject: [PATCH 09/17] [CI] Update baselines (#1693) Co-authored-by: zhulin1 --- .github/scripts/oc_score_baseline_fullbench.yaml | 2 +- .github/scripts/oc_score_baseline_testrange.yaml | 16 ++++++++-------- .github/workflows/daily-run-test.yml | 2 +- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/.github/scripts/oc_score_baseline_fullbench.yaml b/.github/scripts/oc_score_baseline_fullbench.yaml index 4eea62fe..c95e7b91 100644 --- a/.github/scripts/oc_score_baseline_fullbench.yaml +++ b/.github/scripts/oc_score_baseline_fullbench.yaml @@ -1,6 +1,6 @@ internlm2_5-7b-chat-hf_fullbench: race-high: 93.75 - ARC-c: 87.5 + ARC-c: 93.75 BoolQ: 81.25 drop: 81.25 GPQA_diamond: 25 diff --git a/.github/scripts/oc_score_baseline_testrange.yaml b/.github/scripts/oc_score_baseline_testrange.yaml index 6df2b515..f93f8957 100644 --- a/.github/scripts/oc_score_baseline_testrange.yaml +++ b/.github/scripts/oc_score_baseline_testrange.yaml @@ -44,7 +44,7 @@ gemma-7b-it-hf: gemma-2-9b-it-turbomind: gsm8k: 68.75 - race-high: 81.25 + race-high: 84.38 gemma-7b-it-vllm: gsm8k: 28.12 @@ -112,11 +112,11 @@ mistral-7b-instruct-v0.3-hf: mistral-nemo-instruct-2407-hf: gsm8k: 75 - race-high: 81.25 + race-high: 84.38 mistral-nemo-instruct-2407-turbomind: gsm8k: 75 - race-high: 81.25 + race-high: 84.38 mistral-7b-instruct-v0.1-vllm: gsm8k: 37.5 @@ -132,7 +132,7 @@ MiniCPM3-4B-hf: minicpm-2b-dpo-fp32-hf: gsm8k: 56.25 - race-high: 56.25 + race-high: 53.12 minicpm-2b-sft-bf16-hf: gsm8k: 46.88 @@ -144,7 +144,7 @@ minicpm-2b-sft-fp32-hf: phi-3-mini-4k-instruct-hf: gsm8k: 56.25 - race-high: 78.12 + race-high: 84.38 qwen1.5-0.5b-chat-hf: gsm8k: 0 @@ -192,15 +192,15 @@ internlm2_5-20b-chat-turbomind: mistral-small-instruct-2409-hf: gsm8k: 81.25 - race-high: 90.62 + race-high: 87.50 mistral-small-instruct-2409-turbomind: gsm8k: 78.12 - race-high: 90.62 + race-high: 87.50 qwen2.5-14b-instruct-hf: gsm8k: 71.88 - race-high: 93.75 + race-high: 96.88 qwen2.5-14b-instruct-turbomind: gsm8k: 71.88 diff --git a/.github/workflows/daily-run-test.yml b/.github/workflows/daily-run-test.yml index d16c5b03..125aaa71 100644 --- a/.github/workflows/daily-run-test.yml +++ b/.github/workflows/daily-run-test.yml @@ -39,7 +39,7 @@ on: type: string default: "['dsw_cu12']" schedule: - - cron: '56 16 * * *' + - cron: '15 16 * * *' concurrency: group: ${{ github.workflow }}-${{ github.ref }} From 4653f6976ec252aa7cbca11b134921508b09623f Mon Sep 17 00:00:00 2001 From: Linchen Xiao Date: Mon, 18 Nov 2024 12:33:51 +0800 Subject: [PATCH 10/17] [Update] update volc CPU flavor (#1698) --- opencompass/runners/volc.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/opencompass/runners/volc.py b/opencompass/runners/volc.py index f076daa6..81d0c869 100644 --- a/opencompass/runners/volc.py +++ b/opencompass/runners/volc.py @@ -249,7 +249,7 @@ class VOLCRunner(BaseRunner): with open(config_path) as fp: volc_cfg = yaml.safe_load(fp) if num_gpus <= 0: - flavor = 'ml.c1ie.2xlarge' + flavor = 'ml.c3i.2xlarge' elif num_gpus == 1: flavor = 'ml.pni2l.3xlarge' elif num_gpus == 2: From 98242ff1d1387e941154a2b7af8ca88c7ebb8e34 Mon Sep 17 00:00:00 2001 From: Linchen Xiao Date: Mon, 18 Nov 2024 20:14:29 +0800 Subject: [PATCH 11/17] [Update] first_option_postprocess (#1699) * update first_option_postprocess * update --- opencompass/utils/text_postprocessors.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/opencompass/utils/text_postprocessors.py b/opencompass/utils/text_postprocessors.py index 367fcbd1..16083a0b 100644 --- a/opencompass/utils/text_postprocessors.py +++ b/opencompass/utils/text_postprocessors.py @@ -128,7 +128,10 @@ def first_option_postprocess(text: str, options: str, cushion=True) -> str: text = text.strip() match = re.search(pattern, text, re.DOTALL) if match: - outputs = match.group(1) + if match.group(1) is not None and match.group(1) != '': + outputs = match.group(1) + else: + outputs = match.group(0) for i in options: if i in outputs: return i From ab8fdbbaab0c785bb1681a3240ba511899e7479c Mon Sep 17 00:00:00 2001 From: Linchen Xiao Date: Mon, 18 Nov 2024 20:24:35 +0800 Subject: [PATCH 12/17] [Update] Update Math auto-download data (#1700) --- opencompass/datasets/math.py | 6 ++++-- opencompass/utils/datasets_info.py | 12 ++++++------ 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/opencompass/datasets/math.py b/opencompass/datasets/math.py index a2cf55a1..7a290536 100644 --- a/opencompass/datasets/math.py +++ b/opencompass/datasets/math.py @@ -1,4 +1,5 @@ import json +import os import re from os import environ @@ -140,7 +141,7 @@ def extract_answer(response_text: str): class MATHDataset(BaseDataset): @staticmethod - def load(path: str): + def load(path: str, file_name: str = 'math.json'): path = get_data_path(path) dataset = DatasetDict() raw_data = [] @@ -155,7 +156,8 @@ class MATHDataset(BaseDataset): extract_boxed_answer(item['solution']) }) else: - data = json.load(open(path)) + file_path = os.path.join(path, file_name) + data = json.load(open(file_path)) for i in data.keys(): raw_data.append({ 'problem': diff --git a/opencompass/utils/datasets_info.py b/opencompass/utils/datasets_info.py index 53fa1175..7d694ff1 100644 --- a/opencompass/utils/datasets_info.py +++ b/opencompass/utils/datasets_info.py @@ -167,7 +167,7 @@ DATASETS_MAPPING = { "opencompass/math": { "ms_id": "opencompass/math", "hf_id": "opencompass/math", - "local": "./data/math/math.json", + "local": "./data/math/", }, # MMLU "opencompass/mmlu": { @@ -327,11 +327,11 @@ DATASETS_MAPPING = { "hf_id": "", "local": "./data/mmmlu_lite", }, - "opencompass/musr": { + "opencompass/musr": { "ms_id": "", "hf_id": "", "local": "./data/musr", - }, + }, "opencompass/babilong": { "ms_id": "", "hf_id": "", @@ -374,7 +374,7 @@ DATASETS_URL = { }, "/math/": { "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/math.zip", - "md5": "8b1b897259684672055e6fd4fc07c808", + "md5": "cb5b4c8378085929e20345174e731fdf", }, "/hellaswag/": { "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/hellaswag.zip", @@ -540,8 +540,8 @@ DATASETS_URL = { "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/WikiBench.zip", "md5": "6dac1d1a3133fe1effff185cbf71d928", }, - "/babilong":{ + "/babilong": { "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/babilong.zip", "md5": "e400864c31bc58d29eaa3e199751f99b", - } + }, } From ff831b153e3f81f80ac84a56a254dbb4cbad95c9 Mon Sep 17 00:00:00 2001 From: Linchen Xiao Date: Mon, 18 Nov 2024 20:24:50 +0800 Subject: [PATCH 13/17] [BUMP] Bump version to 0.3.6 (#1694) --- opencompass/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/opencompass/__init__.py b/opencompass/__init__.py index 12c79d2f..0172d916 100644 --- a/opencompass/__init__.py +++ b/opencompass/__init__.py @@ -1,4 +1,4 @@ -__version__ = '0.3.5' +__version__ = '0.3.6' def _warn_about_config_migration(): From 05044dfaf2edda87abee168b69fe0215ebf75611 Mon Sep 17 00:00:00 2001 From: Yi Ding Date: Wed, 20 Nov 2024 16:40:22 +0800 Subject: [PATCH 14/17] [Update] Support new error code for Bailing model (#1702) * support new error code * fix the lint problems --- opencompass/models/bailing_api_oc.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/opencompass/models/bailing_api_oc.py b/opencompass/models/bailing_api_oc.py index 0b721bff..316f529b 100644 --- a/opencompass/models/bailing_api_oc.py +++ b/opencompass/models/bailing_api_oc.py @@ -2,6 +2,7 @@ import concurrent import concurrent.futures import os import socket +import time import traceback from typing import Dict, List, Optional, Union @@ -20,6 +21,8 @@ from .base_api import BaseAPIModel PromptType = Union[PromptList, str] +BAILING_RETRY_DELAY: int = 30 + class HTTPAdapterWithSocketOptions(HTTPAdapter): @@ -200,6 +203,9 @@ class BailingAPI(BaseAPIModel): break # success elif response.status_code == 426: retry_num += 1 # retry + elif response.status_code in [429, 500, 504]: + time.sleep(BAILING_RETRY_DELAY) + retry_num += 1 # retry else: raise ValueError(f'Status code = {response.status_code}') else: From ed81f9df302b6c9d61d5167e7c98bbedd238a09d Mon Sep 17 00:00:00 2001 From: zhulinJulia24 <145004780+zhulinJulia24@users.noreply.github.com> Date: Thu, 21 Nov 2024 10:37:33 +0800 Subject: [PATCH 15/17] [CI] update torch version and add more datasets into daily testcase (#1701) * update * update * update * update * update * update * update * update * update * update --------- Co-authored-by: zhulin1 --- .../scripts/eval_regression_base_fullbench.py | 4 + .github/scripts/eval_regression_chat.py | 2 + ...val_regression_chat_objective_fullbench.py | 132 ++++++++++++------ .github/scripts/oc_score_assert.py | 17 ++- .../scripts/oc_score_baseline_fullbench.yaml | 50 +++++-- .../scripts/oc_score_baseline_testrange.yaml | 20 +-- .github/workflows/daily-run-test.yml | 12 +- 7 files changed, 160 insertions(+), 77 deletions(-) diff --git a/.github/scripts/eval_regression_base_fullbench.py b/.github/scripts/eval_regression_base_fullbench.py index d5ad48c4..11c2f514 100644 --- a/.github/scripts/eval_regression_base_fullbench.py +++ b/.github/scripts/eval_regression_base_fullbench.py @@ -7,6 +7,8 @@ with read_base(): bbh_datasets # noqa: F401, E501 from opencompass.configs.datasets.cmmlu.cmmlu_ppl_041cbf import \ cmmlu_datasets # noqa: F401, E501 + from opencompass.configs.datasets.dingo.dingo_gen import \ + datasets as dingo_datasets # noqa: F401, E501 from opencompass.configs.datasets.drop.drop_gen_a2697c import \ drop_datasets # noqa: F401, E501 from opencompass.configs.datasets.GaokaoBench.GaokaoBench_no_subjective_gen_d21e37 import \ @@ -120,6 +122,8 @@ summarizer = dict( ['winogrande', 'accuracy'], ['hellaswag', 'accuracy'], ['TheoremQA', 'score'], + ['dingo_en_192', 'score'], + ['dingo_zh_170', 'score'], '###### MathBench-A: Application Part ######', 'college', 'high', diff --git a/.github/scripts/eval_regression_chat.py b/.github/scripts/eval_regression_chat.py index 68c225c5..7762e4f7 100644 --- a/.github/scripts/eval_regression_chat.py +++ b/.github/scripts/eval_regression_chat.py @@ -59,6 +59,8 @@ with read_base(): models as hf_llama3_2_3b_instruct_model # noqa: F401, E501 from opencompass.configs.models.hf_llama.hf_llama3_8b_instruct import \ models as hf_llama3_8b_instruct_model # noqa: F401, E501 + from opencompass.configs.models.hf_llama.lmdeploy_llama2_7b_chat import \ + models as lmdeploy_llama2_7b_chat_model # noqa: F401, E501 from opencompass.configs.models.hf_llama.lmdeploy_llama3_1_8b_instruct import \ models as lmdeploy_llama3_1_8b_instruct_model # noqa: F401, E501 from opencompass.configs.models.hf_llama.lmdeploy_llama3_2_3b_instruct import \ diff --git a/.github/scripts/eval_regression_chat_objective_fullbench.py b/.github/scripts/eval_regression_chat_objective_fullbench.py index ff8dfba4..c66fba33 100644 --- a/.github/scripts/eval_regression_chat_objective_fullbench.py +++ b/.github/scripts/eval_regression_chat_objective_fullbench.py @@ -3,12 +3,16 @@ from mmengine.config import read_base with read_base(): # read hf models - chat models # Dataset + from opencompass.configs.datasets.aime2024.aime2024_gen_6e39a4 import \ + aime2024_datasets # noqa: F401, E501 from opencompass.configs.datasets.ARC_c.ARC_c_cot_gen_926652 import \ ARC_c_datasets # noqa: F401, E501 from opencompass.configs.datasets.bbh.bbh_gen_5b92b0 import \ bbh_datasets # noqa: F401, E501 from opencompass.configs.datasets.cmmlu.cmmlu_0shot_cot_gen_305931 import \ cmmlu_datasets # noqa: F401, E501 + from opencompass.configs.datasets.cmo_fib.cmo_fib_gen_ace24b import \ + cmo_fib_datasets # noqa: F401, E501 from opencompass.configs.datasets.drop.drop_openai_simple_evals_gen_3857b0 import \ drop_datasets # noqa: F401, E501 from opencompass.configs.datasets.ds1000.ds1000_service_eval_gen_cbc84f import \ @@ -28,6 +32,8 @@ with read_base(): humanevalx_datasets # noqa: F401, E501 from opencompass.configs.datasets.IFEval.IFEval_gen_3321a3 import \ ifeval_datasets # noqa: F401, E501 + from opencompass.configs.datasets.livecodebench.livecodebench_gen_b2b0fd import \ + LCB_datasets # noqa: F401, E501 from opencompass.configs.datasets.math.math_0shot_gen_393424 import \ math_datasets # noqa: F401, E501 from opencompass.configs.datasets.MathBench.mathbench_2024_gen_50a320 import \ @@ -38,6 +44,10 @@ with read_base(): mmlu_datasets # noqa: F401, E501 from opencompass.configs.datasets.mmlu_pro.mmlu_pro_0shot_cot_gen_08c1de import \ mmlu_pro_datasets # noqa: F401, E501 + from opencompass.configs.datasets.mmmlu_lite.mmmlu_lite_gen_c51a84 import \ + mmmlu_lite_datasets # noqa: F401, E501 + from opencompass.configs.datasets.musr.musr_gen_3c6e15 import \ + musr_datasets # noqa: F401, E501 from opencompass.configs.datasets.nq.nq_open_1shot_gen_2e45e5 import \ nq_datasets # noqa: F401, E501 from opencompass.configs.datasets.race.race_cot_gen_d95929 import \ @@ -77,10 +87,14 @@ with read_base(): mmlu_summary_groups # noqa: F401, E501 from opencompass.configs.summarizers.groups.mmlu_pro import \ mmlu_pro_summary_groups # noqa: F401, E501 + from opencompass.configs.summarizers.groups.musr_average import \ + summarizer as musr_summarizer # noqa: F401, E501 from opencompass.configs.summarizers.groups.scicode import \ scicode_summary_groups # noqa: F401, E501 from opencompass.configs.summarizers.groups.teval import \ teval_summary_groups # noqa: F401, E501 + from opencompass.configs.summarizers.mmmlu_lite import \ + mmmlu_summary_groups # noqa: F401, E501 # For HumanEval-X Evaluation # Apply the evaluator ip_address and port @@ -122,6 +136,10 @@ mmlu_datasets = [ ] mmlu_pro_datasets = [mmlu_pro_datasets[0]] + +mmmlu_lite_datasets = [ + x for x in mmmlu_lite_datasets if 'mmlu_lite_AR-XY' in x['abbr'] +] mathbench_datasets = [x for x in mathbench_datasets if 'college' in x['abbr']] GaokaoBench_datasets = [ x for x in GaokaoBench_datasets if '2010-2022_Math_II_MCQs' in x['abbr'] @@ -137,52 +155,68 @@ datasets += teval_en_datasets datasets += teval_zh_datasets # datasets += SciCode_datasets +musr_summary_groups = musr_summarizer['summary_groups'] +summary_groups = sum( + [v for k, v in locals().items() if k.endswith('_summary_groups')], []) +summary_groups.append( + { + 'name': 'Mathbench', + 'subsets': ['mathbench-a (average)', 'mathbench-t (average)'], + }, ) + +# Summarizer summarizer = dict( dataset_abbrs=[ + 'Language', ['race-high', 'accuracy'], ['ARC-c', 'accuracy'], ['BoolQ', 'accuracy'], - ['mmlu_pro', 'naive_average'], + ['triviaqa_wiki_1shot', 'score'], + ['nq_open_1shot', 'score'], + ['mmmlu_lite', 'naive_average'], + '', + 'Instruction Following', + ['IFEval', 'Prompt-level-strict-accuracy'], + '', + 'General Reasoning', ['drop', 'accuracy'], ['bbh', 'naive_average'], ['GPQA_diamond', 'accuracy'], + ['hellaswag', 'accuracy'], + ['TheoremQA', 'score'], + ['musr_average', 'naive_average'], + '', + 'Math Calculation', + ['gsm8k', 'accuracy'], + ['GaokaoBench', 'weighted_average'], ['math', 'accuracy'], + ['cmo_fib', 'accuracy'], + ['aime2024', 'accuracy'], + ['Mathbench', 'naive_average'], + '', + 'Knowledge', ['wikibench-wiki-single_choice_cncircular', 'perf_4'], - ['openai_humaneval', 'humaneval_pass@1'], - ['sanitized_mbpp', 'score'], ['cmmlu', 'naive_average'], ['mmlu', 'naive_average'], + ['mmlu_pro', 'naive_average'], + '', + 'Code', + ['openai_humaneval', 'humaneval_pass@1'], + ['sanitized_mbpp', 'score'], + ['humanevalx', 'naive_average'], + ['ds1000', 'naive_average'], + ['lcb_code_generation', 'pass@1'], + ['lcb_code_execution', 'pass@1'], + ['lcb_test_output', 'pass@1'], + '', + 'Agent', ['teval', 'naive_average'], ['SciCode', 'accuracy'], ['SciCode', 'sub_accuracy'], - ['humanevalx', 'naive_average'], - ['ds1000', 'naive_average'], - ['IFEval', 'Prompt-level-strict-accuracy'], - ['gsm8k', 'accuracy'], - ['GaokaoBench', 'weighted_average'], - ['triviaqa_wiki_1shot', 'score'], - ['nq_open_1shot', 'score'], - ['hellaswag', 'accuracy'], - ['TheoremQA', 'score'], - '###### MathBench-A: Application Part ######', - 'college', - 'high', - 'middle', - 'primary', - 'arithmetic', - 'mathbench-a (average)', - '###### MathBench-T: Theory Part ######', - 'college_knowledge', - 'high_knowledge', - 'middle_knowledge', - 'primary_knowledge', - 'mathbench-t (average)', - '###### Overall: Average between MathBench-A and MathBench-T ######', - 'Overall', '', 'bbh-logical_deduction_seven_objects', 'bbh-multistep_arithmetic_two', - '' + '', 'mmlu', 'mmlu-stem', 'mmlu-social-science', @@ -212,15 +246,6 @@ summarizer = dict( 'mmlu_pro_psychology', 'mmlu_pro_other', '', - 'GaokaoBench_2010-2022_Math_II_MCQs', - 'GaokaoBench_2010-2022_Math_II_Fill-in-the-Blank', - '', - 'humanevalx-python', - 'humanevalx-cpp', - 'humanevalx-go', - 'humanevalx-java', - 'humanevalx-js', - '', 'ds1000_Pandas', 'ds1000_Numpy', 'ds1000_Tensorflow', @@ -228,9 +253,38 @@ summarizer = dict( 'ds1000_Sklearn', 'ds1000_Pytorch', 'ds1000_Matplotlib', + '', + 'mmmlu_lite', + 'openai_mmmlu_lite_AR-XY', + 'openai_mmmlu_lite_BN-BD', + 'openai_mmmlu_lite_DE-DE', + 'openai_mmmlu_lite_ES-LA', + 'openai_mmmlu_lite_FR-FR', + 'openai_mmmlu_lite_HI-IN', + 'openai_mmmlu_lite_ID-ID', + 'openai_mmmlu_lite_IT-IT', + 'openai_mmmlu_lite_JA-JP', + 'openai_mmmlu_lite_KO-KR', + 'openai_mmmlu_lite_PT-BR', + 'openai_mmmlu_lite_SW-KE', + 'openai_mmmlu_lite_YO-NG', + 'openai_mmmlu_lite_ZH-CN', + '', + '###### MathBench-A: Application Part ######', + 'college', + 'high', + 'middle', + 'primary', + 'arithmetic', + 'mathbench-a (average)', + '###### MathBench-T: Theory Part ######', + 'college_knowledge', + 'high_knowledge', + 'middle_knowledge', + 'primary_knowledge', + 'mathbench-t (average)', ], - summary_groups=sum( - [v for k, v in locals().items() if k.endswith('_summary_groups')], []), + summary_groups=summary_groups, ) for d in datasets: diff --git a/.github/scripts/oc_score_assert.py b/.github/scripts/oc_score_assert.py index d8e33adb..179dec27 100644 --- a/.github/scripts/oc_score_assert.py +++ b/.github/scripts/oc_score_assert.py @@ -131,14 +131,16 @@ class TestChatObjFullbench: 'internlm2_5-7b-chat-hf_fullbench', 'internlm2_5-7b-chat-turbomind_fullbench' ] for p2 in [ - 'race-high', 'ARC-c', 'BoolQ', 'drop', 'GPQA_diamond', 'math', + 'race-high', 'ARC-c', 'BoolQ', 'triviaqa_wiki_1shot', 'nq_open_1shot', + 'IFEval', 'drop', 'GPQA_diamond', 'hellaswag', 'TheoremQA', + 'musr_average', 'gsm8k', 'math', 'cmo_fib', 'aime2024', 'wikibench-wiki-single_choice_cncircular', 'sanitized_mbpp', 'ds1000', - 'gsm8k', 'triviaqa_wiki_1shot', 'nq_open_1shot', 'hellaswag', - 'TheoremQA', 'college', 'college_knowledge', + 'lcb_code_generation', 'lcb_code_execution', 'lcb_test_output', 'bbh-logical_deduction_seven_objects', 'bbh-multistep_arithmetic_two', 'mmlu-other', 'cmmlu-china-specific', 'mmlu_pro_math', 'ds1000_Pandas', 'ds1000_Numpy', 'ds1000_Tensorflow', 'ds1000_Scipy', 'ds1000_Sklearn', - 'ds1000_Pytorch', 'ds1000_Matplotlib' + 'ds1000_Pytorch', 'ds1000_Matplotlib', 'openai_mmmlu_lite_AR-XY', + 'college', 'college_knowledge' ]]) def test_model_dataset_score(self, baseline_scores_fullbench, result_scores, model, dataset): @@ -188,9 +190,10 @@ class TestBaseFullbench: 'race-high', 'ARC-c', 'BoolQ', 'drop', 'GPQA_diamond', 'math', 'wikibench-wiki-single_choice_cncircular', 'sanitized_mbpp', 'gsm8k', 'triviaqa_wiki_1shot', 'nq_open_1shot', 'winogrande', 'hellaswag', - 'TheoremQA', 'college', 'college_knowledge', - 'bbh-logical_deduction_seven_objects', 'bbh-multistep_arithmetic_two', - 'mmlu-other', 'cmmlu-china-specific', 'mmlu_pro_math' + 'TheoremQA', 'dingo_en_192', 'dingo_zh_170', 'college', + 'college_knowledge', 'bbh-logical_deduction_seven_objects', + 'bbh-multistep_arithmetic_two', 'mmlu-other', 'cmmlu-china-specific', + 'mmlu_pro_math' ]]) def test_model_dataset_score(self, baseline_scores_fullbench, result_scores, model, dataset): diff --git a/.github/scripts/oc_score_baseline_fullbench.yaml b/.github/scripts/oc_score_baseline_fullbench.yaml index c95e7b91..413a99a3 100644 --- a/.github/scripts/oc_score_baseline_fullbench.yaml +++ b/.github/scripts/oc_score_baseline_fullbench.yaml @@ -2,19 +2,24 @@ internlm2_5-7b-chat-hf_fullbench: race-high: 93.75 ARC-c: 93.75 BoolQ: 81.25 + triviaqa_wiki_1shot: 50 + nq_open_1shot: 25 + IFEval: 50 drop: 81.25 GPQA_diamond: 25 + hellaswag: 87.5 + TheoremQA: 18.75 + musr_average: 39.58 + gsm8k: 56.25 math: 75 + cmo_fib: 6.25 + aime2024: 6.25 wikibench-wiki-single_choice_cncircular: 50 sanitized_mbpp: 68.75 ds1000: 16.96 - gsm8k: 56.25 - triviaqa_wiki_1shot: 50 - nq_open_1shot: 25 - hellaswag: 87.5 - TheoremQA: 18.75 - college: 12.5 - college_knowledge: 87.5 + lcb_code_generation: 12.5 + lcb_code_execution: 43.75 + lcb_test_output: 18.75 bbh-logical_deduction_seven_objects: 50 bbh-multistep_arithmetic_two: 68.75 mmlu-other: 72.6 @@ -27,6 +32,9 @@ internlm2_5-7b-chat-hf_fullbench: ds1000_Sklearn: 18.75 ds1000_Pytorch: 12.5 ds1000_Matplotlib: 43.75 + openai_mmmlu_lite_AR-XY: 37.5 + college: 12.5 + college_knowledge: 87.5 Alignbench总分: 0.65 Alignbench专业能力: 7.83 AlpacaEvaltotal: 0 @@ -56,19 +64,24 @@ internlm2_5-7b-chat-turbomind_fullbench: race-high: 93.75 ARC-c: 87.5 BoolQ: 68.75 + triviaqa_wiki_1shot: 50 + nq_open_1shot: 25 + IFEval: 50 drop: 75 - GPQA_diamond: 25 + hellaswag: 81.25 + TheoremQA: 6.25 + musr_average: 39.58 + gsm8k: 68.75 math: 75 + GPQA_diamond: 25 + cmo_fib: 6.25 + aime2024: 6.25 wikibench-wiki-single_choice_cncircular: 25 sanitized_mbpp: 68.75 ds1000: 13.39 - gsm8k: 68.75 - triviaqa_wiki_1shot: 50 - nq_open_1shot: 25 - hellaswag: 81.25 - TheoremQA: 6.25 - college: 0 - college_knowledge: 87.5 + lcb_code_generation: 12.5 + lcb_code_execution: 43.75 + lcb_test_output: 12.5 bbh-logical_deduction_seven_objects: 56.25 bbh-multistep_arithmetic_two: 68.75 mmlu-other: 74.04 @@ -81,6 +94,9 @@ internlm2_5-7b-chat-turbomind_fullbench: ds1000_Sklearn: 18.75 ds1000_Pytorch: 6.25 ds1000_Matplotlib: 37.5 + openai_mmmlu_lite_AR-XY: 37.5 + college: 0 + college_knowledge: 87.5 Alignbench总分: 0.64 Alignbench专业能力: 7.6 AlpacaEvaltotal: 10 @@ -121,6 +137,8 @@ internlm2_5-7b-hf_fullbench: winogrande: 75 hellaswag: 93.75 TheoremQA: 25 + dingo_en_192: 37.5 + dingo_zh_170: 100 college: 12.5 college_knowledge: 87.5 bbh-logical_deduction_seven_objects: 43.75 @@ -144,6 +162,8 @@ internlm2_5-7b-turbomind_fullbench: winogrande: 87.5 hellaswag: 93.75 TheoremQA: 31.25 + dingo_en_192: 43.75 + dingo_zh_170: 100 college: 12.5 college_knowledge: 87.5 bbh-logical_deduction_seven_objects: 50 diff --git a/.github/scripts/oc_score_baseline_testrange.yaml b/.github/scripts/oc_score_baseline_testrange.yaml index f93f8957..68f6660a 100644 --- a/.github/scripts/oc_score_baseline_testrange.yaml +++ b/.github/scripts/oc_score_baseline_testrange.yaml @@ -43,11 +43,11 @@ gemma-7b-it-hf: race-high: 68.75 gemma-2-9b-it-turbomind: - gsm8k: 68.75 + gsm8k: 65.62 race-high: 84.38 gemma-7b-it-vllm: - gsm8k: 28.12 + gsm8k: 34.38 race-high: 68.75 internlm2_5-7b-chat-hf: @@ -95,7 +95,7 @@ llama-3_1-8b-instruct-turbomind: race-high: 90.62 llama-3_2-3b-instruct-turbomind: - gsm8k: 65.62 + gsm8k: 62.50 race-high: 81.25 llama-3-8b-instruct-turbomind: @@ -112,15 +112,15 @@ mistral-7b-instruct-v0.3-hf: mistral-nemo-instruct-2407-hf: gsm8k: 75 - race-high: 84.38 + race-high: 81.25 mistral-nemo-instruct-2407-turbomind: - gsm8k: 75 - race-high: 84.38 + gsm8k: 68.75 + race-high: 87.50 mistral-7b-instruct-v0.1-vllm: - gsm8k: 37.5 - race-high: 71.88 + gsm8k: 34.38 + race-high: 68.75 mistral-7b-instruct-v0.2-vllm: gsm8k: 43.75 @@ -255,13 +255,13 @@ gemma-7b-hf: winogrande: 78.12 gemma-2b-vllm: - gsm8k: 18.75 + gsm8k: 15.62 GPQA_diamond: 6.25 race-high: winogrande: gemma-7b-vllm: - gsm8k: 59.38 + gsm8k: 53.12 GPQA_diamond: 6.25 race-high: winogrande: diff --git a/.github/workflows/daily-run-test.yml b/.github/workflows/daily-run-test.yml index 125aaa71..1d7a1189 100644 --- a/.github/workflows/daily-run-test.yml +++ b/.github/workflows/daily-run-test.yml @@ -163,9 +163,9 @@ jobs: pip install /cpfs01/user/qa-llm-cicd/packages/lmdeploy-0.6.1+cu118-cp310-cp310-manylinux2014_x86_64.whl --cache-dir ${{env.PIP_CACHE_PATH}} pip install /cpfs01/user/qa-llm-cicd/packages/vllm-0.6.1.post1+cu118-cp310-cp310-manylinux1_x86_64.whl --cache-dir ${{env.PIP_CACHE_PATH}} pip uninstall torch torchvision torchaudio -y - pip install torch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 --cache-dir ${{env.PIP_CACHE_PATH}} --index-url https://download.pytorch.org/whl/cu118 - FLASH_ATTENTION_FORCE_BUILD=TRUE pip install /cpfs01/user/qa-llm-cicd/packages/flash_attn-2.6.3+cu118torch2.4cxx11abiFALSE-cp310-cp310-linux_x86_64.whl - pip install /cpfs01/user/qa-llm-cicd/packages/xformers-0.0.27.post2+cu118-cp310-cp310-manylinux2014_x86_64.whl --cache-dir ${{env.PIP_CACHE_PATH}} + pip install torch==2.5.1 torchvision==0.20.1 torchaudio==2.5.1 --cache-dir ${{env.PIP_CACHE_PATH}} --index-url https://download.pytorch.org/whl/cu118 + FLASH_ATTENTION_FORCE_BUILD=TRUE pip install /cpfs01/user/qa-llm-cicd/packages/flash_attn-2.7.0.post2+cu11torch2.5cxx11abiFALSE-cp310-cp310-linux_x86_64.whl + pip install /cpfs01/user/qa-llm-cicd/packages/xformers-0.0.28.post3-cp310-cp310-manylinux_2_28_x86_64.whl --cache-dir ${{env.PIP_CACHE_PATH}} conda info --envs pip list - name: Prepare - create conda env and install torch - cu12 @@ -183,9 +183,9 @@ jobs: pip install opencompass*.whl --cache-dir ${{env.PIP_CACHE_PATH}} pip install opencompass[lmdeploy] --cache-dir ${{env.PIP_CACHE_PATH}} pip install opencompass[vllm] --cache-dir ${{env.PIP_CACHE_PATH}} - pip install torch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 --cache-dir ${{env.PIP_CACHE_PATH}} - FLASH_ATTENTION_FORCE_BUILD=TRUE pip install /cpfs01/user/qa-llm-cicd/packages/flash_attn-2.6.3+cu123torch2.4cxx11abiFALSE-cp310-cp310-linux_x86_64.whl - pip install /cpfs01/user/qa-llm-cicd/packages/xformers-0.0.27.post2-cp310-cp310-manylinux2014_x86_64.whl --cache-dir ${{env.PIP_CACHE_PATH}} + pip install torch==2.5.1 torchvision==0.20.1 torchaudio==2.5.1 --cache-dir ${{env.PIP_CACHE_PATH}} + FLASH_ATTENTION_FORCE_BUILD=TRUE pip install /cpfs01/user/qa-llm-cicd/packages/flash_attn-2.7.0.post2+cu12torch2.5cxx11abiFALSE-cp310-cp310-linux_x86_64.whl + pip install /cpfs01/user/qa-llm-cicd/packages/xformers-0.0.28.post3-cp310-cp310-manylinux_2_28_x86_64.whl --cache-dir ${{env.PIP_CACHE_PATH}} conda info --envs pip list - name: Prepare - reinstall lmdeploy - cu12 From 500fb1032a71b42b9843b705b522ec91274be9cb Mon Sep 17 00:00:00 2001 From: Linchen Xiao Date: Thu, 21 Nov 2024 16:51:18 +0800 Subject: [PATCH 16/17] [Update] Update configurations (#1704) --- .../livecodebench/livecodebench_gen.py | 2 +- .../livecodebench/livecodebench_gen_6966bc.py | 164 ++++++++++++++++++ .../livecodebench/livecodebench_gen.py | 2 +- .../livecodebench/livecodebench_gen_6966bc.py | 164 ++++++++++++++++++ .../models/chatglm/lmdeploy_glm4_9b.py | 15 ++ .../models/qwen2_5/lmdeploy_qwen2_5_14b.py | 15 ++ .../models/qwen2_5/lmdeploy_qwen2_5_32b.py | 15 ++ .../models/qwen2_5/lmdeploy_qwen2_5_72b.py | 17 ++ .../configs/models/yi/lmdeploy_yi_1_5_9b.py | 15 ++ opencompass/models/openai_api.py | 4 +- .../models/turbomind_with_tf_above_v4_33.py | 5 + 11 files changed, 414 insertions(+), 4 deletions(-) create mode 100644 configs/datasets/livecodebench/livecodebench_gen_6966bc.py create mode 100644 opencompass/configs/datasets/livecodebench/livecodebench_gen_6966bc.py create mode 100644 opencompass/configs/models/chatglm/lmdeploy_glm4_9b.py create mode 100644 opencompass/configs/models/qwen2_5/lmdeploy_qwen2_5_14b.py create mode 100644 opencompass/configs/models/qwen2_5/lmdeploy_qwen2_5_32b.py create mode 100644 opencompass/configs/models/qwen2_5/lmdeploy_qwen2_5_72b.py create mode 100644 opencompass/configs/models/yi/lmdeploy_yi_1_5_9b.py diff --git a/configs/datasets/livecodebench/livecodebench_gen.py b/configs/datasets/livecodebench/livecodebench_gen.py index a82ef82e..f663df06 100644 --- a/configs/datasets/livecodebench/livecodebench_gen.py +++ b/configs/datasets/livecodebench/livecodebench_gen.py @@ -1,4 +1,4 @@ from mmengine.config import read_base with read_base(): - from .livecodebench_gen_b2b0fd import LCB_datasets # noqa: F401, F403 + from .livecodebench_gen_6966bc import LCB_datasets # noqa: F401, F403 diff --git a/configs/datasets/livecodebench/livecodebench_gen_6966bc.py b/configs/datasets/livecodebench/livecodebench_gen_6966bc.py new file mode 100644 index 00000000..6f2da11e --- /dev/null +++ b/configs/datasets/livecodebench/livecodebench_gen_6966bc.py @@ -0,0 +1,164 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import ( + LCBCodeGenerationDataset, + LCBCodeExecutionDataset, + LCBTestOutputPredictionDataset, + LCBCodeGenerationEvaluator, + LCBCodeExecutionEvaluator, + LCBTestOutputEvaluator +) +from opencompass.datasets.livecodebench import TestOutputPromptConstants + + +lcb_code_generation_reader_cfg = dict( + input_columns=[ + 'question_content', + 'format_prompt', + ], + # output_column='evaluation_sample', + output_column='question_id', +) + +SYSTEM_MESSAGE_GENERIC = f'You are an expert Python programmer. You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests. You will NOT return anything except for the program.' + +prompt_template = '### Question:\n{question_content}\n\n{format_prompt}' + \ + '### Answer: (use the provided format with backticks)\n\n' + + +# Code Generation Tasks +lcb_code_generation_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict( + role='HUMAN', + prompt=prompt_template + ) + ] + ) + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=1024) +) + +lcb_code_generation_eval_cfg = dict( + evaluator=dict( + type=LCBCodeGenerationEvaluator, + num_process_evaluate=4, + timeout=6, + ), + pred_role='BOT', +) + +LCBCodeGeneration_dataset = dict( + type=LCBCodeGenerationDataset, + abbr='lcb_code_generation', + path='opencompass/code_generation_lite', + reader_cfg=lcb_code_generation_reader_cfg, + infer_cfg=lcb_code_generation_infer_cfg, + eval_cfg=lcb_code_generation_eval_cfg +) + +# Code Execution Dataset +lcb_code_execution_reader_cfg = dict( + input_columns=[ + 'prompt', + ], + output_column='evaluation_sample', +) + +lcb_code_execution_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + begin=[ + dict( + role='SYSTEM', + fallback_role='HUMAN', + prompt='You are an expert at Python programming, code execution, test case generation, and fuzzing.' + ), + ], + round=[ + dict( + role='HUMAN', + prompt='{prompt}' + ) + ] + ) + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=1024) +) + +lcb_code_execution_eval_cfg = dict( + evaluator=dict( + type=LCBCodeExecutionEvaluator, + ), + pred_role='BOT', +) + +LCBCodeExecution_dataset = dict( + type=LCBCodeExecutionDataset, + abbr='lcb_code_execution', + path='opencompass/execution-v2', + reader_cfg=lcb_code_execution_reader_cfg, + infer_cfg=lcb_code_execution_infer_cfg, + eval_cfg=lcb_code_execution_eval_cfg, +) + +# TestOuputput Dataset +lcb_test_output_reader_cfg = dict( + input_columns=[ + 'prompt', + ], + output_column='evaluation_sample', +) + +system_prompt = 'You are an expert Python programmer. You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests. You will NOT return anything except for the program.' + +lcb_test_output_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + # begin=[ + # dict( + # role='SYSTEM', + # prompt=system_prompt + # ), + # ], + round=[ + dict( + role='HUMAN', + prompt='{prompt}' + ) + ] + ) + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=1024) +) + +lcb_test_output_eval_cfg = dict( + evaluator=dict( + type=LCBTestOutputEvaluator, + ), + pred_role='BOT', +) + +LCBTestOutput_dataset = dict( + type=LCBTestOutputPredictionDataset, + abbr='lcb_test_output', + path='opencompass/test_generation', + reader_cfg=lcb_test_output_reader_cfg, + infer_cfg=lcb_test_output_infer_cfg, + eval_cfg=lcb_test_output_eval_cfg, +) + +LCB_datasets = [ + LCBCodeGeneration_dataset, + LCBCodeExecution_dataset, + LCBTestOutput_dataset, +] diff --git a/opencompass/configs/datasets/livecodebench/livecodebench_gen.py b/opencompass/configs/datasets/livecodebench/livecodebench_gen.py index a82ef82e..f663df06 100644 --- a/opencompass/configs/datasets/livecodebench/livecodebench_gen.py +++ b/opencompass/configs/datasets/livecodebench/livecodebench_gen.py @@ -1,4 +1,4 @@ from mmengine.config import read_base with read_base(): - from .livecodebench_gen_b2b0fd import LCB_datasets # noqa: F401, F403 + from .livecodebench_gen_6966bc import LCB_datasets # noqa: F401, F403 diff --git a/opencompass/configs/datasets/livecodebench/livecodebench_gen_6966bc.py b/opencompass/configs/datasets/livecodebench/livecodebench_gen_6966bc.py new file mode 100644 index 00000000..6f2da11e --- /dev/null +++ b/opencompass/configs/datasets/livecodebench/livecodebench_gen_6966bc.py @@ -0,0 +1,164 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import ( + LCBCodeGenerationDataset, + LCBCodeExecutionDataset, + LCBTestOutputPredictionDataset, + LCBCodeGenerationEvaluator, + LCBCodeExecutionEvaluator, + LCBTestOutputEvaluator +) +from opencompass.datasets.livecodebench import TestOutputPromptConstants + + +lcb_code_generation_reader_cfg = dict( + input_columns=[ + 'question_content', + 'format_prompt', + ], + # output_column='evaluation_sample', + output_column='question_id', +) + +SYSTEM_MESSAGE_GENERIC = f'You are an expert Python programmer. You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests. You will NOT return anything except for the program.' + +prompt_template = '### Question:\n{question_content}\n\n{format_prompt}' + \ + '### Answer: (use the provided format with backticks)\n\n' + + +# Code Generation Tasks +lcb_code_generation_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict( + role='HUMAN', + prompt=prompt_template + ) + ] + ) + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=1024) +) + +lcb_code_generation_eval_cfg = dict( + evaluator=dict( + type=LCBCodeGenerationEvaluator, + num_process_evaluate=4, + timeout=6, + ), + pred_role='BOT', +) + +LCBCodeGeneration_dataset = dict( + type=LCBCodeGenerationDataset, + abbr='lcb_code_generation', + path='opencompass/code_generation_lite', + reader_cfg=lcb_code_generation_reader_cfg, + infer_cfg=lcb_code_generation_infer_cfg, + eval_cfg=lcb_code_generation_eval_cfg +) + +# Code Execution Dataset +lcb_code_execution_reader_cfg = dict( + input_columns=[ + 'prompt', + ], + output_column='evaluation_sample', +) + +lcb_code_execution_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + begin=[ + dict( + role='SYSTEM', + fallback_role='HUMAN', + prompt='You are an expert at Python programming, code execution, test case generation, and fuzzing.' + ), + ], + round=[ + dict( + role='HUMAN', + prompt='{prompt}' + ) + ] + ) + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=1024) +) + +lcb_code_execution_eval_cfg = dict( + evaluator=dict( + type=LCBCodeExecutionEvaluator, + ), + pred_role='BOT', +) + +LCBCodeExecution_dataset = dict( + type=LCBCodeExecutionDataset, + abbr='lcb_code_execution', + path='opencompass/execution-v2', + reader_cfg=lcb_code_execution_reader_cfg, + infer_cfg=lcb_code_execution_infer_cfg, + eval_cfg=lcb_code_execution_eval_cfg, +) + +# TestOuputput Dataset +lcb_test_output_reader_cfg = dict( + input_columns=[ + 'prompt', + ], + output_column='evaluation_sample', +) + +system_prompt = 'You are an expert Python programmer. You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests. You will NOT return anything except for the program.' + +lcb_test_output_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + # begin=[ + # dict( + # role='SYSTEM', + # prompt=system_prompt + # ), + # ], + round=[ + dict( + role='HUMAN', + prompt='{prompt}' + ) + ] + ) + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=1024) +) + +lcb_test_output_eval_cfg = dict( + evaluator=dict( + type=LCBTestOutputEvaluator, + ), + pred_role='BOT', +) + +LCBTestOutput_dataset = dict( + type=LCBTestOutputPredictionDataset, + abbr='lcb_test_output', + path='opencompass/test_generation', + reader_cfg=lcb_test_output_reader_cfg, + infer_cfg=lcb_test_output_infer_cfg, + eval_cfg=lcb_test_output_eval_cfg, +) + +LCB_datasets = [ + LCBCodeGeneration_dataset, + LCBCodeExecution_dataset, + LCBTestOutput_dataset, +] diff --git a/opencompass/configs/models/chatglm/lmdeploy_glm4_9b.py b/opencompass/configs/models/chatglm/lmdeploy_glm4_9b.py new file mode 100644 index 00000000..e9e2d394 --- /dev/null +++ b/opencompass/configs/models/chatglm/lmdeploy_glm4_9b.py @@ -0,0 +1,15 @@ +from opencompass.models import TurboMindModel + +models = [ + dict( + type=TurboMindModel, + abbr='glm-4-9b-turbomind', + path='THUDM/glm-4-9b', + engine_config=dict(max_batch_size=16, tp=1), + gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=2048), + max_seq_len=8192, + max_out_len=2048, + batch_size=16, + run_cfg=dict(num_gpus=1), + ) +] diff --git a/opencompass/configs/models/qwen2_5/lmdeploy_qwen2_5_14b.py b/opencompass/configs/models/qwen2_5/lmdeploy_qwen2_5_14b.py new file mode 100644 index 00000000..a5f63e54 --- /dev/null +++ b/opencompass/configs/models/qwen2_5/lmdeploy_qwen2_5_14b.py @@ -0,0 +1,15 @@ +from opencompass.models import TurboMindModel + +models = [ + dict( + type=TurboMindModel, + abbr='qwen2.5-14b-turbomind', + path='Qwen/Qwen2.5-14B', + engine_config=dict(session_len=7168, max_batch_size=16, tp=2), + gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=1024), + max_seq_len=7168, + max_out_len=1024, + batch_size=16, + run_cfg=dict(num_gpus=2), + ) +] diff --git a/opencompass/configs/models/qwen2_5/lmdeploy_qwen2_5_32b.py b/opencompass/configs/models/qwen2_5/lmdeploy_qwen2_5_32b.py new file mode 100644 index 00000000..bf0c0c15 --- /dev/null +++ b/opencompass/configs/models/qwen2_5/lmdeploy_qwen2_5_32b.py @@ -0,0 +1,15 @@ +from opencompass.models import TurboMindModel + +models = [ + dict( + type=TurboMindModel, + abbr='qwen2.5-32b-turbomind', + path='Qwen/Qwen2.5-32B', + engine_config=dict(session_len=7168, max_batch_size=16, tp=2), + gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=1024), + max_seq_len=7168, + max_out_len=1024, + batch_size=16, + run_cfg=dict(num_gpus=2), + ) +] diff --git a/opencompass/configs/models/qwen2_5/lmdeploy_qwen2_5_72b.py b/opencompass/configs/models/qwen2_5/lmdeploy_qwen2_5_72b.py new file mode 100644 index 00000000..0bee0557 --- /dev/null +++ b/opencompass/configs/models/qwen2_5/lmdeploy_qwen2_5_72b.py @@ -0,0 +1,17 @@ +from opencompass.models import TurboMindModel + +models = [ + dict( + type=TurboMindModel, + abbr='qwen2.5-72b-turbomind', + path='Qwen/Qwen2.5-72B', + engine_config=dict(session_len=7168, max_batch_size=16, tp=4), + gen_config=dict( + top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=1024 + ), + max_seq_len=7168, + max_out_len=1024, + batch_size=16, + run_cfg=dict(num_gpus=4), + ) +] diff --git a/opencompass/configs/models/yi/lmdeploy_yi_1_5_9b.py b/opencompass/configs/models/yi/lmdeploy_yi_1_5_9b.py new file mode 100644 index 00000000..5780ec2e --- /dev/null +++ b/opencompass/configs/models/yi/lmdeploy_yi_1_5_9b.py @@ -0,0 +1,15 @@ +from opencompass.models import TurboMindModel + +models = [ + dict( + type=TurboMindModel, + abbr='yi-1.5-9b-turbomind', + path='01-ai/Yi-1.5-9B', + engine_config=dict(session_len=4096, max_batch_size=16, tp=1), + gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=2048), + max_seq_len=4096, + max_out_len=2048, + batch_size=16, + run_cfg=dict(num_gpus=1), + ) +] diff --git a/opencompass/models/openai_api.py b/opencompass/models/openai_api.py index ab7ab304..4e5d2246 100644 --- a/opencompass/models/openai_api.py +++ b/opencompass/models/openai_api.py @@ -526,7 +526,7 @@ class OpenAISDK(OpenAI): def _generate(self, input: PromptList | str, max_out_len: int, temperature: float) -> str: - from openai import BadRequestError + from openai import APIStatusError, BadRequestError assert isinstance(input, (str, PromptList)) # max num token for gpt-3.5-turbo is 4097 @@ -616,7 +616,7 @@ class OpenAISDK(OpenAI): from the API provider.') return responses.choices[0].message.content - except BadRequestError as e: + except (BadRequestError, APIStatusError) as e: # Handle BadRequest status # You can specify self.status_code_mappings to bypass \ # API sensitivity blocks diff --git a/opencompass/models/turbomind_with_tf_above_v4_33.py b/opencompass/models/turbomind_with_tf_above_v4_33.py index cf5b880b..79e6e556 100644 --- a/opencompass/models/turbomind_with_tf_above_v4_33.py +++ b/opencompass/models/turbomind_with_tf_above_v4_33.py @@ -87,6 +87,7 @@ class TurboMindModelwithChatTemplate(BaseModel): def generate(self, inputs: List[str], max_out_len: int, + min_out_len: Optional[int] = None, stopping_criteria: List[str] = [], do_sample: Optional[bool] = None, temperature: float = 1.0, @@ -123,6 +124,10 @@ class TurboMindModelwithChatTemplate(BaseModel): gen_config = copy.deepcopy(DEFAULT_GEN_CONFIG) gen_config.update(self.gen_config) + if max_out_len is not None: + gen_config['max_new_tokens'] = max_out_len + if min_out_len is not None: + gen_config['min_new_tokens'] = min_out_len if do_sample or ('do_sample' in self.gen_config and self.gen_config['do_sample']): gen_config['top_k'] = 40 gen_config['temperature'] = temperature From 80e3b9ef37f5fa827c7c34362e22926276d7f574 Mon Sep 17 00:00:00 2001 From: Linchen Xiao Date: Thu, 21 Nov 2024 21:29:43 +0800 Subject: [PATCH 17/17] [Update] Add math prm 800k (#1708) --- .../datasets/math/math_prm800k_500_gen.py | 36 +++++++++++++++++++ opencompass/models/openai_api.py | 8 ++--- 2 files changed, 39 insertions(+), 5 deletions(-) create mode 100644 opencompass/configs/datasets/math/math_prm800k_500_gen.py diff --git a/opencompass/configs/datasets/math/math_prm800k_500_gen.py b/opencompass/configs/datasets/math/math_prm800k_500_gen.py new file mode 100644 index 00000000..1b3bba23 --- /dev/null +++ b/opencompass/configs/datasets/math/math_prm800k_500_gen.py @@ -0,0 +1,36 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import MATHDataset, MATHEvaluator, math_postprocess_v2, normalize_final_answer + +math_reader_cfg = dict(input_columns=['problem'], output_column='solution') + +math_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict(role='HUMAN', prompt='{problem}\nPlease reason step by step, and put your final answer within \\boxed{}.'), + ] + ), + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=1024), +) + +# postprocess v2 +math_eval_cfg = dict( + evaluator=dict(type=MATHEvaluator, version='v2'), pred_postprocessor=dict(type=math_postprocess_v2), +) + +math_datasets = [ + dict( + type=MATHDataset, + abbr='math_prm800k_500', + path='opencompass/math', + file_name = 'test_prm800k_500.json', + reader_cfg=math_reader_cfg, + infer_cfg=math_infer_cfg, + eval_cfg=math_eval_cfg, + ) +] diff --git a/opencompass/models/openai_api.py b/opencompass/models/openai_api.py index 4e5d2246..91de1192 100644 --- a/opencompass/models/openai_api.py +++ b/opencompass/models/openai_api.py @@ -625,12 +625,10 @@ class OpenAISDK(OpenAI): status_code = e.status_code if (status_code is not None and status_code in self.status_code_mappings): - original_error_message = e.body.get('message') error_message = self.status_code_mappings[status_code] - self.logger.info( - f'Status Code: {status_code}, ' - f'Original Error Message: {original_error_message},' - f'Return Message: {error_message} ') + self.logger.info(f'Status Code: {status_code},\n' + f'Original Error Message: {e},\n' + f'Return Message: {error_message} ') return error_message else: self.logger.error(e)