diff --git a/configs/eval_corebench_2409_longcontext.py b/configs/eval_corebench_2409_longcontext.py new file mode 100644 index 00000000..718044d2 --- /dev/null +++ b/configs/eval_corebench_2409_longcontext.py @@ -0,0 +1,138 @@ +import os.path as osp +from copy import deepcopy + +from mmengine.config import read_base +from opencompass.models import (HuggingFacewithChatTemplate, + TurboMindModelwithChatTemplate) +from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner +from opencompass.runners import DLCRunner, LocalRunner +from opencompass.tasks import OpenICLEvalTask, OpenICLInferTask + + +####################################################################### +# PART 0 Essential Configs # +####################################################################### +with read_base(): + from opencompass.configs.datasets.longbench.longbench import \ + longbench_datasets + from opencompass.configs.datasets.needlebench.needlebench_8k.needlebench_8k import \ + needlebench_datasets as needlebench_8k_datasets + from opencompass.configs.datasets.needlebench.needlebench_32k.needlebench_32k import \ + needlebench_datasets as needlebench_32k_datasets + from opencompass.configs.datasets.needlebench.needlebench_128k.needlebench_128k import \ + needlebench_datasets as needlebench_128k_datasets + from opencompass.configs.datasets.ruler.ruler_8k_gen import \ + ruler_datasets as ruler_8k_datasets + from opencompass.configs.datasets.ruler.ruler_32k_gen import \ + ruler_datasets as ruler_32k_datasets + from opencompass.configs.datasets.ruler.ruler_128k_gen import \ + ruler_datasets as ruler_128k_datasets + # Summary Groups + from opencompass.configs.summarizers.groups.longbench import \ + longbench_summary_groups + from opencompass.configs.summarizers.groups.ruler import \ + ruler_summary_groups + from opencompass.configs.summarizers.needlebench import ( + needlebench_8k_summarizer, needlebench_32k_summarizer, + needlebench_128k_summarizer) + + # Instruct models + from opencompass.configs.models.qwen.lmdeploy_qwen2_7b_instruct import \ + models as lmdeploy_qwen2_7b_instruct_model + + from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat_1m import \ + models as lmdeploy_internlm2_5_7b_1m_chat_model + from opencompass.configs.models.hf_llama.lmdeploy_llama3_1_8b_instruct import \ + models as llama3_1_8b_instruct_model + + +####################################################################### +# PART 1 Datasets List # +####################################################################### +# datasets list for evaluation +datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), []) + + +####################################################################### +# PART 2 Datset Summarizer # +####################################################################### +needlebench_8k_summary_groups = needlebench_8k_summarizer['summary_groups'] +needlebench_32k_summary_groups = needlebench_32k_summarizer['summary_groups'] +needlebench_128k_summary_groups = needlebench_128k_summarizer['summary_groups'] + +# Instruct models summarizer +summarizer = dict( + dataset_abbrs=[ + ['ruler_8k', 'naive_average'], + ['ruler_32k', 'naive_average'], + ['ruler_128k', 'naive_average'], + ['NeedleBench-Overall-Score-8K', 'weighted_average'], + ['NeedleBench-Overall-Score-32K', 'weighted_average'], + ['NeedleBench-Overall-Score-128K', 'weighted_average'], + ['longbench', 'naive_average'], + ['longbench_zh', 'naive_average'], + ['longbench_en', 'naive_average'], + '', + 'longbench_single-document-qa', + 'longbench_multi-document-qa', + 'longbench_summarization', + 'longbench_few-shot-learning', + 'longbench_synthetic-tasks', + 'longbench_code-completion', + ], + summary_groups=sum( + [v for k, v in locals().items() if k.endswith('_summary_groups')], []), +) + + +####################################################################### +# PART 3 Models List # +####################################################################### + +lmdeploy_qwen2_7b_instruct_model[0]['max_seq_len'] = 1048576 +lmdeploy_qwen2_7b_instruct_model[0]['engine_config']['session_len'] = 1048576 +lmdeploy_qwen2_7b_instruct_model[0]['engine_config']['tp'] = 4 +lmdeploy_qwen2_7b_instruct_model[0]['engine_config']['rope_scaling_factor'] = 4 +lmdeploy_qwen2_7b_instruct_model[0]['run_cfg']['num_gpus'] = 4 + +llama3_1_8b_instruct_model[0]['max_seq_len'] = 1048576 +llama3_1_8b_instruct_model[0]['engine_config']['session_len'] = 1048576 +llama3_1_8b_instruct_model[0]['engine_config']['tp'] = 4 +llama3_1_8b_instruct_model[0]['engine_config']['rope_scaling_factor'] = 4 +llama3_1_8b_instruct_model[0]['run_cfg']['num_gpus'] = 4 + +models = sum([v for k, v in locals().items() if k.endswith('_model')], []) + +####################################################################### +# PART 4 Inference/Evaluation Configuaration # +####################################################################### + +# Local Runner +infer = dict( + partitioner=dict( + type=NumWorkerPartitioner, + num_worker=8 + ), + runner=dict( + type=LocalRunner, + max_num_workers=16, + retry=0, # Modify if needed + task=dict(type=OpenICLInferTask) + ), +) + +# eval with local runner +eval = dict( + partitioner=dict(type=NaivePartitioner, n=10), + runner=dict( + type=LocalRunner, + max_num_workers=16, + task=dict(type=OpenICLEvalTask)), +) + + +####################################################################### +# PART 5 Utils Configuaration # +####################################################################### +base_exp_dir = 'outputs/corebench/' +work_dir = osp.join(base_exp_dir, 'long_context') diff --git a/configs/eval_corebench_2409_objective.py b/configs/eval_corebench_2409_objective.py new file mode 100644 index 00000000..e14c5247 --- /dev/null +++ b/configs/eval_corebench_2409_objective.py @@ -0,0 +1,208 @@ +from mmengine.config import read_base +import os.path as osp +from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner +from opencompass.runners import LocalRunner +from opencompass.tasks import OpenICLInferTask, OpenICLEvalTask + + +####################################################################### +# PART 0 Essential Configs # +####################################################################### +with read_base(): + # Datasets Part + ## Core Set + # ## Examination + from opencompass.configs.datasets.mmlu.mmlu_openai_simple_evals_gen_b618ea import mmlu_datasets + from opencompass.configs.datasets.mmlu_pro.mmlu_pro_0shot_cot_gen_08c1de import mmlu_pro_datasets + from opencompass.configs.datasets.cmmlu.cmmlu_0shot_cot_gen_305931 import cmmlu_datasets + + # ## Reasoning + from opencompass.configs.datasets.bbh.bbh_gen_4a31fa import bbh_datasets + # TODO: Add HellaSwag + # TODO: Add DROP + + # ## Math + from opencompass.configs.datasets.math.math_0shot_gen_393424 import math_datasets + # TODO: Add GSM8K + # TODO: Add MathBench + + # ## Scientific + from opencompass.configs.datasets.gpqa.gpqa_openai_simple_evals_gen_5aeece import gpqa_datasets + + # ## Coding + from opencompass.configs.datasets.humaneval.humaneval_gen_8e312c import humaneval_datasets + # TODO: Add MBPP + # TODO: Add LiveCodeBench + + # ## Instruction Following + from opencompass.configs.datasets.IFEval.IFEval_gen_3321a3 import ifeval_datasets + + # Summarizer + from opencompass.configs.summarizers.groups.mmlu import mmlu_summary_groups + from opencompass.configs.summarizers.groups.mmlu_pro import mmlu_pro_summary_groups + from opencompass.configs.summarizers.groups.cmmlu import cmmlu_summary_groups + from opencompass.configs.summarizers.groups.bbh import bbh_summary_groups + + + # Model List + # from opencompass.configs.models.qwen.lmdeploy_qwen2_1_5b_instruct import models as lmdeploy_qwen2_1_5b_instruct_model + # from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat import models as hf_internlm2_5_7b_chat_model + # from opencompass.configs.models.openbmb.hf_minicpm_2b_sft_bf16 import models as hf_minicpm_2b_sft_bf16_model + # from opencompass.configs.models.yi.hf_yi_1_5_6b_chat import models as hf_yi_1_5_6b_chat_model + # from opencompass.configs.models.gemma.hf_gemma_2b_it import models as hf_gemma_2b_it_model + # from opencompass.configs.models.yi.hf_yi_1_5_34b_chat import models as hf_yi_1_5_34b_chat_model + +####################################################################### +# PART 1 Datasets List # +####################################################################### +# datasets list for evaluation +datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), []) + + +####################################################################### +# PART 2 Datset Summarizer # +####################################################################### +# with read_base(): + +core_summary_groups = [ + { + 'name': 'core_average', + 'subsets': [ + ['mmlu', 'accuracy'], + ['mmlu_pro', 'accuracy'], + # ['cmmlu', 'naive_average'], + ['cmmlu', 'accuracy'], + ['bbh', 'score'], + ['math', 'accuracy'], + ['openai_humaneval', 'humaneval_pass@1'], + ['GPQA_diamond', 'accuracy'], + ['IFEval', 'Prompt-level-strict-accuracy'], + ], + }, +] + +summarizer = dict( + dataset_abbrs=[ + ['core_average', 'naive_average'], + ['mmlu', 'accuracy'], + ['mmlu_pro', 'accuracy'], + ['cmmlu', 'accuracy'], + ['bbh', 'score'], + ['math', 'accuracy'], + ['openai_humaneval', 'humaneval_pass@1'], + ['GPQA_diamond', 'accuracy'], + ['IFEval', 'Prompt-level-strict-accuracy'], + '', + + ['mmlu', 'accuracy'], + ['mmlu-stem', 'accuracy'], + ['mmlu-social-science', 'accuracy'], + ['mmlu-humanities', 'accuracy'], + ['mmlu-other', 'accuracy'], + + '', + ['mmlu_pro', 'accuracy'], + ['mmlu_pro_math','accuracy'], + ['mmlu_pro_physics', 'accuracy'], + ['mmlu_pro_chemistry', 'accuracy'], + ['mmlu_pro_law', 'accuracy'], + ['mmlu_pro_engineering', 'accuracy'], + ['mmlu_pro_other', 'accuracy'], + ['mmlu_pro_economics', 'accuracy'], + ['mmlu_pro_health', 'accuracy'], + ['mmlu_pro_psychology', 'accuracy'], + ['mmlu_pro_business', 'accuracy'], + ['mmlu_pro_biology', 'accuracy'], + ['mmlu_pro_philosophy', 'accuracy'], + ['mmlu_pro_computer_science','accuracy'], + ['mmlu_pro_history', 'accuracy'], + '', + ['cmmlu', 'accuracy'], + ['cmmlu-stem', 'accuracy'], + ['cmmlu-social-science', 'accuracy'], + ['cmmlu-humanities', 'accuracy'], + ['cmmlu-other', 'accuracy'], + ['cmmlu-china-specific', 'accuracy'], + '', + ['bbh', 'extract_rate'], + ['math', 'extract_rate'], + # ['openai_humaneval', 'extract_rate'], + ['GPQA_diamond', 'extract_rate'], + # ['IFEval', 'extract_rate'], + '', + ['mmlu', 'extract_rate'], + ['mmlu-stem', 'extract_rate'], + ['mmlu-social-science', 'extract_rate'], + ['mmlu-humanities', 'extract_rate'], + ['mmlu-other', 'extract_rate'], + '', + ['mmlu_pro', 'extract_rate'], + ['mmlu_pro_math', 'extract_rate'], + ['mmlu_pro_physics', 'extract_rate'], + ['mmlu_pro_chemistry', 'extract_rate'], + ['mmlu_pro_law', 'extract_rate'], + ['mmlu_pro_engineering', 'extract_rate'], + ['mmlu_pro_other', 'extract_rate'], + ['mmlu_pro_economics', 'extract_rate'], + ['mmlu_pro_health', 'extract_rate'], + ['mmlu_pro_psychology', 'extract_rate'], + ['mmlu_pro_business', 'extract_rate'], + ['mmlu_pro_biology', 'extract_rate'], + ['mmlu_pro_philosophy', 'extract_rate'], + ['mmlu_pro_computer_science', 'extract_rate'], + ['mmlu_pro_history', 'extract_rate'], + '', + ['cmmlu', 'extract_rate'], + ['cmmlu-stem', 'extract_rate'], + ['cmmlu-social-science', 'extract_rate'], + ['cmmlu-humanities', 'extract_rate'], + ['cmmlu-other', 'extract_rate'], + ['cmmlu-china-specific', 'extract_rate'], + + ], + summary_groups=sum( + [v for k, v in locals().items() if k.endswith('_summary_groups')], []), +) + + +####################################################################### +# PART 3 Models List # +####################################################################### + +models = sum([v for k, v in locals().items() if k.endswith('_model')], []) + + + +####################################################################### +# PART 4 Inference/Evaluation Configuaration # +####################################################################### + +# Local Runner +infer = dict( + partitioner=dict( + type=NumWorkerPartitioner, + num_worker=8 + ), + runner=dict( + type=LocalRunner, + max_num_workers=16, + retry=0, # Modify if needed + task=dict(type=OpenICLInferTask) + ), +) + +# eval with local runner +eval = dict( + partitioner=dict(type=NaivePartitioner, n=10), + runner=dict( + type=LocalRunner, + max_num_workers=16, + task=dict(type=OpenICLEvalTask)), +) + + +####################################################################### +# PART 5 Utils Configuaration # +####################################################################### +base_exp_dir = 'outputs/corebench/' +work_dir = osp.join(base_exp_dir, 'chat_objective') diff --git a/configs/eval_corebench_2409_subjective.py b/configs/eval_corebench_2409_subjective.py new file mode 100644 index 00000000..c0623c80 --- /dev/null +++ b/configs/eval_corebench_2409_subjective.py @@ -0,0 +1,134 @@ +import os.path as osp +from copy import deepcopy + +from mmengine.config import read_base +from opencompass.models import (HuggingFacewithChatTemplate, + TurboMindModelwithChatTemplate) +from opencompass.models.openai_api import OpenAI, OpenAISDK +from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner +from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner +from opencompass.runners import DLCRunner, LocalRunner +from opencompass.summarizers import SubjectiveSummarizer +from opencompass.tasks import OpenICLEvalTask, OpenICLInferTask +from opencompass.tasks.subjective_eval import SubjectiveEvalTask + + +####################################################################### +# PART 0 Essential Configs # +####################################################################### +with read_base(): + # Datasets Part + from opencompass.configs.datasets.subjective.arena_hard.arena_hard_compare import \ + arenahard_datasets + from opencompass.configs.datasets.subjective.alignbench.alignbench_v1_1_judgeby_critiquellm import \ + alignbench_datasets + from opencompass.configs.datasets.subjective.multiround.mtbench_single_judge_diff_temp import \ + mtbench_datasets + + # Summarizer + + # Model List + # from opencompass.configs.models.qwen.lmdeploy_qwen2_1_5b_instruct import models as lmdeploy_qwen2_1_5b_instruct_model + # from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat import models as hf_internlm2_5_7b_chat_model + + +####################################################################### +# PART 1 Datasets List # +####################################################################### +# datasets list for evaluation + +datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), []) + + +####################################################################### +# PART 2 Datset Summarizer # +####################################################################### +summarizer = dict(type=SubjectiveSummarizer, function='subjective') + +####################################################################### +# PART 3 Models List # +####################################################################### + +models = [ + dict( + type=TurboMindModelwithChatTemplate, + abbr='internlm2_5-7b-chat-turbomind', + path='internlm/internlm2_5-7b-chat', + engine_config=dict(session_len=16384, max_batch_size=16, tp=1), + gen_config=dict(top_k=40, temperature=1.0, top_p=0.9, max_new_tokens=4096), + max_seq_len=16384, + max_out_len=4096, + batch_size=16, + run_cfg=dict(num_gpus=1), + ) +] + +models = sum([v for k, v in locals().items() if k.endswith('_model')], models) + + + +####################################################################### +# PART 4 Inference/Evaluation Configuaration # +####################################################################### + +# Local Runner +infer = dict( + partitioner=dict( + type=NumWorkerPartitioner, + num_worker=8 + ), + runner=dict( + type=LocalRunner, + max_num_workers=16, + retry=0, # Modify if needed + task=dict(type=OpenICLInferTask) + ), +) + +# JudgeLLM +api_meta_template = dict(round=[ + dict(role='HUMAN', api_role='HUMAN'), + dict(role='BOT', api_role='BOT', generate=True), +]) + + +judge_models = [ + dict( + type=OpenAISDK, + abbr='gpt-4o-2024-08-06', + path='gpt-4o-2024-08-06', + # openai_api_base= + # 'http://10.140.1.86:10001/v1', # Change to your own url if needed. + key='YOUR_API_KEY', + retry=10, + meta_template=api_meta_template, + rpm_verbose=True, + query_per_second=1, + max_out_len=4096, + max_seq_len=16384, + batch_size=16, + temperature=0.01, + tokenizer_path='gpt-4o-2024-08-06' + ) +] + +# Evaluation with local runner +eval = dict( + partitioner=dict( + type=SubjectiveNaivePartitioner, + models=models, + judge_models=judge_models, + ), + runner=dict( + type=LocalRunner, + max_num_workers=16, + task=dict(type=SubjectiveEvalTask)), +) + + + +####################################################################### +# PART 5 Utils Configuaration # +####################################################################### +base_exp_dir = 'outputs/corebench/' +work_dir = osp.join(base_exp_dir, 'chat_subjective')