From 9de27b4d85d9fd8439d43f6fe133338c741b427e Mon Sep 17 00:00:00 2001 From: Linchen Xiao Date: Mon, 2 Dec 2024 11:42:07 +0800 Subject: [PATCH] [Update] Update max_out_len for datasets (#1726) * [Update] Update max_out_len for datasets * Update eval_regression_chat_objective_fullbench.py * Update eval_regression_chat.py * Update eval_regression_chat.py * Update oc_score_baseline_fullbench.yaml --------- Co-authored-by: zhulinJulia24 <145004780+zhulinJulia24@users.noreply.github.com> --- ...val_regression_chat_objective_fullbench.py | 4 +- .../scripts/oc_score_baseline_fullbench.yaml | 2 +- .../compassarena/compassarena_compare.py | 2 +- .../gsm8k/gsm8k_0shot_v2_gen_6e39a4.py | 37 +++++ .../configs/datasets/musr/musr_gen_3622bb.py | 135 ++++++++++++++++++ .../compassarena/compassarena_compare.py | 2 +- .../lmdeploy_mistral_large_instruct_2411.py | 22 +++ opencompass/datasets/__init__.py | 1 + opencompass/models/gemini_api.py | 31 ++-- 9 files changed, 218 insertions(+), 18 deletions(-) create mode 100644 opencompass/configs/datasets/gsm8k/gsm8k_0shot_v2_gen_6e39a4.py create mode 100644 opencompass/configs/datasets/musr/musr_gen_3622bb.py create mode 100644 opencompass/configs/models/mistral/lmdeploy_mistral_large_instruct_2411.py diff --git a/.github/scripts/eval_regression_chat_objective_fullbench.py b/.github/scripts/eval_regression_chat_objective_fullbench.py index c66fba33..368fe040 100644 --- a/.github/scripts/eval_regression_chat_objective_fullbench.py +++ b/.github/scripts/eval_regression_chat_objective_fullbench.py @@ -22,7 +22,7 @@ with read_base(): from opencompass.configs.datasets.gpqa.gpqa_openai_simple_evals_gen_5aeece import \ gpqa_datasets # noqa: F401, E501 # new datasets in Fullbench v1.1 - from opencompass.configs.datasets.gsm8k.gsm8k_0shot_v2_gen_a58960 import \ + from opencompass.configs.datasets.gsm8k.gsm8k_0shot_v2_gen_6e39a4 import \ gsm8k_datasets # noqa: F401, E501 from opencompass.configs.datasets.hellaswag.hellaswag_10shot_gen_e42710 import \ hellaswag_datasets # noqa: F401, E501 @@ -46,7 +46,7 @@ with read_base(): mmlu_pro_datasets # noqa: F401, E501 from opencompass.configs.datasets.mmmlu_lite.mmmlu_lite_gen_c51a84 import \ mmmlu_lite_datasets # noqa: F401, E501 - from opencompass.configs.datasets.musr.musr_gen_3c6e15 import \ + from opencompass.configs.datasets.musr.musr_gen_3622bb import \ musr_datasets # noqa: F401, E501 from opencompass.configs.datasets.nq.nq_open_1shot_gen_2e45e5 import \ nq_datasets # noqa: F401, E501 diff --git a/.github/scripts/oc_score_baseline_fullbench.yaml b/.github/scripts/oc_score_baseline_fullbench.yaml index 413a99a3..49393e05 100644 --- a/.github/scripts/oc_score_baseline_fullbench.yaml +++ b/.github/scripts/oc_score_baseline_fullbench.yaml @@ -70,7 +70,7 @@ internlm2_5-7b-chat-turbomind_fullbench: drop: 75 hellaswag: 81.25 TheoremQA: 6.25 - musr_average: 39.58 + musr_average: 37.5 gsm8k: 68.75 math: 75 GPQA_diamond: 25 diff --git a/configs/datasets/subjective/compassarena/compassarena_compare.py b/configs/datasets/subjective/compassarena/compassarena_compare.py index e175a787..2c9b3e9b 100644 --- a/configs/datasets/subjective/compassarena/compassarena_compare.py +++ b/configs/datasets/subjective/compassarena/compassarena_compare.py @@ -118,7 +118,7 @@ for _name, _prompt in sub_map.items(): ]), ), retriever=dict(type=ZeroRetriever), - inferencer=dict(type=GenInferencer, max_seq_len=4096, max_out_len=2048), + inferencer=dict(type=GenInferencer, max_seq_len=4096, max_out_len=4096), ) subjective_eval_cfg = dict( diff --git a/opencompass/configs/datasets/gsm8k/gsm8k_0shot_v2_gen_6e39a4.py b/opencompass/configs/datasets/gsm8k/gsm8k_0shot_v2_gen_6e39a4.py new file mode 100644 index 00000000..3888678c --- /dev/null +++ b/opencompass/configs/datasets/gsm8k/gsm8k_0shot_v2_gen_6e39a4.py @@ -0,0 +1,37 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import GSM8KDataset, gsm8k_postprocess, gsm8k_dataset_postprocess, Gsm8kEvaluator +from opencompass.datasets import MATHEvaluator, math_postprocess_v2 + +gsm8k_reader_cfg = dict(input_columns=['question'], output_column='answer') + +gsm8k_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict(role='HUMAN', prompt='{question}\nPlease reason step by step, and put your final answer within \\boxed{}.'), + ], + ), + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=2048), +) + +gsm8k_eval_cfg = dict( + evaluator=dict(type=MATHEvaluator, version='v2'), + pred_postprocessor=dict(type=math_postprocess_v2), + dataset_postprocessor=dict(type=gsm8k_dataset_postprocess), +) + +gsm8k_datasets = [ + dict( + abbr='gsm8k', + type=GSM8KDataset, + path='opencompass/gsm8k', + reader_cfg=gsm8k_reader_cfg, + infer_cfg=gsm8k_infer_cfg, + eval_cfg=gsm8k_eval_cfg, + ) +] diff --git a/opencompass/configs/datasets/musr/musr_gen_3622bb.py b/opencompass/configs/datasets/musr/musr_gen_3622bb.py new file mode 100644 index 00000000..93c065f0 --- /dev/null +++ b/opencompass/configs/datasets/musr/musr_gen_3622bb.py @@ -0,0 +1,135 @@ +from opencompass.datasets import MusrDataset, MusrEvaluator +from opencompass.openicl import PromptTemplate, ZeroRetriever, GenInferencer + + +DATASET_CONFIGS = { + 'murder_mysteries': { + 'abbr': 'musr_murder_mysteries', + 'name': 'murder_mysteries', + 'path': 'opencompass/musr', + 'reader_cfg': dict( + input_columns=['context', 'question_text', 'question', 'answer', 'choices', 'choices_str', 'intermediate_trees', 'intermediate_data', 'prompt', 'system_prompt', 'gold_answer', 'scidx', 'self_consistency_n', 'ablation_name'], + output_column='gold_answer', + ), + 'infer_cfg': dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + begin=[ + dict( + role='SYSTEM', + fallback_role='HUMAN', + prompt='{system_prompt}' + ) + ], + round=[ + dict( + role='HUMAN', + prompt='{prompt}' + ), + ] + ), + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=2048), + ), + 'eval_cfg': dict( + evaluator=dict( + type=MusrEvaluator, + answer_index_modifier=1, + self_consistency_n=1 + ), + ), + }, + 'object_placements': { + 'abbr': 'musr_object_placements', + 'name': 'object_placements', + 'path': 'opencompass/musr', + 'reader_cfg': dict( + input_columns=['context', 'question_text', 'question', 'answer', 'choices', 'choices_str', 'intermediate_trees', 'intermediate_data', 'prompt', 'system_prompt', 'gold_answer', 'scidx', 'self_consistency_n', 'ablation_name'], + output_column='gold_answer', + ), + 'infer_cfg': dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + begin=[ + dict( + role='SYSTEM', + fallback_role='HUMAN', + prompt='{system_prompt}' + ) + ], + round=[ + dict( + role='HUMAN', + prompt='{prompt}' + ), + ] + ), + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=512), + ), + 'eval_cfg': dict( + evaluator=dict( + type=MusrEvaluator, + answer_index_modifier=1, + self_consistency_n=1 + ), + ), + }, + 'team_allocation': { + 'abbr': 'musr_team_allocation', + 'name': 'team_allocation', + 'path': 'opencompass/musr', + 'reader_cfg': dict( + input_columns=['context', 'question_text', 'question', 'answer', 'choices', 'choices_str', 'intermediate_trees', 'intermediate_data', 'prompt', 'system_prompt', 'gold_answer', 'scidx', 'self_consistency_n', 'ablation_name'], + output_column='gold_answer', + ), + 'infer_cfg': dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + begin=[ + dict( + role='SYSTEM', + fallback_role='HUMAN', + prompt='{system_prompt}' + ) + ], + round=[ + dict( + role='HUMAN', + prompt='{prompt}' + ), + ] + ), + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=512), + ), + 'eval_cfg': dict( + evaluator=dict( + type=MusrEvaluator, + answer_index_modifier=1, + self_consistency_n=1 + ), + ), + }, +} + + +musr_datasets = [] + +for config in DATASET_CONFIGS.values(): + dataset = dict( + abbr=config['abbr'], + type=MusrDataset, + path=config['path'], + name=config['name'], + reader_cfg=config['reader_cfg'], + infer_cfg=config['infer_cfg'], + eval_cfg=config['eval_cfg'], + ) + musr_datasets.append(dataset) diff --git a/opencompass/configs/datasets/subjective/compassarena/compassarena_compare.py b/opencompass/configs/datasets/subjective/compassarena/compassarena_compare.py index e175a787..2c9b3e9b 100644 --- a/opencompass/configs/datasets/subjective/compassarena/compassarena_compare.py +++ b/opencompass/configs/datasets/subjective/compassarena/compassarena_compare.py @@ -118,7 +118,7 @@ for _name, _prompt in sub_map.items(): ]), ), retriever=dict(type=ZeroRetriever), - inferencer=dict(type=GenInferencer, max_seq_len=4096, max_out_len=2048), + inferencer=dict(type=GenInferencer, max_seq_len=4096, max_out_len=4096), ) subjective_eval_cfg = dict( diff --git a/opencompass/configs/models/mistral/lmdeploy_mistral_large_instruct_2411.py b/opencompass/configs/models/mistral/lmdeploy_mistral_large_instruct_2411.py new file mode 100644 index 00000000..205dc27b --- /dev/null +++ b/opencompass/configs/models/mistral/lmdeploy_mistral_large_instruct_2411.py @@ -0,0 +1,22 @@ +from opencompass.models import TurboMindModelwithChatTemplate + +models = [ + dict( + type=TurboMindModelwithChatTemplate, + abbr='mixtral-large-instruct-2411-turbomind', + path='mistralai/Mistral-Large-Instruct-2411', + engine_config=dict( + session_len=32768, + max_batch_size=16, + tp=4, + cache_max_entry_count=0.7, + ), + gen_config=dict( + top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=4096 + ), + max_seq_len=32768, + max_out_len=4096, + batch_size=8, + run_cfg=dict(num_gpus=4), + ) +] diff --git a/opencompass/datasets/__init__.py b/opencompass/datasets/__init__.py index 20e6fecb..a590b8dd 100644 --- a/opencompass/datasets/__init__.py +++ b/opencompass/datasets/__init__.py @@ -6,6 +6,7 @@ from .anli import AnliDataset # noqa: F401, F403 from .anthropics_evals import * # noqa: F401, F403 from .apps import * # noqa: F401, F403 from .arc import * # noqa: F401, F403 +from .arc_prize_public_evaluation import * # noqa: F401, F403 from .ax import * # noqa: F401, F403 from .babilong import * # noqa: F401, F403 from .bbh import * # noqa: F401, F403 diff --git a/opencompass/models/gemini_api.py b/opencompass/models/gemini_api.py index 7695b218..0020cf22 100644 --- a/opencompass/models/gemini_api.py +++ b/opencompass/models/gemini_api.py @@ -44,11 +44,13 @@ class Gemini(BaseAPIModel): top_p: float = 0.8, top_k: float = 10.0, ): - super().__init__(path=path, - max_seq_len=max_seq_len, - query_per_second=query_per_second, - meta_template=meta_template, - retry=retry) + super().__init__( + path=path, + max_seq_len=max_seq_len, + query_per_second=query_per_second, + meta_template=meta_template, + retry=retry, + ) assert isinstance(key, str) if key == 'ENV': if 'GEMINI_API_KEY' not in os.environ: @@ -56,8 +58,11 @@ class Gemini(BaseAPIModel): key = os.getenv('GEMINI_API_KEY') assert path in [ - 'gemini-1.0-pro', 'gemini-pro', 'gemini-1.5-flash', - 'gemini-1.5-pro' + 'gemini-1.0-pro', + 'gemini-pro', + 'gemini-1.5-flash', + 'gemini-1.5-pro', + 'gemini-1.5-pro-latest', ] # https://ai.google.dev/gemini-api/docs/models/gemini#model-variations self.url = f'https://generativelanguage.googleapis.com/v1beta/models/{path}:generateContent?key={key}' @@ -147,19 +152,19 @@ class Gemini(BaseAPIModel): 'safetySettings': [ { 'category': 'HARM_CATEGORY_DANGEROUS_CONTENT', - 'threshold': 'BLOCK_NONE' + 'threshold': 'BLOCK_NONE', }, { 'category': 'HARM_CATEGORY_HATE_SPEECH', - 'threshold': 'BLOCK_NONE' + 'threshold': 'BLOCK_NONE', }, { 'category': 'HARM_CATEGORY_HARASSMENT', - 'threshold': 'BLOCK_NONE' + 'threshold': 'BLOCK_NONE', }, { 'category': 'HARM_CATEGORY_DANGEROUS_CONTENT', - 'threshold': 'BLOCK_NONE' + 'threshold': 'BLOCK_NONE', }, ], 'generationConfig': { @@ -167,8 +172,8 @@ class Gemini(BaseAPIModel): 'temperature': self.temperature, 'maxOutputTokens': 2048, 'topP': self.top_p, - 'topK': self.top_k - } + 'topK': self.top_k, + }, } for _ in range(self.retry):