[Update] Update max_out_len for datasets (#1726)

* [Update] Update max_out_len for datasets

* Update eval_regression_chat_objective_fullbench.py

* Update eval_regression_chat.py

* Update eval_regression_chat.py

* Update oc_score_baseline_fullbench.yaml

---------

Co-authored-by: zhulinJulia24 <145004780+zhulinJulia24@users.noreply.github.com>
This commit is contained in:
Linchen Xiao 2024-12-02 11:42:07 +08:00 committed by GitHub
parent fe6d76fb13
commit 9de27b4d85
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
9 changed files with 218 additions and 18 deletions

View File

@ -22,7 +22,7 @@ with read_base():
from opencompass.configs.datasets.gpqa.gpqa_openai_simple_evals_gen_5aeece import \
gpqa_datasets # noqa: F401, E501
# new datasets in Fullbench v1.1
from opencompass.configs.datasets.gsm8k.gsm8k_0shot_v2_gen_a58960 import \
from opencompass.configs.datasets.gsm8k.gsm8k_0shot_v2_gen_6e39a4 import \
gsm8k_datasets # noqa: F401, E501
from opencompass.configs.datasets.hellaswag.hellaswag_10shot_gen_e42710 import \
hellaswag_datasets # noqa: F401, E501
@ -46,7 +46,7 @@ with read_base():
mmlu_pro_datasets # noqa: F401, E501
from opencompass.configs.datasets.mmmlu_lite.mmmlu_lite_gen_c51a84 import \
mmmlu_lite_datasets # noqa: F401, E501
from opencompass.configs.datasets.musr.musr_gen_3c6e15 import \
from opencompass.configs.datasets.musr.musr_gen_3622bb import \
musr_datasets # noqa: F401, E501
from opencompass.configs.datasets.nq.nq_open_1shot_gen_2e45e5 import \
nq_datasets # noqa: F401, E501

View File

@ -70,7 +70,7 @@ internlm2_5-7b-chat-turbomind_fullbench:
drop: 75
hellaswag: 81.25
TheoremQA: 6.25
musr_average: 39.58
musr_average: 37.5
gsm8k: 68.75
math: 75
GPQA_diamond: 25

View File

@ -118,7 +118,7 @@ for _name, _prompt in sub_map.items():
]),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_seq_len=4096, max_out_len=2048),
inferencer=dict(type=GenInferencer, max_seq_len=4096, max_out_len=4096),
)
subjective_eval_cfg = dict(

View File

@ -0,0 +1,37 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets import GSM8KDataset, gsm8k_postprocess, gsm8k_dataset_postprocess, Gsm8kEvaluator
from opencompass.datasets import MATHEvaluator, math_postprocess_v2
gsm8k_reader_cfg = dict(input_columns=['question'], output_column='answer')
gsm8k_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(role='HUMAN', prompt='{question}\nPlease reason step by step, and put your final answer within \\boxed{}.'),
],
),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=2048),
)
gsm8k_eval_cfg = dict(
evaluator=dict(type=MATHEvaluator, version='v2'),
pred_postprocessor=dict(type=math_postprocess_v2),
dataset_postprocessor=dict(type=gsm8k_dataset_postprocess),
)
gsm8k_datasets = [
dict(
abbr='gsm8k',
type=GSM8KDataset,
path='opencompass/gsm8k',
reader_cfg=gsm8k_reader_cfg,
infer_cfg=gsm8k_infer_cfg,
eval_cfg=gsm8k_eval_cfg,
)
]

View File

@ -0,0 +1,135 @@
from opencompass.datasets import MusrDataset, MusrEvaluator
from opencompass.openicl import PromptTemplate, ZeroRetriever, GenInferencer
DATASET_CONFIGS = {
'murder_mysteries': {
'abbr': 'musr_murder_mysteries',
'name': 'murder_mysteries',
'path': 'opencompass/musr',
'reader_cfg': dict(
input_columns=['context', 'question_text', 'question', 'answer', 'choices', 'choices_str', 'intermediate_trees', 'intermediate_data', 'prompt', 'system_prompt', 'gold_answer', 'scidx', 'self_consistency_n', 'ablation_name'],
output_column='gold_answer',
),
'infer_cfg': dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
begin=[
dict(
role='SYSTEM',
fallback_role='HUMAN',
prompt='{system_prompt}'
)
],
round=[
dict(
role='HUMAN',
prompt='{prompt}'
),
]
),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=2048),
),
'eval_cfg': dict(
evaluator=dict(
type=MusrEvaluator,
answer_index_modifier=1,
self_consistency_n=1
),
),
},
'object_placements': {
'abbr': 'musr_object_placements',
'name': 'object_placements',
'path': 'opencompass/musr',
'reader_cfg': dict(
input_columns=['context', 'question_text', 'question', 'answer', 'choices', 'choices_str', 'intermediate_trees', 'intermediate_data', 'prompt', 'system_prompt', 'gold_answer', 'scidx', 'self_consistency_n', 'ablation_name'],
output_column='gold_answer',
),
'infer_cfg': dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
begin=[
dict(
role='SYSTEM',
fallback_role='HUMAN',
prompt='{system_prompt}'
)
],
round=[
dict(
role='HUMAN',
prompt='{prompt}'
),
]
),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=512),
),
'eval_cfg': dict(
evaluator=dict(
type=MusrEvaluator,
answer_index_modifier=1,
self_consistency_n=1
),
),
},
'team_allocation': {
'abbr': 'musr_team_allocation',
'name': 'team_allocation',
'path': 'opencompass/musr',
'reader_cfg': dict(
input_columns=['context', 'question_text', 'question', 'answer', 'choices', 'choices_str', 'intermediate_trees', 'intermediate_data', 'prompt', 'system_prompt', 'gold_answer', 'scidx', 'self_consistency_n', 'ablation_name'],
output_column='gold_answer',
),
'infer_cfg': dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
begin=[
dict(
role='SYSTEM',
fallback_role='HUMAN',
prompt='{system_prompt}'
)
],
round=[
dict(
role='HUMAN',
prompt='{prompt}'
),
]
),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=512),
),
'eval_cfg': dict(
evaluator=dict(
type=MusrEvaluator,
answer_index_modifier=1,
self_consistency_n=1
),
),
},
}
musr_datasets = []
for config in DATASET_CONFIGS.values():
dataset = dict(
abbr=config['abbr'],
type=MusrDataset,
path=config['path'],
name=config['name'],
reader_cfg=config['reader_cfg'],
infer_cfg=config['infer_cfg'],
eval_cfg=config['eval_cfg'],
)
musr_datasets.append(dataset)

View File

@ -118,7 +118,7 @@ for _name, _prompt in sub_map.items():
]),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_seq_len=4096, max_out_len=2048),
inferencer=dict(type=GenInferencer, max_seq_len=4096, max_out_len=4096),
)
subjective_eval_cfg = dict(

View File

@ -0,0 +1,22 @@
from opencompass.models import TurboMindModelwithChatTemplate
models = [
dict(
type=TurboMindModelwithChatTemplate,
abbr='mixtral-large-instruct-2411-turbomind',
path='mistralai/Mistral-Large-Instruct-2411',
engine_config=dict(
session_len=32768,
max_batch_size=16,
tp=4,
cache_max_entry_count=0.7,
),
gen_config=dict(
top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=4096
),
max_seq_len=32768,
max_out_len=4096,
batch_size=8,
run_cfg=dict(num_gpus=4),
)
]

View File

@ -6,6 +6,7 @@ from .anli import AnliDataset # noqa: F401, F403
from .anthropics_evals import * # noqa: F401, F403
from .apps import * # noqa: F401, F403
from .arc import * # noqa: F401, F403
from .arc_prize_public_evaluation import * # noqa: F401, F403
from .ax import * # noqa: F401, F403
from .babilong import * # noqa: F401, F403
from .bbh import * # noqa: F401, F403

View File

@ -44,11 +44,13 @@ class Gemini(BaseAPIModel):
top_p: float = 0.8,
top_k: float = 10.0,
):
super().__init__(path=path,
max_seq_len=max_seq_len,
query_per_second=query_per_second,
meta_template=meta_template,
retry=retry)
super().__init__(
path=path,
max_seq_len=max_seq_len,
query_per_second=query_per_second,
meta_template=meta_template,
retry=retry,
)
assert isinstance(key, str)
if key == 'ENV':
if 'GEMINI_API_KEY' not in os.environ:
@ -56,8 +58,11 @@ class Gemini(BaseAPIModel):
key = os.getenv('GEMINI_API_KEY')
assert path in [
'gemini-1.0-pro', 'gemini-pro', 'gemini-1.5-flash',
'gemini-1.5-pro'
'gemini-1.0-pro',
'gemini-pro',
'gemini-1.5-flash',
'gemini-1.5-pro',
'gemini-1.5-pro-latest',
] # https://ai.google.dev/gemini-api/docs/models/gemini#model-variations
self.url = f'https://generativelanguage.googleapis.com/v1beta/models/{path}:generateContent?key={key}'
@ -147,19 +152,19 @@ class Gemini(BaseAPIModel):
'safetySettings': [
{
'category': 'HARM_CATEGORY_DANGEROUS_CONTENT',
'threshold': 'BLOCK_NONE'
'threshold': 'BLOCK_NONE',
},
{
'category': 'HARM_CATEGORY_HATE_SPEECH',
'threshold': 'BLOCK_NONE'
'threshold': 'BLOCK_NONE',
},
{
'category': 'HARM_CATEGORY_HARASSMENT',
'threshold': 'BLOCK_NONE'
'threshold': 'BLOCK_NONE',
},
{
'category': 'HARM_CATEGORY_DANGEROUS_CONTENT',
'threshold': 'BLOCK_NONE'
'threshold': 'BLOCK_NONE',
},
],
'generationConfig': {
@ -167,8 +172,8 @@ class Gemini(BaseAPIModel):
'temperature': self.temperature,
'maxOutputTokens': 2048,
'topP': self.top_p,
'topK': self.top_k
}
'topK': self.top_k,
},
}
for _ in range(self.retry):