mirror of
https://github.com/open-compass/opencompass.git
synced 2025-05-30 16:03:24 +08:00
[Update] Update max_out_len for datasets (#1726)
* [Update] Update max_out_len for datasets * Update eval_regression_chat_objective_fullbench.py * Update eval_regression_chat.py * Update eval_regression_chat.py * Update oc_score_baseline_fullbench.yaml --------- Co-authored-by: zhulinJulia24 <145004780+zhulinJulia24@users.noreply.github.com>
This commit is contained in:
parent
fe6d76fb13
commit
9de27b4d85
@ -22,7 +22,7 @@ with read_base():
|
||||
from opencompass.configs.datasets.gpqa.gpqa_openai_simple_evals_gen_5aeece import \
|
||||
gpqa_datasets # noqa: F401, E501
|
||||
# new datasets in Fullbench v1.1
|
||||
from opencompass.configs.datasets.gsm8k.gsm8k_0shot_v2_gen_a58960 import \
|
||||
from opencompass.configs.datasets.gsm8k.gsm8k_0shot_v2_gen_6e39a4 import \
|
||||
gsm8k_datasets # noqa: F401, E501
|
||||
from opencompass.configs.datasets.hellaswag.hellaswag_10shot_gen_e42710 import \
|
||||
hellaswag_datasets # noqa: F401, E501
|
||||
@ -46,7 +46,7 @@ with read_base():
|
||||
mmlu_pro_datasets # noqa: F401, E501
|
||||
from opencompass.configs.datasets.mmmlu_lite.mmmlu_lite_gen_c51a84 import \
|
||||
mmmlu_lite_datasets # noqa: F401, E501
|
||||
from opencompass.configs.datasets.musr.musr_gen_3c6e15 import \
|
||||
from opencompass.configs.datasets.musr.musr_gen_3622bb import \
|
||||
musr_datasets # noqa: F401, E501
|
||||
from opencompass.configs.datasets.nq.nq_open_1shot_gen_2e45e5 import \
|
||||
nq_datasets # noqa: F401, E501
|
||||
|
@ -70,7 +70,7 @@ internlm2_5-7b-chat-turbomind_fullbench:
|
||||
drop: 75
|
||||
hellaswag: 81.25
|
||||
TheoremQA: 6.25
|
||||
musr_average: 39.58
|
||||
musr_average: 37.5
|
||||
gsm8k: 68.75
|
||||
math: 75
|
||||
GPQA_diamond: 25
|
||||
|
@ -118,7 +118,7 @@ for _name, _prompt in sub_map.items():
|
||||
]),
|
||||
),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer, max_seq_len=4096, max_out_len=2048),
|
||||
inferencer=dict(type=GenInferencer, max_seq_len=4096, max_out_len=4096),
|
||||
)
|
||||
|
||||
subjective_eval_cfg = dict(
|
||||
|
@ -0,0 +1,37 @@
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.datasets import GSM8KDataset, gsm8k_postprocess, gsm8k_dataset_postprocess, Gsm8kEvaluator
|
||||
from opencompass.datasets import MATHEvaluator, math_postprocess_v2
|
||||
|
||||
gsm8k_reader_cfg = dict(input_columns=['question'], output_column='answer')
|
||||
|
||||
gsm8k_infer_cfg = dict(
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(
|
||||
round=[
|
||||
dict(role='HUMAN', prompt='{question}\nPlease reason step by step, and put your final answer within \\boxed{}.'),
|
||||
],
|
||||
),
|
||||
),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer, max_out_len=2048),
|
||||
)
|
||||
|
||||
gsm8k_eval_cfg = dict(
|
||||
evaluator=dict(type=MATHEvaluator, version='v2'),
|
||||
pred_postprocessor=dict(type=math_postprocess_v2),
|
||||
dataset_postprocessor=dict(type=gsm8k_dataset_postprocess),
|
||||
)
|
||||
|
||||
gsm8k_datasets = [
|
||||
dict(
|
||||
abbr='gsm8k',
|
||||
type=GSM8KDataset,
|
||||
path='opencompass/gsm8k',
|
||||
reader_cfg=gsm8k_reader_cfg,
|
||||
infer_cfg=gsm8k_infer_cfg,
|
||||
eval_cfg=gsm8k_eval_cfg,
|
||||
)
|
||||
]
|
135
opencompass/configs/datasets/musr/musr_gen_3622bb.py
Normal file
135
opencompass/configs/datasets/musr/musr_gen_3622bb.py
Normal file
@ -0,0 +1,135 @@
|
||||
from opencompass.datasets import MusrDataset, MusrEvaluator
|
||||
from opencompass.openicl import PromptTemplate, ZeroRetriever, GenInferencer
|
||||
|
||||
|
||||
DATASET_CONFIGS = {
|
||||
'murder_mysteries': {
|
||||
'abbr': 'musr_murder_mysteries',
|
||||
'name': 'murder_mysteries',
|
||||
'path': 'opencompass/musr',
|
||||
'reader_cfg': dict(
|
||||
input_columns=['context', 'question_text', 'question', 'answer', 'choices', 'choices_str', 'intermediate_trees', 'intermediate_data', 'prompt', 'system_prompt', 'gold_answer', 'scidx', 'self_consistency_n', 'ablation_name'],
|
||||
output_column='gold_answer',
|
||||
),
|
||||
'infer_cfg': dict(
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(
|
||||
begin=[
|
||||
dict(
|
||||
role='SYSTEM',
|
||||
fallback_role='HUMAN',
|
||||
prompt='{system_prompt}'
|
||||
)
|
||||
],
|
||||
round=[
|
||||
dict(
|
||||
role='HUMAN',
|
||||
prompt='{prompt}'
|
||||
),
|
||||
]
|
||||
),
|
||||
),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer, max_out_len=2048),
|
||||
),
|
||||
'eval_cfg': dict(
|
||||
evaluator=dict(
|
||||
type=MusrEvaluator,
|
||||
answer_index_modifier=1,
|
||||
self_consistency_n=1
|
||||
),
|
||||
),
|
||||
},
|
||||
'object_placements': {
|
||||
'abbr': 'musr_object_placements',
|
||||
'name': 'object_placements',
|
||||
'path': 'opencompass/musr',
|
||||
'reader_cfg': dict(
|
||||
input_columns=['context', 'question_text', 'question', 'answer', 'choices', 'choices_str', 'intermediate_trees', 'intermediate_data', 'prompt', 'system_prompt', 'gold_answer', 'scidx', 'self_consistency_n', 'ablation_name'],
|
||||
output_column='gold_answer',
|
||||
),
|
||||
'infer_cfg': dict(
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(
|
||||
begin=[
|
||||
dict(
|
||||
role='SYSTEM',
|
||||
fallback_role='HUMAN',
|
||||
prompt='{system_prompt}'
|
||||
)
|
||||
],
|
||||
round=[
|
||||
dict(
|
||||
role='HUMAN',
|
||||
prompt='{prompt}'
|
||||
),
|
||||
]
|
||||
),
|
||||
),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer, max_out_len=512),
|
||||
),
|
||||
'eval_cfg': dict(
|
||||
evaluator=dict(
|
||||
type=MusrEvaluator,
|
||||
answer_index_modifier=1,
|
||||
self_consistency_n=1
|
||||
),
|
||||
),
|
||||
},
|
||||
'team_allocation': {
|
||||
'abbr': 'musr_team_allocation',
|
||||
'name': 'team_allocation',
|
||||
'path': 'opencompass/musr',
|
||||
'reader_cfg': dict(
|
||||
input_columns=['context', 'question_text', 'question', 'answer', 'choices', 'choices_str', 'intermediate_trees', 'intermediate_data', 'prompt', 'system_prompt', 'gold_answer', 'scidx', 'self_consistency_n', 'ablation_name'],
|
||||
output_column='gold_answer',
|
||||
),
|
||||
'infer_cfg': dict(
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(
|
||||
begin=[
|
||||
dict(
|
||||
role='SYSTEM',
|
||||
fallback_role='HUMAN',
|
||||
prompt='{system_prompt}'
|
||||
)
|
||||
],
|
||||
round=[
|
||||
dict(
|
||||
role='HUMAN',
|
||||
prompt='{prompt}'
|
||||
),
|
||||
]
|
||||
),
|
||||
),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer, max_out_len=512),
|
||||
),
|
||||
'eval_cfg': dict(
|
||||
evaluator=dict(
|
||||
type=MusrEvaluator,
|
||||
answer_index_modifier=1,
|
||||
self_consistency_n=1
|
||||
),
|
||||
),
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
musr_datasets = []
|
||||
|
||||
for config in DATASET_CONFIGS.values():
|
||||
dataset = dict(
|
||||
abbr=config['abbr'],
|
||||
type=MusrDataset,
|
||||
path=config['path'],
|
||||
name=config['name'],
|
||||
reader_cfg=config['reader_cfg'],
|
||||
infer_cfg=config['infer_cfg'],
|
||||
eval_cfg=config['eval_cfg'],
|
||||
)
|
||||
musr_datasets.append(dataset)
|
@ -118,7 +118,7 @@ for _name, _prompt in sub_map.items():
|
||||
]),
|
||||
),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer, max_seq_len=4096, max_out_len=2048),
|
||||
inferencer=dict(type=GenInferencer, max_seq_len=4096, max_out_len=4096),
|
||||
)
|
||||
|
||||
subjective_eval_cfg = dict(
|
||||
|
@ -0,0 +1,22 @@
|
||||
from opencompass.models import TurboMindModelwithChatTemplate
|
||||
|
||||
models = [
|
||||
dict(
|
||||
type=TurboMindModelwithChatTemplate,
|
||||
abbr='mixtral-large-instruct-2411-turbomind',
|
||||
path='mistralai/Mistral-Large-Instruct-2411',
|
||||
engine_config=dict(
|
||||
session_len=32768,
|
||||
max_batch_size=16,
|
||||
tp=4,
|
||||
cache_max_entry_count=0.7,
|
||||
),
|
||||
gen_config=dict(
|
||||
top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=4096
|
||||
),
|
||||
max_seq_len=32768,
|
||||
max_out_len=4096,
|
||||
batch_size=8,
|
||||
run_cfg=dict(num_gpus=4),
|
||||
)
|
||||
]
|
@ -6,6 +6,7 @@ from .anli import AnliDataset # noqa: F401, F403
|
||||
from .anthropics_evals import * # noqa: F401, F403
|
||||
from .apps import * # noqa: F401, F403
|
||||
from .arc import * # noqa: F401, F403
|
||||
from .arc_prize_public_evaluation import * # noqa: F401, F403
|
||||
from .ax import * # noqa: F401, F403
|
||||
from .babilong import * # noqa: F401, F403
|
||||
from .bbh import * # noqa: F401, F403
|
||||
|
@ -44,11 +44,13 @@ class Gemini(BaseAPIModel):
|
||||
top_p: float = 0.8,
|
||||
top_k: float = 10.0,
|
||||
):
|
||||
super().__init__(path=path,
|
||||
max_seq_len=max_seq_len,
|
||||
query_per_second=query_per_second,
|
||||
meta_template=meta_template,
|
||||
retry=retry)
|
||||
super().__init__(
|
||||
path=path,
|
||||
max_seq_len=max_seq_len,
|
||||
query_per_second=query_per_second,
|
||||
meta_template=meta_template,
|
||||
retry=retry,
|
||||
)
|
||||
assert isinstance(key, str)
|
||||
if key == 'ENV':
|
||||
if 'GEMINI_API_KEY' not in os.environ:
|
||||
@ -56,8 +58,11 @@ class Gemini(BaseAPIModel):
|
||||
key = os.getenv('GEMINI_API_KEY')
|
||||
|
||||
assert path in [
|
||||
'gemini-1.0-pro', 'gemini-pro', 'gemini-1.5-flash',
|
||||
'gemini-1.5-pro'
|
||||
'gemini-1.0-pro',
|
||||
'gemini-pro',
|
||||
'gemini-1.5-flash',
|
||||
'gemini-1.5-pro',
|
||||
'gemini-1.5-pro-latest',
|
||||
] # https://ai.google.dev/gemini-api/docs/models/gemini#model-variations
|
||||
|
||||
self.url = f'https://generativelanguage.googleapis.com/v1beta/models/{path}:generateContent?key={key}'
|
||||
@ -147,19 +152,19 @@ class Gemini(BaseAPIModel):
|
||||
'safetySettings': [
|
||||
{
|
||||
'category': 'HARM_CATEGORY_DANGEROUS_CONTENT',
|
||||
'threshold': 'BLOCK_NONE'
|
||||
'threshold': 'BLOCK_NONE',
|
||||
},
|
||||
{
|
||||
'category': 'HARM_CATEGORY_HATE_SPEECH',
|
||||
'threshold': 'BLOCK_NONE'
|
||||
'threshold': 'BLOCK_NONE',
|
||||
},
|
||||
{
|
||||
'category': 'HARM_CATEGORY_HARASSMENT',
|
||||
'threshold': 'BLOCK_NONE'
|
||||
'threshold': 'BLOCK_NONE',
|
||||
},
|
||||
{
|
||||
'category': 'HARM_CATEGORY_DANGEROUS_CONTENT',
|
||||
'threshold': 'BLOCK_NONE'
|
||||
'threshold': 'BLOCK_NONE',
|
||||
},
|
||||
],
|
||||
'generationConfig': {
|
||||
@ -167,8 +172,8 @@ class Gemini(BaseAPIModel):
|
||||
'temperature': self.temperature,
|
||||
'maxOutputTokens': 2048,
|
||||
'topP': self.top_p,
|
||||
'topK': self.top_k
|
||||
}
|
||||
'topK': self.top_k,
|
||||
},
|
||||
}
|
||||
|
||||
for _ in range(self.retry):
|
||||
|
Loading…
Reference in New Issue
Block a user