mirror of
https://github.com/open-compass/opencompass.git
synced 2025-05-30 16:03:24 +08:00
[Update] Update configurations (#1704)
This commit is contained in:
parent
ed81f9df30
commit
500fb1032a
@ -1,4 +1,4 @@
|
||||
from mmengine.config import read_base
|
||||
|
||||
with read_base():
|
||||
from .livecodebench_gen_b2b0fd import LCB_datasets # noqa: F401, F403
|
||||
from .livecodebench_gen_6966bc import LCB_datasets # noqa: F401, F403
|
||||
|
164
configs/datasets/livecodebench/livecodebench_gen_6966bc.py
Normal file
164
configs/datasets/livecodebench/livecodebench_gen_6966bc.py
Normal file
@ -0,0 +1,164 @@
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.datasets import (
|
||||
LCBCodeGenerationDataset,
|
||||
LCBCodeExecutionDataset,
|
||||
LCBTestOutputPredictionDataset,
|
||||
LCBCodeGenerationEvaluator,
|
||||
LCBCodeExecutionEvaluator,
|
||||
LCBTestOutputEvaluator
|
||||
)
|
||||
from opencompass.datasets.livecodebench import TestOutputPromptConstants
|
||||
|
||||
|
||||
lcb_code_generation_reader_cfg = dict(
|
||||
input_columns=[
|
||||
'question_content',
|
||||
'format_prompt',
|
||||
],
|
||||
# output_column='evaluation_sample',
|
||||
output_column='question_id',
|
||||
)
|
||||
|
||||
SYSTEM_MESSAGE_GENERIC = f'You are an expert Python programmer. You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests. You will NOT return anything except for the program.'
|
||||
|
||||
prompt_template = '### Question:\n{question_content}\n\n{format_prompt}' + \
|
||||
'### Answer: (use the provided format with backticks)\n\n'
|
||||
|
||||
|
||||
# Code Generation Tasks
|
||||
lcb_code_generation_infer_cfg = dict(
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(
|
||||
round=[
|
||||
dict(
|
||||
role='HUMAN',
|
||||
prompt=prompt_template
|
||||
)
|
||||
]
|
||||
)
|
||||
),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer, max_out_len=1024)
|
||||
)
|
||||
|
||||
lcb_code_generation_eval_cfg = dict(
|
||||
evaluator=dict(
|
||||
type=LCBCodeGenerationEvaluator,
|
||||
num_process_evaluate=4,
|
||||
timeout=6,
|
||||
),
|
||||
pred_role='BOT',
|
||||
)
|
||||
|
||||
LCBCodeGeneration_dataset = dict(
|
||||
type=LCBCodeGenerationDataset,
|
||||
abbr='lcb_code_generation',
|
||||
path='opencompass/code_generation_lite',
|
||||
reader_cfg=lcb_code_generation_reader_cfg,
|
||||
infer_cfg=lcb_code_generation_infer_cfg,
|
||||
eval_cfg=lcb_code_generation_eval_cfg
|
||||
)
|
||||
|
||||
# Code Execution Dataset
|
||||
lcb_code_execution_reader_cfg = dict(
|
||||
input_columns=[
|
||||
'prompt',
|
||||
],
|
||||
output_column='evaluation_sample',
|
||||
)
|
||||
|
||||
lcb_code_execution_infer_cfg = dict(
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(
|
||||
begin=[
|
||||
dict(
|
||||
role='SYSTEM',
|
||||
fallback_role='HUMAN',
|
||||
prompt='You are an expert at Python programming, code execution, test case generation, and fuzzing.'
|
||||
),
|
||||
],
|
||||
round=[
|
||||
dict(
|
||||
role='HUMAN',
|
||||
prompt='{prompt}'
|
||||
)
|
||||
]
|
||||
)
|
||||
),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer, max_out_len=1024)
|
||||
)
|
||||
|
||||
lcb_code_execution_eval_cfg = dict(
|
||||
evaluator=dict(
|
||||
type=LCBCodeExecutionEvaluator,
|
||||
),
|
||||
pred_role='BOT',
|
||||
)
|
||||
|
||||
LCBCodeExecution_dataset = dict(
|
||||
type=LCBCodeExecutionDataset,
|
||||
abbr='lcb_code_execution',
|
||||
path='opencompass/execution-v2',
|
||||
reader_cfg=lcb_code_execution_reader_cfg,
|
||||
infer_cfg=lcb_code_execution_infer_cfg,
|
||||
eval_cfg=lcb_code_execution_eval_cfg,
|
||||
)
|
||||
|
||||
# TestOuputput Dataset
|
||||
lcb_test_output_reader_cfg = dict(
|
||||
input_columns=[
|
||||
'prompt',
|
||||
],
|
||||
output_column='evaluation_sample',
|
||||
)
|
||||
|
||||
system_prompt = 'You are an expert Python programmer. You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests. You will NOT return anything except for the program.'
|
||||
|
||||
lcb_test_output_infer_cfg = dict(
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(
|
||||
# begin=[
|
||||
# dict(
|
||||
# role='SYSTEM',
|
||||
# prompt=system_prompt
|
||||
# ),
|
||||
# ],
|
||||
round=[
|
||||
dict(
|
||||
role='HUMAN',
|
||||
prompt='{prompt}'
|
||||
)
|
||||
]
|
||||
)
|
||||
),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer, max_out_len=1024)
|
||||
)
|
||||
|
||||
lcb_test_output_eval_cfg = dict(
|
||||
evaluator=dict(
|
||||
type=LCBTestOutputEvaluator,
|
||||
),
|
||||
pred_role='BOT',
|
||||
)
|
||||
|
||||
LCBTestOutput_dataset = dict(
|
||||
type=LCBTestOutputPredictionDataset,
|
||||
abbr='lcb_test_output',
|
||||
path='opencompass/test_generation',
|
||||
reader_cfg=lcb_test_output_reader_cfg,
|
||||
infer_cfg=lcb_test_output_infer_cfg,
|
||||
eval_cfg=lcb_test_output_eval_cfg,
|
||||
)
|
||||
|
||||
LCB_datasets = [
|
||||
LCBCodeGeneration_dataset,
|
||||
LCBCodeExecution_dataset,
|
||||
LCBTestOutput_dataset,
|
||||
]
|
@ -1,4 +1,4 @@
|
||||
from mmengine.config import read_base
|
||||
|
||||
with read_base():
|
||||
from .livecodebench_gen_b2b0fd import LCB_datasets # noqa: F401, F403
|
||||
from .livecodebench_gen_6966bc import LCB_datasets # noqa: F401, F403
|
||||
|
@ -0,0 +1,164 @@
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.datasets import (
|
||||
LCBCodeGenerationDataset,
|
||||
LCBCodeExecutionDataset,
|
||||
LCBTestOutputPredictionDataset,
|
||||
LCBCodeGenerationEvaluator,
|
||||
LCBCodeExecutionEvaluator,
|
||||
LCBTestOutputEvaluator
|
||||
)
|
||||
from opencompass.datasets.livecodebench import TestOutputPromptConstants
|
||||
|
||||
|
||||
lcb_code_generation_reader_cfg = dict(
|
||||
input_columns=[
|
||||
'question_content',
|
||||
'format_prompt',
|
||||
],
|
||||
# output_column='evaluation_sample',
|
||||
output_column='question_id',
|
||||
)
|
||||
|
||||
SYSTEM_MESSAGE_GENERIC = f'You are an expert Python programmer. You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests. You will NOT return anything except for the program.'
|
||||
|
||||
prompt_template = '### Question:\n{question_content}\n\n{format_prompt}' + \
|
||||
'### Answer: (use the provided format with backticks)\n\n'
|
||||
|
||||
|
||||
# Code Generation Tasks
|
||||
lcb_code_generation_infer_cfg = dict(
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(
|
||||
round=[
|
||||
dict(
|
||||
role='HUMAN',
|
||||
prompt=prompt_template
|
||||
)
|
||||
]
|
||||
)
|
||||
),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer, max_out_len=1024)
|
||||
)
|
||||
|
||||
lcb_code_generation_eval_cfg = dict(
|
||||
evaluator=dict(
|
||||
type=LCBCodeGenerationEvaluator,
|
||||
num_process_evaluate=4,
|
||||
timeout=6,
|
||||
),
|
||||
pred_role='BOT',
|
||||
)
|
||||
|
||||
LCBCodeGeneration_dataset = dict(
|
||||
type=LCBCodeGenerationDataset,
|
||||
abbr='lcb_code_generation',
|
||||
path='opencompass/code_generation_lite',
|
||||
reader_cfg=lcb_code_generation_reader_cfg,
|
||||
infer_cfg=lcb_code_generation_infer_cfg,
|
||||
eval_cfg=lcb_code_generation_eval_cfg
|
||||
)
|
||||
|
||||
# Code Execution Dataset
|
||||
lcb_code_execution_reader_cfg = dict(
|
||||
input_columns=[
|
||||
'prompt',
|
||||
],
|
||||
output_column='evaluation_sample',
|
||||
)
|
||||
|
||||
lcb_code_execution_infer_cfg = dict(
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(
|
||||
begin=[
|
||||
dict(
|
||||
role='SYSTEM',
|
||||
fallback_role='HUMAN',
|
||||
prompt='You are an expert at Python programming, code execution, test case generation, and fuzzing.'
|
||||
),
|
||||
],
|
||||
round=[
|
||||
dict(
|
||||
role='HUMAN',
|
||||
prompt='{prompt}'
|
||||
)
|
||||
]
|
||||
)
|
||||
),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer, max_out_len=1024)
|
||||
)
|
||||
|
||||
lcb_code_execution_eval_cfg = dict(
|
||||
evaluator=dict(
|
||||
type=LCBCodeExecutionEvaluator,
|
||||
),
|
||||
pred_role='BOT',
|
||||
)
|
||||
|
||||
LCBCodeExecution_dataset = dict(
|
||||
type=LCBCodeExecutionDataset,
|
||||
abbr='lcb_code_execution',
|
||||
path='opencompass/execution-v2',
|
||||
reader_cfg=lcb_code_execution_reader_cfg,
|
||||
infer_cfg=lcb_code_execution_infer_cfg,
|
||||
eval_cfg=lcb_code_execution_eval_cfg,
|
||||
)
|
||||
|
||||
# TestOuputput Dataset
|
||||
lcb_test_output_reader_cfg = dict(
|
||||
input_columns=[
|
||||
'prompt',
|
||||
],
|
||||
output_column='evaluation_sample',
|
||||
)
|
||||
|
||||
system_prompt = 'You are an expert Python programmer. You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests. You will NOT return anything except for the program.'
|
||||
|
||||
lcb_test_output_infer_cfg = dict(
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(
|
||||
# begin=[
|
||||
# dict(
|
||||
# role='SYSTEM',
|
||||
# prompt=system_prompt
|
||||
# ),
|
||||
# ],
|
||||
round=[
|
||||
dict(
|
||||
role='HUMAN',
|
||||
prompt='{prompt}'
|
||||
)
|
||||
]
|
||||
)
|
||||
),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer, max_out_len=1024)
|
||||
)
|
||||
|
||||
lcb_test_output_eval_cfg = dict(
|
||||
evaluator=dict(
|
||||
type=LCBTestOutputEvaluator,
|
||||
),
|
||||
pred_role='BOT',
|
||||
)
|
||||
|
||||
LCBTestOutput_dataset = dict(
|
||||
type=LCBTestOutputPredictionDataset,
|
||||
abbr='lcb_test_output',
|
||||
path='opencompass/test_generation',
|
||||
reader_cfg=lcb_test_output_reader_cfg,
|
||||
infer_cfg=lcb_test_output_infer_cfg,
|
||||
eval_cfg=lcb_test_output_eval_cfg,
|
||||
)
|
||||
|
||||
LCB_datasets = [
|
||||
LCBCodeGeneration_dataset,
|
||||
LCBCodeExecution_dataset,
|
||||
LCBTestOutput_dataset,
|
||||
]
|
15
opencompass/configs/models/chatglm/lmdeploy_glm4_9b.py
Normal file
15
opencompass/configs/models/chatglm/lmdeploy_glm4_9b.py
Normal file
@ -0,0 +1,15 @@
|
||||
from opencompass.models import TurboMindModel
|
||||
|
||||
models = [
|
||||
dict(
|
||||
type=TurboMindModel,
|
||||
abbr='glm-4-9b-turbomind',
|
||||
path='THUDM/glm-4-9b',
|
||||
engine_config=dict(max_batch_size=16, tp=1),
|
||||
gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=2048),
|
||||
max_seq_len=8192,
|
||||
max_out_len=2048,
|
||||
batch_size=16,
|
||||
run_cfg=dict(num_gpus=1),
|
||||
)
|
||||
]
|
15
opencompass/configs/models/qwen2_5/lmdeploy_qwen2_5_14b.py
Normal file
15
opencompass/configs/models/qwen2_5/lmdeploy_qwen2_5_14b.py
Normal file
@ -0,0 +1,15 @@
|
||||
from opencompass.models import TurboMindModel
|
||||
|
||||
models = [
|
||||
dict(
|
||||
type=TurboMindModel,
|
||||
abbr='qwen2.5-14b-turbomind',
|
||||
path='Qwen/Qwen2.5-14B',
|
||||
engine_config=dict(session_len=7168, max_batch_size=16, tp=2),
|
||||
gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=1024),
|
||||
max_seq_len=7168,
|
||||
max_out_len=1024,
|
||||
batch_size=16,
|
||||
run_cfg=dict(num_gpus=2),
|
||||
)
|
||||
]
|
15
opencompass/configs/models/qwen2_5/lmdeploy_qwen2_5_32b.py
Normal file
15
opencompass/configs/models/qwen2_5/lmdeploy_qwen2_5_32b.py
Normal file
@ -0,0 +1,15 @@
|
||||
from opencompass.models import TurboMindModel
|
||||
|
||||
models = [
|
||||
dict(
|
||||
type=TurboMindModel,
|
||||
abbr='qwen2.5-32b-turbomind',
|
||||
path='Qwen/Qwen2.5-32B',
|
||||
engine_config=dict(session_len=7168, max_batch_size=16, tp=2),
|
||||
gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=1024),
|
||||
max_seq_len=7168,
|
||||
max_out_len=1024,
|
||||
batch_size=16,
|
||||
run_cfg=dict(num_gpus=2),
|
||||
)
|
||||
]
|
17
opencompass/configs/models/qwen2_5/lmdeploy_qwen2_5_72b.py
Normal file
17
opencompass/configs/models/qwen2_5/lmdeploy_qwen2_5_72b.py
Normal file
@ -0,0 +1,17 @@
|
||||
from opencompass.models import TurboMindModel
|
||||
|
||||
models = [
|
||||
dict(
|
||||
type=TurboMindModel,
|
||||
abbr='qwen2.5-72b-turbomind',
|
||||
path='Qwen/Qwen2.5-72B',
|
||||
engine_config=dict(session_len=7168, max_batch_size=16, tp=4),
|
||||
gen_config=dict(
|
||||
top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=1024
|
||||
),
|
||||
max_seq_len=7168,
|
||||
max_out_len=1024,
|
||||
batch_size=16,
|
||||
run_cfg=dict(num_gpus=4),
|
||||
)
|
||||
]
|
15
opencompass/configs/models/yi/lmdeploy_yi_1_5_9b.py
Normal file
15
opencompass/configs/models/yi/lmdeploy_yi_1_5_9b.py
Normal file
@ -0,0 +1,15 @@
|
||||
from opencompass.models import TurboMindModel
|
||||
|
||||
models = [
|
||||
dict(
|
||||
type=TurboMindModel,
|
||||
abbr='yi-1.5-9b-turbomind',
|
||||
path='01-ai/Yi-1.5-9B',
|
||||
engine_config=dict(session_len=4096, max_batch_size=16, tp=1),
|
||||
gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=2048),
|
||||
max_seq_len=4096,
|
||||
max_out_len=2048,
|
||||
batch_size=16,
|
||||
run_cfg=dict(num_gpus=1),
|
||||
)
|
||||
]
|
@ -526,7 +526,7 @@ class OpenAISDK(OpenAI):
|
||||
|
||||
def _generate(self, input: PromptList | str, max_out_len: int,
|
||||
temperature: float) -> str:
|
||||
from openai import BadRequestError
|
||||
from openai import APIStatusError, BadRequestError
|
||||
assert isinstance(input, (str, PromptList))
|
||||
|
||||
# max num token for gpt-3.5-turbo is 4097
|
||||
@ -616,7 +616,7 @@ class OpenAISDK(OpenAI):
|
||||
from the API provider.')
|
||||
return responses.choices[0].message.content
|
||||
|
||||
except BadRequestError as e:
|
||||
except (BadRequestError, APIStatusError) as e:
|
||||
# Handle BadRequest status
|
||||
# You can specify self.status_code_mappings to bypass \
|
||||
# API sensitivity blocks
|
||||
|
@ -87,6 +87,7 @@ class TurboMindModelwithChatTemplate(BaseModel):
|
||||
def generate(self,
|
||||
inputs: List[str],
|
||||
max_out_len: int,
|
||||
min_out_len: Optional[int] = None,
|
||||
stopping_criteria: List[str] = [],
|
||||
do_sample: Optional[bool] = None,
|
||||
temperature: float = 1.0,
|
||||
@ -123,6 +124,10 @@ class TurboMindModelwithChatTemplate(BaseModel):
|
||||
|
||||
gen_config = copy.deepcopy(DEFAULT_GEN_CONFIG)
|
||||
gen_config.update(self.gen_config)
|
||||
if max_out_len is not None:
|
||||
gen_config['max_new_tokens'] = max_out_len
|
||||
if min_out_len is not None:
|
||||
gen_config['min_new_tokens'] = min_out_len
|
||||
if do_sample or ('do_sample' in self.gen_config and self.gen_config['do_sample']):
|
||||
gen_config['top_k'] = 40
|
||||
gen_config['temperature'] = temperature
|
||||
|
Loading…
Reference in New Issue
Block a user