mirror of
https://github.com/open-compass/opencompass.git
synced 2025-05-30 16:03:24 +08:00
[Update] Update configurations (#1704)
This commit is contained in:
parent
ed81f9df30
commit
500fb1032a
@ -1,4 +1,4 @@
|
|||||||
from mmengine.config import read_base
|
from mmengine.config import read_base
|
||||||
|
|
||||||
with read_base():
|
with read_base():
|
||||||
from .livecodebench_gen_b2b0fd import LCB_datasets # noqa: F401, F403
|
from .livecodebench_gen_6966bc import LCB_datasets # noqa: F401, F403
|
||||||
|
164
configs/datasets/livecodebench/livecodebench_gen_6966bc.py
Normal file
164
configs/datasets/livecodebench/livecodebench_gen_6966bc.py
Normal file
@ -0,0 +1,164 @@
|
|||||||
|
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||||
|
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||||
|
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||||
|
from opencompass.datasets import (
|
||||||
|
LCBCodeGenerationDataset,
|
||||||
|
LCBCodeExecutionDataset,
|
||||||
|
LCBTestOutputPredictionDataset,
|
||||||
|
LCBCodeGenerationEvaluator,
|
||||||
|
LCBCodeExecutionEvaluator,
|
||||||
|
LCBTestOutputEvaluator
|
||||||
|
)
|
||||||
|
from opencompass.datasets.livecodebench import TestOutputPromptConstants
|
||||||
|
|
||||||
|
|
||||||
|
lcb_code_generation_reader_cfg = dict(
|
||||||
|
input_columns=[
|
||||||
|
'question_content',
|
||||||
|
'format_prompt',
|
||||||
|
],
|
||||||
|
# output_column='evaluation_sample',
|
||||||
|
output_column='question_id',
|
||||||
|
)
|
||||||
|
|
||||||
|
SYSTEM_MESSAGE_GENERIC = f'You are an expert Python programmer. You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests. You will NOT return anything except for the program.'
|
||||||
|
|
||||||
|
prompt_template = '### Question:\n{question_content}\n\n{format_prompt}' + \
|
||||||
|
'### Answer: (use the provided format with backticks)\n\n'
|
||||||
|
|
||||||
|
|
||||||
|
# Code Generation Tasks
|
||||||
|
lcb_code_generation_infer_cfg = dict(
|
||||||
|
prompt_template=dict(
|
||||||
|
type=PromptTemplate,
|
||||||
|
template=dict(
|
||||||
|
round=[
|
||||||
|
dict(
|
||||||
|
role='HUMAN',
|
||||||
|
prompt=prompt_template
|
||||||
|
)
|
||||||
|
]
|
||||||
|
)
|
||||||
|
),
|
||||||
|
retriever=dict(type=ZeroRetriever),
|
||||||
|
inferencer=dict(type=GenInferencer, max_out_len=1024)
|
||||||
|
)
|
||||||
|
|
||||||
|
lcb_code_generation_eval_cfg = dict(
|
||||||
|
evaluator=dict(
|
||||||
|
type=LCBCodeGenerationEvaluator,
|
||||||
|
num_process_evaluate=4,
|
||||||
|
timeout=6,
|
||||||
|
),
|
||||||
|
pred_role='BOT',
|
||||||
|
)
|
||||||
|
|
||||||
|
LCBCodeGeneration_dataset = dict(
|
||||||
|
type=LCBCodeGenerationDataset,
|
||||||
|
abbr='lcb_code_generation',
|
||||||
|
path='opencompass/code_generation_lite',
|
||||||
|
reader_cfg=lcb_code_generation_reader_cfg,
|
||||||
|
infer_cfg=lcb_code_generation_infer_cfg,
|
||||||
|
eval_cfg=lcb_code_generation_eval_cfg
|
||||||
|
)
|
||||||
|
|
||||||
|
# Code Execution Dataset
|
||||||
|
lcb_code_execution_reader_cfg = dict(
|
||||||
|
input_columns=[
|
||||||
|
'prompt',
|
||||||
|
],
|
||||||
|
output_column='evaluation_sample',
|
||||||
|
)
|
||||||
|
|
||||||
|
lcb_code_execution_infer_cfg = dict(
|
||||||
|
prompt_template=dict(
|
||||||
|
type=PromptTemplate,
|
||||||
|
template=dict(
|
||||||
|
begin=[
|
||||||
|
dict(
|
||||||
|
role='SYSTEM',
|
||||||
|
fallback_role='HUMAN',
|
||||||
|
prompt='You are an expert at Python programming, code execution, test case generation, and fuzzing.'
|
||||||
|
),
|
||||||
|
],
|
||||||
|
round=[
|
||||||
|
dict(
|
||||||
|
role='HUMAN',
|
||||||
|
prompt='{prompt}'
|
||||||
|
)
|
||||||
|
]
|
||||||
|
)
|
||||||
|
),
|
||||||
|
retriever=dict(type=ZeroRetriever),
|
||||||
|
inferencer=dict(type=GenInferencer, max_out_len=1024)
|
||||||
|
)
|
||||||
|
|
||||||
|
lcb_code_execution_eval_cfg = dict(
|
||||||
|
evaluator=dict(
|
||||||
|
type=LCBCodeExecutionEvaluator,
|
||||||
|
),
|
||||||
|
pred_role='BOT',
|
||||||
|
)
|
||||||
|
|
||||||
|
LCBCodeExecution_dataset = dict(
|
||||||
|
type=LCBCodeExecutionDataset,
|
||||||
|
abbr='lcb_code_execution',
|
||||||
|
path='opencompass/execution-v2',
|
||||||
|
reader_cfg=lcb_code_execution_reader_cfg,
|
||||||
|
infer_cfg=lcb_code_execution_infer_cfg,
|
||||||
|
eval_cfg=lcb_code_execution_eval_cfg,
|
||||||
|
)
|
||||||
|
|
||||||
|
# TestOuputput Dataset
|
||||||
|
lcb_test_output_reader_cfg = dict(
|
||||||
|
input_columns=[
|
||||||
|
'prompt',
|
||||||
|
],
|
||||||
|
output_column='evaluation_sample',
|
||||||
|
)
|
||||||
|
|
||||||
|
system_prompt = 'You are an expert Python programmer. You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests. You will NOT return anything except for the program.'
|
||||||
|
|
||||||
|
lcb_test_output_infer_cfg = dict(
|
||||||
|
prompt_template=dict(
|
||||||
|
type=PromptTemplate,
|
||||||
|
template=dict(
|
||||||
|
# begin=[
|
||||||
|
# dict(
|
||||||
|
# role='SYSTEM',
|
||||||
|
# prompt=system_prompt
|
||||||
|
# ),
|
||||||
|
# ],
|
||||||
|
round=[
|
||||||
|
dict(
|
||||||
|
role='HUMAN',
|
||||||
|
prompt='{prompt}'
|
||||||
|
)
|
||||||
|
]
|
||||||
|
)
|
||||||
|
),
|
||||||
|
retriever=dict(type=ZeroRetriever),
|
||||||
|
inferencer=dict(type=GenInferencer, max_out_len=1024)
|
||||||
|
)
|
||||||
|
|
||||||
|
lcb_test_output_eval_cfg = dict(
|
||||||
|
evaluator=dict(
|
||||||
|
type=LCBTestOutputEvaluator,
|
||||||
|
),
|
||||||
|
pred_role='BOT',
|
||||||
|
)
|
||||||
|
|
||||||
|
LCBTestOutput_dataset = dict(
|
||||||
|
type=LCBTestOutputPredictionDataset,
|
||||||
|
abbr='lcb_test_output',
|
||||||
|
path='opencompass/test_generation',
|
||||||
|
reader_cfg=lcb_test_output_reader_cfg,
|
||||||
|
infer_cfg=lcb_test_output_infer_cfg,
|
||||||
|
eval_cfg=lcb_test_output_eval_cfg,
|
||||||
|
)
|
||||||
|
|
||||||
|
LCB_datasets = [
|
||||||
|
LCBCodeGeneration_dataset,
|
||||||
|
LCBCodeExecution_dataset,
|
||||||
|
LCBTestOutput_dataset,
|
||||||
|
]
|
@ -1,4 +1,4 @@
|
|||||||
from mmengine.config import read_base
|
from mmengine.config import read_base
|
||||||
|
|
||||||
with read_base():
|
with read_base():
|
||||||
from .livecodebench_gen_b2b0fd import LCB_datasets # noqa: F401, F403
|
from .livecodebench_gen_6966bc import LCB_datasets # noqa: F401, F403
|
||||||
|
@ -0,0 +1,164 @@
|
|||||||
|
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||||
|
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||||
|
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||||
|
from opencompass.datasets import (
|
||||||
|
LCBCodeGenerationDataset,
|
||||||
|
LCBCodeExecutionDataset,
|
||||||
|
LCBTestOutputPredictionDataset,
|
||||||
|
LCBCodeGenerationEvaluator,
|
||||||
|
LCBCodeExecutionEvaluator,
|
||||||
|
LCBTestOutputEvaluator
|
||||||
|
)
|
||||||
|
from opencompass.datasets.livecodebench import TestOutputPromptConstants
|
||||||
|
|
||||||
|
|
||||||
|
lcb_code_generation_reader_cfg = dict(
|
||||||
|
input_columns=[
|
||||||
|
'question_content',
|
||||||
|
'format_prompt',
|
||||||
|
],
|
||||||
|
# output_column='evaluation_sample',
|
||||||
|
output_column='question_id',
|
||||||
|
)
|
||||||
|
|
||||||
|
SYSTEM_MESSAGE_GENERIC = f'You are an expert Python programmer. You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests. You will NOT return anything except for the program.'
|
||||||
|
|
||||||
|
prompt_template = '### Question:\n{question_content}\n\n{format_prompt}' + \
|
||||||
|
'### Answer: (use the provided format with backticks)\n\n'
|
||||||
|
|
||||||
|
|
||||||
|
# Code Generation Tasks
|
||||||
|
lcb_code_generation_infer_cfg = dict(
|
||||||
|
prompt_template=dict(
|
||||||
|
type=PromptTemplate,
|
||||||
|
template=dict(
|
||||||
|
round=[
|
||||||
|
dict(
|
||||||
|
role='HUMAN',
|
||||||
|
prompt=prompt_template
|
||||||
|
)
|
||||||
|
]
|
||||||
|
)
|
||||||
|
),
|
||||||
|
retriever=dict(type=ZeroRetriever),
|
||||||
|
inferencer=dict(type=GenInferencer, max_out_len=1024)
|
||||||
|
)
|
||||||
|
|
||||||
|
lcb_code_generation_eval_cfg = dict(
|
||||||
|
evaluator=dict(
|
||||||
|
type=LCBCodeGenerationEvaluator,
|
||||||
|
num_process_evaluate=4,
|
||||||
|
timeout=6,
|
||||||
|
),
|
||||||
|
pred_role='BOT',
|
||||||
|
)
|
||||||
|
|
||||||
|
LCBCodeGeneration_dataset = dict(
|
||||||
|
type=LCBCodeGenerationDataset,
|
||||||
|
abbr='lcb_code_generation',
|
||||||
|
path='opencompass/code_generation_lite',
|
||||||
|
reader_cfg=lcb_code_generation_reader_cfg,
|
||||||
|
infer_cfg=lcb_code_generation_infer_cfg,
|
||||||
|
eval_cfg=lcb_code_generation_eval_cfg
|
||||||
|
)
|
||||||
|
|
||||||
|
# Code Execution Dataset
|
||||||
|
lcb_code_execution_reader_cfg = dict(
|
||||||
|
input_columns=[
|
||||||
|
'prompt',
|
||||||
|
],
|
||||||
|
output_column='evaluation_sample',
|
||||||
|
)
|
||||||
|
|
||||||
|
lcb_code_execution_infer_cfg = dict(
|
||||||
|
prompt_template=dict(
|
||||||
|
type=PromptTemplate,
|
||||||
|
template=dict(
|
||||||
|
begin=[
|
||||||
|
dict(
|
||||||
|
role='SYSTEM',
|
||||||
|
fallback_role='HUMAN',
|
||||||
|
prompt='You are an expert at Python programming, code execution, test case generation, and fuzzing.'
|
||||||
|
),
|
||||||
|
],
|
||||||
|
round=[
|
||||||
|
dict(
|
||||||
|
role='HUMAN',
|
||||||
|
prompt='{prompt}'
|
||||||
|
)
|
||||||
|
]
|
||||||
|
)
|
||||||
|
),
|
||||||
|
retriever=dict(type=ZeroRetriever),
|
||||||
|
inferencer=dict(type=GenInferencer, max_out_len=1024)
|
||||||
|
)
|
||||||
|
|
||||||
|
lcb_code_execution_eval_cfg = dict(
|
||||||
|
evaluator=dict(
|
||||||
|
type=LCBCodeExecutionEvaluator,
|
||||||
|
),
|
||||||
|
pred_role='BOT',
|
||||||
|
)
|
||||||
|
|
||||||
|
LCBCodeExecution_dataset = dict(
|
||||||
|
type=LCBCodeExecutionDataset,
|
||||||
|
abbr='lcb_code_execution',
|
||||||
|
path='opencompass/execution-v2',
|
||||||
|
reader_cfg=lcb_code_execution_reader_cfg,
|
||||||
|
infer_cfg=lcb_code_execution_infer_cfg,
|
||||||
|
eval_cfg=lcb_code_execution_eval_cfg,
|
||||||
|
)
|
||||||
|
|
||||||
|
# TestOuputput Dataset
|
||||||
|
lcb_test_output_reader_cfg = dict(
|
||||||
|
input_columns=[
|
||||||
|
'prompt',
|
||||||
|
],
|
||||||
|
output_column='evaluation_sample',
|
||||||
|
)
|
||||||
|
|
||||||
|
system_prompt = 'You are an expert Python programmer. You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests. You will NOT return anything except for the program.'
|
||||||
|
|
||||||
|
lcb_test_output_infer_cfg = dict(
|
||||||
|
prompt_template=dict(
|
||||||
|
type=PromptTemplate,
|
||||||
|
template=dict(
|
||||||
|
# begin=[
|
||||||
|
# dict(
|
||||||
|
# role='SYSTEM',
|
||||||
|
# prompt=system_prompt
|
||||||
|
# ),
|
||||||
|
# ],
|
||||||
|
round=[
|
||||||
|
dict(
|
||||||
|
role='HUMAN',
|
||||||
|
prompt='{prompt}'
|
||||||
|
)
|
||||||
|
]
|
||||||
|
)
|
||||||
|
),
|
||||||
|
retriever=dict(type=ZeroRetriever),
|
||||||
|
inferencer=dict(type=GenInferencer, max_out_len=1024)
|
||||||
|
)
|
||||||
|
|
||||||
|
lcb_test_output_eval_cfg = dict(
|
||||||
|
evaluator=dict(
|
||||||
|
type=LCBTestOutputEvaluator,
|
||||||
|
),
|
||||||
|
pred_role='BOT',
|
||||||
|
)
|
||||||
|
|
||||||
|
LCBTestOutput_dataset = dict(
|
||||||
|
type=LCBTestOutputPredictionDataset,
|
||||||
|
abbr='lcb_test_output',
|
||||||
|
path='opencompass/test_generation',
|
||||||
|
reader_cfg=lcb_test_output_reader_cfg,
|
||||||
|
infer_cfg=lcb_test_output_infer_cfg,
|
||||||
|
eval_cfg=lcb_test_output_eval_cfg,
|
||||||
|
)
|
||||||
|
|
||||||
|
LCB_datasets = [
|
||||||
|
LCBCodeGeneration_dataset,
|
||||||
|
LCBCodeExecution_dataset,
|
||||||
|
LCBTestOutput_dataset,
|
||||||
|
]
|
15
opencompass/configs/models/chatglm/lmdeploy_glm4_9b.py
Normal file
15
opencompass/configs/models/chatglm/lmdeploy_glm4_9b.py
Normal file
@ -0,0 +1,15 @@
|
|||||||
|
from opencompass.models import TurboMindModel
|
||||||
|
|
||||||
|
models = [
|
||||||
|
dict(
|
||||||
|
type=TurboMindModel,
|
||||||
|
abbr='glm-4-9b-turbomind',
|
||||||
|
path='THUDM/glm-4-9b',
|
||||||
|
engine_config=dict(max_batch_size=16, tp=1),
|
||||||
|
gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=2048),
|
||||||
|
max_seq_len=8192,
|
||||||
|
max_out_len=2048,
|
||||||
|
batch_size=16,
|
||||||
|
run_cfg=dict(num_gpus=1),
|
||||||
|
)
|
||||||
|
]
|
15
opencompass/configs/models/qwen2_5/lmdeploy_qwen2_5_14b.py
Normal file
15
opencompass/configs/models/qwen2_5/lmdeploy_qwen2_5_14b.py
Normal file
@ -0,0 +1,15 @@
|
|||||||
|
from opencompass.models import TurboMindModel
|
||||||
|
|
||||||
|
models = [
|
||||||
|
dict(
|
||||||
|
type=TurboMindModel,
|
||||||
|
abbr='qwen2.5-14b-turbomind',
|
||||||
|
path='Qwen/Qwen2.5-14B',
|
||||||
|
engine_config=dict(session_len=7168, max_batch_size=16, tp=2),
|
||||||
|
gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=1024),
|
||||||
|
max_seq_len=7168,
|
||||||
|
max_out_len=1024,
|
||||||
|
batch_size=16,
|
||||||
|
run_cfg=dict(num_gpus=2),
|
||||||
|
)
|
||||||
|
]
|
15
opencompass/configs/models/qwen2_5/lmdeploy_qwen2_5_32b.py
Normal file
15
opencompass/configs/models/qwen2_5/lmdeploy_qwen2_5_32b.py
Normal file
@ -0,0 +1,15 @@
|
|||||||
|
from opencompass.models import TurboMindModel
|
||||||
|
|
||||||
|
models = [
|
||||||
|
dict(
|
||||||
|
type=TurboMindModel,
|
||||||
|
abbr='qwen2.5-32b-turbomind',
|
||||||
|
path='Qwen/Qwen2.5-32B',
|
||||||
|
engine_config=dict(session_len=7168, max_batch_size=16, tp=2),
|
||||||
|
gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=1024),
|
||||||
|
max_seq_len=7168,
|
||||||
|
max_out_len=1024,
|
||||||
|
batch_size=16,
|
||||||
|
run_cfg=dict(num_gpus=2),
|
||||||
|
)
|
||||||
|
]
|
17
opencompass/configs/models/qwen2_5/lmdeploy_qwen2_5_72b.py
Normal file
17
opencompass/configs/models/qwen2_5/lmdeploy_qwen2_5_72b.py
Normal file
@ -0,0 +1,17 @@
|
|||||||
|
from opencompass.models import TurboMindModel
|
||||||
|
|
||||||
|
models = [
|
||||||
|
dict(
|
||||||
|
type=TurboMindModel,
|
||||||
|
abbr='qwen2.5-72b-turbomind',
|
||||||
|
path='Qwen/Qwen2.5-72B',
|
||||||
|
engine_config=dict(session_len=7168, max_batch_size=16, tp=4),
|
||||||
|
gen_config=dict(
|
||||||
|
top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=1024
|
||||||
|
),
|
||||||
|
max_seq_len=7168,
|
||||||
|
max_out_len=1024,
|
||||||
|
batch_size=16,
|
||||||
|
run_cfg=dict(num_gpus=4),
|
||||||
|
)
|
||||||
|
]
|
15
opencompass/configs/models/yi/lmdeploy_yi_1_5_9b.py
Normal file
15
opencompass/configs/models/yi/lmdeploy_yi_1_5_9b.py
Normal file
@ -0,0 +1,15 @@
|
|||||||
|
from opencompass.models import TurboMindModel
|
||||||
|
|
||||||
|
models = [
|
||||||
|
dict(
|
||||||
|
type=TurboMindModel,
|
||||||
|
abbr='yi-1.5-9b-turbomind',
|
||||||
|
path='01-ai/Yi-1.5-9B',
|
||||||
|
engine_config=dict(session_len=4096, max_batch_size=16, tp=1),
|
||||||
|
gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=2048),
|
||||||
|
max_seq_len=4096,
|
||||||
|
max_out_len=2048,
|
||||||
|
batch_size=16,
|
||||||
|
run_cfg=dict(num_gpus=1),
|
||||||
|
)
|
||||||
|
]
|
@ -526,7 +526,7 @@ class OpenAISDK(OpenAI):
|
|||||||
|
|
||||||
def _generate(self, input: PromptList | str, max_out_len: int,
|
def _generate(self, input: PromptList | str, max_out_len: int,
|
||||||
temperature: float) -> str:
|
temperature: float) -> str:
|
||||||
from openai import BadRequestError
|
from openai import APIStatusError, BadRequestError
|
||||||
assert isinstance(input, (str, PromptList))
|
assert isinstance(input, (str, PromptList))
|
||||||
|
|
||||||
# max num token for gpt-3.5-turbo is 4097
|
# max num token for gpt-3.5-turbo is 4097
|
||||||
@ -616,7 +616,7 @@ class OpenAISDK(OpenAI):
|
|||||||
from the API provider.')
|
from the API provider.')
|
||||||
return responses.choices[0].message.content
|
return responses.choices[0].message.content
|
||||||
|
|
||||||
except BadRequestError as e:
|
except (BadRequestError, APIStatusError) as e:
|
||||||
# Handle BadRequest status
|
# Handle BadRequest status
|
||||||
# You can specify self.status_code_mappings to bypass \
|
# You can specify self.status_code_mappings to bypass \
|
||||||
# API sensitivity blocks
|
# API sensitivity blocks
|
||||||
|
@ -87,6 +87,7 @@ class TurboMindModelwithChatTemplate(BaseModel):
|
|||||||
def generate(self,
|
def generate(self,
|
||||||
inputs: List[str],
|
inputs: List[str],
|
||||||
max_out_len: int,
|
max_out_len: int,
|
||||||
|
min_out_len: Optional[int] = None,
|
||||||
stopping_criteria: List[str] = [],
|
stopping_criteria: List[str] = [],
|
||||||
do_sample: Optional[bool] = None,
|
do_sample: Optional[bool] = None,
|
||||||
temperature: float = 1.0,
|
temperature: float = 1.0,
|
||||||
@ -123,6 +124,10 @@ class TurboMindModelwithChatTemplate(BaseModel):
|
|||||||
|
|
||||||
gen_config = copy.deepcopy(DEFAULT_GEN_CONFIG)
|
gen_config = copy.deepcopy(DEFAULT_GEN_CONFIG)
|
||||||
gen_config.update(self.gen_config)
|
gen_config.update(self.gen_config)
|
||||||
|
if max_out_len is not None:
|
||||||
|
gen_config['max_new_tokens'] = max_out_len
|
||||||
|
if min_out_len is not None:
|
||||||
|
gen_config['min_new_tokens'] = min_out_len
|
||||||
if do_sample or ('do_sample' in self.gen_config and self.gen_config['do_sample']):
|
if do_sample or ('do_sample' in self.gen_config and self.gen_config['do_sample']):
|
||||||
gen_config['top_k'] = 40
|
gen_config['top_k'] = 40
|
||||||
gen_config['temperature'] = temperature
|
gen_config['temperature'] = temperature
|
||||||
|
Loading…
Reference in New Issue
Block a user