diff --git a/opencompass/configs/datasets/bigcodebench/bigcodebench_full_instruct_repeat_gen_8815eb.py b/opencompass/configs/datasets/bigcodebench/bigcodebench_full_instruct_repeat_gen_8815eb.py index 226746c0..b4c89813 100644 --- a/opencompass/configs/datasets/bigcodebench/bigcodebench_full_instruct_repeat_gen_8815eb.py +++ b/opencompass/configs/datasets/bigcodebench/bigcodebench_full_instruct_repeat_gen_8815eb.py @@ -40,6 +40,6 @@ bigcodebench_full_instruct_datasets = [ infer_cfg=bigcodebench_full_infer_cfg, eval_cfg=bigcodebench_full_eval_cfg, release_version='v0.1.2', - n=3, - k=2) + n=5, + k=3) ] diff --git a/opencompass/configs/datasets/bigcodebench/bigcodebench_hard_instruct_repeat_gen_c3d5ad.py b/opencompass/configs/datasets/bigcodebench/bigcodebench_hard_instruct_repeat_gen_c3d5ad.py index b3804003..5baa55c0 100644 --- a/opencompass/configs/datasets/bigcodebench/bigcodebench_hard_instruct_repeat_gen_c3d5ad.py +++ b/opencompass/configs/datasets/bigcodebench/bigcodebench_hard_instruct_repeat_gen_c3d5ad.py @@ -42,7 +42,7 @@ bigcodebench_hard_instruct_datasets = [ eval_cfg=bigcodebench_hard_eval_cfg, release_version='v0.1.2', dataset_version='hard', - n=3, - k=2 + n=5, + k=3 ) ] diff --git a/opencompass/configs/datasets/humaneval/humaneval_openai_sample_evals_repeat_gen_dcae0e.py b/opencompass/configs/datasets/humaneval/humaneval_openai_sample_evals_repeat_gen_dcae0e.py new file mode 100644 index 00000000..45542977 --- /dev/null +++ b/opencompass/configs/datasets/humaneval/humaneval_openai_sample_evals_repeat_gen_dcae0e.py @@ -0,0 +1,37 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import HumanevalDataset, HumanEvalEvaluator, humaneval_postprocess_v2 + +humaneval_reader_cfg = dict( + input_columns=['prompt'], output_column='task_id', train_split='test') + +# TODO: allow empty output-column +humaneval_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict(round=[ + dict( + role='HUMAN', + prompt='Read the following function signature and docstring, and fully implement the function described. Your response should only contain the code for this function.\n{prompt}'), + ])), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer)) + +humaneval_eval_cfg = dict( + evaluator=dict(type=HumanEvalEvaluator), + pred_role='BOT', + pred_postprocessor=dict(type=humaneval_postprocess_v2), +) + +humaneval_datasets = [ + dict( + abbr='openai_humaneval', + type=HumanevalDataset, + path='opencompass/humaneval', + reader_cfg=humaneval_reader_cfg, + infer_cfg=humaneval_infer_cfg, + eval_cfg=humaneval_eval_cfg, + n=3, + k=2) +] diff --git a/opencompass/datasets/humaneval.py b/opencompass/datasets/humaneval.py index 9788b638..af001716 100644 --- a/opencompass/datasets/humaneval.py +++ b/opencompass/datasets/humaneval.py @@ -183,13 +183,13 @@ def humaneval_postprocess_v2(text: str) -> str: blocks = re.findall(r'```\w*\n(.*?)```', text, re.DOTALL) if len(blocks) >= 1: text = blocks[0] - return text + return text.lstrip() def humaneval_postprocess_v3(text: str) -> str: blocks = re.findall(r'```\w*\n(.*?)```', text, re.DOTALL) if len(blocks) >= 1: text = blocks[-1] - return text + return text.lstrip() def humaneval_internal_v2_postprocess(text: str): if text.startswith(' ') and not text.startswith(' '):