From bd7b705be46dadbce0cce068d76e67f295599b43 Mon Sep 17 00:00:00 2001 From: Linchen Xiao Date: Wed, 11 Dec 2024 18:20:29 +0800 Subject: [PATCH] [Update] Update dataset configuration with no max_out_len (#1754) --- .../wildbench/wildbench_pair_judge.py | 2 +- .../wildbench/wildbench_pair_judge_new.py | 2 +- .../datasets/IFEval/IFEval_gen_353ae7.py | 33 +++++++++++++++ ...umaneval_openai_sample_evals_gen_dcae0e.py | 36 ++++++++++++++++ .../humanevalx/humanevalx_gen_3d84a3.py | 41 +++++++++++++++++++ .../datasets/math/math_0shot_gen_11c4b5.py | 35 ++++++++++++++++ .../wildbench/wildbench_pair_judge.py | 2 +- .../wildbench/wildbench_pair_judge_new.py | 2 +- 8 files changed, 149 insertions(+), 4 deletions(-) create mode 100644 opencompass/configs/datasets/IFEval/IFEval_gen_353ae7.py create mode 100644 opencompass/configs/datasets/humaneval/humaneval_openai_sample_evals_gen_dcae0e.py create mode 100644 opencompass/configs/datasets/humanevalx/humanevalx_gen_3d84a3.py create mode 100644 opencompass/configs/datasets/math/math_0shot_gen_11c4b5.py diff --git a/configs/datasets/subjective/wildbench/wildbench_pair_judge.py b/configs/datasets/subjective/wildbench/wildbench_pair_judge.py index b8cbd02f..d1adca1f 100644 --- a/configs/datasets/subjective/wildbench/wildbench_pair_judge.py +++ b/configs/datasets/subjective/wildbench/wildbench_pair_judge.py @@ -20,7 +20,7 @@ subjective_infer_cfg = dict( template="""{dialogue}""" ), retriever=dict(type=ZeroRetriever), - inferencer=dict(type=ChatInferencer, max_seq_len=32768, max_out_len=4096, infer_mode='last'), + inferencer=dict(type=ChatInferencer, max_seq_len=32768, infer_mode='last'), ) subjective_eval_cfg = dict( diff --git a/configs/datasets/subjective/wildbench/wildbench_pair_judge_new.py b/configs/datasets/subjective/wildbench/wildbench_pair_judge_new.py index eff6cbb0..15310554 100644 --- a/configs/datasets/subjective/wildbench/wildbench_pair_judge_new.py +++ b/configs/datasets/subjective/wildbench/wildbench_pair_judge_new.py @@ -20,7 +20,7 @@ subjective_infer_cfg = dict( template="""{dialogue}""" ), retriever=dict(type=ZeroRetriever), - inferencer=dict(type=ChatInferencer, max_seq_len=32768, max_out_len=4096, infer_mode='last'), + inferencer=dict(type=ChatInferencer, max_seq_len=32768, infer_mode='last'), ) subjective_eval_cfg = dict( diff --git a/opencompass/configs/datasets/IFEval/IFEval_gen_353ae7.py b/opencompass/configs/datasets/IFEval/IFEval_gen_353ae7.py new file mode 100644 index 00000000..0a83521f --- /dev/null +++ b/opencompass/configs/datasets/IFEval/IFEval_gen_353ae7.py @@ -0,0 +1,33 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import IFEvalDataset, IFEvaluator + +ifeval_reader_cfg = dict( + input_columns=['prompt'], output_column='reference') + +ifeval_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict(round=[ + dict( + role='HUMAN', + prompt='{prompt}'), + ])), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer)) + +ifeval_eval_cfg = dict( + evaluator=dict(type=IFEvaluator), + pred_role='BOT', +) + +ifeval_datasets = [ + dict( + abbr='IFEval', + type=IFEvalDataset, + path='data/ifeval/input_data.jsonl', + reader_cfg=ifeval_reader_cfg, + infer_cfg=ifeval_infer_cfg, + eval_cfg=ifeval_eval_cfg) +] diff --git a/opencompass/configs/datasets/humaneval/humaneval_openai_sample_evals_gen_dcae0e.py b/opencompass/configs/datasets/humaneval/humaneval_openai_sample_evals_gen_dcae0e.py new file mode 100644 index 00000000..5d5bed64 --- /dev/null +++ b/opencompass/configs/datasets/humaneval/humaneval_openai_sample_evals_gen_dcae0e.py @@ -0,0 +1,36 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import HumanevalDataset, HumanEvalEvaluator, humaneval_postprocess_v2 + +humaneval_reader_cfg = dict( + input_columns=['prompt'], output_column='task_id', train_split='test') + +# TODO: allow empty output-column +humaneval_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict(round=[ + dict( + role='HUMAN', + prompt='Read the following function signature and docstring, and fully implement the function described. Your response should only contain the code for this function.\n{prompt}'), + ])), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer)) + +humaneval_eval_cfg = dict( + evaluator=dict(type=HumanEvalEvaluator), + pred_role='BOT', + k=[1, 10, 100], # the parameter only for humaneval + pred_postprocessor=dict(type=humaneval_postprocess_v2), +) + +humaneval_datasets = [ + dict( + abbr='openai_humaneval', + type=HumanevalDataset, + path='opencompass/humaneval', + reader_cfg=humaneval_reader_cfg, + infer_cfg=humaneval_infer_cfg, + eval_cfg=humaneval_eval_cfg) +] diff --git a/opencompass/configs/datasets/humanevalx/humanevalx_gen_3d84a3.py b/opencompass/configs/datasets/humanevalx/humanevalx_gen_3d84a3.py new file mode 100644 index 00000000..b5f48820 --- /dev/null +++ b/opencompass/configs/datasets/humanevalx/humanevalx_gen_3d84a3.py @@ -0,0 +1,41 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import HumanevalXDataset, HumanevalXEvaluator + +humanevalx_reader_cfg = dict( + input_columns=['prompt'], output_column='declaration', train_split='test') + +humanevalx_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template='{prompt}'), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer)) + +humanevalx_eval_cfg_dict = { + lang : dict( + evaluator=dict( + type=HumanevalXEvaluator, + language=lang, + ip_address= + 'localhost', # replace to your code_eval_server ip_address, port + port=5001), # refer to https://opencompass.readthedocs.io/en/latest/advanced_guides/code_eval_service.html to launch a server + pred_role='BOT') + for lang in ['python', 'cpp', 'go', 'java', 'js'] # do not support rust now +} + +# Please download the needed `xx.jsonl.gz` from +# https://github.com/THUDM/CodeGeeX2/tree/main/benchmark/humanevalx +# and move them into `data/humanevalx/` folder +humanevalx_datasets = [ + dict( + type=HumanevalXDataset, + abbr=f'humanevalx-{lang}', + language=lang, + path='./data/humanevalx', + reader_cfg=humanevalx_reader_cfg, + infer_cfg=humanevalx_infer_cfg, + eval_cfg=humanevalx_eval_cfg_dict[lang]) + for lang in ['python', 'cpp', 'go', 'java', 'js'] +] diff --git a/opencompass/configs/datasets/math/math_0shot_gen_11c4b5.py b/opencompass/configs/datasets/math/math_0shot_gen_11c4b5.py new file mode 100644 index 00000000..71597c29 --- /dev/null +++ b/opencompass/configs/datasets/math/math_0shot_gen_11c4b5.py @@ -0,0 +1,35 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import MATHDataset, MATHEvaluator, math_postprocess_v2, normalize_final_answer + +math_reader_cfg = dict(input_columns=['problem'], output_column='solution') + +math_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict(role='HUMAN', prompt='{problem}\nPlease reason step by step, and put your final answer within \\boxed{}.'), + ] + ), + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer), +) + +# postprocess v2 +math_eval_cfg = dict( + evaluator=dict(type=MATHEvaluator, version='v2'), pred_postprocessor=dict(type=math_postprocess_v2), +) + +math_datasets = [ + dict( + type=MATHDataset, + abbr='math', + path='opencompass/math', + reader_cfg=math_reader_cfg, + infer_cfg=math_infer_cfg, + eval_cfg=math_eval_cfg, + ) +] diff --git a/opencompass/configs/datasets/subjective/wildbench/wildbench_pair_judge.py b/opencompass/configs/datasets/subjective/wildbench/wildbench_pair_judge.py index b8cbd02f..d1adca1f 100644 --- a/opencompass/configs/datasets/subjective/wildbench/wildbench_pair_judge.py +++ b/opencompass/configs/datasets/subjective/wildbench/wildbench_pair_judge.py @@ -20,7 +20,7 @@ subjective_infer_cfg = dict( template="""{dialogue}""" ), retriever=dict(type=ZeroRetriever), - inferencer=dict(type=ChatInferencer, max_seq_len=32768, max_out_len=4096, infer_mode='last'), + inferencer=dict(type=ChatInferencer, max_seq_len=32768, infer_mode='last'), ) subjective_eval_cfg = dict( diff --git a/opencompass/configs/datasets/subjective/wildbench/wildbench_pair_judge_new.py b/opencompass/configs/datasets/subjective/wildbench/wildbench_pair_judge_new.py index eff6cbb0..15310554 100644 --- a/opencompass/configs/datasets/subjective/wildbench/wildbench_pair_judge_new.py +++ b/opencompass/configs/datasets/subjective/wildbench/wildbench_pair_judge_new.py @@ -20,7 +20,7 @@ subjective_infer_cfg = dict( template="""{dialogue}""" ), retriever=dict(type=ZeroRetriever), - inferencer=dict(type=ChatInferencer, max_seq_len=32768, max_out_len=4096, infer_mode='last'), + inferencer=dict(type=ChatInferencer, max_seq_len=32768, infer_mode='last'), ) subjective_eval_cfg = dict(