diff --git a/configs/datasets/humaneval/deprecated_humaneval_gen_8e312c.py b/configs/datasets/humaneval/deprecated_humaneval_gen_8e312c.py new file mode 100644 index 00000000..2b9e20ff --- /dev/null +++ b/configs/datasets/humaneval/deprecated_humaneval_gen_8e312c.py @@ -0,0 +1,36 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import HumanevalDataset, HumanEvaluator, humaneval_postprocess + +humaneval_reader_cfg = dict( + input_columns=['prompt'], output_column='task_id', train_split='test') + +# TODO: allow empty output-column +humaneval_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict(round=[ + dict( + role='HUMAN', + prompt='Complete the following python code:\n{prompt}'), + ])), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=512)) + +humaneval_eval_cfg = dict( + evaluator=dict(type=HumanEvaluator), + pred_role='BOT', + k=[1, 10, 100], # the parameter only for humaneval + pred_postprocessor=dict(type=humaneval_postprocess), +) + +humaneval_datasets = [ + dict( + abbr='openai_humaneval', + type=HumanevalDataset, + path='./data/humaneval/human-eval-v2-20210705.jsonl', + reader_cfg=humaneval_reader_cfg, + infer_cfg=humaneval_infer_cfg, + eval_cfg=humaneval_eval_cfg) +] diff --git a/configs/datasets/humaneval/humaneval_gen_8e312c.py b/configs/datasets/humaneval/humaneval_gen_8e312c.py index 2b9e20ff..8a0c6c7c 100644 --- a/configs/datasets/humaneval/humaneval_gen_8e312c.py +++ b/configs/datasets/humaneval/humaneval_gen_8e312c.py @@ -1,7 +1,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import GenInferencer -from opencompass.datasets import HumanevalDataset, HumanEvaluator, humaneval_postprocess +from opencompass.datasets import HumanevalDataset, HumanEvaluator, humaneval_postprocess_v2 humaneval_reader_cfg = dict( input_columns=['prompt'], output_column='task_id', train_split='test') @@ -22,7 +22,7 @@ humaneval_eval_cfg = dict( evaluator=dict(type=HumanEvaluator), pred_role='BOT', k=[1, 10, 100], # the parameter only for humaneval - pred_postprocessor=dict(type=humaneval_postprocess), + pred_postprocessor=dict(type=humaneval_postprocess_v2), ) humaneval_datasets = [ diff --git a/configs/datasets/humaneval/humaneval_passk_gen_8e312c.py b/configs/datasets/humaneval/humaneval_passk_gen_8e312c.py deleted file mode 120000 index ca9488ed..00000000 --- a/configs/datasets/humaneval/humaneval_passk_gen_8e312c.py +++ /dev/null @@ -1 +0,0 @@ -./humaneval_gen_8e312c.py \ No newline at end of file diff --git a/configs/datasets/humaneval/humaneval_passk_gen_8e312c.py b/configs/datasets/humaneval/humaneval_passk_gen_8e312c.py new file mode 100644 index 00000000..d1696511 --- /dev/null +++ b/configs/datasets/humaneval/humaneval_passk_gen_8e312c.py @@ -0,0 +1,36 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import HumanevalDataset, HumanEvaluator, humaneval_postprocess_v2 + +humaneval_reader_cfg = dict( + input_columns=['prompt'], output_column='task_id', train_split='test') + +# TODO: allow empty output-column +humaneval_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict(round=[ + dict( + role='HUMAN', + prompt='Complete the following python code:\n{prompt}'), + ])), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=512)) + +humaneval_eval_cfg = dict( + evaluator=dict(type=HumanEvaluator), + pred_role='BOT', + k=[1, 10, 100], # the parameter only for humaneval + pred_postprocessor=dict(type=humaneval_postprocess_v2), +) + +humaneval_datasets = [ + dict( + abbr='openai_humaneval_passk', + type=HumanevalDataset, + path='./data/humaneval/human-eval-v2-20210705.jsonl', + reader_cfg=humaneval_reader_cfg, + infer_cfg=humaneval_infer_cfg, + eval_cfg=humaneval_eval_cfg) +] diff --git a/configs/datasets/humaneval/humaneval_repeat10_gen_8e312c.py b/configs/datasets/humaneval/humaneval_repeat10_gen_8e312c.py index a1940b3f..5eff32c2 100644 --- a/configs/datasets/humaneval/humaneval_repeat10_gen_8e312c.py +++ b/configs/datasets/humaneval/humaneval_repeat10_gen_8e312c.py @@ -1,7 +1,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import GenInferencer -from opencompass.datasets import HumanevalDataset, HumanEvaluator, humaneval_postprocess +from opencompass.datasets import HumanevalDataset, HumanEvaluator, humaneval_postprocess_v2 humaneval_reader_cfg = dict( input_columns=['prompt'], output_column='task_id', train_split='test') @@ -22,12 +22,12 @@ humaneval_eval_cfg = dict( evaluator=dict(type=HumanEvaluator), pred_role='BOT', k=[1, 10, 100], # the parameter only for humaneval - pred_postprocessor=dict(type=humaneval_postprocess), + pred_postprocessor=dict(type=humaneval_postprocess_v2), ) humaneval_datasets = [ dict( - abbr='openai_humaneval_pass10', + abbr='openai_humaneval_repeat10', type=HumanevalDataset, path='./data/humaneval/human-eval-v2-20210705.jsonl', num_repeats=10, diff --git a/configs/datasets/humaneval_cn/humaneval_cn_passk_gen_6313aa.py b/configs/datasets/humaneval_cn/humaneval_cn_passk_gen_6313aa.py deleted file mode 120000 index 4a0cf6f3..00000000 --- a/configs/datasets/humaneval_cn/humaneval_cn_passk_gen_6313aa.py +++ /dev/null @@ -1 +0,0 @@ -./humaneval_cn_gen_6313aa.py \ No newline at end of file diff --git a/configs/datasets/humaneval_cn/humaneval_cn_passk_gen_6313aa.py b/configs/datasets/humaneval_cn/humaneval_cn_passk_gen_6313aa.py new file mode 100644 index 00000000..4073824f --- /dev/null +++ b/configs/datasets/humaneval_cn/humaneval_cn_passk_gen_6313aa.py @@ -0,0 +1,37 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import HumanevalDataset, HumanEvaluator, humaneval_postprocess_v2 + +humaneval_reader_cfg = dict( + input_columns=['prompt'], output_column='task_id', train_split='test') + +# TODO: allow empty output-column +humaneval_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict(round=[ + dict( + role='HUMAN', + prompt='完成以下Python代码任务:\n{prompt}'), + ])), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=512)) + +humaneval_eval_cfg = dict( + evaluator=dict(type=HumanEvaluator), + pred_role='BOT', + k=[1, 10, 100], # the parameter only for humaneval + pred_postprocessor=dict(type=humaneval_postprocess_v2), +) + +humaneval_cn_datasets = [ + dict( + abbr='openai_humaneval_cn_passk', + type=HumanevalDataset, + path='./data/humaneval_cn/human-eval-cn-v2-20210705.jsonl', + reader_cfg=humaneval_reader_cfg, + infer_cfg=humaneval_infer_cfg, + eval_cfg=humaneval_eval_cfg) + +] diff --git a/configs/datasets/humaneval_cn/humaneval_cn_repeat10_gen_6313aa.py b/configs/datasets/humaneval_cn/humaneval_cn_repeat10_gen_6313aa.py index 5665695b..3cafadc3 100644 --- a/configs/datasets/humaneval_cn/humaneval_cn_repeat10_gen_6313aa.py +++ b/configs/datasets/humaneval_cn/humaneval_cn_repeat10_gen_6313aa.py @@ -27,7 +27,7 @@ humaneval_eval_cfg = dict( humaneval_cn_datasets = [ dict( - abbr='openai_humaneval_cn_pass10', + abbr='openai_humaneval_cn_repeat10', type=HumanevalDataset, path='./data/humaneval_cn/human-eval-cn-v2-20210705.jsonl', num_repeats=10, diff --git a/configs/datasets/humaneval_plus/humaneval_plus_gen_8e312c.py b/configs/datasets/humaneval_plus/humaneval_plus_gen_8e312c.py index 63698cba..f1f61387 100644 --- a/configs/datasets/humaneval_plus/humaneval_plus_gen_8e312c.py +++ b/configs/datasets/humaneval_plus/humaneval_plus_gen_8e312c.py @@ -19,7 +19,7 @@ humaneval_plus_infer_cfg = dict( inferencer=dict(type=GenInferencer, max_out_len=512)) humaneval_plus_eval_cfg = dict( - evaluator=dict(type=HumanEvaluator,k=1, metric='EvalPlus'), + evaluator=dict(type=HumanEvaluator, metric='EvalPlus'), pred_role='BOT', k=[1, 10, 100], # the parameter only for humaneval pred_postprocessor=dict(type=humaneval_postprocess_v2), diff --git a/configs/datasets/humaneval_plus/humaneval_plus_passk_gen_8e312c.py b/configs/datasets/humaneval_plus/humaneval_plus_passk_gen_8e312c.py new file mode 100644 index 00000000..1ff4c4dd --- /dev/null +++ b/configs/datasets/humaneval_plus/humaneval_plus_passk_gen_8e312c.py @@ -0,0 +1,36 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import HumanevalDataset, HumanEvaluator, humaneval_postprocess_v2 + +humaneval_plus_reader_cfg = dict( + input_columns=['prompt'], output_column='task_id', train_split='test') + +# TODO: allow empty output-column +humaneval_plus_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict(round=[ + dict( + role='HUMAN', + prompt='Complete the following python code:\n{prompt}'), + ])), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=512)) + +humaneval_plus_eval_cfg = dict( + evaluator=dict(type=HumanEvaluator, metric='EvalPlus'), + pred_role='BOT', + k=[1, 10, 100], # the parameter only for humaneval + pred_postprocessor=dict(type=humaneval_postprocess_v2), +) + +humaneval_plus_datasets = [ + dict( + abbr='humaneval_plus_passk', + type=HumanevalDataset, + path='./data/humaneval/human-eval-v2-20210705.jsonl', + reader_cfg=humaneval_plus_reader_cfg, + infer_cfg=humaneval_plus_infer_cfg, + eval_cfg=humaneval_plus_eval_cfg) +] diff --git a/configs/datasets/humaneval_plus/humaneval_plus_repeat10_gen_8e312c.py b/configs/datasets/humaneval_plus/humaneval_plus_repeat10_gen_8e312c.py new file mode 100644 index 00000000..bb859a7a --- /dev/null +++ b/configs/datasets/humaneval_plus/humaneval_plus_repeat10_gen_8e312c.py @@ -0,0 +1,37 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import HumanevalDataset, HumanEvaluator, humaneval_postprocess_v2 + +humaneval_plus_reader_cfg = dict( + input_columns=['prompt'], output_column='task_id', train_split='test') + +# TODO: allow empty output-column +humaneval_plus_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict(round=[ + dict( + role='HUMAN', + prompt='Complete the following python code:\n{prompt}'), + ])), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=512)) + +humaneval_plus_eval_cfg = dict( + evaluator=dict(type=HumanEvaluator, metric='EvalPlus'), + pred_role='BOT', + k=[1, 10, 100], # the parameter only for humaneval + pred_postprocessor=dict(type=humaneval_postprocess_v2), +) + +humaneval_plus_datasets = [ + dict( + abbr='humaneval_plus_repeat10', + type=HumanevalDataset, + path='./data/humaneval/human-eval-v2-20210705.jsonl', + num_repeats=10, + reader_cfg=humaneval_plus_reader_cfg, + infer_cfg=humaneval_plus_infer_cfg, + eval_cfg=humaneval_plus_eval_cfg) +] diff --git a/configs/datasets/mbpp/mbpp_passk_gen_1e1056.py b/configs/datasets/mbpp/mbpp_passk_gen_1e1056.py index edc4b9ae..1d3f6611 100644 --- a/configs/datasets/mbpp/mbpp_passk_gen_1e1056.py +++ b/configs/datasets/mbpp/mbpp_passk_gen_1e1056.py @@ -56,7 +56,7 @@ mbpp_eval_cfg = dict(evaluator=dict(type=MBPPPassKEvaluator), pred_role="BOT") mbpp_datasets = [ dict( type=MBPPDataset_V2, - abbr='mbpp', + abbr='mbpp_passk', path='./data/mbpp/mbpp.jsonl', reader_cfg=mbpp_reader_cfg, infer_cfg=mbpp_infer_cfg, diff --git a/configs/datasets/mbpp/mbpp_repeat10_gen_1e1056.py b/configs/datasets/mbpp/mbpp_repeat10_gen_1e1056.py index 1b8a6a86..53fad641 100644 --- a/configs/datasets/mbpp/mbpp_repeat10_gen_1e1056.py +++ b/configs/datasets/mbpp/mbpp_repeat10_gen_1e1056.py @@ -58,7 +58,7 @@ mbpp_eval_cfg = dict(evaluator=dict(type=MBPPPassKEvaluator), pred_role="BOT") mbpp_datasets = [ dict( type=MBPPDataset_V2, - abbr='mbpp_pass10', + abbr='mbpp_repeat10', path='./data/mbpp/mbpp.jsonl', num_repeats=10, reader_cfg=mbpp_reader_cfg, diff --git a/configs/datasets/mbpp/sanitized_mbpp_passk_gen_1e1056.py b/configs/datasets/mbpp/sanitized_mbpp_passk_gen_1e1056.py index 7514c531..fc3a430a 100644 --- a/configs/datasets/mbpp/sanitized_mbpp_passk_gen_1e1056.py +++ b/configs/datasets/mbpp/sanitized_mbpp_passk_gen_1e1056.py @@ -56,7 +56,7 @@ sanitized_mbpp_eval_cfg = dict(evaluator=dict(type=MBPPPassKEvaluator), pred_rol sanitized_mbpp_datasets = [ dict( type=SanitizedMBPPDataset, - abbr='sanitized_mbpp', + abbr='sanitized_mbpp_passk', path='./sanitized-mbpp.jsonl', reader_cfg=sanitized_mbpp_reader_cfg, infer_cfg=sanitized_mbpp_infer_cfg, diff --git a/configs/datasets/mbpp/sanitized_mbpp_repeat10_gen_1e1056.py b/configs/datasets/mbpp/sanitized_mbpp_repeat10_gen_1e1056.py index 6af2d3f6..90e64c15 100644 --- a/configs/datasets/mbpp/sanitized_mbpp_repeat10_gen_1e1056.py +++ b/configs/datasets/mbpp/sanitized_mbpp_repeat10_gen_1e1056.py @@ -56,7 +56,7 @@ sanitized_mbpp_eval_cfg = dict(evaluator=dict(type=MBPPPassKEvaluator), pred_rol sanitized_mbpp_datasets = [ dict( type=SanitizedMBPPDataset, - abbr='sanitized_mbpp_pass10', + abbr='sanitized_mbpp_repeat10', path='./sanitized-mbpp.jsonl', num_repeats=10, reader_cfg=sanitized_mbpp_reader_cfg, diff --git a/configs/datasets/mbpp_cn/mbpp_cn_passk_gen_1d1481.py b/configs/datasets/mbpp_cn/mbpp_cn_passk_gen_1d1481.py index e80954b1..8ce5b7c1 100644 --- a/configs/datasets/mbpp_cn/mbpp_cn_passk_gen_1d1481.py +++ b/configs/datasets/mbpp_cn/mbpp_cn_passk_gen_1d1481.py @@ -56,7 +56,7 @@ mbpp_eval_cfg = dict(evaluator=dict(type=MBPPPassKEvaluator), pred_role="BOT") mbpp_cn_datasets = [ dict( type=MBPPDataset_V2, - abbr='mbpp_cn', + abbr='mbpp_cn_passk', path='./data/mbpp_cn/mbpp_cn.jsonl', reader_cfg=mbpp_reader_cfg, infer_cfg=mbpp_infer_cfg, diff --git a/configs/datasets/mbpp_cn/mbpp_cn_repeat10_gen_1d1481.py b/configs/datasets/mbpp_cn/mbpp_cn_repeat10_gen_1d1481.py index 3f0a6258..9ac1fc59 100644 --- a/configs/datasets/mbpp_cn/mbpp_cn_repeat10_gen_1d1481.py +++ b/configs/datasets/mbpp_cn/mbpp_cn_repeat10_gen_1d1481.py @@ -56,7 +56,7 @@ mbpp_eval_cfg = dict(evaluator=dict(type=MBPPPassKEvaluator), pred_role="BOT") mbpp_cn_datasets = [ dict( type=MBPPDataset_V2, - abbr='mbpp_cn_pass10', + abbr='mbpp_cn_repeat10', path='./data/mbpp_cn/mbpp_cn.jsonl', num_repeats=10, reader_cfg=mbpp_reader_cfg, diff --git a/opencompass/models/huggingface.py b/opencompass/models/huggingface.py index d1f98864..0f6864ee 100644 --- a/opencompass/models/huggingface.py +++ b/opencompass/models/huggingface.py @@ -621,6 +621,7 @@ class HuggingFaceChatGLM3(HuggingFace): peft_path: Optional[str] = None, tokenizer_only: bool = False, model_kwargs: dict = dict(device_map='auto'), + generation_kwargs: dict = dict(), meta_template: Optional[Dict] = None, extract_pred_after_decode: bool = False, batch_padding: bool = False, @@ -634,6 +635,7 @@ class HuggingFaceChatGLM3(HuggingFace): tokenizer_kwargs=tokenizer_kwargs, peft_path=peft_path, tokenizer_only=tokenizer_only, + generation_kwargs=generation_kwargs, model_kwargs=model_kwargs, meta_template=meta_template, extract_pred_after_decode=extract_pred_after_decode, @@ -647,15 +649,17 @@ class HuggingFaceChatGLM3(HuggingFace): def generate(self, inputs: List[str or PromptList], max_out_len: int = 512, - temperature: float = 0.6, - skip_overlength=False) -> str: + skip_overlength=False, + **kwargs) -> str: """Generate response from input prompt. Args: inputs (list): input prompt max_out_len (int): max output length - temperature (float): temperature for sampling """ + generation_kwargs = kwargs.copy() + generation_kwargs.update(self.generation_kwargs) + responses = [] for _input in inputs: assert isinstance(_input, (str, PromptList)) @@ -692,7 +696,8 @@ class HuggingFaceChatGLM3(HuggingFace): try: response, history = self.model.chat(self.tokenizer, user_content, - history=history) + history=history, + **generation_kwargs) # response will be dict sometime if isinstance(response, dict): response = response.get('content', '')