diff --git a/opencompass/configs/datasets/xhumaneval/README.md b/opencompass/configs/datasets/xhumaneval/README.md new file mode 100644 index 00000000..ce004785 --- /dev/null +++ b/opencompass/configs/datasets/xhumaneval/README.md @@ -0,0 +1,69 @@ +# HumanEval + +```bash +python3 run.py --models hf_internlm2_7b --datasets deprecated_humaneval_gen_d2537e --debug +python3 run.py --models hf_internlm2_chat_7b --datasets humaneval_gen_8e312c --debug +``` + +## Base Models + +| model | pass@1 | +|:------------------------:|---------:| +| llama-7b-turbomind | 12.80 | +| llama-13b-turbomind | 15.24 | +| llama-30b-turbomind | 9.15 | +| llama-65b-turbomind | 7.32 | +| llama-2-7b-turbomind | 14.02 | +| llama-2-13b-turbomind | 15.24 | +| llama-2-70b-turbomind | 15.24 | +| llama-3-8b-turbomind | 28.05 | +| llama-3-70b-turbomind | 28.05 | +| internlm2-1.8b-turbomind | 30.49 | +| internlm2-7b-turbomind | 48.17 | +| internlm2-20b-turbomind | 51.83 | +| qwen-1.8b-turbomind | 16.46 | +| qwen-7b-turbomind | 23.78 | +| qwen-14b-turbomind | 23.78 | +| qwen-72b-turbomind | 66.46 | +| qwen1.5-0.5b-hf | 8.54 | +| qwen1.5-1.8b-hf | 23.17 | +| qwen1.5-4b-hf | 41.46 | +| qwen1.5-7b-hf | 53.05 | +| qwen1.5-14b-hf | 57.32 | +| qwen1.5-32b-hf | 70.12 | +| qwen1.5-72b-hf | 65.85 | +| qwen1.5-moe-a2-7b-hf | 45.73 | +| mistral-7b-v0.1-hf | 14.02 | +| mistral-7b-v0.2-hf | 9.15 | +| mixtral-8x7b-v0.1-hf | 24.39 | +| mixtral-8x22b-v0.1-hf | 16.46 | +| yi-6b-hf | 14.63 | +| yi-34b-hf | 17.07 | +| deepseek-7b-base-hf | 18.29 | +| deepseek-67b-base-hf | 23.17 | + +## Chat Models + +| model | pass@1 | +|:-----------------------------:|---------:| +| qwen1.5-0.5b-chat-hf | 9.15 | +| qwen1.5-1.8b-chat-hf | 15.85 | +| qwen1.5-4b-chat-hf | 30.49 | +| qwen1.5-7b-chat-hf | 40.85 | +| qwen1.5-14b-chat-hf | 50.00 | +| qwen1.5-32b-chat-hf | 57.93 | +| qwen1.5-72b-chat-hf | 60.37 | +| qwen1.5-110b-chat-hf | 65.24 | +| internlm2-chat-1.8b-hf | 33.54 | +| internlm2-chat-1.8b-sft-hf | 34.15 | +| internlm2-chat-7b-hf | 56.71 | +| internlm2-chat-7b-sft-hf | 61.59 | +| internlm2-chat-20b-hf | 67.68 | +| internlm2-chat-20b-sft-hf | 67.68 | +| llama-3-8b-instruct-hf | 55.49 | +| llama-3-70b-instruct-hf | 70.73 | +| llama-3-8b-instruct-lmdeploy | 57.93 | +| llama-3-70b-instruct-lmdeploy | 70.73 | +| mistral-7b-instruct-v0.1-hf | 32.32 | +| mistral-7b-instruct-v0.2-hf | 29.27 | +| mixtral-8x7b-instruct-v0.1-hf | 34.15 | diff --git a/opencompass/configs/datasets/xhumaneval/deprecated_humaneval_gen_4a6eef.py b/opencompass/configs/datasets/xhumaneval/deprecated_humaneval_gen_4a6eef.py new file mode 100644 index 00000000..67dd58a5 --- /dev/null +++ b/opencompass/configs/datasets/xhumaneval/deprecated_humaneval_gen_4a6eef.py @@ -0,0 +1,36 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import HumanevalDataset, HumanEvalEvaluator, humaneval_postprocess + +humaneval_reader_cfg = dict( + input_columns=['prompt'], output_column='task_id', train_split='test') + +# TODO: allow empty output-column +humaneval_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict(round=[ + dict( + role='HUMAN', + prompt='Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\nCreate a Python script for this problem:\n{prompt}\n\n### Response:\n'), + ])), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=512)) + +humaneval_eval_cfg = dict( + evaluator=dict(type=HumanEvalEvaluator), + pred_role='BOT', + k=[1, 10, 100], # the parameter only for humaneval + pred_postprocessor=dict(type=humaneval_postprocess), +) + +humaneval_datasets = [ + dict( + abbr='openai_humaneval', + type=HumanevalDataset, + path='opencompass/humaneval', + reader_cfg=humaneval_reader_cfg, + infer_cfg=humaneval_infer_cfg, + eval_cfg=humaneval_eval_cfg) +] diff --git a/opencompass/configs/datasets/xhumaneval/deprecated_humaneval_gen_6d1cc2.py b/opencompass/configs/datasets/xhumaneval/deprecated_humaneval_gen_6d1cc2.py new file mode 100644 index 00000000..830d391f --- /dev/null +++ b/opencompass/configs/datasets/xhumaneval/deprecated_humaneval_gen_6d1cc2.py @@ -0,0 +1,36 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import HumanevalDataset, HumanEvalEvaluator, humaneval_postprocess + +humaneval_reader_cfg = dict( + input_columns=['prompt'], output_column='task_id', train_split='test') + +# TODO: allow empty output-column +humaneval_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict(round=[ + dict( + role='HUMAN', + prompt='Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\nComplete the following python function.:\n{prompt}\n\n### Response:\n'), + ])), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=512)) + +humaneval_eval_cfg = dict( + evaluator=dict(type=HumanEvalEvaluator), + pred_role='BOT', + k=[1, 10, 100], # the parameter only for humaneval + pred_postprocessor=dict(type=humaneval_postprocess), +) + +humaneval_datasets = [ + dict( + abbr='openai_humaneval', + type=HumanevalDataset, + path='opencompass/humaneval', + reader_cfg=humaneval_reader_cfg, + infer_cfg=humaneval_infer_cfg, + eval_cfg=humaneval_eval_cfg) +] diff --git a/opencompass/configs/datasets/xhumaneval/deprecated_humaneval_gen_a82cae.py b/opencompass/configs/datasets/xhumaneval/deprecated_humaneval_gen_a82cae.py new file mode 100644 index 00000000..dc5d10f5 --- /dev/null +++ b/opencompass/configs/datasets/xhumaneval/deprecated_humaneval_gen_a82cae.py @@ -0,0 +1,36 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import HumanevalDataset, HumanEvalEvaluator, humaneval_postprocess_v2 + +humaneval_reader_cfg = dict( + input_columns=['prompt'], output_column='task_id', train_split='test') + +# TODO: allow empty output-column +humaneval_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict(round=[ + dict( + role='HUMAN', + prompt='{prompt}'), + ])), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=512)) + +humaneval_eval_cfg = dict( + evaluator=dict(type=HumanEvalEvaluator), + pred_role='BOT', + k=[1, 10, 100], # the parameter only for humaneval + pred_postprocessor=dict(type=humaneval_postprocess_v2), +) + +humaneval_datasets = [ + dict( + abbr='openai_humaneval', + type=HumanevalDataset, + path='opencompass/humaneval', + reader_cfg=humaneval_reader_cfg, + infer_cfg=humaneval_infer_cfg, + eval_cfg=humaneval_eval_cfg) +] diff --git a/opencompass/configs/datasets/xhumaneval/deprecated_humaneval_gen_d2537e.py b/opencompass/configs/datasets/xhumaneval/deprecated_humaneval_gen_d2537e.py new file mode 100644 index 00000000..69231fdc --- /dev/null +++ b/opencompass/configs/datasets/xhumaneval/deprecated_humaneval_gen_d2537e.py @@ -0,0 +1,33 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import HumanevalDataset, HumanEvalEvaluator, humaneval_postprocess_v2 + +humaneval_reader_cfg = dict( + input_columns=['prompt'], output_column='task_id', train_split='test') + +# TODO: allow empty output-column +humaneval_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template='Complete the following python code:\n{prompt}', + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=512)) + +humaneval_eval_cfg = dict( + evaluator=dict(type=HumanEvalEvaluator), + pred_role='BOT', + k=[1, 10, 100], # the parameter only for humaneval + pred_postprocessor=dict(type=humaneval_postprocess_v2), +) + +humaneval_datasets = [ + dict( + abbr='openai_humaneval', + type=HumanevalDataset, + path='opencompass/humaneval', + reader_cfg=humaneval_reader_cfg, + infer_cfg=humaneval_infer_cfg, + eval_cfg=humaneval_eval_cfg) +] diff --git a/opencompass/configs/datasets/xhumaneval/deprecated_humaneval_gen_fd5822.py b/opencompass/configs/datasets/xhumaneval/deprecated_humaneval_gen_fd5822.py new file mode 100644 index 00000000..ea56afd6 --- /dev/null +++ b/opencompass/configs/datasets/xhumaneval/deprecated_humaneval_gen_fd5822.py @@ -0,0 +1,31 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import HumanevalDataset, HumanEvalEvaluator, humaneval_postprocess + +humaneval_reader_cfg = dict( + input_columns=['prompt'], output_column='task_id', train_split='test') + +# TODO: allow empty output-column +humaneval_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template='{prompt}'), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=512)) + +humaneval_eval_cfg = dict( + evaluator=dict(type=HumanEvalEvaluator), + k=[1, 10, 100], # the parameter only for humaneval + pred_postprocessor=dict(type=humaneval_postprocess), +) + +humaneval_datasets = [ + dict( + abbr='openai_humaneval', + type=HumanevalDataset, + path='opencompass/humaneval', + reader_cfg=humaneval_reader_cfg, + infer_cfg=humaneval_infer_cfg, + eval_cfg=humaneval_eval_cfg) +] diff --git a/opencompass/configs/datasets/xhumaneval/deprecated_humaneval_gen_ff7054.py b/opencompass/configs/datasets/xhumaneval/deprecated_humaneval_gen_ff7054.py new file mode 100644 index 00000000..a1be3ba9 --- /dev/null +++ b/opencompass/configs/datasets/xhumaneval/deprecated_humaneval_gen_ff7054.py @@ -0,0 +1,41 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import HumanevalDataset, HumanEvalEvaluator, humaneval_postprocess + +humaneval_reader_cfg = dict( + input_columns=['prompt'], output_column='task_id', train_split='test') + +# TODO: allow empty output-column +humaneval_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + begin=[ + dict( + role='SYSTEM', + fallback_role='HUMAN', + prompt='Complete the following python code:'), + ], + round=[ + dict(role='HUMAN', prompt='{prompt}'), + ])), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer)) + +humaneval_eval_cfg = dict( + evaluator=dict(type=HumanEvalEvaluator), + pred_role='BOT', + k=[1, 10, 100], # the parameter only for humaneval + pred_postprocessor=dict(type=humaneval_postprocess), +) + +humaneval_datasets = [ + dict( + abbr='openai_humaneval', + type=HumanevalDataset, + path='opencompass/humaneval', + reader_cfg=humaneval_reader_cfg, + infer_cfg=humaneval_infer_cfg, + eval_cfg=humaneval_eval_cfg) +] diff --git a/opencompass/configs/datasets/xhumaneval/humaneval/README.md b/opencompass/configs/datasets/xhumaneval/humaneval/README.md new file mode 100644 index 00000000..ce004785 --- /dev/null +++ b/opencompass/configs/datasets/xhumaneval/humaneval/README.md @@ -0,0 +1,69 @@ +# HumanEval + +```bash +python3 run.py --models hf_internlm2_7b --datasets deprecated_humaneval_gen_d2537e --debug +python3 run.py --models hf_internlm2_chat_7b --datasets humaneval_gen_8e312c --debug +``` + +## Base Models + +| model | pass@1 | +|:------------------------:|---------:| +| llama-7b-turbomind | 12.80 | +| llama-13b-turbomind | 15.24 | +| llama-30b-turbomind | 9.15 | +| llama-65b-turbomind | 7.32 | +| llama-2-7b-turbomind | 14.02 | +| llama-2-13b-turbomind | 15.24 | +| llama-2-70b-turbomind | 15.24 | +| llama-3-8b-turbomind | 28.05 | +| llama-3-70b-turbomind | 28.05 | +| internlm2-1.8b-turbomind | 30.49 | +| internlm2-7b-turbomind | 48.17 | +| internlm2-20b-turbomind | 51.83 | +| qwen-1.8b-turbomind | 16.46 | +| qwen-7b-turbomind | 23.78 | +| qwen-14b-turbomind | 23.78 | +| qwen-72b-turbomind | 66.46 | +| qwen1.5-0.5b-hf | 8.54 | +| qwen1.5-1.8b-hf | 23.17 | +| qwen1.5-4b-hf | 41.46 | +| qwen1.5-7b-hf | 53.05 | +| qwen1.5-14b-hf | 57.32 | +| qwen1.5-32b-hf | 70.12 | +| qwen1.5-72b-hf | 65.85 | +| qwen1.5-moe-a2-7b-hf | 45.73 | +| mistral-7b-v0.1-hf | 14.02 | +| mistral-7b-v0.2-hf | 9.15 | +| mixtral-8x7b-v0.1-hf | 24.39 | +| mixtral-8x22b-v0.1-hf | 16.46 | +| yi-6b-hf | 14.63 | +| yi-34b-hf | 17.07 | +| deepseek-7b-base-hf | 18.29 | +| deepseek-67b-base-hf | 23.17 | + +## Chat Models + +| model | pass@1 | +|:-----------------------------:|---------:| +| qwen1.5-0.5b-chat-hf | 9.15 | +| qwen1.5-1.8b-chat-hf | 15.85 | +| qwen1.5-4b-chat-hf | 30.49 | +| qwen1.5-7b-chat-hf | 40.85 | +| qwen1.5-14b-chat-hf | 50.00 | +| qwen1.5-32b-chat-hf | 57.93 | +| qwen1.5-72b-chat-hf | 60.37 | +| qwen1.5-110b-chat-hf | 65.24 | +| internlm2-chat-1.8b-hf | 33.54 | +| internlm2-chat-1.8b-sft-hf | 34.15 | +| internlm2-chat-7b-hf | 56.71 | +| internlm2-chat-7b-sft-hf | 61.59 | +| internlm2-chat-20b-hf | 67.68 | +| internlm2-chat-20b-sft-hf | 67.68 | +| llama-3-8b-instruct-hf | 55.49 | +| llama-3-70b-instruct-hf | 70.73 | +| llama-3-8b-instruct-lmdeploy | 57.93 | +| llama-3-70b-instruct-lmdeploy | 70.73 | +| mistral-7b-instruct-v0.1-hf | 32.32 | +| mistral-7b-instruct-v0.2-hf | 29.27 | +| mixtral-8x7b-instruct-v0.1-hf | 34.15 | diff --git a/opencompass/configs/datasets/xhumaneval/humaneval/deprecated_humaneval_gen_4a6eef.py b/opencompass/configs/datasets/xhumaneval/humaneval/deprecated_humaneval_gen_4a6eef.py new file mode 100644 index 00000000..67dd58a5 --- /dev/null +++ b/opencompass/configs/datasets/xhumaneval/humaneval/deprecated_humaneval_gen_4a6eef.py @@ -0,0 +1,36 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import HumanevalDataset, HumanEvalEvaluator, humaneval_postprocess + +humaneval_reader_cfg = dict( + input_columns=['prompt'], output_column='task_id', train_split='test') + +# TODO: allow empty output-column +humaneval_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict(round=[ + dict( + role='HUMAN', + prompt='Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\nCreate a Python script for this problem:\n{prompt}\n\n### Response:\n'), + ])), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=512)) + +humaneval_eval_cfg = dict( + evaluator=dict(type=HumanEvalEvaluator), + pred_role='BOT', + k=[1, 10, 100], # the parameter only for humaneval + pred_postprocessor=dict(type=humaneval_postprocess), +) + +humaneval_datasets = [ + dict( + abbr='openai_humaneval', + type=HumanevalDataset, + path='opencompass/humaneval', + reader_cfg=humaneval_reader_cfg, + infer_cfg=humaneval_infer_cfg, + eval_cfg=humaneval_eval_cfg) +] diff --git a/opencompass/configs/datasets/xhumaneval/humaneval/deprecated_humaneval_gen_6d1cc2.py b/opencompass/configs/datasets/xhumaneval/humaneval/deprecated_humaneval_gen_6d1cc2.py new file mode 100644 index 00000000..830d391f --- /dev/null +++ b/opencompass/configs/datasets/xhumaneval/humaneval/deprecated_humaneval_gen_6d1cc2.py @@ -0,0 +1,36 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import HumanevalDataset, HumanEvalEvaluator, humaneval_postprocess + +humaneval_reader_cfg = dict( + input_columns=['prompt'], output_column='task_id', train_split='test') + +# TODO: allow empty output-column +humaneval_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict(round=[ + dict( + role='HUMAN', + prompt='Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\nComplete the following python function.:\n{prompt}\n\n### Response:\n'), + ])), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=512)) + +humaneval_eval_cfg = dict( + evaluator=dict(type=HumanEvalEvaluator), + pred_role='BOT', + k=[1, 10, 100], # the parameter only for humaneval + pred_postprocessor=dict(type=humaneval_postprocess), +) + +humaneval_datasets = [ + dict( + abbr='openai_humaneval', + type=HumanevalDataset, + path='opencompass/humaneval', + reader_cfg=humaneval_reader_cfg, + infer_cfg=humaneval_infer_cfg, + eval_cfg=humaneval_eval_cfg) +] diff --git a/opencompass/configs/datasets/xhumaneval/humaneval/deprecated_humaneval_gen_a82cae.py b/opencompass/configs/datasets/xhumaneval/humaneval/deprecated_humaneval_gen_a82cae.py new file mode 100644 index 00000000..dc5d10f5 --- /dev/null +++ b/opencompass/configs/datasets/xhumaneval/humaneval/deprecated_humaneval_gen_a82cae.py @@ -0,0 +1,36 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import HumanevalDataset, HumanEvalEvaluator, humaneval_postprocess_v2 + +humaneval_reader_cfg = dict( + input_columns=['prompt'], output_column='task_id', train_split='test') + +# TODO: allow empty output-column +humaneval_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict(round=[ + dict( + role='HUMAN', + prompt='{prompt}'), + ])), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=512)) + +humaneval_eval_cfg = dict( + evaluator=dict(type=HumanEvalEvaluator), + pred_role='BOT', + k=[1, 10, 100], # the parameter only for humaneval + pred_postprocessor=dict(type=humaneval_postprocess_v2), +) + +humaneval_datasets = [ + dict( + abbr='openai_humaneval', + type=HumanevalDataset, + path='opencompass/humaneval', + reader_cfg=humaneval_reader_cfg, + infer_cfg=humaneval_infer_cfg, + eval_cfg=humaneval_eval_cfg) +] diff --git a/opencompass/configs/datasets/xhumaneval/humaneval/deprecated_humaneval_gen_d2537e.py b/opencompass/configs/datasets/xhumaneval/humaneval/deprecated_humaneval_gen_d2537e.py new file mode 100644 index 00000000..69231fdc --- /dev/null +++ b/opencompass/configs/datasets/xhumaneval/humaneval/deprecated_humaneval_gen_d2537e.py @@ -0,0 +1,33 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import HumanevalDataset, HumanEvalEvaluator, humaneval_postprocess_v2 + +humaneval_reader_cfg = dict( + input_columns=['prompt'], output_column='task_id', train_split='test') + +# TODO: allow empty output-column +humaneval_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template='Complete the following python code:\n{prompt}', + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=512)) + +humaneval_eval_cfg = dict( + evaluator=dict(type=HumanEvalEvaluator), + pred_role='BOT', + k=[1, 10, 100], # the parameter only for humaneval + pred_postprocessor=dict(type=humaneval_postprocess_v2), +) + +humaneval_datasets = [ + dict( + abbr='openai_humaneval', + type=HumanevalDataset, + path='opencompass/humaneval', + reader_cfg=humaneval_reader_cfg, + infer_cfg=humaneval_infer_cfg, + eval_cfg=humaneval_eval_cfg) +] diff --git a/opencompass/configs/datasets/xhumaneval/humaneval/deprecated_humaneval_gen_fd5822.py b/opencompass/configs/datasets/xhumaneval/humaneval/deprecated_humaneval_gen_fd5822.py new file mode 100644 index 00000000..ea56afd6 --- /dev/null +++ b/opencompass/configs/datasets/xhumaneval/humaneval/deprecated_humaneval_gen_fd5822.py @@ -0,0 +1,31 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import HumanevalDataset, HumanEvalEvaluator, humaneval_postprocess + +humaneval_reader_cfg = dict( + input_columns=['prompt'], output_column='task_id', train_split='test') + +# TODO: allow empty output-column +humaneval_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template='{prompt}'), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=512)) + +humaneval_eval_cfg = dict( + evaluator=dict(type=HumanEvalEvaluator), + k=[1, 10, 100], # the parameter only for humaneval + pred_postprocessor=dict(type=humaneval_postprocess), +) + +humaneval_datasets = [ + dict( + abbr='openai_humaneval', + type=HumanevalDataset, + path='opencompass/humaneval', + reader_cfg=humaneval_reader_cfg, + infer_cfg=humaneval_infer_cfg, + eval_cfg=humaneval_eval_cfg) +] diff --git a/opencompass/configs/datasets/xhumaneval/humaneval/deprecated_humaneval_gen_ff7054.py b/opencompass/configs/datasets/xhumaneval/humaneval/deprecated_humaneval_gen_ff7054.py new file mode 100644 index 00000000..a1be3ba9 --- /dev/null +++ b/opencompass/configs/datasets/xhumaneval/humaneval/deprecated_humaneval_gen_ff7054.py @@ -0,0 +1,41 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import HumanevalDataset, HumanEvalEvaluator, humaneval_postprocess + +humaneval_reader_cfg = dict( + input_columns=['prompt'], output_column='task_id', train_split='test') + +# TODO: allow empty output-column +humaneval_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + begin=[ + dict( + role='SYSTEM', + fallback_role='HUMAN', + prompt='Complete the following python code:'), + ], + round=[ + dict(role='HUMAN', prompt='{prompt}'), + ])), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer)) + +humaneval_eval_cfg = dict( + evaluator=dict(type=HumanEvalEvaluator), + pred_role='BOT', + k=[1, 10, 100], # the parameter only for humaneval + pred_postprocessor=dict(type=humaneval_postprocess), +) + +humaneval_datasets = [ + dict( + abbr='openai_humaneval', + type=HumanevalDataset, + path='opencompass/humaneval', + reader_cfg=humaneval_reader_cfg, + infer_cfg=humaneval_infer_cfg, + eval_cfg=humaneval_eval_cfg) +] diff --git a/opencompass/configs/datasets/xhumaneval/humaneval/humaneval_gen.py b/opencompass/configs/datasets/xhumaneval/humaneval/humaneval_gen.py new file mode 100644 index 00000000..74019908 --- /dev/null +++ b/opencompass/configs/datasets/xhumaneval/humaneval/humaneval_gen.py @@ -0,0 +1,4 @@ +from mmengine.config import read_base + +with read_base(): + from .humaneval_gen_8e312c import humaneval_datasets # noqa: F401, F403 diff --git a/opencompass/configs/datasets/xhumaneval/humaneval/humaneval_gen_66a7f4.py b/opencompass/configs/datasets/xhumaneval/humaneval/humaneval_gen_66a7f4.py new file mode 100644 index 00000000..b4109925 --- /dev/null +++ b/opencompass/configs/datasets/xhumaneval/humaneval/humaneval_gen_66a7f4.py @@ -0,0 +1,35 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import HumanevalDataset, HumanEvalEvaluator, humaneval_postprocess_v2 + +humaneval_reader_cfg = dict(input_columns=['prompt'], output_column='task_id', train_split='test') + +HUMANEVAL_TEMPLATE = dict( + round=[ + dict(role='HUMAN', prompt='You are an intelligent programming assistant to produce Python algorithmic solutions.\nCan you complete the following Python function?\n```python\n{prompt}\n```'), + ] +) + +humaneval_infer_cfg = dict( + prompt_template=dict(type=PromptTemplate, template=HUMANEVAL_TEMPLATE), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=1024), +) + +humaneval_eval_cfg = dict( + evaluator=dict(type=HumanEvalEvaluator), + k=[1, 10, 100], + pred_postprocessor=dict(type=humaneval_postprocess_v2), +) + +humaneval_datasets = [ + dict( + abbr='openai_humaneval', + type=HumanevalDataset, + path='opencompass/humaneval', + reader_cfg=humaneval_reader_cfg, + infer_cfg=humaneval_infer_cfg, + eval_cfg=humaneval_eval_cfg, + ) +] diff --git a/opencompass/configs/datasets/xhumaneval/humaneval/humaneval_gen_8e312c.py b/opencompass/configs/datasets/xhumaneval/humaneval/humaneval_gen_8e312c.py new file mode 100644 index 00000000..a8c6e587 --- /dev/null +++ b/opencompass/configs/datasets/xhumaneval/humaneval/humaneval_gen_8e312c.py @@ -0,0 +1,37 @@ +# THIS SHALL ALSO BE DEPRECATED +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import HumanevalDataset, HumanEvalEvaluator, humaneval_postprocess_v2 + +humaneval_reader_cfg = dict( + input_columns=['prompt'], output_column='task_id', train_split='test') + +# TODO: allow empty output-column +humaneval_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict(round=[ + dict( + role='HUMAN', + prompt='Complete the following python code:\n{prompt}'), + ])), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=512)) + +humaneval_eval_cfg = dict( + evaluator=dict(type=HumanEvalEvaluator), + pred_role='BOT', + k=[1, 10, 100], # the parameter only for humaneval + pred_postprocessor=dict(type=humaneval_postprocess_v2), +) + +humaneval_datasets = [ + dict( + abbr='openai_humaneval', + type=HumanevalDataset, + path='opencompass/humaneval', + reader_cfg=humaneval_reader_cfg, + infer_cfg=humaneval_infer_cfg, + eval_cfg=humaneval_eval_cfg) +] diff --git a/opencompass/configs/datasets/xhumaneval/humaneval/humaneval_openai_sample_evals_gen_159614.py b/opencompass/configs/datasets/xhumaneval/humaneval/humaneval_openai_sample_evals_gen_159614.py new file mode 100644 index 00000000..d364f938 --- /dev/null +++ b/opencompass/configs/datasets/xhumaneval/humaneval/humaneval_openai_sample_evals_gen_159614.py @@ -0,0 +1,36 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import HumanevalDataset, HumanEvalEvaluator, humaneval_postprocess_v2 + +humaneval_reader_cfg = dict( + input_columns=['prompt'], output_column='task_id', train_split='test') + +# TODO: allow empty output-column +humaneval_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict(round=[ + dict( + role='HUMAN', + prompt='Read the following function signature and docstring, and fully implement the function described. Your response should only contain the code for this function.\n{prompt}'), + ])), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=512)) + +humaneval_eval_cfg = dict( + evaluator=dict(type=HumanEvalEvaluator), + pred_role='BOT', + k=[1, 10, 100], # the parameter only for humaneval + pred_postprocessor=dict(type=humaneval_postprocess_v2), +) + +humaneval_datasets = [ + dict( + abbr='openai_humaneval', + type=HumanevalDataset, + path='opencompass/humaneval', + reader_cfg=humaneval_reader_cfg, + infer_cfg=humaneval_infer_cfg, + eval_cfg=humaneval_eval_cfg) +] diff --git a/opencompass/configs/datasets/xhumaneval/humaneval/humaneval_passk_gen_8e312c.py b/opencompass/configs/datasets/xhumaneval/humaneval/humaneval_passk_gen_8e312c.py new file mode 100644 index 00000000..6224696f --- /dev/null +++ b/opencompass/configs/datasets/xhumaneval/humaneval/humaneval_passk_gen_8e312c.py @@ -0,0 +1,36 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import HumanevalDataset, HumanEvalEvaluator, humaneval_postprocess_v2 + +humaneval_reader_cfg = dict( + input_columns=['prompt'], output_column='task_id', train_split='test') + +# TODO: allow empty output-column +humaneval_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict(round=[ + dict( + role='HUMAN', + prompt='Complete the following python code:\n{prompt}'), + ])), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=512)) + +humaneval_eval_cfg = dict( + evaluator=dict(type=HumanEvalEvaluator), + pred_role='BOT', + k=[1, 10, 100], # the parameter only for humaneval + pred_postprocessor=dict(type=humaneval_postprocess_v2), +) + +humaneval_datasets = [ + dict( + abbr='openai_humaneval_passk', + type=HumanevalDataset, + path='opencompass/humaneval', + reader_cfg=humaneval_reader_cfg, + infer_cfg=humaneval_infer_cfg, + eval_cfg=humaneval_eval_cfg) +] diff --git a/opencompass/configs/datasets/xhumaneval/humaneval/humaneval_repeat10_gen_8e312c.py b/opencompass/configs/datasets/xhumaneval/humaneval/humaneval_repeat10_gen_8e312c.py new file mode 100644 index 00000000..adcabde9 --- /dev/null +++ b/opencompass/configs/datasets/xhumaneval/humaneval/humaneval_repeat10_gen_8e312c.py @@ -0,0 +1,37 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import HumanevalDataset, HumanEvalEvaluator, humaneval_postprocess_v2 + +humaneval_reader_cfg = dict( + input_columns=['prompt'], output_column='task_id', train_split='test') + +# TODO: allow empty output-column +humaneval_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict(round=[ + dict( + role='HUMAN', + prompt='Complete the following python code:\n{prompt}'), + ])), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=512)) + +humaneval_eval_cfg = dict( + evaluator=dict(type=HumanEvalEvaluator), + pred_role='BOT', + k=[1, 10, 100], # the parameter only for humaneval + pred_postprocessor=dict(type=humaneval_postprocess_v2), +) + +humaneval_datasets = [ + dict( + abbr='openai_humaneval_repeat10', + type=HumanevalDataset, + path='opencompass/humaneval', + num_repeats=10, + reader_cfg=humaneval_reader_cfg, + infer_cfg=humaneval_infer_cfg, + eval_cfg=humaneval_eval_cfg) +] diff --git a/opencompass/configs/datasets/xhumaneval/humaneval/internal_humaneval_gen_ce6b06.py b/opencompass/configs/datasets/xhumaneval/humaneval/internal_humaneval_gen_ce6b06.py new file mode 100644 index 00000000..53505e52 --- /dev/null +++ b/opencompass/configs/datasets/xhumaneval/humaneval/internal_humaneval_gen_ce6b06.py @@ -0,0 +1,33 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import HumanevalDataset, HumanEvalEvaluator, humaneval_internal_v2_postprocess + +humaneval_reader_cfg = dict( + input_columns=['prompt'], output_column='task_id', train_split='test') + +# TODO: allow empty output-column +humaneval_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template='# Complete the following python code:\n{prompt}', + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=512)) + +humaneval_eval_cfg = dict( + evaluator=dict(type=HumanEvalEvaluator), + pred_role='BOT', + k=[1, 10, 100], # the parameter only for humaneval + pred_postprocessor=dict(type=humaneval_internal_v2_postprocess), +) + +humaneval_datasets = [ + dict( + abbr='openai_humaneval', + type=HumanevalDataset, + path='opencompass/humaneval', + reader_cfg=humaneval_reader_cfg, + infer_cfg=humaneval_infer_cfg, + eval_cfg=humaneval_eval_cfg) +] diff --git a/opencompass/configs/datasets/xhumaneval/humaneval/internal_humaneval_gen_d2537e.py b/opencompass/configs/datasets/xhumaneval/humaneval/internal_humaneval_gen_d2537e.py new file mode 100644 index 00000000..cb8e6223 --- /dev/null +++ b/opencompass/configs/datasets/xhumaneval/humaneval/internal_humaneval_gen_d2537e.py @@ -0,0 +1,33 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import HumanevalDataset, HumanEvalEvaluator, humaneval_internal_v1_postprocess + +humaneval_reader_cfg = dict( + input_columns=['prompt'], output_column='task_id', train_split='test') + +# TODO: allow empty output-column +humaneval_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template='Complete the following python code:\n{prompt}', + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=512)) + +humaneval_eval_cfg = dict( + evaluator=dict(type=HumanEvalEvaluator), + pred_role='BOT', + k=[1, 10, 100], # the parameter only for humaneval + pred_postprocessor=dict(type=humaneval_internal_v1_postprocess), +) + +humaneval_datasets = [ + dict( + abbr='openai_humaneval', + type=HumanevalDataset, + path='opencompass/humaneval', + reader_cfg=humaneval_reader_cfg, + infer_cfg=humaneval_infer_cfg, + eval_cfg=humaneval_eval_cfg) +] diff --git a/opencompass/configs/datasets/xhumaneval/humaneval_gen_66a7f4.py b/opencompass/configs/datasets/xhumaneval/humaneval_gen_66a7f4.py new file mode 100644 index 00000000..b4109925 --- /dev/null +++ b/opencompass/configs/datasets/xhumaneval/humaneval_gen_66a7f4.py @@ -0,0 +1,35 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import HumanevalDataset, HumanEvalEvaluator, humaneval_postprocess_v2 + +humaneval_reader_cfg = dict(input_columns=['prompt'], output_column='task_id', train_split='test') + +HUMANEVAL_TEMPLATE = dict( + round=[ + dict(role='HUMAN', prompt='You are an intelligent programming assistant to produce Python algorithmic solutions.\nCan you complete the following Python function?\n```python\n{prompt}\n```'), + ] +) + +humaneval_infer_cfg = dict( + prompt_template=dict(type=PromptTemplate, template=HUMANEVAL_TEMPLATE), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=1024), +) + +humaneval_eval_cfg = dict( + evaluator=dict(type=HumanEvalEvaluator), + k=[1, 10, 100], + pred_postprocessor=dict(type=humaneval_postprocess_v2), +) + +humaneval_datasets = [ + dict( + abbr='openai_humaneval', + type=HumanevalDataset, + path='opencompass/humaneval', + reader_cfg=humaneval_reader_cfg, + infer_cfg=humaneval_infer_cfg, + eval_cfg=humaneval_eval_cfg, + ) +] diff --git a/opencompass/configs/datasets/xhumaneval/humaneval_gen_8e312c.py b/opencompass/configs/datasets/xhumaneval/humaneval_gen_8e312c.py new file mode 100644 index 00000000..ed76790c --- /dev/null +++ b/opencompass/configs/datasets/xhumaneval/humaneval_gen_8e312c.py @@ -0,0 +1,49 @@ +# THIS SHALL ALSO BE DEPRECATED +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import xHumanevalDataset, HumanEvalEvaluator, humaneval_postprocess_v2, HumanEvalPlusEvaluator + +humaneval_reader_cfg = dict( + input_columns=['prompt'], output_column='task_id', train_split='test') + +# TODO: allow empty output-column +humaneval_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict(round=[ + dict( + role='HUMAN', + prompt='Complete the following python code:\n{prompt}'), + ])), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=512)) + +humaneval_eval_cfg = dict( + evaluator=dict(type=HumanEvalEvaluator), + pred_role='BOT', + k=[1, 10, 100], # the parameter only for humaneval + pred_postprocessor=dict(type=humaneval_postprocess_v2), +) + +# humaneval_datasets = [ +# dict( +# abbr='openai_humaneval', +# type=xHumanevalDataset, +# path='opencompass/humaneval', +# reader_cfg=humaneval_reader_cfg, +# infer_cfg=humaneval_infer_cfg, +# eval_cfg=humaneval_eval_cfg) +# ] +LANGS = ['ar'] +humaneval_datasets = [] +for lang in LANGS: + humaneval_datasets.append( + dict( + abbr=f'humaneval_{lang}', + type=xHumanevalDataset, + path=f'data/xhumaneval_plus/humaneval_plus_gpt4o_{lang}.jsonl', + reader_cfg=humaneval_reader_cfg, + infer_cfg=humaneval_infer_cfg, + eval_cfg=humaneval_eval_cfg) + ) diff --git a/opencompass/configs/datasets/xhumaneval/humaneval_openai_sample_evals_gen_159614.py b/opencompass/configs/datasets/xhumaneval/humaneval_openai_sample_evals_gen_159614.py new file mode 100644 index 00000000..d364f938 --- /dev/null +++ b/opencompass/configs/datasets/xhumaneval/humaneval_openai_sample_evals_gen_159614.py @@ -0,0 +1,36 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import HumanevalDataset, HumanEvalEvaluator, humaneval_postprocess_v2 + +humaneval_reader_cfg = dict( + input_columns=['prompt'], output_column='task_id', train_split='test') + +# TODO: allow empty output-column +humaneval_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict(round=[ + dict( + role='HUMAN', + prompt='Read the following function signature and docstring, and fully implement the function described. Your response should only contain the code for this function.\n{prompt}'), + ])), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=512)) + +humaneval_eval_cfg = dict( + evaluator=dict(type=HumanEvalEvaluator), + pred_role='BOT', + k=[1, 10, 100], # the parameter only for humaneval + pred_postprocessor=dict(type=humaneval_postprocess_v2), +) + +humaneval_datasets = [ + dict( + abbr='openai_humaneval', + type=HumanevalDataset, + path='opencompass/humaneval', + reader_cfg=humaneval_reader_cfg, + infer_cfg=humaneval_infer_cfg, + eval_cfg=humaneval_eval_cfg) +] diff --git a/opencompass/configs/datasets/xhumaneval/humaneval_passk_gen_8e312c.py b/opencompass/configs/datasets/xhumaneval/humaneval_passk_gen_8e312c.py new file mode 100644 index 00000000..6224696f --- /dev/null +++ b/opencompass/configs/datasets/xhumaneval/humaneval_passk_gen_8e312c.py @@ -0,0 +1,36 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import HumanevalDataset, HumanEvalEvaluator, humaneval_postprocess_v2 + +humaneval_reader_cfg = dict( + input_columns=['prompt'], output_column='task_id', train_split='test') + +# TODO: allow empty output-column +humaneval_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict(round=[ + dict( + role='HUMAN', + prompt='Complete the following python code:\n{prompt}'), + ])), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=512)) + +humaneval_eval_cfg = dict( + evaluator=dict(type=HumanEvalEvaluator), + pred_role='BOT', + k=[1, 10, 100], # the parameter only for humaneval + pred_postprocessor=dict(type=humaneval_postprocess_v2), +) + +humaneval_datasets = [ + dict( + abbr='openai_humaneval_passk', + type=HumanevalDataset, + path='opencompass/humaneval', + reader_cfg=humaneval_reader_cfg, + infer_cfg=humaneval_infer_cfg, + eval_cfg=humaneval_eval_cfg) +] diff --git a/opencompass/configs/datasets/xhumaneval/humaneval_repeat10_gen_8e312c.py b/opencompass/configs/datasets/xhumaneval/humaneval_repeat10_gen_8e312c.py new file mode 100644 index 00000000..adcabde9 --- /dev/null +++ b/opencompass/configs/datasets/xhumaneval/humaneval_repeat10_gen_8e312c.py @@ -0,0 +1,37 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import HumanevalDataset, HumanEvalEvaluator, humaneval_postprocess_v2 + +humaneval_reader_cfg = dict( + input_columns=['prompt'], output_column='task_id', train_split='test') + +# TODO: allow empty output-column +humaneval_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict(round=[ + dict( + role='HUMAN', + prompt='Complete the following python code:\n{prompt}'), + ])), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=512)) + +humaneval_eval_cfg = dict( + evaluator=dict(type=HumanEvalEvaluator), + pred_role='BOT', + k=[1, 10, 100], # the parameter only for humaneval + pred_postprocessor=dict(type=humaneval_postprocess_v2), +) + +humaneval_datasets = [ + dict( + abbr='openai_humaneval_repeat10', + type=HumanevalDataset, + path='opencompass/humaneval', + num_repeats=10, + reader_cfg=humaneval_reader_cfg, + infer_cfg=humaneval_infer_cfg, + eval_cfg=humaneval_eval_cfg) +] diff --git a/opencompass/configs/datasets/xhumaneval/internal_humaneval_gen_ce6b06.py b/opencompass/configs/datasets/xhumaneval/internal_humaneval_gen_ce6b06.py new file mode 100644 index 00000000..53505e52 --- /dev/null +++ b/opencompass/configs/datasets/xhumaneval/internal_humaneval_gen_ce6b06.py @@ -0,0 +1,33 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import HumanevalDataset, HumanEvalEvaluator, humaneval_internal_v2_postprocess + +humaneval_reader_cfg = dict( + input_columns=['prompt'], output_column='task_id', train_split='test') + +# TODO: allow empty output-column +humaneval_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template='# Complete the following python code:\n{prompt}', + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=512)) + +humaneval_eval_cfg = dict( + evaluator=dict(type=HumanEvalEvaluator), + pred_role='BOT', + k=[1, 10, 100], # the parameter only for humaneval + pred_postprocessor=dict(type=humaneval_internal_v2_postprocess), +) + +humaneval_datasets = [ + dict( + abbr='openai_humaneval', + type=HumanevalDataset, + path='opencompass/humaneval', + reader_cfg=humaneval_reader_cfg, + infer_cfg=humaneval_infer_cfg, + eval_cfg=humaneval_eval_cfg) +] diff --git a/opencompass/configs/datasets/xhumaneval/internal_humaneval_gen_d2537e.py b/opencompass/configs/datasets/xhumaneval/internal_humaneval_gen_d2537e.py new file mode 100644 index 00000000..cb8e6223 --- /dev/null +++ b/opencompass/configs/datasets/xhumaneval/internal_humaneval_gen_d2537e.py @@ -0,0 +1,33 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import HumanevalDataset, HumanEvalEvaluator, humaneval_internal_v1_postprocess + +humaneval_reader_cfg = dict( + input_columns=['prompt'], output_column='task_id', train_split='test') + +# TODO: allow empty output-column +humaneval_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template='Complete the following python code:\n{prompt}', + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=512)) + +humaneval_eval_cfg = dict( + evaluator=dict(type=HumanEvalEvaluator), + pred_role='BOT', + k=[1, 10, 100], # the parameter only for humaneval + pred_postprocessor=dict(type=humaneval_internal_v1_postprocess), +) + +humaneval_datasets = [ + dict( + abbr='openai_humaneval', + type=HumanevalDataset, + path='opencompass/humaneval', + reader_cfg=humaneval_reader_cfg, + infer_cfg=humaneval_infer_cfg, + eval_cfg=humaneval_eval_cfg) +] diff --git a/opencompass/configs/datasets/xhumaneval/xhumaneval_gen.py b/opencompass/configs/datasets/xhumaneval/xhumaneval_gen.py new file mode 100644 index 00000000..74019908 --- /dev/null +++ b/opencompass/configs/datasets/xhumaneval/xhumaneval_gen.py @@ -0,0 +1,4 @@ +from mmengine.config import read_base + +with read_base(): + from .humaneval_gen_8e312c import humaneval_datasets # noqa: F401, F403 diff --git a/opencompass/configs/datasets/xhumaneval_plus/humaneval_plus_gen.py b/opencompass/configs/datasets/xhumaneval_plus/humaneval_plus_gen.py new file mode 100644 index 00000000..9176bdee --- /dev/null +++ b/opencompass/configs/datasets/xhumaneval_plus/humaneval_plus_gen.py @@ -0,0 +1,4 @@ +from mmengine.config import read_base + +with read_base(): + from .humaneval_plus_gen_8e312c import humaneval_plus_datasets # noqa: F401, F403 diff --git a/opencompass/configs/datasets/xhumaneval_plus/humaneval_plus_gen_66a7f4.py b/opencompass/configs/datasets/xhumaneval_plus/humaneval_plus_gen_66a7f4.py new file mode 100644 index 00000000..357ef91c --- /dev/null +++ b/opencompass/configs/datasets/xhumaneval_plus/humaneval_plus_gen_66a7f4.py @@ -0,0 +1,35 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import HumanevalDataset, HumanEvalPlusEvaluator, humaneval_postprocess_v2 + +humaneval_plus_reader_cfg = dict(input_columns=['prompt'], output_column='task_id', train_split='test') + +HUMANEVAL_TEMPLATE = dict( + round=[ + dict(role='HUMAN', prompt='You are an intelligent programming assistant to produce Python algorithmic solutions.\nCan you complete the following Python function?\n```python\n{prompt}\n```'), + ] +) + +humaneval_plus_infer_cfg = dict( + prompt_template=dict(type=PromptTemplate, template=HUMANEVAL_TEMPLATE), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=1024), +) + +humaneval_plus_eval_cfg = dict( + evaluator=dict(type=HumanEvalPlusEvaluator), + k=[1, 10, 100], + pred_postprocessor=dict(type=humaneval_postprocess_v2), +) + +humaneval_plus_datasets = [ + dict( + abbr='humaneval_plus', + type=HumanevalDataset, + path='opencompass/humaneval', + reader_cfg=humaneval_plus_reader_cfg, + infer_cfg=humaneval_plus_infer_cfg, + eval_cfg=humaneval_plus_eval_cfg, + ) +] diff --git a/opencompass/configs/datasets/xhumaneval_plus/humaneval_plus_gen_8e312c.py b/opencompass/configs/datasets/xhumaneval_plus/humaneval_plus_gen_8e312c.py new file mode 100644 index 00000000..b37bf37f --- /dev/null +++ b/opencompass/configs/datasets/xhumaneval_plus/humaneval_plus_gen_8e312c.py @@ -0,0 +1,57 @@ +# THIS SHALL ALSO BE DEPRECATED +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import xHumanevalDataset, HumanEvalPlusEvaluator, humaneval_postprocess_v2 + +humaneval_plus_reader_cfg = dict( + input_columns=['prompt'], output_column='task_id', train_split='test') + +# TODO: allow empty output-column +humaneval_plus_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict(round=[ + dict( + role='HUMAN', + prompt='Complete the following python code:\n{prompt}'), + ])), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=2048)) + + + + +# print('humaneval_plus_datasets ru') +# humaneval_plus_datasets = [ +# dict( +# abbr='xhumaneval_plus_ru', +# type=xHumanevalDataset, +# path='xhumaneval_plus', +# name='humaneval_plus_gpt4o_ru.jsonl', +# local_mode=True, +# reader_cfg=humaneval_plus_reader_cfg, +# infer_cfg=humaneval_plus_infer_cfg, +# eval_cfg=humaneval_plus_eval_cfg) +# ] + +LANGS = ['de'] + +humaneval_plus_datasets = [] +for lang in LANGS: + humaneval_plus_eval_cfg = dict( + evaluator=dict(type=HumanEvalPlusEvaluator), + pred_role='BOT', + k=[1, 10], # the parameter only for humaneval + lang=lang, + pred_postprocessor=dict(type=humaneval_postprocess_v2), + ) + humaneval_plus_datasets.append( + dict( + abbr=f'xhumaneval_plus_{lang}', + type=xHumanevalDataset, + path=f'data/xhumaneval_plus/humaneval_{lang}.jsonl', + reader_cfg=humaneval_plus_reader_cfg, + infer_cfg=humaneval_plus_infer_cfg, + eval_cfg=humaneval_plus_eval_cfg) + ) diff --git a/opencompass/configs/datasets/xhumaneval_plus/humaneval_plus_openai_simple_evals_gen_159614.py b/opencompass/configs/datasets/xhumaneval_plus/humaneval_plus_openai_simple_evals_gen_159614.py new file mode 100644 index 00000000..8e91abdc --- /dev/null +++ b/opencompass/configs/datasets/xhumaneval_plus/humaneval_plus_openai_simple_evals_gen_159614.py @@ -0,0 +1,38 @@ +# THIS SHALL ALSO BE DEPRECATED +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import HumanevalDataset, HumanEvalPlusEvaluator, humaneval_postprocess_v2 + +humaneval_plus_reader_cfg = dict( + input_columns=['prompt'], output_column='task_id', train_split='test') + +# TODO: allow empty output-column +humaneval_plus_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict(round=[ + dict( + role='HUMAN', + prompt='Read the following function signature and docstring, and fully implement the function described. Your response should only contain the code for this function.\n{prompt}' + ), + ])), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=512)) + +humaneval_plus_eval_cfg = dict( + evaluator=dict(type=HumanEvalPlusEvaluator), + pred_role='BOT', + k=[1, 10, 100], # the parameter only for humaneval + pred_postprocessor=dict(type=humaneval_postprocess_v2), +) + +humaneval_plus_datasets = [ + dict( + abbr='humaneval_plus', + type=HumanevalDataset, + path='opencompass/humaneval', + reader_cfg=humaneval_plus_reader_cfg, + infer_cfg=humaneval_plus_infer_cfg, + eval_cfg=humaneval_plus_eval_cfg) +] diff --git a/opencompass/configs/datasets/xhumaneval_plus/humaneval_plus_passk_gen_8e312c.py b/opencompass/configs/datasets/xhumaneval_plus/humaneval_plus_passk_gen_8e312c.py new file mode 100644 index 00000000..d602d73b --- /dev/null +++ b/opencompass/configs/datasets/xhumaneval_plus/humaneval_plus_passk_gen_8e312c.py @@ -0,0 +1,36 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import HumanevalDataset, HumanEvalEvaluator, humaneval_postprocess_v2 + +humaneval_plus_reader_cfg = dict( + input_columns=['prompt'], output_column='task_id', train_split='test') + +# TODO: allow empty output-column +humaneval_plus_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict(round=[ + dict( + role='HUMAN', + prompt='Complete the following python code:\n{prompt}'), + ])), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=512)) + +humaneval_plus_eval_cfg = dict( + evaluator=dict(type=HumanEvalEvaluator, metric='EvalPlus'), + pred_role='BOT', + k=[1, 10, 100], # the parameter only for humaneval + pred_postprocessor=dict(type=humaneval_postprocess_v2), +) + +humaneval_plus_datasets = [ + dict( + abbr='humaneval_plus_passk', + type=HumanevalDataset, + path='opencompass/humaneval', + reader_cfg=humaneval_plus_reader_cfg, + infer_cfg=humaneval_plus_infer_cfg, + eval_cfg=humaneval_plus_eval_cfg) +] diff --git a/opencompass/configs/datasets/xhumaneval_plus/humaneval_plus_repeat10_gen_8e312c.py b/opencompass/configs/datasets/xhumaneval_plus/humaneval_plus_repeat10_gen_8e312c.py new file mode 100644 index 00000000..bbbafd90 --- /dev/null +++ b/opencompass/configs/datasets/xhumaneval_plus/humaneval_plus_repeat10_gen_8e312c.py @@ -0,0 +1,37 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import HumanevalDataset, HumanEvalEvaluator, humaneval_postprocess_v2 + +humaneval_plus_reader_cfg = dict( + input_columns=['prompt'], output_column='task_id', train_split='test') + +# TODO: allow empty output-column +humaneval_plus_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict(round=[ + dict( + role='HUMAN', + prompt='Complete the following python code:\n{prompt}'), + ])), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=512)) + +humaneval_plus_eval_cfg = dict( + evaluator=dict(type=HumanEvalEvaluator, metric='EvalPlus'), + pred_role='BOT', + k=[1, 10, 100], # the parameter only for humaneval + pred_postprocessor=dict(type=humaneval_postprocess_v2), +) + +humaneval_plus_datasets = [ + dict( + abbr='humaneval_plus_repeat10', + type=HumanevalDataset, + path='opencompass/humaneval', + num_repeats=10, + reader_cfg=humaneval_plus_reader_cfg, + infer_cfg=humaneval_plus_infer_cfg, + eval_cfg=humaneval_plus_eval_cfg) +] diff --git a/opencompass/configs/datasets/xhumaneval_plus/xhumaneval_plus_gen.py b/opencompass/configs/datasets/xhumaneval_plus/xhumaneval_plus_gen.py new file mode 100644 index 00000000..9176bdee --- /dev/null +++ b/opencompass/configs/datasets/xhumaneval_plus/xhumaneval_plus_gen.py @@ -0,0 +1,4 @@ +from mmengine.config import read_base + +with read_base(): + from .humaneval_plus_gen_8e312c import humaneval_plus_datasets # noqa: F401, F403 diff --git a/opencompass/datasets/__init__.py b/opencompass/datasets/__init__.py index c13fc983..911f4293 100644 --- a/opencompass/datasets/__init__.py +++ b/opencompass/datasets/__init__.py @@ -146,6 +146,7 @@ from .xcopa import * # noqa: F401, F403 from .xiezhi import XiezhiDataset, XiezhiRetriever # noqa: F401, F403 from .xlsum import * # noqa: F401, F403 from .xsum import * # noqa: F401, F403 +from .xhumaneval import xHumanevalDataset # noqa: F401, F403 from .xIFEval.ifeval import xIFEvalDataset, xIFEvaluator # noqa: F401, F403 from .xlivecodebench import xLCBCodeGenerationDataset, xLCBCodeGenerationEvaluator from .xmgsm import * # noqa: F401, F403 diff --git a/opencompass/datasets/xhumaneval.py b/opencompass/datasets/xhumaneval.py new file mode 100644 index 00000000..7bc8588b --- /dev/null +++ b/opencompass/datasets/xhumaneval.py @@ -0,0 +1,73 @@ +# flake8: noqa: E501 +# yapf: disable +import copy +import json +import os.path as osp +import re +import tempfile +from os import environ +from typing import List +import os +from datasets import Dataset + +from opencompass.openicl.icl_evaluator import BaseEvaluator +from opencompass.registry import LOAD_DATASET +from opencompass.utils import get_data_path + +from .base import BaseDataset + +HUMANEVAL_IMPORT_ERROR = '''\ +Please install human_eval use following steps: +git clone git@github.com:open-compass/human-eval.git +cd human-eval && pip install -e .''' + +HUMANEVAL_PLUS_IMPORT_ERROR = '''\ +Please install evalplus use following steps: +git clone --recurse-submodules git@github.com:open-compass/human-eval.git +cd human-eval +pip install -e . +pip install -e evalplus''' + + +@LOAD_DATASET.register_module() +class xHumanevalDataset(BaseDataset): + + @staticmethod + def load(path: str, num_repeats: int = 1, local_mode: bool = False): + """Load humaneval dataset for pass k mode. + + Note that you can use num_repeats > 1 when your model does not support + `num_return_sequence` in generation, otherwise use the raw + humaneval dataset and set `num_return_sequence` in model config to + generate multiple responses for testing pass@k>1. + + It better to change your dataset abbr correspondingly if you want to + change num_repeats>1, otherwise the number in + `.cache/dataset_size.json` might be inconsistent. + + Args: + num_repeats(int): Number of repetition for this dataset to get + multiple responses in special cases. + """ + path = get_data_path(path, local_mode=True) + # path = os.path.join(path, name) + if environ.get('DATASET_SOURCE') == 'ModelScope': + from modelscope import MsDataset + dataset = MsDataset.load(path, subset_name='openai_humaneval', split='test') + dataset_list = [] + for example in dataset: + dataset_list.extend([example] * num_repeats) + dataset = Dataset.from_list(dataset_list) + else: + dataset = [] + with open(path, 'r', encoding='utf-8') as f: + for line in f: + item = json.loads(line.strip()) + item = {'prompt': item['prompt'], 'task_id': item['task_id'], 'test': item['test'], 'entry_point': item['entry_point'], 'canonical_solution': item['canonical_solution']} + dataset.extend( + [item for _ in range(num_repeats)]) + print(dataset[:10]) + dataset = Dataset.from_list(dataset) + return dataset + + diff --git a/run.sh b/run.sh new file mode 100644 index 00000000..8a13ab2b --- /dev/null +++ b/run.sh @@ -0,0 +1 @@ +CUDA_VISIBLE_DEVICES=0 python3 run.py --models hf_llama3_1_8b_instruct --datasets xhumaneval_plus_gen -a vllm --max-num-worker 1 \ No newline at end of file diff --git a/start.sh b/start.sh new file mode 100644 index 00000000..028f06f2 --- /dev/null +++ b/start.sh @@ -0,0 +1,6 @@ +cd ../ +git lfs install +git clone https://huggingface.co/datasets/LLaMAX/BenchMAX_Function_Completion +mkdir -p ~/.cache/opencompass/data/xhumaneval_plus +mv BenchMAX_Function_Completion/* ~/.cache/opencompass/data/xhumaneval_plus/ +cd opencompass \ No newline at end of file