mirror of
https://github.com/open-compass/opencompass.git
synced 2025-05-30 16:03:24 +08:00
add humaneval
This commit is contained in:
parent
c6c4ffc180
commit
8ea13bde6a
69
opencompass/configs/datasets/xhumaneval/README.md
Normal file
69
opencompass/configs/datasets/xhumaneval/README.md
Normal file
@ -0,0 +1,69 @@
|
||||
# HumanEval
|
||||
|
||||
```bash
|
||||
python3 run.py --models hf_internlm2_7b --datasets deprecated_humaneval_gen_d2537e --debug
|
||||
python3 run.py --models hf_internlm2_chat_7b --datasets humaneval_gen_8e312c --debug
|
||||
```
|
||||
|
||||
## Base Models
|
||||
|
||||
| model | pass@1 |
|
||||
|:------------------------:|---------:|
|
||||
| llama-7b-turbomind | 12.80 |
|
||||
| llama-13b-turbomind | 15.24 |
|
||||
| llama-30b-turbomind | 9.15 |
|
||||
| llama-65b-turbomind | 7.32 |
|
||||
| llama-2-7b-turbomind | 14.02 |
|
||||
| llama-2-13b-turbomind | 15.24 |
|
||||
| llama-2-70b-turbomind | 15.24 |
|
||||
| llama-3-8b-turbomind | 28.05 |
|
||||
| llama-3-70b-turbomind | 28.05 |
|
||||
| internlm2-1.8b-turbomind | 30.49 |
|
||||
| internlm2-7b-turbomind | 48.17 |
|
||||
| internlm2-20b-turbomind | 51.83 |
|
||||
| qwen-1.8b-turbomind | 16.46 |
|
||||
| qwen-7b-turbomind | 23.78 |
|
||||
| qwen-14b-turbomind | 23.78 |
|
||||
| qwen-72b-turbomind | 66.46 |
|
||||
| qwen1.5-0.5b-hf | 8.54 |
|
||||
| qwen1.5-1.8b-hf | 23.17 |
|
||||
| qwen1.5-4b-hf | 41.46 |
|
||||
| qwen1.5-7b-hf | 53.05 |
|
||||
| qwen1.5-14b-hf | 57.32 |
|
||||
| qwen1.5-32b-hf | 70.12 |
|
||||
| qwen1.5-72b-hf | 65.85 |
|
||||
| qwen1.5-moe-a2-7b-hf | 45.73 |
|
||||
| mistral-7b-v0.1-hf | 14.02 |
|
||||
| mistral-7b-v0.2-hf | 9.15 |
|
||||
| mixtral-8x7b-v0.1-hf | 24.39 |
|
||||
| mixtral-8x22b-v0.1-hf | 16.46 |
|
||||
| yi-6b-hf | 14.63 |
|
||||
| yi-34b-hf | 17.07 |
|
||||
| deepseek-7b-base-hf | 18.29 |
|
||||
| deepseek-67b-base-hf | 23.17 |
|
||||
|
||||
## Chat Models
|
||||
|
||||
| model | pass@1 |
|
||||
|:-----------------------------:|---------:|
|
||||
| qwen1.5-0.5b-chat-hf | 9.15 |
|
||||
| qwen1.5-1.8b-chat-hf | 15.85 |
|
||||
| qwen1.5-4b-chat-hf | 30.49 |
|
||||
| qwen1.5-7b-chat-hf | 40.85 |
|
||||
| qwen1.5-14b-chat-hf | 50.00 |
|
||||
| qwen1.5-32b-chat-hf | 57.93 |
|
||||
| qwen1.5-72b-chat-hf | 60.37 |
|
||||
| qwen1.5-110b-chat-hf | 65.24 |
|
||||
| internlm2-chat-1.8b-hf | 33.54 |
|
||||
| internlm2-chat-1.8b-sft-hf | 34.15 |
|
||||
| internlm2-chat-7b-hf | 56.71 |
|
||||
| internlm2-chat-7b-sft-hf | 61.59 |
|
||||
| internlm2-chat-20b-hf | 67.68 |
|
||||
| internlm2-chat-20b-sft-hf | 67.68 |
|
||||
| llama-3-8b-instruct-hf | 55.49 |
|
||||
| llama-3-70b-instruct-hf | 70.73 |
|
||||
| llama-3-8b-instruct-lmdeploy | 57.93 |
|
||||
| llama-3-70b-instruct-lmdeploy | 70.73 |
|
||||
| mistral-7b-instruct-v0.1-hf | 32.32 |
|
||||
| mistral-7b-instruct-v0.2-hf | 29.27 |
|
||||
| mixtral-8x7b-instruct-v0.1-hf | 34.15 |
|
@ -0,0 +1,36 @@
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.datasets import HumanevalDataset, HumanEvalEvaluator, humaneval_postprocess
|
||||
|
||||
humaneval_reader_cfg = dict(
|
||||
input_columns=['prompt'], output_column='task_id', train_split='test')
|
||||
|
||||
# TODO: allow empty output-column
|
||||
humaneval_infer_cfg = dict(
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(round=[
|
||||
dict(
|
||||
role='HUMAN',
|
||||
prompt='Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\nCreate a Python script for this problem:\n{prompt}\n\n### Response:\n'),
|
||||
])),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer, max_out_len=512))
|
||||
|
||||
humaneval_eval_cfg = dict(
|
||||
evaluator=dict(type=HumanEvalEvaluator),
|
||||
pred_role='BOT',
|
||||
k=[1, 10, 100], # the parameter only for humaneval
|
||||
pred_postprocessor=dict(type=humaneval_postprocess),
|
||||
)
|
||||
|
||||
humaneval_datasets = [
|
||||
dict(
|
||||
abbr='openai_humaneval',
|
||||
type=HumanevalDataset,
|
||||
path='opencompass/humaneval',
|
||||
reader_cfg=humaneval_reader_cfg,
|
||||
infer_cfg=humaneval_infer_cfg,
|
||||
eval_cfg=humaneval_eval_cfg)
|
||||
]
|
@ -0,0 +1,36 @@
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.datasets import HumanevalDataset, HumanEvalEvaluator, humaneval_postprocess
|
||||
|
||||
humaneval_reader_cfg = dict(
|
||||
input_columns=['prompt'], output_column='task_id', train_split='test')
|
||||
|
||||
# TODO: allow empty output-column
|
||||
humaneval_infer_cfg = dict(
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(round=[
|
||||
dict(
|
||||
role='HUMAN',
|
||||
prompt='Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\nComplete the following python function.:\n{prompt}\n\n### Response:\n'),
|
||||
])),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer, max_out_len=512))
|
||||
|
||||
humaneval_eval_cfg = dict(
|
||||
evaluator=dict(type=HumanEvalEvaluator),
|
||||
pred_role='BOT',
|
||||
k=[1, 10, 100], # the parameter only for humaneval
|
||||
pred_postprocessor=dict(type=humaneval_postprocess),
|
||||
)
|
||||
|
||||
humaneval_datasets = [
|
||||
dict(
|
||||
abbr='openai_humaneval',
|
||||
type=HumanevalDataset,
|
||||
path='opencompass/humaneval',
|
||||
reader_cfg=humaneval_reader_cfg,
|
||||
infer_cfg=humaneval_infer_cfg,
|
||||
eval_cfg=humaneval_eval_cfg)
|
||||
]
|
@ -0,0 +1,36 @@
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.datasets import HumanevalDataset, HumanEvalEvaluator, humaneval_postprocess_v2
|
||||
|
||||
humaneval_reader_cfg = dict(
|
||||
input_columns=['prompt'], output_column='task_id', train_split='test')
|
||||
|
||||
# TODO: allow empty output-column
|
||||
humaneval_infer_cfg = dict(
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(round=[
|
||||
dict(
|
||||
role='HUMAN',
|
||||
prompt='{prompt}'),
|
||||
])),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer, max_out_len=512))
|
||||
|
||||
humaneval_eval_cfg = dict(
|
||||
evaluator=dict(type=HumanEvalEvaluator),
|
||||
pred_role='BOT',
|
||||
k=[1, 10, 100], # the parameter only for humaneval
|
||||
pred_postprocessor=dict(type=humaneval_postprocess_v2),
|
||||
)
|
||||
|
||||
humaneval_datasets = [
|
||||
dict(
|
||||
abbr='openai_humaneval',
|
||||
type=HumanevalDataset,
|
||||
path='opencompass/humaneval',
|
||||
reader_cfg=humaneval_reader_cfg,
|
||||
infer_cfg=humaneval_infer_cfg,
|
||||
eval_cfg=humaneval_eval_cfg)
|
||||
]
|
@ -0,0 +1,33 @@
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.datasets import HumanevalDataset, HumanEvalEvaluator, humaneval_postprocess_v2
|
||||
|
||||
humaneval_reader_cfg = dict(
|
||||
input_columns=['prompt'], output_column='task_id', train_split='test')
|
||||
|
||||
# TODO: allow empty output-column
|
||||
humaneval_infer_cfg = dict(
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template='Complete the following python code:\n{prompt}',
|
||||
),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer, max_out_len=512))
|
||||
|
||||
humaneval_eval_cfg = dict(
|
||||
evaluator=dict(type=HumanEvalEvaluator),
|
||||
pred_role='BOT',
|
||||
k=[1, 10, 100], # the parameter only for humaneval
|
||||
pred_postprocessor=dict(type=humaneval_postprocess_v2),
|
||||
)
|
||||
|
||||
humaneval_datasets = [
|
||||
dict(
|
||||
abbr='openai_humaneval',
|
||||
type=HumanevalDataset,
|
||||
path='opencompass/humaneval',
|
||||
reader_cfg=humaneval_reader_cfg,
|
||||
infer_cfg=humaneval_infer_cfg,
|
||||
eval_cfg=humaneval_eval_cfg)
|
||||
]
|
@ -0,0 +1,31 @@
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.datasets import HumanevalDataset, HumanEvalEvaluator, humaneval_postprocess
|
||||
|
||||
humaneval_reader_cfg = dict(
|
||||
input_columns=['prompt'], output_column='task_id', train_split='test')
|
||||
|
||||
# TODO: allow empty output-column
|
||||
humaneval_infer_cfg = dict(
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template='{prompt}'),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer, max_out_len=512))
|
||||
|
||||
humaneval_eval_cfg = dict(
|
||||
evaluator=dict(type=HumanEvalEvaluator),
|
||||
k=[1, 10, 100], # the parameter only for humaneval
|
||||
pred_postprocessor=dict(type=humaneval_postprocess),
|
||||
)
|
||||
|
||||
humaneval_datasets = [
|
||||
dict(
|
||||
abbr='openai_humaneval',
|
||||
type=HumanevalDataset,
|
||||
path='opencompass/humaneval',
|
||||
reader_cfg=humaneval_reader_cfg,
|
||||
infer_cfg=humaneval_infer_cfg,
|
||||
eval_cfg=humaneval_eval_cfg)
|
||||
]
|
@ -0,0 +1,41 @@
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.datasets import HumanevalDataset, HumanEvalEvaluator, humaneval_postprocess
|
||||
|
||||
humaneval_reader_cfg = dict(
|
||||
input_columns=['prompt'], output_column='task_id', train_split='test')
|
||||
|
||||
# TODO: allow empty output-column
|
||||
humaneval_infer_cfg = dict(
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(
|
||||
begin=[
|
||||
dict(
|
||||
role='SYSTEM',
|
||||
fallback_role='HUMAN',
|
||||
prompt='Complete the following python code:'),
|
||||
],
|
||||
round=[
|
||||
dict(role='HUMAN', prompt='{prompt}'),
|
||||
])),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer))
|
||||
|
||||
humaneval_eval_cfg = dict(
|
||||
evaluator=dict(type=HumanEvalEvaluator),
|
||||
pred_role='BOT',
|
||||
k=[1, 10, 100], # the parameter only for humaneval
|
||||
pred_postprocessor=dict(type=humaneval_postprocess),
|
||||
)
|
||||
|
||||
humaneval_datasets = [
|
||||
dict(
|
||||
abbr='openai_humaneval',
|
||||
type=HumanevalDataset,
|
||||
path='opencompass/humaneval',
|
||||
reader_cfg=humaneval_reader_cfg,
|
||||
infer_cfg=humaneval_infer_cfg,
|
||||
eval_cfg=humaneval_eval_cfg)
|
||||
]
|
69
opencompass/configs/datasets/xhumaneval/humaneval/README.md
Normal file
69
opencompass/configs/datasets/xhumaneval/humaneval/README.md
Normal file
@ -0,0 +1,69 @@
|
||||
# HumanEval
|
||||
|
||||
```bash
|
||||
python3 run.py --models hf_internlm2_7b --datasets deprecated_humaneval_gen_d2537e --debug
|
||||
python3 run.py --models hf_internlm2_chat_7b --datasets humaneval_gen_8e312c --debug
|
||||
```
|
||||
|
||||
## Base Models
|
||||
|
||||
| model | pass@1 |
|
||||
|:------------------------:|---------:|
|
||||
| llama-7b-turbomind | 12.80 |
|
||||
| llama-13b-turbomind | 15.24 |
|
||||
| llama-30b-turbomind | 9.15 |
|
||||
| llama-65b-turbomind | 7.32 |
|
||||
| llama-2-7b-turbomind | 14.02 |
|
||||
| llama-2-13b-turbomind | 15.24 |
|
||||
| llama-2-70b-turbomind | 15.24 |
|
||||
| llama-3-8b-turbomind | 28.05 |
|
||||
| llama-3-70b-turbomind | 28.05 |
|
||||
| internlm2-1.8b-turbomind | 30.49 |
|
||||
| internlm2-7b-turbomind | 48.17 |
|
||||
| internlm2-20b-turbomind | 51.83 |
|
||||
| qwen-1.8b-turbomind | 16.46 |
|
||||
| qwen-7b-turbomind | 23.78 |
|
||||
| qwen-14b-turbomind | 23.78 |
|
||||
| qwen-72b-turbomind | 66.46 |
|
||||
| qwen1.5-0.5b-hf | 8.54 |
|
||||
| qwen1.5-1.8b-hf | 23.17 |
|
||||
| qwen1.5-4b-hf | 41.46 |
|
||||
| qwen1.5-7b-hf | 53.05 |
|
||||
| qwen1.5-14b-hf | 57.32 |
|
||||
| qwen1.5-32b-hf | 70.12 |
|
||||
| qwen1.5-72b-hf | 65.85 |
|
||||
| qwen1.5-moe-a2-7b-hf | 45.73 |
|
||||
| mistral-7b-v0.1-hf | 14.02 |
|
||||
| mistral-7b-v0.2-hf | 9.15 |
|
||||
| mixtral-8x7b-v0.1-hf | 24.39 |
|
||||
| mixtral-8x22b-v0.1-hf | 16.46 |
|
||||
| yi-6b-hf | 14.63 |
|
||||
| yi-34b-hf | 17.07 |
|
||||
| deepseek-7b-base-hf | 18.29 |
|
||||
| deepseek-67b-base-hf | 23.17 |
|
||||
|
||||
## Chat Models
|
||||
|
||||
| model | pass@1 |
|
||||
|:-----------------------------:|---------:|
|
||||
| qwen1.5-0.5b-chat-hf | 9.15 |
|
||||
| qwen1.5-1.8b-chat-hf | 15.85 |
|
||||
| qwen1.5-4b-chat-hf | 30.49 |
|
||||
| qwen1.5-7b-chat-hf | 40.85 |
|
||||
| qwen1.5-14b-chat-hf | 50.00 |
|
||||
| qwen1.5-32b-chat-hf | 57.93 |
|
||||
| qwen1.5-72b-chat-hf | 60.37 |
|
||||
| qwen1.5-110b-chat-hf | 65.24 |
|
||||
| internlm2-chat-1.8b-hf | 33.54 |
|
||||
| internlm2-chat-1.8b-sft-hf | 34.15 |
|
||||
| internlm2-chat-7b-hf | 56.71 |
|
||||
| internlm2-chat-7b-sft-hf | 61.59 |
|
||||
| internlm2-chat-20b-hf | 67.68 |
|
||||
| internlm2-chat-20b-sft-hf | 67.68 |
|
||||
| llama-3-8b-instruct-hf | 55.49 |
|
||||
| llama-3-70b-instruct-hf | 70.73 |
|
||||
| llama-3-8b-instruct-lmdeploy | 57.93 |
|
||||
| llama-3-70b-instruct-lmdeploy | 70.73 |
|
||||
| mistral-7b-instruct-v0.1-hf | 32.32 |
|
||||
| mistral-7b-instruct-v0.2-hf | 29.27 |
|
||||
| mixtral-8x7b-instruct-v0.1-hf | 34.15 |
|
@ -0,0 +1,36 @@
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.datasets import HumanevalDataset, HumanEvalEvaluator, humaneval_postprocess
|
||||
|
||||
humaneval_reader_cfg = dict(
|
||||
input_columns=['prompt'], output_column='task_id', train_split='test')
|
||||
|
||||
# TODO: allow empty output-column
|
||||
humaneval_infer_cfg = dict(
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(round=[
|
||||
dict(
|
||||
role='HUMAN',
|
||||
prompt='Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\nCreate a Python script for this problem:\n{prompt}\n\n### Response:\n'),
|
||||
])),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer, max_out_len=512))
|
||||
|
||||
humaneval_eval_cfg = dict(
|
||||
evaluator=dict(type=HumanEvalEvaluator),
|
||||
pred_role='BOT',
|
||||
k=[1, 10, 100], # the parameter only for humaneval
|
||||
pred_postprocessor=dict(type=humaneval_postprocess),
|
||||
)
|
||||
|
||||
humaneval_datasets = [
|
||||
dict(
|
||||
abbr='openai_humaneval',
|
||||
type=HumanevalDataset,
|
||||
path='opencompass/humaneval',
|
||||
reader_cfg=humaneval_reader_cfg,
|
||||
infer_cfg=humaneval_infer_cfg,
|
||||
eval_cfg=humaneval_eval_cfg)
|
||||
]
|
@ -0,0 +1,36 @@
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.datasets import HumanevalDataset, HumanEvalEvaluator, humaneval_postprocess
|
||||
|
||||
humaneval_reader_cfg = dict(
|
||||
input_columns=['prompt'], output_column='task_id', train_split='test')
|
||||
|
||||
# TODO: allow empty output-column
|
||||
humaneval_infer_cfg = dict(
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(round=[
|
||||
dict(
|
||||
role='HUMAN',
|
||||
prompt='Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\nComplete the following python function.:\n{prompt}\n\n### Response:\n'),
|
||||
])),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer, max_out_len=512))
|
||||
|
||||
humaneval_eval_cfg = dict(
|
||||
evaluator=dict(type=HumanEvalEvaluator),
|
||||
pred_role='BOT',
|
||||
k=[1, 10, 100], # the parameter only for humaneval
|
||||
pred_postprocessor=dict(type=humaneval_postprocess),
|
||||
)
|
||||
|
||||
humaneval_datasets = [
|
||||
dict(
|
||||
abbr='openai_humaneval',
|
||||
type=HumanevalDataset,
|
||||
path='opencompass/humaneval',
|
||||
reader_cfg=humaneval_reader_cfg,
|
||||
infer_cfg=humaneval_infer_cfg,
|
||||
eval_cfg=humaneval_eval_cfg)
|
||||
]
|
@ -0,0 +1,36 @@
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.datasets import HumanevalDataset, HumanEvalEvaluator, humaneval_postprocess_v2
|
||||
|
||||
humaneval_reader_cfg = dict(
|
||||
input_columns=['prompt'], output_column='task_id', train_split='test')
|
||||
|
||||
# TODO: allow empty output-column
|
||||
humaneval_infer_cfg = dict(
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(round=[
|
||||
dict(
|
||||
role='HUMAN',
|
||||
prompt='{prompt}'),
|
||||
])),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer, max_out_len=512))
|
||||
|
||||
humaneval_eval_cfg = dict(
|
||||
evaluator=dict(type=HumanEvalEvaluator),
|
||||
pred_role='BOT',
|
||||
k=[1, 10, 100], # the parameter only for humaneval
|
||||
pred_postprocessor=dict(type=humaneval_postprocess_v2),
|
||||
)
|
||||
|
||||
humaneval_datasets = [
|
||||
dict(
|
||||
abbr='openai_humaneval',
|
||||
type=HumanevalDataset,
|
||||
path='opencompass/humaneval',
|
||||
reader_cfg=humaneval_reader_cfg,
|
||||
infer_cfg=humaneval_infer_cfg,
|
||||
eval_cfg=humaneval_eval_cfg)
|
||||
]
|
@ -0,0 +1,33 @@
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.datasets import HumanevalDataset, HumanEvalEvaluator, humaneval_postprocess_v2
|
||||
|
||||
humaneval_reader_cfg = dict(
|
||||
input_columns=['prompt'], output_column='task_id', train_split='test')
|
||||
|
||||
# TODO: allow empty output-column
|
||||
humaneval_infer_cfg = dict(
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template='Complete the following python code:\n{prompt}',
|
||||
),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer, max_out_len=512))
|
||||
|
||||
humaneval_eval_cfg = dict(
|
||||
evaluator=dict(type=HumanEvalEvaluator),
|
||||
pred_role='BOT',
|
||||
k=[1, 10, 100], # the parameter only for humaneval
|
||||
pred_postprocessor=dict(type=humaneval_postprocess_v2),
|
||||
)
|
||||
|
||||
humaneval_datasets = [
|
||||
dict(
|
||||
abbr='openai_humaneval',
|
||||
type=HumanevalDataset,
|
||||
path='opencompass/humaneval',
|
||||
reader_cfg=humaneval_reader_cfg,
|
||||
infer_cfg=humaneval_infer_cfg,
|
||||
eval_cfg=humaneval_eval_cfg)
|
||||
]
|
@ -0,0 +1,31 @@
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.datasets import HumanevalDataset, HumanEvalEvaluator, humaneval_postprocess
|
||||
|
||||
humaneval_reader_cfg = dict(
|
||||
input_columns=['prompt'], output_column='task_id', train_split='test')
|
||||
|
||||
# TODO: allow empty output-column
|
||||
humaneval_infer_cfg = dict(
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template='{prompt}'),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer, max_out_len=512))
|
||||
|
||||
humaneval_eval_cfg = dict(
|
||||
evaluator=dict(type=HumanEvalEvaluator),
|
||||
k=[1, 10, 100], # the parameter only for humaneval
|
||||
pred_postprocessor=dict(type=humaneval_postprocess),
|
||||
)
|
||||
|
||||
humaneval_datasets = [
|
||||
dict(
|
||||
abbr='openai_humaneval',
|
||||
type=HumanevalDataset,
|
||||
path='opencompass/humaneval',
|
||||
reader_cfg=humaneval_reader_cfg,
|
||||
infer_cfg=humaneval_infer_cfg,
|
||||
eval_cfg=humaneval_eval_cfg)
|
||||
]
|
@ -0,0 +1,41 @@
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.datasets import HumanevalDataset, HumanEvalEvaluator, humaneval_postprocess
|
||||
|
||||
humaneval_reader_cfg = dict(
|
||||
input_columns=['prompt'], output_column='task_id', train_split='test')
|
||||
|
||||
# TODO: allow empty output-column
|
||||
humaneval_infer_cfg = dict(
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(
|
||||
begin=[
|
||||
dict(
|
||||
role='SYSTEM',
|
||||
fallback_role='HUMAN',
|
||||
prompt='Complete the following python code:'),
|
||||
],
|
||||
round=[
|
||||
dict(role='HUMAN', prompt='{prompt}'),
|
||||
])),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer))
|
||||
|
||||
humaneval_eval_cfg = dict(
|
||||
evaluator=dict(type=HumanEvalEvaluator),
|
||||
pred_role='BOT',
|
||||
k=[1, 10, 100], # the parameter only for humaneval
|
||||
pred_postprocessor=dict(type=humaneval_postprocess),
|
||||
)
|
||||
|
||||
humaneval_datasets = [
|
||||
dict(
|
||||
abbr='openai_humaneval',
|
||||
type=HumanevalDataset,
|
||||
path='opencompass/humaneval',
|
||||
reader_cfg=humaneval_reader_cfg,
|
||||
infer_cfg=humaneval_infer_cfg,
|
||||
eval_cfg=humaneval_eval_cfg)
|
||||
]
|
@ -0,0 +1,4 @@
|
||||
from mmengine.config import read_base
|
||||
|
||||
with read_base():
|
||||
from .humaneval_gen_8e312c import humaneval_datasets # noqa: F401, F403
|
@ -0,0 +1,35 @@
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.datasets import HumanevalDataset, HumanEvalEvaluator, humaneval_postprocess_v2
|
||||
|
||||
humaneval_reader_cfg = dict(input_columns=['prompt'], output_column='task_id', train_split='test')
|
||||
|
||||
HUMANEVAL_TEMPLATE = dict(
|
||||
round=[
|
||||
dict(role='HUMAN', prompt='You are an intelligent programming assistant to produce Python algorithmic solutions.\nCan you complete the following Python function?\n```python\n{prompt}\n```'),
|
||||
]
|
||||
)
|
||||
|
||||
humaneval_infer_cfg = dict(
|
||||
prompt_template=dict(type=PromptTemplate, template=HUMANEVAL_TEMPLATE),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer, max_out_len=1024),
|
||||
)
|
||||
|
||||
humaneval_eval_cfg = dict(
|
||||
evaluator=dict(type=HumanEvalEvaluator),
|
||||
k=[1, 10, 100],
|
||||
pred_postprocessor=dict(type=humaneval_postprocess_v2),
|
||||
)
|
||||
|
||||
humaneval_datasets = [
|
||||
dict(
|
||||
abbr='openai_humaneval',
|
||||
type=HumanevalDataset,
|
||||
path='opencompass/humaneval',
|
||||
reader_cfg=humaneval_reader_cfg,
|
||||
infer_cfg=humaneval_infer_cfg,
|
||||
eval_cfg=humaneval_eval_cfg,
|
||||
)
|
||||
]
|
@ -0,0 +1,37 @@
|
||||
# THIS SHALL ALSO BE DEPRECATED
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.datasets import HumanevalDataset, HumanEvalEvaluator, humaneval_postprocess_v2
|
||||
|
||||
humaneval_reader_cfg = dict(
|
||||
input_columns=['prompt'], output_column='task_id', train_split='test')
|
||||
|
||||
# TODO: allow empty output-column
|
||||
humaneval_infer_cfg = dict(
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(round=[
|
||||
dict(
|
||||
role='HUMAN',
|
||||
prompt='Complete the following python code:\n{prompt}'),
|
||||
])),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer, max_out_len=512))
|
||||
|
||||
humaneval_eval_cfg = dict(
|
||||
evaluator=dict(type=HumanEvalEvaluator),
|
||||
pred_role='BOT',
|
||||
k=[1, 10, 100], # the parameter only for humaneval
|
||||
pred_postprocessor=dict(type=humaneval_postprocess_v2),
|
||||
)
|
||||
|
||||
humaneval_datasets = [
|
||||
dict(
|
||||
abbr='openai_humaneval',
|
||||
type=HumanevalDataset,
|
||||
path='opencompass/humaneval',
|
||||
reader_cfg=humaneval_reader_cfg,
|
||||
infer_cfg=humaneval_infer_cfg,
|
||||
eval_cfg=humaneval_eval_cfg)
|
||||
]
|
@ -0,0 +1,36 @@
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.datasets import HumanevalDataset, HumanEvalEvaluator, humaneval_postprocess_v2
|
||||
|
||||
humaneval_reader_cfg = dict(
|
||||
input_columns=['prompt'], output_column='task_id', train_split='test')
|
||||
|
||||
# TODO: allow empty output-column
|
||||
humaneval_infer_cfg = dict(
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(round=[
|
||||
dict(
|
||||
role='HUMAN',
|
||||
prompt='Read the following function signature and docstring, and fully implement the function described. Your response should only contain the code for this function.\n{prompt}'),
|
||||
])),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer, max_out_len=512))
|
||||
|
||||
humaneval_eval_cfg = dict(
|
||||
evaluator=dict(type=HumanEvalEvaluator),
|
||||
pred_role='BOT',
|
||||
k=[1, 10, 100], # the parameter only for humaneval
|
||||
pred_postprocessor=dict(type=humaneval_postprocess_v2),
|
||||
)
|
||||
|
||||
humaneval_datasets = [
|
||||
dict(
|
||||
abbr='openai_humaneval',
|
||||
type=HumanevalDataset,
|
||||
path='opencompass/humaneval',
|
||||
reader_cfg=humaneval_reader_cfg,
|
||||
infer_cfg=humaneval_infer_cfg,
|
||||
eval_cfg=humaneval_eval_cfg)
|
||||
]
|
@ -0,0 +1,36 @@
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.datasets import HumanevalDataset, HumanEvalEvaluator, humaneval_postprocess_v2
|
||||
|
||||
humaneval_reader_cfg = dict(
|
||||
input_columns=['prompt'], output_column='task_id', train_split='test')
|
||||
|
||||
# TODO: allow empty output-column
|
||||
humaneval_infer_cfg = dict(
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(round=[
|
||||
dict(
|
||||
role='HUMAN',
|
||||
prompt='Complete the following python code:\n{prompt}'),
|
||||
])),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer, max_out_len=512))
|
||||
|
||||
humaneval_eval_cfg = dict(
|
||||
evaluator=dict(type=HumanEvalEvaluator),
|
||||
pred_role='BOT',
|
||||
k=[1, 10, 100], # the parameter only for humaneval
|
||||
pred_postprocessor=dict(type=humaneval_postprocess_v2),
|
||||
)
|
||||
|
||||
humaneval_datasets = [
|
||||
dict(
|
||||
abbr='openai_humaneval_passk',
|
||||
type=HumanevalDataset,
|
||||
path='opencompass/humaneval',
|
||||
reader_cfg=humaneval_reader_cfg,
|
||||
infer_cfg=humaneval_infer_cfg,
|
||||
eval_cfg=humaneval_eval_cfg)
|
||||
]
|
@ -0,0 +1,37 @@
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.datasets import HumanevalDataset, HumanEvalEvaluator, humaneval_postprocess_v2
|
||||
|
||||
humaneval_reader_cfg = dict(
|
||||
input_columns=['prompt'], output_column='task_id', train_split='test')
|
||||
|
||||
# TODO: allow empty output-column
|
||||
humaneval_infer_cfg = dict(
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(round=[
|
||||
dict(
|
||||
role='HUMAN',
|
||||
prompt='Complete the following python code:\n{prompt}'),
|
||||
])),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer, max_out_len=512))
|
||||
|
||||
humaneval_eval_cfg = dict(
|
||||
evaluator=dict(type=HumanEvalEvaluator),
|
||||
pred_role='BOT',
|
||||
k=[1, 10, 100], # the parameter only for humaneval
|
||||
pred_postprocessor=dict(type=humaneval_postprocess_v2),
|
||||
)
|
||||
|
||||
humaneval_datasets = [
|
||||
dict(
|
||||
abbr='openai_humaneval_repeat10',
|
||||
type=HumanevalDataset,
|
||||
path='opencompass/humaneval',
|
||||
num_repeats=10,
|
||||
reader_cfg=humaneval_reader_cfg,
|
||||
infer_cfg=humaneval_infer_cfg,
|
||||
eval_cfg=humaneval_eval_cfg)
|
||||
]
|
@ -0,0 +1,33 @@
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.datasets import HumanevalDataset, HumanEvalEvaluator, humaneval_internal_v2_postprocess
|
||||
|
||||
humaneval_reader_cfg = dict(
|
||||
input_columns=['prompt'], output_column='task_id', train_split='test')
|
||||
|
||||
# TODO: allow empty output-column
|
||||
humaneval_infer_cfg = dict(
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template='# Complete the following python code:\n{prompt}',
|
||||
),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer, max_out_len=512))
|
||||
|
||||
humaneval_eval_cfg = dict(
|
||||
evaluator=dict(type=HumanEvalEvaluator),
|
||||
pred_role='BOT',
|
||||
k=[1, 10, 100], # the parameter only for humaneval
|
||||
pred_postprocessor=dict(type=humaneval_internal_v2_postprocess),
|
||||
)
|
||||
|
||||
humaneval_datasets = [
|
||||
dict(
|
||||
abbr='openai_humaneval',
|
||||
type=HumanevalDataset,
|
||||
path='opencompass/humaneval',
|
||||
reader_cfg=humaneval_reader_cfg,
|
||||
infer_cfg=humaneval_infer_cfg,
|
||||
eval_cfg=humaneval_eval_cfg)
|
||||
]
|
@ -0,0 +1,33 @@
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.datasets import HumanevalDataset, HumanEvalEvaluator, humaneval_internal_v1_postprocess
|
||||
|
||||
humaneval_reader_cfg = dict(
|
||||
input_columns=['prompt'], output_column='task_id', train_split='test')
|
||||
|
||||
# TODO: allow empty output-column
|
||||
humaneval_infer_cfg = dict(
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template='Complete the following python code:\n{prompt}',
|
||||
),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer, max_out_len=512))
|
||||
|
||||
humaneval_eval_cfg = dict(
|
||||
evaluator=dict(type=HumanEvalEvaluator),
|
||||
pred_role='BOT',
|
||||
k=[1, 10, 100], # the parameter only for humaneval
|
||||
pred_postprocessor=dict(type=humaneval_internal_v1_postprocess),
|
||||
)
|
||||
|
||||
humaneval_datasets = [
|
||||
dict(
|
||||
abbr='openai_humaneval',
|
||||
type=HumanevalDataset,
|
||||
path='opencompass/humaneval',
|
||||
reader_cfg=humaneval_reader_cfg,
|
||||
infer_cfg=humaneval_infer_cfg,
|
||||
eval_cfg=humaneval_eval_cfg)
|
||||
]
|
@ -0,0 +1,35 @@
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.datasets import HumanevalDataset, HumanEvalEvaluator, humaneval_postprocess_v2
|
||||
|
||||
humaneval_reader_cfg = dict(input_columns=['prompt'], output_column='task_id', train_split='test')
|
||||
|
||||
HUMANEVAL_TEMPLATE = dict(
|
||||
round=[
|
||||
dict(role='HUMAN', prompt='You are an intelligent programming assistant to produce Python algorithmic solutions.\nCan you complete the following Python function?\n```python\n{prompt}\n```'),
|
||||
]
|
||||
)
|
||||
|
||||
humaneval_infer_cfg = dict(
|
||||
prompt_template=dict(type=PromptTemplate, template=HUMANEVAL_TEMPLATE),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer, max_out_len=1024),
|
||||
)
|
||||
|
||||
humaneval_eval_cfg = dict(
|
||||
evaluator=dict(type=HumanEvalEvaluator),
|
||||
k=[1, 10, 100],
|
||||
pred_postprocessor=dict(type=humaneval_postprocess_v2),
|
||||
)
|
||||
|
||||
humaneval_datasets = [
|
||||
dict(
|
||||
abbr='openai_humaneval',
|
||||
type=HumanevalDataset,
|
||||
path='opencompass/humaneval',
|
||||
reader_cfg=humaneval_reader_cfg,
|
||||
infer_cfg=humaneval_infer_cfg,
|
||||
eval_cfg=humaneval_eval_cfg,
|
||||
)
|
||||
]
|
@ -0,0 +1,49 @@
|
||||
# THIS SHALL ALSO BE DEPRECATED
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.datasets import xHumanevalDataset, HumanEvalEvaluator, humaneval_postprocess_v2, HumanEvalPlusEvaluator
|
||||
|
||||
humaneval_reader_cfg = dict(
|
||||
input_columns=['prompt'], output_column='task_id', train_split='test')
|
||||
|
||||
# TODO: allow empty output-column
|
||||
humaneval_infer_cfg = dict(
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(round=[
|
||||
dict(
|
||||
role='HUMAN',
|
||||
prompt='Complete the following python code:\n{prompt}'),
|
||||
])),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer, max_out_len=512))
|
||||
|
||||
humaneval_eval_cfg = dict(
|
||||
evaluator=dict(type=HumanEvalEvaluator),
|
||||
pred_role='BOT',
|
||||
k=[1, 10, 100], # the parameter only for humaneval
|
||||
pred_postprocessor=dict(type=humaneval_postprocess_v2),
|
||||
)
|
||||
|
||||
# humaneval_datasets = [
|
||||
# dict(
|
||||
# abbr='openai_humaneval',
|
||||
# type=xHumanevalDataset,
|
||||
# path='opencompass/humaneval',
|
||||
# reader_cfg=humaneval_reader_cfg,
|
||||
# infer_cfg=humaneval_infer_cfg,
|
||||
# eval_cfg=humaneval_eval_cfg)
|
||||
# ]
|
||||
LANGS = ['ar']
|
||||
humaneval_datasets = []
|
||||
for lang in LANGS:
|
||||
humaneval_datasets.append(
|
||||
dict(
|
||||
abbr=f'humaneval_{lang}',
|
||||
type=xHumanevalDataset,
|
||||
path=f'data/xhumaneval_plus/humaneval_plus_gpt4o_{lang}.jsonl',
|
||||
reader_cfg=humaneval_reader_cfg,
|
||||
infer_cfg=humaneval_infer_cfg,
|
||||
eval_cfg=humaneval_eval_cfg)
|
||||
)
|
@ -0,0 +1,36 @@
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.datasets import HumanevalDataset, HumanEvalEvaluator, humaneval_postprocess_v2
|
||||
|
||||
humaneval_reader_cfg = dict(
|
||||
input_columns=['prompt'], output_column='task_id', train_split='test')
|
||||
|
||||
# TODO: allow empty output-column
|
||||
humaneval_infer_cfg = dict(
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(round=[
|
||||
dict(
|
||||
role='HUMAN',
|
||||
prompt='Read the following function signature and docstring, and fully implement the function described. Your response should only contain the code for this function.\n{prompt}'),
|
||||
])),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer, max_out_len=512))
|
||||
|
||||
humaneval_eval_cfg = dict(
|
||||
evaluator=dict(type=HumanEvalEvaluator),
|
||||
pred_role='BOT',
|
||||
k=[1, 10, 100], # the parameter only for humaneval
|
||||
pred_postprocessor=dict(type=humaneval_postprocess_v2),
|
||||
)
|
||||
|
||||
humaneval_datasets = [
|
||||
dict(
|
||||
abbr='openai_humaneval',
|
||||
type=HumanevalDataset,
|
||||
path='opencompass/humaneval',
|
||||
reader_cfg=humaneval_reader_cfg,
|
||||
infer_cfg=humaneval_infer_cfg,
|
||||
eval_cfg=humaneval_eval_cfg)
|
||||
]
|
@ -0,0 +1,36 @@
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.datasets import HumanevalDataset, HumanEvalEvaluator, humaneval_postprocess_v2
|
||||
|
||||
humaneval_reader_cfg = dict(
|
||||
input_columns=['prompt'], output_column='task_id', train_split='test')
|
||||
|
||||
# TODO: allow empty output-column
|
||||
humaneval_infer_cfg = dict(
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(round=[
|
||||
dict(
|
||||
role='HUMAN',
|
||||
prompt='Complete the following python code:\n{prompt}'),
|
||||
])),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer, max_out_len=512))
|
||||
|
||||
humaneval_eval_cfg = dict(
|
||||
evaluator=dict(type=HumanEvalEvaluator),
|
||||
pred_role='BOT',
|
||||
k=[1, 10, 100], # the parameter only for humaneval
|
||||
pred_postprocessor=dict(type=humaneval_postprocess_v2),
|
||||
)
|
||||
|
||||
humaneval_datasets = [
|
||||
dict(
|
||||
abbr='openai_humaneval_passk',
|
||||
type=HumanevalDataset,
|
||||
path='opencompass/humaneval',
|
||||
reader_cfg=humaneval_reader_cfg,
|
||||
infer_cfg=humaneval_infer_cfg,
|
||||
eval_cfg=humaneval_eval_cfg)
|
||||
]
|
@ -0,0 +1,37 @@
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.datasets import HumanevalDataset, HumanEvalEvaluator, humaneval_postprocess_v2
|
||||
|
||||
humaneval_reader_cfg = dict(
|
||||
input_columns=['prompt'], output_column='task_id', train_split='test')
|
||||
|
||||
# TODO: allow empty output-column
|
||||
humaneval_infer_cfg = dict(
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(round=[
|
||||
dict(
|
||||
role='HUMAN',
|
||||
prompt='Complete the following python code:\n{prompt}'),
|
||||
])),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer, max_out_len=512))
|
||||
|
||||
humaneval_eval_cfg = dict(
|
||||
evaluator=dict(type=HumanEvalEvaluator),
|
||||
pred_role='BOT',
|
||||
k=[1, 10, 100], # the parameter only for humaneval
|
||||
pred_postprocessor=dict(type=humaneval_postprocess_v2),
|
||||
)
|
||||
|
||||
humaneval_datasets = [
|
||||
dict(
|
||||
abbr='openai_humaneval_repeat10',
|
||||
type=HumanevalDataset,
|
||||
path='opencompass/humaneval',
|
||||
num_repeats=10,
|
||||
reader_cfg=humaneval_reader_cfg,
|
||||
infer_cfg=humaneval_infer_cfg,
|
||||
eval_cfg=humaneval_eval_cfg)
|
||||
]
|
@ -0,0 +1,33 @@
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.datasets import HumanevalDataset, HumanEvalEvaluator, humaneval_internal_v2_postprocess
|
||||
|
||||
humaneval_reader_cfg = dict(
|
||||
input_columns=['prompt'], output_column='task_id', train_split='test')
|
||||
|
||||
# TODO: allow empty output-column
|
||||
humaneval_infer_cfg = dict(
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template='# Complete the following python code:\n{prompt}',
|
||||
),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer, max_out_len=512))
|
||||
|
||||
humaneval_eval_cfg = dict(
|
||||
evaluator=dict(type=HumanEvalEvaluator),
|
||||
pred_role='BOT',
|
||||
k=[1, 10, 100], # the parameter only for humaneval
|
||||
pred_postprocessor=dict(type=humaneval_internal_v2_postprocess),
|
||||
)
|
||||
|
||||
humaneval_datasets = [
|
||||
dict(
|
||||
abbr='openai_humaneval',
|
||||
type=HumanevalDataset,
|
||||
path='opencompass/humaneval',
|
||||
reader_cfg=humaneval_reader_cfg,
|
||||
infer_cfg=humaneval_infer_cfg,
|
||||
eval_cfg=humaneval_eval_cfg)
|
||||
]
|
@ -0,0 +1,33 @@
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.datasets import HumanevalDataset, HumanEvalEvaluator, humaneval_internal_v1_postprocess
|
||||
|
||||
humaneval_reader_cfg = dict(
|
||||
input_columns=['prompt'], output_column='task_id', train_split='test')
|
||||
|
||||
# TODO: allow empty output-column
|
||||
humaneval_infer_cfg = dict(
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template='Complete the following python code:\n{prompt}',
|
||||
),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer, max_out_len=512))
|
||||
|
||||
humaneval_eval_cfg = dict(
|
||||
evaluator=dict(type=HumanEvalEvaluator),
|
||||
pred_role='BOT',
|
||||
k=[1, 10, 100], # the parameter only for humaneval
|
||||
pred_postprocessor=dict(type=humaneval_internal_v1_postprocess),
|
||||
)
|
||||
|
||||
humaneval_datasets = [
|
||||
dict(
|
||||
abbr='openai_humaneval',
|
||||
type=HumanevalDataset,
|
||||
path='opencompass/humaneval',
|
||||
reader_cfg=humaneval_reader_cfg,
|
||||
infer_cfg=humaneval_infer_cfg,
|
||||
eval_cfg=humaneval_eval_cfg)
|
||||
]
|
@ -0,0 +1,4 @@
|
||||
from mmengine.config import read_base
|
||||
|
||||
with read_base():
|
||||
from .humaneval_gen_8e312c import humaneval_datasets # noqa: F401, F403
|
@ -0,0 +1,4 @@
|
||||
from mmengine.config import read_base
|
||||
|
||||
with read_base():
|
||||
from .humaneval_plus_gen_8e312c import humaneval_plus_datasets # noqa: F401, F403
|
@ -0,0 +1,35 @@
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.datasets import HumanevalDataset, HumanEvalPlusEvaluator, humaneval_postprocess_v2
|
||||
|
||||
humaneval_plus_reader_cfg = dict(input_columns=['prompt'], output_column='task_id', train_split='test')
|
||||
|
||||
HUMANEVAL_TEMPLATE = dict(
|
||||
round=[
|
||||
dict(role='HUMAN', prompt='You are an intelligent programming assistant to produce Python algorithmic solutions.\nCan you complete the following Python function?\n```python\n{prompt}\n```'),
|
||||
]
|
||||
)
|
||||
|
||||
humaneval_plus_infer_cfg = dict(
|
||||
prompt_template=dict(type=PromptTemplate, template=HUMANEVAL_TEMPLATE),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer, max_out_len=1024),
|
||||
)
|
||||
|
||||
humaneval_plus_eval_cfg = dict(
|
||||
evaluator=dict(type=HumanEvalPlusEvaluator),
|
||||
k=[1, 10, 100],
|
||||
pred_postprocessor=dict(type=humaneval_postprocess_v2),
|
||||
)
|
||||
|
||||
humaneval_plus_datasets = [
|
||||
dict(
|
||||
abbr='humaneval_plus',
|
||||
type=HumanevalDataset,
|
||||
path='opencompass/humaneval',
|
||||
reader_cfg=humaneval_plus_reader_cfg,
|
||||
infer_cfg=humaneval_plus_infer_cfg,
|
||||
eval_cfg=humaneval_plus_eval_cfg,
|
||||
)
|
||||
]
|
@ -0,0 +1,57 @@
|
||||
# THIS SHALL ALSO BE DEPRECATED
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.datasets import xHumanevalDataset, HumanEvalPlusEvaluator, humaneval_postprocess_v2
|
||||
|
||||
humaneval_plus_reader_cfg = dict(
|
||||
input_columns=['prompt'], output_column='task_id', train_split='test')
|
||||
|
||||
# TODO: allow empty output-column
|
||||
humaneval_plus_infer_cfg = dict(
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(round=[
|
||||
dict(
|
||||
role='HUMAN',
|
||||
prompt='Complete the following python code:\n{prompt}'),
|
||||
])),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer, max_out_len=2048))
|
||||
|
||||
|
||||
|
||||
|
||||
# print('humaneval_plus_datasets ru')
|
||||
# humaneval_plus_datasets = [
|
||||
# dict(
|
||||
# abbr='xhumaneval_plus_ru',
|
||||
# type=xHumanevalDataset,
|
||||
# path='xhumaneval_plus',
|
||||
# name='humaneval_plus_gpt4o_ru.jsonl',
|
||||
# local_mode=True,
|
||||
# reader_cfg=humaneval_plus_reader_cfg,
|
||||
# infer_cfg=humaneval_plus_infer_cfg,
|
||||
# eval_cfg=humaneval_plus_eval_cfg)
|
||||
# ]
|
||||
|
||||
LANGS = ['de']
|
||||
|
||||
humaneval_plus_datasets = []
|
||||
for lang in LANGS:
|
||||
humaneval_plus_eval_cfg = dict(
|
||||
evaluator=dict(type=HumanEvalPlusEvaluator),
|
||||
pred_role='BOT',
|
||||
k=[1, 10], # the parameter only for humaneval
|
||||
lang=lang,
|
||||
pred_postprocessor=dict(type=humaneval_postprocess_v2),
|
||||
)
|
||||
humaneval_plus_datasets.append(
|
||||
dict(
|
||||
abbr=f'xhumaneval_plus_{lang}',
|
||||
type=xHumanevalDataset,
|
||||
path=f'data/xhumaneval_plus/humaneval_{lang}.jsonl',
|
||||
reader_cfg=humaneval_plus_reader_cfg,
|
||||
infer_cfg=humaneval_plus_infer_cfg,
|
||||
eval_cfg=humaneval_plus_eval_cfg)
|
||||
)
|
@ -0,0 +1,38 @@
|
||||
# THIS SHALL ALSO BE DEPRECATED
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.datasets import HumanevalDataset, HumanEvalPlusEvaluator, humaneval_postprocess_v2
|
||||
|
||||
humaneval_plus_reader_cfg = dict(
|
||||
input_columns=['prompt'], output_column='task_id', train_split='test')
|
||||
|
||||
# TODO: allow empty output-column
|
||||
humaneval_plus_infer_cfg = dict(
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(round=[
|
||||
dict(
|
||||
role='HUMAN',
|
||||
prompt='Read the following function signature and docstring, and fully implement the function described. Your response should only contain the code for this function.\n{prompt}'
|
||||
),
|
||||
])),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer, max_out_len=512))
|
||||
|
||||
humaneval_plus_eval_cfg = dict(
|
||||
evaluator=dict(type=HumanEvalPlusEvaluator),
|
||||
pred_role='BOT',
|
||||
k=[1, 10, 100], # the parameter only for humaneval
|
||||
pred_postprocessor=dict(type=humaneval_postprocess_v2),
|
||||
)
|
||||
|
||||
humaneval_plus_datasets = [
|
||||
dict(
|
||||
abbr='humaneval_plus',
|
||||
type=HumanevalDataset,
|
||||
path='opencompass/humaneval',
|
||||
reader_cfg=humaneval_plus_reader_cfg,
|
||||
infer_cfg=humaneval_plus_infer_cfg,
|
||||
eval_cfg=humaneval_plus_eval_cfg)
|
||||
]
|
@ -0,0 +1,36 @@
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.datasets import HumanevalDataset, HumanEvalEvaluator, humaneval_postprocess_v2
|
||||
|
||||
humaneval_plus_reader_cfg = dict(
|
||||
input_columns=['prompt'], output_column='task_id', train_split='test')
|
||||
|
||||
# TODO: allow empty output-column
|
||||
humaneval_plus_infer_cfg = dict(
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(round=[
|
||||
dict(
|
||||
role='HUMAN',
|
||||
prompt='Complete the following python code:\n{prompt}'),
|
||||
])),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer, max_out_len=512))
|
||||
|
||||
humaneval_plus_eval_cfg = dict(
|
||||
evaluator=dict(type=HumanEvalEvaluator, metric='EvalPlus'),
|
||||
pred_role='BOT',
|
||||
k=[1, 10, 100], # the parameter only for humaneval
|
||||
pred_postprocessor=dict(type=humaneval_postprocess_v2),
|
||||
)
|
||||
|
||||
humaneval_plus_datasets = [
|
||||
dict(
|
||||
abbr='humaneval_plus_passk',
|
||||
type=HumanevalDataset,
|
||||
path='opencompass/humaneval',
|
||||
reader_cfg=humaneval_plus_reader_cfg,
|
||||
infer_cfg=humaneval_plus_infer_cfg,
|
||||
eval_cfg=humaneval_plus_eval_cfg)
|
||||
]
|
@ -0,0 +1,37 @@
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.datasets import HumanevalDataset, HumanEvalEvaluator, humaneval_postprocess_v2
|
||||
|
||||
humaneval_plus_reader_cfg = dict(
|
||||
input_columns=['prompt'], output_column='task_id', train_split='test')
|
||||
|
||||
# TODO: allow empty output-column
|
||||
humaneval_plus_infer_cfg = dict(
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(round=[
|
||||
dict(
|
||||
role='HUMAN',
|
||||
prompt='Complete the following python code:\n{prompt}'),
|
||||
])),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer, max_out_len=512))
|
||||
|
||||
humaneval_plus_eval_cfg = dict(
|
||||
evaluator=dict(type=HumanEvalEvaluator, metric='EvalPlus'),
|
||||
pred_role='BOT',
|
||||
k=[1, 10, 100], # the parameter only for humaneval
|
||||
pred_postprocessor=dict(type=humaneval_postprocess_v2),
|
||||
)
|
||||
|
||||
humaneval_plus_datasets = [
|
||||
dict(
|
||||
abbr='humaneval_plus_repeat10',
|
||||
type=HumanevalDataset,
|
||||
path='opencompass/humaneval',
|
||||
num_repeats=10,
|
||||
reader_cfg=humaneval_plus_reader_cfg,
|
||||
infer_cfg=humaneval_plus_infer_cfg,
|
||||
eval_cfg=humaneval_plus_eval_cfg)
|
||||
]
|
@ -0,0 +1,4 @@
|
||||
from mmengine.config import read_base
|
||||
|
||||
with read_base():
|
||||
from .humaneval_plus_gen_8e312c import humaneval_plus_datasets # noqa: F401, F403
|
@ -146,6 +146,7 @@ from .xcopa import * # noqa: F401, F403
|
||||
from .xiezhi import XiezhiDataset, XiezhiRetriever # noqa: F401, F403
|
||||
from .xlsum import * # noqa: F401, F403
|
||||
from .xsum import * # noqa: F401, F403
|
||||
from .xhumaneval import xHumanevalDataset # noqa: F401, F403
|
||||
from .xIFEval.ifeval import xIFEvalDataset, xIFEvaluator # noqa: F401, F403
|
||||
from .xlivecodebench import xLCBCodeGenerationDataset, xLCBCodeGenerationEvaluator
|
||||
from .xmgsm import * # noqa: F401, F403
|
||||
|
73
opencompass/datasets/xhumaneval.py
Normal file
73
opencompass/datasets/xhumaneval.py
Normal file
@ -0,0 +1,73 @@
|
||||
# flake8: noqa: E501
|
||||
# yapf: disable
|
||||
import copy
|
||||
import json
|
||||
import os.path as osp
|
||||
import re
|
||||
import tempfile
|
||||
from os import environ
|
||||
from typing import List
|
||||
import os
|
||||
from datasets import Dataset
|
||||
|
||||
from opencompass.openicl.icl_evaluator import BaseEvaluator
|
||||
from opencompass.registry import LOAD_DATASET
|
||||
from opencompass.utils import get_data_path
|
||||
|
||||
from .base import BaseDataset
|
||||
|
||||
HUMANEVAL_IMPORT_ERROR = '''\
|
||||
Please install human_eval use following steps:
|
||||
git clone git@github.com:open-compass/human-eval.git
|
||||
cd human-eval && pip install -e .'''
|
||||
|
||||
HUMANEVAL_PLUS_IMPORT_ERROR = '''\
|
||||
Please install evalplus use following steps:
|
||||
git clone --recurse-submodules git@github.com:open-compass/human-eval.git
|
||||
cd human-eval
|
||||
pip install -e .
|
||||
pip install -e evalplus'''
|
||||
|
||||
|
||||
@LOAD_DATASET.register_module()
|
||||
class xHumanevalDataset(BaseDataset):
|
||||
|
||||
@staticmethod
|
||||
def load(path: str, num_repeats: int = 1, local_mode: bool = False):
|
||||
"""Load humaneval dataset for pass k mode.
|
||||
|
||||
Note that you can use num_repeats > 1 when your model does not support
|
||||
`num_return_sequence` in generation, otherwise use the raw
|
||||
humaneval dataset and set `num_return_sequence` in model config to
|
||||
generate multiple responses for testing pass@k>1.
|
||||
|
||||
It better to change your dataset abbr correspondingly if you want to
|
||||
change num_repeats>1, otherwise the number in
|
||||
`.cache/dataset_size.json` might be inconsistent.
|
||||
|
||||
Args:
|
||||
num_repeats(int): Number of repetition for this dataset to get
|
||||
multiple responses in special cases.
|
||||
"""
|
||||
path = get_data_path(path, local_mode=True)
|
||||
# path = os.path.join(path, name)
|
||||
if environ.get('DATASET_SOURCE') == 'ModelScope':
|
||||
from modelscope import MsDataset
|
||||
dataset = MsDataset.load(path, subset_name='openai_humaneval', split='test')
|
||||
dataset_list = []
|
||||
for example in dataset:
|
||||
dataset_list.extend([example] * num_repeats)
|
||||
dataset = Dataset.from_list(dataset_list)
|
||||
else:
|
||||
dataset = []
|
||||
with open(path, 'r', encoding='utf-8') as f:
|
||||
for line in f:
|
||||
item = json.loads(line.strip())
|
||||
item = {'prompt': item['prompt'], 'task_id': item['task_id'], 'test': item['test'], 'entry_point': item['entry_point'], 'canonical_solution': item['canonical_solution']}
|
||||
dataset.extend(
|
||||
[item for _ in range(num_repeats)])
|
||||
print(dataset[:10])
|
||||
dataset = Dataset.from_list(dataset)
|
||||
return dataset
|
||||
|
||||
|
1
run.sh
Normal file
1
run.sh
Normal file
@ -0,0 +1 @@
|
||||
CUDA_VISIBLE_DEVICES=0 python3 run.py --models hf_llama3_1_8b_instruct --datasets xhumaneval_plus_gen -a vllm --max-num-worker 1
|
Loading…
Reference in New Issue
Block a user