add humaneval

This commit is contained in:
Hanxu Hu 2025-02-19 04:43:17 +01:00
parent c6c4ffc180
commit 8ea13bde6a
41 changed files with 1370 additions and 0 deletions

View File

@ -0,0 +1,69 @@
# HumanEval
```bash
python3 run.py --models hf_internlm2_7b --datasets deprecated_humaneval_gen_d2537e --debug
python3 run.py --models hf_internlm2_chat_7b --datasets humaneval_gen_8e312c --debug
```
## Base Models
| model | pass@1 |
|:------------------------:|---------:|
| llama-7b-turbomind | 12.80 |
| llama-13b-turbomind | 15.24 |
| llama-30b-turbomind | 9.15 |
| llama-65b-turbomind | 7.32 |
| llama-2-7b-turbomind | 14.02 |
| llama-2-13b-turbomind | 15.24 |
| llama-2-70b-turbomind | 15.24 |
| llama-3-8b-turbomind | 28.05 |
| llama-3-70b-turbomind | 28.05 |
| internlm2-1.8b-turbomind | 30.49 |
| internlm2-7b-turbomind | 48.17 |
| internlm2-20b-turbomind | 51.83 |
| qwen-1.8b-turbomind | 16.46 |
| qwen-7b-turbomind | 23.78 |
| qwen-14b-turbomind | 23.78 |
| qwen-72b-turbomind | 66.46 |
| qwen1.5-0.5b-hf | 8.54 |
| qwen1.5-1.8b-hf | 23.17 |
| qwen1.5-4b-hf | 41.46 |
| qwen1.5-7b-hf | 53.05 |
| qwen1.5-14b-hf | 57.32 |
| qwen1.5-32b-hf | 70.12 |
| qwen1.5-72b-hf | 65.85 |
| qwen1.5-moe-a2-7b-hf | 45.73 |
| mistral-7b-v0.1-hf | 14.02 |
| mistral-7b-v0.2-hf | 9.15 |
| mixtral-8x7b-v0.1-hf | 24.39 |
| mixtral-8x22b-v0.1-hf | 16.46 |
| yi-6b-hf | 14.63 |
| yi-34b-hf | 17.07 |
| deepseek-7b-base-hf | 18.29 |
| deepseek-67b-base-hf | 23.17 |
## Chat Models
| model | pass@1 |
|:-----------------------------:|---------:|
| qwen1.5-0.5b-chat-hf | 9.15 |
| qwen1.5-1.8b-chat-hf | 15.85 |
| qwen1.5-4b-chat-hf | 30.49 |
| qwen1.5-7b-chat-hf | 40.85 |
| qwen1.5-14b-chat-hf | 50.00 |
| qwen1.5-32b-chat-hf | 57.93 |
| qwen1.5-72b-chat-hf | 60.37 |
| qwen1.5-110b-chat-hf | 65.24 |
| internlm2-chat-1.8b-hf | 33.54 |
| internlm2-chat-1.8b-sft-hf | 34.15 |
| internlm2-chat-7b-hf | 56.71 |
| internlm2-chat-7b-sft-hf | 61.59 |
| internlm2-chat-20b-hf | 67.68 |
| internlm2-chat-20b-sft-hf | 67.68 |
| llama-3-8b-instruct-hf | 55.49 |
| llama-3-70b-instruct-hf | 70.73 |
| llama-3-8b-instruct-lmdeploy | 57.93 |
| llama-3-70b-instruct-lmdeploy | 70.73 |
| mistral-7b-instruct-v0.1-hf | 32.32 |
| mistral-7b-instruct-v0.2-hf | 29.27 |
| mixtral-8x7b-instruct-v0.1-hf | 34.15 |

View File

@ -0,0 +1,36 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets import HumanevalDataset, HumanEvalEvaluator, humaneval_postprocess
humaneval_reader_cfg = dict(
input_columns=['prompt'], output_column='task_id', train_split='test')
# TODO: allow empty output-column
humaneval_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(round=[
dict(
role='HUMAN',
prompt='Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\nCreate a Python script for this problem:\n{prompt}\n\n### Response:\n'),
])),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=512))
humaneval_eval_cfg = dict(
evaluator=dict(type=HumanEvalEvaluator),
pred_role='BOT',
k=[1, 10, 100], # the parameter only for humaneval
pred_postprocessor=dict(type=humaneval_postprocess),
)
humaneval_datasets = [
dict(
abbr='openai_humaneval',
type=HumanevalDataset,
path='opencompass/humaneval',
reader_cfg=humaneval_reader_cfg,
infer_cfg=humaneval_infer_cfg,
eval_cfg=humaneval_eval_cfg)
]

View File

@ -0,0 +1,36 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets import HumanevalDataset, HumanEvalEvaluator, humaneval_postprocess
humaneval_reader_cfg = dict(
input_columns=['prompt'], output_column='task_id', train_split='test')
# TODO: allow empty output-column
humaneval_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(round=[
dict(
role='HUMAN',
prompt='Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\nComplete the following python function.:\n{prompt}\n\n### Response:\n'),
])),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=512))
humaneval_eval_cfg = dict(
evaluator=dict(type=HumanEvalEvaluator),
pred_role='BOT',
k=[1, 10, 100], # the parameter only for humaneval
pred_postprocessor=dict(type=humaneval_postprocess),
)
humaneval_datasets = [
dict(
abbr='openai_humaneval',
type=HumanevalDataset,
path='opencompass/humaneval',
reader_cfg=humaneval_reader_cfg,
infer_cfg=humaneval_infer_cfg,
eval_cfg=humaneval_eval_cfg)
]

View File

@ -0,0 +1,36 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets import HumanevalDataset, HumanEvalEvaluator, humaneval_postprocess_v2
humaneval_reader_cfg = dict(
input_columns=['prompt'], output_column='task_id', train_split='test')
# TODO: allow empty output-column
humaneval_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(round=[
dict(
role='HUMAN',
prompt='{prompt}'),
])),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=512))
humaneval_eval_cfg = dict(
evaluator=dict(type=HumanEvalEvaluator),
pred_role='BOT',
k=[1, 10, 100], # the parameter only for humaneval
pred_postprocessor=dict(type=humaneval_postprocess_v2),
)
humaneval_datasets = [
dict(
abbr='openai_humaneval',
type=HumanevalDataset,
path='opencompass/humaneval',
reader_cfg=humaneval_reader_cfg,
infer_cfg=humaneval_infer_cfg,
eval_cfg=humaneval_eval_cfg)
]

View File

@ -0,0 +1,33 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets import HumanevalDataset, HumanEvalEvaluator, humaneval_postprocess_v2
humaneval_reader_cfg = dict(
input_columns=['prompt'], output_column='task_id', train_split='test')
# TODO: allow empty output-column
humaneval_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template='Complete the following python code:\n{prompt}',
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=512))
humaneval_eval_cfg = dict(
evaluator=dict(type=HumanEvalEvaluator),
pred_role='BOT',
k=[1, 10, 100], # the parameter only for humaneval
pred_postprocessor=dict(type=humaneval_postprocess_v2),
)
humaneval_datasets = [
dict(
abbr='openai_humaneval',
type=HumanevalDataset,
path='opencompass/humaneval',
reader_cfg=humaneval_reader_cfg,
infer_cfg=humaneval_infer_cfg,
eval_cfg=humaneval_eval_cfg)
]

View File

@ -0,0 +1,31 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets import HumanevalDataset, HumanEvalEvaluator, humaneval_postprocess
humaneval_reader_cfg = dict(
input_columns=['prompt'], output_column='task_id', train_split='test')
# TODO: allow empty output-column
humaneval_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template='{prompt}'),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=512))
humaneval_eval_cfg = dict(
evaluator=dict(type=HumanEvalEvaluator),
k=[1, 10, 100], # the parameter only for humaneval
pred_postprocessor=dict(type=humaneval_postprocess),
)
humaneval_datasets = [
dict(
abbr='openai_humaneval',
type=HumanevalDataset,
path='opencompass/humaneval',
reader_cfg=humaneval_reader_cfg,
infer_cfg=humaneval_infer_cfg,
eval_cfg=humaneval_eval_cfg)
]

View File

@ -0,0 +1,41 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets import HumanevalDataset, HumanEvalEvaluator, humaneval_postprocess
humaneval_reader_cfg = dict(
input_columns=['prompt'], output_column='task_id', train_split='test')
# TODO: allow empty output-column
humaneval_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
begin=[
dict(
role='SYSTEM',
fallback_role='HUMAN',
prompt='Complete the following python code:'),
],
round=[
dict(role='HUMAN', prompt='{prompt}'),
])),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer))
humaneval_eval_cfg = dict(
evaluator=dict(type=HumanEvalEvaluator),
pred_role='BOT',
k=[1, 10, 100], # the parameter only for humaneval
pred_postprocessor=dict(type=humaneval_postprocess),
)
humaneval_datasets = [
dict(
abbr='openai_humaneval',
type=HumanevalDataset,
path='opencompass/humaneval',
reader_cfg=humaneval_reader_cfg,
infer_cfg=humaneval_infer_cfg,
eval_cfg=humaneval_eval_cfg)
]

View File

@ -0,0 +1,69 @@
# HumanEval
```bash
python3 run.py --models hf_internlm2_7b --datasets deprecated_humaneval_gen_d2537e --debug
python3 run.py --models hf_internlm2_chat_7b --datasets humaneval_gen_8e312c --debug
```
## Base Models
| model | pass@1 |
|:------------------------:|---------:|
| llama-7b-turbomind | 12.80 |
| llama-13b-turbomind | 15.24 |
| llama-30b-turbomind | 9.15 |
| llama-65b-turbomind | 7.32 |
| llama-2-7b-turbomind | 14.02 |
| llama-2-13b-turbomind | 15.24 |
| llama-2-70b-turbomind | 15.24 |
| llama-3-8b-turbomind | 28.05 |
| llama-3-70b-turbomind | 28.05 |
| internlm2-1.8b-turbomind | 30.49 |
| internlm2-7b-turbomind | 48.17 |
| internlm2-20b-turbomind | 51.83 |
| qwen-1.8b-turbomind | 16.46 |
| qwen-7b-turbomind | 23.78 |
| qwen-14b-turbomind | 23.78 |
| qwen-72b-turbomind | 66.46 |
| qwen1.5-0.5b-hf | 8.54 |
| qwen1.5-1.8b-hf | 23.17 |
| qwen1.5-4b-hf | 41.46 |
| qwen1.5-7b-hf | 53.05 |
| qwen1.5-14b-hf | 57.32 |
| qwen1.5-32b-hf | 70.12 |
| qwen1.5-72b-hf | 65.85 |
| qwen1.5-moe-a2-7b-hf | 45.73 |
| mistral-7b-v0.1-hf | 14.02 |
| mistral-7b-v0.2-hf | 9.15 |
| mixtral-8x7b-v0.1-hf | 24.39 |
| mixtral-8x22b-v0.1-hf | 16.46 |
| yi-6b-hf | 14.63 |
| yi-34b-hf | 17.07 |
| deepseek-7b-base-hf | 18.29 |
| deepseek-67b-base-hf | 23.17 |
## Chat Models
| model | pass@1 |
|:-----------------------------:|---------:|
| qwen1.5-0.5b-chat-hf | 9.15 |
| qwen1.5-1.8b-chat-hf | 15.85 |
| qwen1.5-4b-chat-hf | 30.49 |
| qwen1.5-7b-chat-hf | 40.85 |
| qwen1.5-14b-chat-hf | 50.00 |
| qwen1.5-32b-chat-hf | 57.93 |
| qwen1.5-72b-chat-hf | 60.37 |
| qwen1.5-110b-chat-hf | 65.24 |
| internlm2-chat-1.8b-hf | 33.54 |
| internlm2-chat-1.8b-sft-hf | 34.15 |
| internlm2-chat-7b-hf | 56.71 |
| internlm2-chat-7b-sft-hf | 61.59 |
| internlm2-chat-20b-hf | 67.68 |
| internlm2-chat-20b-sft-hf | 67.68 |
| llama-3-8b-instruct-hf | 55.49 |
| llama-3-70b-instruct-hf | 70.73 |
| llama-3-8b-instruct-lmdeploy | 57.93 |
| llama-3-70b-instruct-lmdeploy | 70.73 |
| mistral-7b-instruct-v0.1-hf | 32.32 |
| mistral-7b-instruct-v0.2-hf | 29.27 |
| mixtral-8x7b-instruct-v0.1-hf | 34.15 |

View File

@ -0,0 +1,36 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets import HumanevalDataset, HumanEvalEvaluator, humaneval_postprocess
humaneval_reader_cfg = dict(
input_columns=['prompt'], output_column='task_id', train_split='test')
# TODO: allow empty output-column
humaneval_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(round=[
dict(
role='HUMAN',
prompt='Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\nCreate a Python script for this problem:\n{prompt}\n\n### Response:\n'),
])),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=512))
humaneval_eval_cfg = dict(
evaluator=dict(type=HumanEvalEvaluator),
pred_role='BOT',
k=[1, 10, 100], # the parameter only for humaneval
pred_postprocessor=dict(type=humaneval_postprocess),
)
humaneval_datasets = [
dict(
abbr='openai_humaneval',
type=HumanevalDataset,
path='opencompass/humaneval',
reader_cfg=humaneval_reader_cfg,
infer_cfg=humaneval_infer_cfg,
eval_cfg=humaneval_eval_cfg)
]

View File

@ -0,0 +1,36 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets import HumanevalDataset, HumanEvalEvaluator, humaneval_postprocess
humaneval_reader_cfg = dict(
input_columns=['prompt'], output_column='task_id', train_split='test')
# TODO: allow empty output-column
humaneval_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(round=[
dict(
role='HUMAN',
prompt='Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\nComplete the following python function.:\n{prompt}\n\n### Response:\n'),
])),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=512))
humaneval_eval_cfg = dict(
evaluator=dict(type=HumanEvalEvaluator),
pred_role='BOT',
k=[1, 10, 100], # the parameter only for humaneval
pred_postprocessor=dict(type=humaneval_postprocess),
)
humaneval_datasets = [
dict(
abbr='openai_humaneval',
type=HumanevalDataset,
path='opencompass/humaneval',
reader_cfg=humaneval_reader_cfg,
infer_cfg=humaneval_infer_cfg,
eval_cfg=humaneval_eval_cfg)
]

View File

@ -0,0 +1,36 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets import HumanevalDataset, HumanEvalEvaluator, humaneval_postprocess_v2
humaneval_reader_cfg = dict(
input_columns=['prompt'], output_column='task_id', train_split='test')
# TODO: allow empty output-column
humaneval_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(round=[
dict(
role='HUMAN',
prompt='{prompt}'),
])),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=512))
humaneval_eval_cfg = dict(
evaluator=dict(type=HumanEvalEvaluator),
pred_role='BOT',
k=[1, 10, 100], # the parameter only for humaneval
pred_postprocessor=dict(type=humaneval_postprocess_v2),
)
humaneval_datasets = [
dict(
abbr='openai_humaneval',
type=HumanevalDataset,
path='opencompass/humaneval',
reader_cfg=humaneval_reader_cfg,
infer_cfg=humaneval_infer_cfg,
eval_cfg=humaneval_eval_cfg)
]

View File

@ -0,0 +1,33 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets import HumanevalDataset, HumanEvalEvaluator, humaneval_postprocess_v2
humaneval_reader_cfg = dict(
input_columns=['prompt'], output_column='task_id', train_split='test')
# TODO: allow empty output-column
humaneval_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template='Complete the following python code:\n{prompt}',
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=512))
humaneval_eval_cfg = dict(
evaluator=dict(type=HumanEvalEvaluator),
pred_role='BOT',
k=[1, 10, 100], # the parameter only for humaneval
pred_postprocessor=dict(type=humaneval_postprocess_v2),
)
humaneval_datasets = [
dict(
abbr='openai_humaneval',
type=HumanevalDataset,
path='opencompass/humaneval',
reader_cfg=humaneval_reader_cfg,
infer_cfg=humaneval_infer_cfg,
eval_cfg=humaneval_eval_cfg)
]

View File

@ -0,0 +1,31 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets import HumanevalDataset, HumanEvalEvaluator, humaneval_postprocess
humaneval_reader_cfg = dict(
input_columns=['prompt'], output_column='task_id', train_split='test')
# TODO: allow empty output-column
humaneval_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template='{prompt}'),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=512))
humaneval_eval_cfg = dict(
evaluator=dict(type=HumanEvalEvaluator),
k=[1, 10, 100], # the parameter only for humaneval
pred_postprocessor=dict(type=humaneval_postprocess),
)
humaneval_datasets = [
dict(
abbr='openai_humaneval',
type=HumanevalDataset,
path='opencompass/humaneval',
reader_cfg=humaneval_reader_cfg,
infer_cfg=humaneval_infer_cfg,
eval_cfg=humaneval_eval_cfg)
]

View File

@ -0,0 +1,41 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets import HumanevalDataset, HumanEvalEvaluator, humaneval_postprocess
humaneval_reader_cfg = dict(
input_columns=['prompt'], output_column='task_id', train_split='test')
# TODO: allow empty output-column
humaneval_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
begin=[
dict(
role='SYSTEM',
fallback_role='HUMAN',
prompt='Complete the following python code:'),
],
round=[
dict(role='HUMAN', prompt='{prompt}'),
])),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer))
humaneval_eval_cfg = dict(
evaluator=dict(type=HumanEvalEvaluator),
pred_role='BOT',
k=[1, 10, 100], # the parameter only for humaneval
pred_postprocessor=dict(type=humaneval_postprocess),
)
humaneval_datasets = [
dict(
abbr='openai_humaneval',
type=HumanevalDataset,
path='opencompass/humaneval',
reader_cfg=humaneval_reader_cfg,
infer_cfg=humaneval_infer_cfg,
eval_cfg=humaneval_eval_cfg)
]

View File

@ -0,0 +1,4 @@
from mmengine.config import read_base
with read_base():
from .humaneval_gen_8e312c import humaneval_datasets # noqa: F401, F403

View File

@ -0,0 +1,35 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets import HumanevalDataset, HumanEvalEvaluator, humaneval_postprocess_v2
humaneval_reader_cfg = dict(input_columns=['prompt'], output_column='task_id', train_split='test')
HUMANEVAL_TEMPLATE = dict(
round=[
dict(role='HUMAN', prompt='You are an intelligent programming assistant to produce Python algorithmic solutions.\nCan you complete the following Python function?\n```python\n{prompt}\n```'),
]
)
humaneval_infer_cfg = dict(
prompt_template=dict(type=PromptTemplate, template=HUMANEVAL_TEMPLATE),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=1024),
)
humaneval_eval_cfg = dict(
evaluator=dict(type=HumanEvalEvaluator),
k=[1, 10, 100],
pred_postprocessor=dict(type=humaneval_postprocess_v2),
)
humaneval_datasets = [
dict(
abbr='openai_humaneval',
type=HumanevalDataset,
path='opencompass/humaneval',
reader_cfg=humaneval_reader_cfg,
infer_cfg=humaneval_infer_cfg,
eval_cfg=humaneval_eval_cfg,
)
]

View File

@ -0,0 +1,37 @@
# THIS SHALL ALSO BE DEPRECATED
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets import HumanevalDataset, HumanEvalEvaluator, humaneval_postprocess_v2
humaneval_reader_cfg = dict(
input_columns=['prompt'], output_column='task_id', train_split='test')
# TODO: allow empty output-column
humaneval_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(round=[
dict(
role='HUMAN',
prompt='Complete the following python code:\n{prompt}'),
])),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=512))
humaneval_eval_cfg = dict(
evaluator=dict(type=HumanEvalEvaluator),
pred_role='BOT',
k=[1, 10, 100], # the parameter only for humaneval
pred_postprocessor=dict(type=humaneval_postprocess_v2),
)
humaneval_datasets = [
dict(
abbr='openai_humaneval',
type=HumanevalDataset,
path='opencompass/humaneval',
reader_cfg=humaneval_reader_cfg,
infer_cfg=humaneval_infer_cfg,
eval_cfg=humaneval_eval_cfg)
]

View File

@ -0,0 +1,36 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets import HumanevalDataset, HumanEvalEvaluator, humaneval_postprocess_v2
humaneval_reader_cfg = dict(
input_columns=['prompt'], output_column='task_id', train_split='test')
# TODO: allow empty output-column
humaneval_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(round=[
dict(
role='HUMAN',
prompt='Read the following function signature and docstring, and fully implement the function described. Your response should only contain the code for this function.\n{prompt}'),
])),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=512))
humaneval_eval_cfg = dict(
evaluator=dict(type=HumanEvalEvaluator),
pred_role='BOT',
k=[1, 10, 100], # the parameter only for humaneval
pred_postprocessor=dict(type=humaneval_postprocess_v2),
)
humaneval_datasets = [
dict(
abbr='openai_humaneval',
type=HumanevalDataset,
path='opencompass/humaneval',
reader_cfg=humaneval_reader_cfg,
infer_cfg=humaneval_infer_cfg,
eval_cfg=humaneval_eval_cfg)
]

View File

@ -0,0 +1,36 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets import HumanevalDataset, HumanEvalEvaluator, humaneval_postprocess_v2
humaneval_reader_cfg = dict(
input_columns=['prompt'], output_column='task_id', train_split='test')
# TODO: allow empty output-column
humaneval_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(round=[
dict(
role='HUMAN',
prompt='Complete the following python code:\n{prompt}'),
])),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=512))
humaneval_eval_cfg = dict(
evaluator=dict(type=HumanEvalEvaluator),
pred_role='BOT',
k=[1, 10, 100], # the parameter only for humaneval
pred_postprocessor=dict(type=humaneval_postprocess_v2),
)
humaneval_datasets = [
dict(
abbr='openai_humaneval_passk',
type=HumanevalDataset,
path='opencompass/humaneval',
reader_cfg=humaneval_reader_cfg,
infer_cfg=humaneval_infer_cfg,
eval_cfg=humaneval_eval_cfg)
]

View File

@ -0,0 +1,37 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets import HumanevalDataset, HumanEvalEvaluator, humaneval_postprocess_v2
humaneval_reader_cfg = dict(
input_columns=['prompt'], output_column='task_id', train_split='test')
# TODO: allow empty output-column
humaneval_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(round=[
dict(
role='HUMAN',
prompt='Complete the following python code:\n{prompt}'),
])),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=512))
humaneval_eval_cfg = dict(
evaluator=dict(type=HumanEvalEvaluator),
pred_role='BOT',
k=[1, 10, 100], # the parameter only for humaneval
pred_postprocessor=dict(type=humaneval_postprocess_v2),
)
humaneval_datasets = [
dict(
abbr='openai_humaneval_repeat10',
type=HumanevalDataset,
path='opencompass/humaneval',
num_repeats=10,
reader_cfg=humaneval_reader_cfg,
infer_cfg=humaneval_infer_cfg,
eval_cfg=humaneval_eval_cfg)
]

View File

@ -0,0 +1,33 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets import HumanevalDataset, HumanEvalEvaluator, humaneval_internal_v2_postprocess
humaneval_reader_cfg = dict(
input_columns=['prompt'], output_column='task_id', train_split='test')
# TODO: allow empty output-column
humaneval_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template='# Complete the following python code:\n{prompt}',
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=512))
humaneval_eval_cfg = dict(
evaluator=dict(type=HumanEvalEvaluator),
pred_role='BOT',
k=[1, 10, 100], # the parameter only for humaneval
pred_postprocessor=dict(type=humaneval_internal_v2_postprocess),
)
humaneval_datasets = [
dict(
abbr='openai_humaneval',
type=HumanevalDataset,
path='opencompass/humaneval',
reader_cfg=humaneval_reader_cfg,
infer_cfg=humaneval_infer_cfg,
eval_cfg=humaneval_eval_cfg)
]

View File

@ -0,0 +1,33 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets import HumanevalDataset, HumanEvalEvaluator, humaneval_internal_v1_postprocess
humaneval_reader_cfg = dict(
input_columns=['prompt'], output_column='task_id', train_split='test')
# TODO: allow empty output-column
humaneval_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template='Complete the following python code:\n{prompt}',
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=512))
humaneval_eval_cfg = dict(
evaluator=dict(type=HumanEvalEvaluator),
pred_role='BOT',
k=[1, 10, 100], # the parameter only for humaneval
pred_postprocessor=dict(type=humaneval_internal_v1_postprocess),
)
humaneval_datasets = [
dict(
abbr='openai_humaneval',
type=HumanevalDataset,
path='opencompass/humaneval',
reader_cfg=humaneval_reader_cfg,
infer_cfg=humaneval_infer_cfg,
eval_cfg=humaneval_eval_cfg)
]

View File

@ -0,0 +1,35 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets import HumanevalDataset, HumanEvalEvaluator, humaneval_postprocess_v2
humaneval_reader_cfg = dict(input_columns=['prompt'], output_column='task_id', train_split='test')
HUMANEVAL_TEMPLATE = dict(
round=[
dict(role='HUMAN', prompt='You are an intelligent programming assistant to produce Python algorithmic solutions.\nCan you complete the following Python function?\n```python\n{prompt}\n```'),
]
)
humaneval_infer_cfg = dict(
prompt_template=dict(type=PromptTemplate, template=HUMANEVAL_TEMPLATE),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=1024),
)
humaneval_eval_cfg = dict(
evaluator=dict(type=HumanEvalEvaluator),
k=[1, 10, 100],
pred_postprocessor=dict(type=humaneval_postprocess_v2),
)
humaneval_datasets = [
dict(
abbr='openai_humaneval',
type=HumanevalDataset,
path='opencompass/humaneval',
reader_cfg=humaneval_reader_cfg,
infer_cfg=humaneval_infer_cfg,
eval_cfg=humaneval_eval_cfg,
)
]

View File

@ -0,0 +1,49 @@
# THIS SHALL ALSO BE DEPRECATED
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets import xHumanevalDataset, HumanEvalEvaluator, humaneval_postprocess_v2, HumanEvalPlusEvaluator
humaneval_reader_cfg = dict(
input_columns=['prompt'], output_column='task_id', train_split='test')
# TODO: allow empty output-column
humaneval_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(round=[
dict(
role='HUMAN',
prompt='Complete the following python code:\n{prompt}'),
])),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=512))
humaneval_eval_cfg = dict(
evaluator=dict(type=HumanEvalEvaluator),
pred_role='BOT',
k=[1, 10, 100], # the parameter only for humaneval
pred_postprocessor=dict(type=humaneval_postprocess_v2),
)
# humaneval_datasets = [
# dict(
# abbr='openai_humaneval',
# type=xHumanevalDataset,
# path='opencompass/humaneval',
# reader_cfg=humaneval_reader_cfg,
# infer_cfg=humaneval_infer_cfg,
# eval_cfg=humaneval_eval_cfg)
# ]
LANGS = ['ar']
humaneval_datasets = []
for lang in LANGS:
humaneval_datasets.append(
dict(
abbr=f'humaneval_{lang}',
type=xHumanevalDataset,
path=f'data/xhumaneval_plus/humaneval_plus_gpt4o_{lang}.jsonl',
reader_cfg=humaneval_reader_cfg,
infer_cfg=humaneval_infer_cfg,
eval_cfg=humaneval_eval_cfg)
)

View File

@ -0,0 +1,36 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets import HumanevalDataset, HumanEvalEvaluator, humaneval_postprocess_v2
humaneval_reader_cfg = dict(
input_columns=['prompt'], output_column='task_id', train_split='test')
# TODO: allow empty output-column
humaneval_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(round=[
dict(
role='HUMAN',
prompt='Read the following function signature and docstring, and fully implement the function described. Your response should only contain the code for this function.\n{prompt}'),
])),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=512))
humaneval_eval_cfg = dict(
evaluator=dict(type=HumanEvalEvaluator),
pred_role='BOT',
k=[1, 10, 100], # the parameter only for humaneval
pred_postprocessor=dict(type=humaneval_postprocess_v2),
)
humaneval_datasets = [
dict(
abbr='openai_humaneval',
type=HumanevalDataset,
path='opencompass/humaneval',
reader_cfg=humaneval_reader_cfg,
infer_cfg=humaneval_infer_cfg,
eval_cfg=humaneval_eval_cfg)
]

View File

@ -0,0 +1,36 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets import HumanevalDataset, HumanEvalEvaluator, humaneval_postprocess_v2
humaneval_reader_cfg = dict(
input_columns=['prompt'], output_column='task_id', train_split='test')
# TODO: allow empty output-column
humaneval_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(round=[
dict(
role='HUMAN',
prompt='Complete the following python code:\n{prompt}'),
])),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=512))
humaneval_eval_cfg = dict(
evaluator=dict(type=HumanEvalEvaluator),
pred_role='BOT',
k=[1, 10, 100], # the parameter only for humaneval
pred_postprocessor=dict(type=humaneval_postprocess_v2),
)
humaneval_datasets = [
dict(
abbr='openai_humaneval_passk',
type=HumanevalDataset,
path='opencompass/humaneval',
reader_cfg=humaneval_reader_cfg,
infer_cfg=humaneval_infer_cfg,
eval_cfg=humaneval_eval_cfg)
]

View File

@ -0,0 +1,37 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets import HumanevalDataset, HumanEvalEvaluator, humaneval_postprocess_v2
humaneval_reader_cfg = dict(
input_columns=['prompt'], output_column='task_id', train_split='test')
# TODO: allow empty output-column
humaneval_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(round=[
dict(
role='HUMAN',
prompt='Complete the following python code:\n{prompt}'),
])),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=512))
humaneval_eval_cfg = dict(
evaluator=dict(type=HumanEvalEvaluator),
pred_role='BOT',
k=[1, 10, 100], # the parameter only for humaneval
pred_postprocessor=dict(type=humaneval_postprocess_v2),
)
humaneval_datasets = [
dict(
abbr='openai_humaneval_repeat10',
type=HumanevalDataset,
path='opencompass/humaneval',
num_repeats=10,
reader_cfg=humaneval_reader_cfg,
infer_cfg=humaneval_infer_cfg,
eval_cfg=humaneval_eval_cfg)
]

View File

@ -0,0 +1,33 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets import HumanevalDataset, HumanEvalEvaluator, humaneval_internal_v2_postprocess
humaneval_reader_cfg = dict(
input_columns=['prompt'], output_column='task_id', train_split='test')
# TODO: allow empty output-column
humaneval_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template='# Complete the following python code:\n{prompt}',
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=512))
humaneval_eval_cfg = dict(
evaluator=dict(type=HumanEvalEvaluator),
pred_role='BOT',
k=[1, 10, 100], # the parameter only for humaneval
pred_postprocessor=dict(type=humaneval_internal_v2_postprocess),
)
humaneval_datasets = [
dict(
abbr='openai_humaneval',
type=HumanevalDataset,
path='opencompass/humaneval',
reader_cfg=humaneval_reader_cfg,
infer_cfg=humaneval_infer_cfg,
eval_cfg=humaneval_eval_cfg)
]

View File

@ -0,0 +1,33 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets import HumanevalDataset, HumanEvalEvaluator, humaneval_internal_v1_postprocess
humaneval_reader_cfg = dict(
input_columns=['prompt'], output_column='task_id', train_split='test')
# TODO: allow empty output-column
humaneval_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template='Complete the following python code:\n{prompt}',
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=512))
humaneval_eval_cfg = dict(
evaluator=dict(type=HumanEvalEvaluator),
pred_role='BOT',
k=[1, 10, 100], # the parameter only for humaneval
pred_postprocessor=dict(type=humaneval_internal_v1_postprocess),
)
humaneval_datasets = [
dict(
abbr='openai_humaneval',
type=HumanevalDataset,
path='opencompass/humaneval',
reader_cfg=humaneval_reader_cfg,
infer_cfg=humaneval_infer_cfg,
eval_cfg=humaneval_eval_cfg)
]

View File

@ -0,0 +1,4 @@
from mmengine.config import read_base
with read_base():
from .humaneval_gen_8e312c import humaneval_datasets # noqa: F401, F403

View File

@ -0,0 +1,4 @@
from mmengine.config import read_base
with read_base():
from .humaneval_plus_gen_8e312c import humaneval_plus_datasets # noqa: F401, F403

View File

@ -0,0 +1,35 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets import HumanevalDataset, HumanEvalPlusEvaluator, humaneval_postprocess_v2
humaneval_plus_reader_cfg = dict(input_columns=['prompt'], output_column='task_id', train_split='test')
HUMANEVAL_TEMPLATE = dict(
round=[
dict(role='HUMAN', prompt='You are an intelligent programming assistant to produce Python algorithmic solutions.\nCan you complete the following Python function?\n```python\n{prompt}\n```'),
]
)
humaneval_plus_infer_cfg = dict(
prompt_template=dict(type=PromptTemplate, template=HUMANEVAL_TEMPLATE),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=1024),
)
humaneval_plus_eval_cfg = dict(
evaluator=dict(type=HumanEvalPlusEvaluator),
k=[1, 10, 100],
pred_postprocessor=dict(type=humaneval_postprocess_v2),
)
humaneval_plus_datasets = [
dict(
abbr='humaneval_plus',
type=HumanevalDataset,
path='opencompass/humaneval',
reader_cfg=humaneval_plus_reader_cfg,
infer_cfg=humaneval_plus_infer_cfg,
eval_cfg=humaneval_plus_eval_cfg,
)
]

View File

@ -0,0 +1,57 @@
# THIS SHALL ALSO BE DEPRECATED
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets import xHumanevalDataset, HumanEvalPlusEvaluator, humaneval_postprocess_v2
humaneval_plus_reader_cfg = dict(
input_columns=['prompt'], output_column='task_id', train_split='test')
# TODO: allow empty output-column
humaneval_plus_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(round=[
dict(
role='HUMAN',
prompt='Complete the following python code:\n{prompt}'),
])),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=2048))
# print('humaneval_plus_datasets ru')
# humaneval_plus_datasets = [
# dict(
# abbr='xhumaneval_plus_ru',
# type=xHumanevalDataset,
# path='xhumaneval_plus',
# name='humaneval_plus_gpt4o_ru.jsonl',
# local_mode=True,
# reader_cfg=humaneval_plus_reader_cfg,
# infer_cfg=humaneval_plus_infer_cfg,
# eval_cfg=humaneval_plus_eval_cfg)
# ]
LANGS = ['de']
humaneval_plus_datasets = []
for lang in LANGS:
humaneval_plus_eval_cfg = dict(
evaluator=dict(type=HumanEvalPlusEvaluator),
pred_role='BOT',
k=[1, 10], # the parameter only for humaneval
lang=lang,
pred_postprocessor=dict(type=humaneval_postprocess_v2),
)
humaneval_plus_datasets.append(
dict(
abbr=f'xhumaneval_plus_{lang}',
type=xHumanevalDataset,
path=f'data/xhumaneval_plus/humaneval_{lang}.jsonl',
reader_cfg=humaneval_plus_reader_cfg,
infer_cfg=humaneval_plus_infer_cfg,
eval_cfg=humaneval_plus_eval_cfg)
)

View File

@ -0,0 +1,38 @@
# THIS SHALL ALSO BE DEPRECATED
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets import HumanevalDataset, HumanEvalPlusEvaluator, humaneval_postprocess_v2
humaneval_plus_reader_cfg = dict(
input_columns=['prompt'], output_column='task_id', train_split='test')
# TODO: allow empty output-column
humaneval_plus_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(round=[
dict(
role='HUMAN',
prompt='Read the following function signature and docstring, and fully implement the function described. Your response should only contain the code for this function.\n{prompt}'
),
])),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=512))
humaneval_plus_eval_cfg = dict(
evaluator=dict(type=HumanEvalPlusEvaluator),
pred_role='BOT',
k=[1, 10, 100], # the parameter only for humaneval
pred_postprocessor=dict(type=humaneval_postprocess_v2),
)
humaneval_plus_datasets = [
dict(
abbr='humaneval_plus',
type=HumanevalDataset,
path='opencompass/humaneval',
reader_cfg=humaneval_plus_reader_cfg,
infer_cfg=humaneval_plus_infer_cfg,
eval_cfg=humaneval_plus_eval_cfg)
]

View File

@ -0,0 +1,36 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets import HumanevalDataset, HumanEvalEvaluator, humaneval_postprocess_v2
humaneval_plus_reader_cfg = dict(
input_columns=['prompt'], output_column='task_id', train_split='test')
# TODO: allow empty output-column
humaneval_plus_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(round=[
dict(
role='HUMAN',
prompt='Complete the following python code:\n{prompt}'),
])),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=512))
humaneval_plus_eval_cfg = dict(
evaluator=dict(type=HumanEvalEvaluator, metric='EvalPlus'),
pred_role='BOT',
k=[1, 10, 100], # the parameter only for humaneval
pred_postprocessor=dict(type=humaneval_postprocess_v2),
)
humaneval_plus_datasets = [
dict(
abbr='humaneval_plus_passk',
type=HumanevalDataset,
path='opencompass/humaneval',
reader_cfg=humaneval_plus_reader_cfg,
infer_cfg=humaneval_plus_infer_cfg,
eval_cfg=humaneval_plus_eval_cfg)
]

View File

@ -0,0 +1,37 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets import HumanevalDataset, HumanEvalEvaluator, humaneval_postprocess_v2
humaneval_plus_reader_cfg = dict(
input_columns=['prompt'], output_column='task_id', train_split='test')
# TODO: allow empty output-column
humaneval_plus_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(round=[
dict(
role='HUMAN',
prompt='Complete the following python code:\n{prompt}'),
])),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=512))
humaneval_plus_eval_cfg = dict(
evaluator=dict(type=HumanEvalEvaluator, metric='EvalPlus'),
pred_role='BOT',
k=[1, 10, 100], # the parameter only for humaneval
pred_postprocessor=dict(type=humaneval_postprocess_v2),
)
humaneval_plus_datasets = [
dict(
abbr='humaneval_plus_repeat10',
type=HumanevalDataset,
path='opencompass/humaneval',
num_repeats=10,
reader_cfg=humaneval_plus_reader_cfg,
infer_cfg=humaneval_plus_infer_cfg,
eval_cfg=humaneval_plus_eval_cfg)
]

View File

@ -0,0 +1,4 @@
from mmengine.config import read_base
with read_base():
from .humaneval_plus_gen_8e312c import humaneval_plus_datasets # noqa: F401, F403

View File

@ -146,6 +146,7 @@ from .xcopa import * # noqa: F401, F403
from .xiezhi import XiezhiDataset, XiezhiRetriever # noqa: F401, F403
from .xlsum import * # noqa: F401, F403
from .xsum import * # noqa: F401, F403
from .xhumaneval import xHumanevalDataset # noqa: F401, F403
from .xIFEval.ifeval import xIFEvalDataset, xIFEvaluator # noqa: F401, F403
from .xlivecodebench import xLCBCodeGenerationDataset, xLCBCodeGenerationEvaluator
from .xmgsm import * # noqa: F401, F403

View File

@ -0,0 +1,73 @@
# flake8: noqa: E501
# yapf: disable
import copy
import json
import os.path as osp
import re
import tempfile
from os import environ
from typing import List
import os
from datasets import Dataset
from opencompass.openicl.icl_evaluator import BaseEvaluator
from opencompass.registry import LOAD_DATASET
from opencompass.utils import get_data_path
from .base import BaseDataset
HUMANEVAL_IMPORT_ERROR = '''\
Please install human_eval use following steps:
git clone git@github.com:open-compass/human-eval.git
cd human-eval && pip install -e .'''
HUMANEVAL_PLUS_IMPORT_ERROR = '''\
Please install evalplus use following steps:
git clone --recurse-submodules git@github.com:open-compass/human-eval.git
cd human-eval
pip install -e .
pip install -e evalplus'''
@LOAD_DATASET.register_module()
class xHumanevalDataset(BaseDataset):
@staticmethod
def load(path: str, num_repeats: int = 1, local_mode: bool = False):
"""Load humaneval dataset for pass k mode.
Note that you can use num_repeats > 1 when your model does not support
`num_return_sequence` in generation, otherwise use the raw
humaneval dataset and set `num_return_sequence` in model config to
generate multiple responses for testing pass@k>1.
It better to change your dataset abbr correspondingly if you want to
change num_repeats>1, otherwise the number in
`.cache/dataset_size.json` might be inconsistent.
Args:
num_repeats(int): Number of repetition for this dataset to get
multiple responses in special cases.
"""
path = get_data_path(path, local_mode=True)
# path = os.path.join(path, name)
if environ.get('DATASET_SOURCE') == 'ModelScope':
from modelscope import MsDataset
dataset = MsDataset.load(path, subset_name='openai_humaneval', split='test')
dataset_list = []
for example in dataset:
dataset_list.extend([example] * num_repeats)
dataset = Dataset.from_list(dataset_list)
else:
dataset = []
with open(path, 'r', encoding='utf-8') as f:
for line in f:
item = json.loads(line.strip())
item = {'prompt': item['prompt'], 'task_id': item['task_id'], 'test': item['test'], 'entry_point': item['entry_point'], 'canonical_solution': item['canonical_solution']}
dataset.extend(
[item for _ in range(num_repeats)])
print(dataset[:10])
dataset = Dataset.from_list(dataset)
return dataset

1
run.sh Normal file
View File

@ -0,0 +1 @@
CUDA_VISIBLE_DEVICES=0 python3 run.py --models hf_llama3_1_8b_instruct --datasets xhumaneval_plus_gen -a vllm --max-num-worker 1

6
start.sh Normal file
View File

@ -0,0 +1,6 @@
cd ../
git lfs install
git clone https://huggingface.co/datasets/LLaMAX/BenchMAX_Function_Completion
mkdir -p ~/.cache/opencompass/data/xhumaneval_plus
mv BenchMAX_Function_Completion/* ~/.cache/opencompass/data/xhumaneval_plus/
cd opencompass