mirror of
https://github.com/open-compass/opencompass.git
synced 2025-05-30 16:03:24 +08:00
Merge b2c84058f2
into 6a6a1a5c0b
This commit is contained in:
commit
a3fa2fb105
153
examples/eval_codebench.py
Normal file
153
examples/eval_codebench.py
Normal file
@ -0,0 +1,153 @@
|
||||
from mmengine.config import read_base
|
||||
import os.path as osp
|
||||
from opencompass.runners import LocalRunner, VOLCRunner
|
||||
from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner
|
||||
from opencompass.tasks import OpenICLInferTask, OpenICLEvalTask
|
||||
|
||||
with read_base():
|
||||
# Datasets Part
|
||||
# bigcodebench
|
||||
from opencompass.configs.datasets.bigcodebench.bigcodebench_full_instruct_gen_c3d5ad import (
|
||||
bigcodebench_full_instruct_datasets
|
||||
)
|
||||
from opencompass.configs.datasets.bigcodebench.bigcodebench_hard_instruct_gen_c3d5ad import (
|
||||
bigcodebench_hard_instruct_datasets
|
||||
)
|
||||
# livecodebench code generation lite v5
|
||||
from opencompass.configs.datasets.livecodebench.livecodebench_time_split_gen import (
|
||||
LCB_datasets
|
||||
)
|
||||
# huamneval
|
||||
from opencompass.configs.datasets.humaneval.humaneval_openai_sample_evals_gen_dcae0e import (
|
||||
humaneval_datasets
|
||||
)
|
||||
from opencompass.configs.datasets.humaneval_pro.humaneval_pro_gen import (
|
||||
humanevalpro_datasets
|
||||
)
|
||||
from opencompass.configs.datasets.humanevalx.humanevalx_gen_627de5 import (
|
||||
humanevalx_datasets
|
||||
)
|
||||
# mbpp
|
||||
from opencompass.configs.datasets.mbpp.mbpp_gen import (
|
||||
mbpp_datasets
|
||||
)
|
||||
from opencompass.configs.datasets.mbpp_pro.mbpp_pro_gen import (
|
||||
mbpppro_datasets
|
||||
)
|
||||
# multipl-e
|
||||
from opencompass.configs.datasets.multipl_e.multiple_top_ten_gen import (
|
||||
multiple_datasets
|
||||
)
|
||||
# ds1000
|
||||
from opencompass.configs.datasets.ds1000.ds1000_service_eval_gen_cbc84f import (
|
||||
ds1000_datasets
|
||||
)
|
||||
|
||||
# Models Part
|
||||
from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_7b_instruct import (
|
||||
models as lmdeploy_qwen2_5_7b_instruct_model,
|
||||
)
|
||||
from opencompass.configs.models.hf_internlm.lmdeploy_internlm3_8b_instruct import (
|
||||
models as lmdeploy_internlm3_8b_instruct_model,
|
||||
)
|
||||
|
||||
# Summary Groups
|
||||
from opencompass.configs.summarizers.groups.ds1000 import (
|
||||
ds1000_summary_groups,
|
||||
)
|
||||
from opencompass.configs.summarizers.groups.multipl_e import (
|
||||
multiple_summary_groups,
|
||||
)
|
||||
from opencompass.configs.summarizers.groups.humanevalx import (
|
||||
humanevalx_summary_groups,
|
||||
)
|
||||
|
||||
# models config
|
||||
models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
|
||||
|
||||
for model in models:
|
||||
model['max_seq_len'] = 16384
|
||||
model['max_out_len'] = 8192
|
||||
|
||||
# datasets config
|
||||
datasets = sum(
|
||||
(v for k, v in locals().items() if k.endswith('_datasets')),
|
||||
[],
|
||||
)
|
||||
|
||||
for item in humanevalx_datasets:
|
||||
item['eval_cfg']['evaluator'][
|
||||
'ip_address'
|
||||
] = 'codeeval.opencompass.org.cn/humanevalx'
|
||||
item['eval_cfg']['evaluator']['port'] = ''
|
||||
for item in ds1000_datasets:
|
||||
item['eval_cfg']['evaluator'][
|
||||
'ip_address'
|
||||
] = 'codeeval.opencompass.org.cn/ds1000'
|
||||
item['eval_cfg']['evaluator']['port'] = ''
|
||||
|
||||
|
||||
for dataset in datasets:
|
||||
dataset['infer_cfg']['inferencer']['max_out_len'] = 8192
|
||||
|
||||
|
||||
# summary
|
||||
summary_groups = sum(
|
||||
[v for k, v in locals().items() if k.endswith('_summary_groups')], []
|
||||
)
|
||||
summary_groups.append(
|
||||
{'name': 'humanevalx',
|
||||
'subsets': ['humanevalx-python', 'humanevalx-cpp', 'humanevalx-java', 'humanevalx-js']}
|
||||
)
|
||||
summarizer = dict(
|
||||
dataset_abbrs = [
|
||||
['bigcodebench_hard_instruct', 'pass@1'],
|
||||
['bigcodebench_full_instruct', 'pass@1'],
|
||||
['lcb_code_generation', 'pass@1'],
|
||||
['openai_humaneval', 'humaneval_pass@1'],
|
||||
['mbpp', 'score'],
|
||||
['humaneval_pro', 'pass@1'],
|
||||
['mbpp_pro', 'pass@1'],
|
||||
['multiple', 'naive_average'],
|
||||
['humanevalx', 'naive_average'],
|
||||
['ds1000', 'naive_average'],
|
||||
'',
|
||||
'humanevalx-python',
|
||||
'humanevalx-cpp',
|
||||
'humanevalx-java',
|
||||
'humanevalx-js',
|
||||
'',
|
||||
'ds1000_Pandas',
|
||||
'ds1000_Numpy',
|
||||
'ds1000_Tensorflow',
|
||||
'ds1000_Scipy',
|
||||
'ds1000_Sklearn',
|
||||
'ds1000_Pytorch',
|
||||
'ds1000_Matplotlib',
|
||||
'',
|
||||
'humaneval-multiple-cpp',
|
||||
'humaneval-multiple-cs',
|
||||
'humaneval-multiple-go',
|
||||
'humaneval-multiple-java',
|
||||
'humaneval-multiple-rb',
|
||||
'humaneval-multiple-js',
|
||||
'humaneval-multiple-php',
|
||||
'humaneval-multiple-r',
|
||||
'humaneval-multiple-rs',
|
||||
'humaneval-multiple-sh',
|
||||
'',
|
||||
'mbpp-multiple-cpp',
|
||||
'mbpp-multiple-cs',
|
||||
'mbpp-multiple-go',
|
||||
'mbpp-multiple-java',
|
||||
'mbpp-multiple-rb',
|
||||
'mbpp-multiple-js',
|
||||
'mbpp-multiple-php',
|
||||
'mbpp-multiple-r',
|
||||
'mbpp-multiple-rs',
|
||||
'mbpp-multiple-sh'
|
||||
],
|
||||
summary_groups=summary_groups,
|
||||
)
|
||||
|
||||
work_dir = 'outputs/code'
|
161
examples/eval_codebench_passk.py
Normal file
161
examples/eval_codebench_passk.py
Normal file
@ -0,0 +1,161 @@
|
||||
from mmengine.config import read_base
|
||||
import os.path as osp
|
||||
from opencompass.runners import LocalRunner, VOLCRunner
|
||||
from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner
|
||||
from opencompass.tasks import OpenICLInferTask, OpenICLEvalTask
|
||||
|
||||
with read_base():
|
||||
# Datasets Part
|
||||
# bigcodebench
|
||||
from opencompass.configs.datasets.bigcodebench.bigcodebench_full_instruct_gen_c3d5ad import (
|
||||
bigcodebench_full_instruct_datasets
|
||||
)
|
||||
from opencompass.configs.datasets.bigcodebench.bigcodebench_hard_instruct_gen_c3d5ad import (
|
||||
bigcodebench_hard_instruct_datasets
|
||||
)
|
||||
# livecodebench code generation lite v5
|
||||
from opencompass.configs.datasets.livecodebench.livecodebench_time_split_gen import (
|
||||
LCB_datasets
|
||||
)
|
||||
# huamneval
|
||||
from opencompass.configs.datasets.humaneval.humaneval_passk_gen_8e312c import (
|
||||
humaneval_datasets
|
||||
)
|
||||
from opencompass.configs.datasets.humaneval_pro.humaneval_pro_gen import (
|
||||
humanevalpro_datasets
|
||||
)
|
||||
# mbpp
|
||||
from opencompass.configs.datasets.mbpp.mbpp_passk_gen_830460 import (
|
||||
mbpp_datasets
|
||||
)
|
||||
from opencompass.configs.datasets.mbpp_pro.mbpp_pro_gen import (
|
||||
mbpppro_datasets
|
||||
)
|
||||
# multipl-e
|
||||
from opencompass.configs.datasets.multipl_e.multiple_top_ten_gen import (
|
||||
multiple_datasets
|
||||
)
|
||||
|
||||
# Models Part
|
||||
from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_7b_instruct import (
|
||||
models as lmdeploy_qwen2_5_7b_instruct_model,
|
||||
)
|
||||
from opencompass.configs.models.hf_internlm.lmdeploy_internlm3_8b_instruct import (
|
||||
models as lmdeploy_internlm3_8b_instruct_model,
|
||||
)
|
||||
|
||||
# models config
|
||||
models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
|
||||
for model in models:
|
||||
model['max_seq_len'] = 16384
|
||||
model['max_out_len'] = 8192
|
||||
|
||||
# datasets config
|
||||
datasets = sum(
|
||||
(v for k, v in locals().items() if k.endswith('_datasets')),
|
||||
[],
|
||||
)
|
||||
num_repeats = 5
|
||||
k = (1, 3, 5)
|
||||
for dataset in datasets:
|
||||
dataset['infer_cfg']['inferencer']['max_out_len'] = 8192
|
||||
# openai pass@k config: the current setting is pass@5 (n=10).
|
||||
if not any(exclude in dataset['abbr'] for exclude in ('mbpp', 'humaneval')):
|
||||
dataset['eval_cfg']['evaluator']['num_repeats'] = num_repeats
|
||||
dataset['eval_cfg']['evaluator']['k'] = k
|
||||
dataset['num_repeats'] = num_repeats
|
||||
# dataset['abbr'] += f'_passk'
|
||||
|
||||
# summary
|
||||
summarizer = dict(
|
||||
dataset_abbrs = [
|
||||
'pass@1',
|
||||
['bigcodebench_full_instruct_passk', 'pass@1'],
|
||||
['bigcodebench_hard_instruct_passk', 'pass@1'],
|
||||
['lcb_code_generation_passk', 'pass@1'],
|
||||
['openai_humaneval_passk_passk', 'humaneval_pass@1'],
|
||||
['humaneval_pro_passk', 'pass@1'],
|
||||
['mbpp_passk_passk', 'pass@1'],
|
||||
['mbpp_pro_passk', 'pass@1'],
|
||||
['humaneval-multiple-cpp_passk', 'pass@1'],
|
||||
['humaneval-multiple-cs_passk', 'pass@1'],
|
||||
['humaneval-multiple-go_passk', 'pass@1'],
|
||||
['humaneval-multiple-java_passk', 'pass@1'],
|
||||
['humaneval-multiple-rb_passk', 'pass@1'],
|
||||
['humaneval-multiple-js_passk', 'pass@1'],
|
||||
['humaneval-multiple-php_passk', 'pass@1'],
|
||||
['humaneval-multiple-r_passk', 'pass@1'],
|
||||
['humaneval-multiple-rs_passk', 'pass@1'],
|
||||
['humaneval-multiple-sh_passk', 'pass@1'],
|
||||
['mbpp-multiple-cpp_passk', 'pass@1'],
|
||||
['mbpp-multiple-cs_passk', 'pass@1'],
|
||||
['mbpp-multiple-go_passk', 'pass@1'],
|
||||
['mbpp-multiple-java_passk', 'pass@1'],
|
||||
['mbpp-multiple-rb_passk', 'pass@1'],
|
||||
['mbpp-multiple-js_passk', 'pass@1'],
|
||||
['mbpp-multiple-php_passk', 'pass@1'],
|
||||
['mbpp-multiple-r_passk', 'pass@1'],
|
||||
['mbpp-multiple-rs_passk', 'pass@1'],
|
||||
['mbpp-multiple-sh_passk', 'pass@1'],
|
||||
'',
|
||||
'pass@3',
|
||||
['bigcodebench_full_instruct_passk', 'pass@3'],
|
||||
['bigcodebench_hard_instruct_passk', 'pass@3'],
|
||||
['lcb_code_generation_passk', 'pass@3'],
|
||||
['openai_humaneval_passk_passk', 'humaneval_pass@3'],
|
||||
['humaneval_pro_passk', 'pass@3'],
|
||||
['mbpp_passk_passk', 'pass@3'],
|
||||
['mbpp_pro_passk', 'pass@3'],
|
||||
['humaneval-multiple-cpp_passk', 'pass@3'],
|
||||
['humaneval-multiple-cs_passk', 'pass@3'],
|
||||
['humaneval-multiple-go_passk', 'pass@3'],
|
||||
['humaneval-multiple-java_passk', 'pass@3'],
|
||||
['humaneval-multiple-rb_passk', 'pass@3'],
|
||||
['humaneval-multiple-js_passk', 'pass@3'],
|
||||
['humaneval-multiple-php_passk', 'pass@3'],
|
||||
['humaneval-multiple-r_passk', 'pass@3'],
|
||||
['humaneval-multiple-rs_passk', 'pass@3'],
|
||||
['humaneval-multiple-sh_passk', 'pass@3'],
|
||||
['mbpp-multiple-cpp_passk', 'pass@3'],
|
||||
['mbpp-multiple-cs_passk', 'pass@3'],
|
||||
['mbpp-multiple-go_passk', 'pass@3'],
|
||||
['mbpp-multiple-java_passk', 'pass@3'],
|
||||
['mbpp-multiple-rb_passk', 'pass@3'],
|
||||
['mbpp-multiple-js_passk', 'pass@3'],
|
||||
['mbpp-multiple-php_passk', 'pass@3'],
|
||||
['mbpp-multiple-r_passk', 'pass@3'],
|
||||
['mbpp-multiple-rs_passk', 'pass@3'],
|
||||
['mbpp-multiple-sh_passk', 'pass@3'],
|
||||
'',
|
||||
'pass@5',
|
||||
['bigcodebench_full_instruct_passk', 'pass@5'],
|
||||
['bigcodebench_hard_instruct_passk', 'pass@5'],
|
||||
['lcb_code_generation_passk', 'pass@5'],
|
||||
['openai_humaneval_passk_passk', 'humaneval_pass@5'],
|
||||
['humaneval_pro_passk', 'pass@5'],
|
||||
['mbpp_passk_passk', 'pass@5'],
|
||||
['mbpp_pro_passk', 'pass@5'],
|
||||
['humaneval-multiple-cpp_passk', 'pass@5'],
|
||||
['humaneval-multiple-cs_passk', 'pass@5'],
|
||||
['humaneval-multiple-go_passk', 'pass@5'],
|
||||
['humaneval-multiple-java_passk', 'pass@5'],
|
||||
['humaneval-multiple-rb_passk', 'pass@5'],
|
||||
['humaneval-multiple-js_passk', 'pass@5'],
|
||||
['humaneval-multiple-php_passk', 'pass@5'],
|
||||
['humaneval-multiple-r_passk', 'pass@5'],
|
||||
['humaneval-multiple-rs_passk', 'pass@5'],
|
||||
['humaneval-multiple-sh_passk', 'pass@5'],
|
||||
['mbpp-multiple-cpp_passk', 'pass@5'],
|
||||
['mbpp-multiple-cs_passk', 'pass@5'],
|
||||
['mbpp-multiple-go_passk', 'pass@5'],
|
||||
['mbpp-multiple-java_passk', 'pass@5'],
|
||||
['mbpp-multiple-rb_passk', 'pass@5'],
|
||||
['mbpp-multiple-js_passk', 'pass@5'],
|
||||
['mbpp-multiple-php_passk', 'pass@5'],
|
||||
['mbpp-multiple-r_passk', 'pass@5'],
|
||||
['mbpp-multiple-rs_passk', 'pass@5'],
|
||||
['mbpp-multiple-sh_passk', 'pass@5'],
|
||||
],
|
||||
)
|
||||
|
||||
work_dir = 'outputs/code_passk'
|
@ -0,0 +1,45 @@
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.datasets import (BigCodeBenchDataset, BigCodeBenchEvaluator)
|
||||
|
||||
bigcodebench_full_reader_cfg = dict(
|
||||
input_columns=['instruct_prompt'],
|
||||
output_column='test',
|
||||
)
|
||||
|
||||
bigcodebench_full_infer_cfg = dict(prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(
|
||||
begin=[dict(role='system', fallback_role='HUMAN', prompt='')],
|
||||
round=[
|
||||
dict(role='HUMAN', prompt='{instruct_prompt}'),
|
||||
])),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer))
|
||||
|
||||
bigcodebench_full_eval_cfg = dict(
|
||||
evaluator=dict(
|
||||
type=BigCodeBenchEvaluator,
|
||||
release_version='v0.1.2',
|
||||
eval_type='instruct',
|
||||
remote_execute_api='https://bigcode-bigcodebench-evaluator.hf.space/',
|
||||
# remote_execute_api=
|
||||
# 'https://opencompass-opencompass-bigcodebench-evaluator.hf.space', # noqa: E501
|
||||
dataset_version='full',
|
||||
num_repeats=1,
|
||||
k=1,
|
||||
),
|
||||
pred_role='BOT',
|
||||
)
|
||||
|
||||
bigcodebench_full_instruct_datasets = [
|
||||
dict(abbr='bigcodebench_full_instruct',
|
||||
type=BigCodeBenchDataset,
|
||||
path='opencompass/bigcodebench',
|
||||
reader_cfg=bigcodebench_full_reader_cfg,
|
||||
infer_cfg=bigcodebench_full_infer_cfg,
|
||||
eval_cfg=bigcodebench_full_eval_cfg,
|
||||
release_version='v0.1.2',
|
||||
num_repeats=1,)
|
||||
]
|
@ -24,10 +24,12 @@ bigcodebench_hard_eval_cfg = dict(
|
||||
type=BigCodeBenchEvaluator,
|
||||
release_version='v0.1.2',
|
||||
eval_type='instruct',
|
||||
# remote_execute_api='https://bigcode-bigcodebench-evaluator.hf.space/',
|
||||
remote_execute_api=
|
||||
'https://opencompass-opencompass-bigcodebench-evaluator.hf.space', # noqa: E501
|
||||
remote_execute_api='https://bigcode-bigcodebench-evaluator.hf.space/',
|
||||
# remote_execute_api=
|
||||
# 'https://opencompass-opencompass-bigcodebench-evaluator.hf.space', # noqa: E501
|
||||
dataset_version='hard',
|
||||
num_repeats=1,
|
||||
k=1,
|
||||
),
|
||||
pred_role='BOT',
|
||||
)
|
||||
@ -42,5 +44,6 @@ bigcodebench_hard_instruct_datasets = [
|
||||
eval_cfg=bigcodebench_hard_eval_cfg,
|
||||
release_version='v0.1.2',
|
||||
dataset_version='hard',
|
||||
num_repeats=1,
|
||||
)
|
||||
]
|
||||
|
@ -19,9 +19,9 @@ humaneval_infer_cfg = dict(
|
||||
inferencer=dict(type=GenInferencer, max_out_len=512))
|
||||
|
||||
humaneval_eval_cfg = dict(
|
||||
evaluator=dict(type=HumanEvalEvaluator),
|
||||
evaluator=dict(type=HumanEvalEvaluator,
|
||||
k=1),
|
||||
pred_role='BOT',
|
||||
k=[1, 10, 100], # the parameter only for humaneval
|
||||
pred_postprocessor=dict(type=humaneval_postprocess_v2),
|
||||
)
|
||||
|
||||
@ -32,5 +32,6 @@ humaneval_datasets = [
|
||||
path='opencompass/humaneval',
|
||||
reader_cfg=humaneval_reader_cfg,
|
||||
infer_cfg=humaneval_infer_cfg,
|
||||
eval_cfg=humaneval_eval_cfg)
|
||||
eval_cfg=humaneval_eval_cfg,
|
||||
num_repeats=1)
|
||||
]
|
||||
|
@ -33,4 +33,4 @@ humaneval_plus_datasets = [
|
||||
reader_cfg=humaneval_plus_reader_cfg,
|
||||
infer_cfg=humaneval_plus_infer_cfg,
|
||||
eval_cfg=humaneval_plus_eval_cfg)
|
||||
]
|
||||
]
|
17
opencompass/configs/datasets/humaneval_pro/README.md
Normal file
17
opencompass/configs/datasets/humaneval_pro/README.md
Normal file
@ -0,0 +1,17 @@
|
||||
# HumanEval pro
|
||||
|
||||
## OC results
|
||||
|
||||
| model | pass@1 |
|
||||
|:--------------------------:|---------:|
|
||||
|qwen2.5-coder-7b-instruct-hf| 65 |
|
||||
| qwen2.5-14b-instruct-hf | 67 |
|
||||
| deepseek-v2-lite-chat-hf | 35 |
|
||||
|
||||
## CodeEval-pro results
|
||||
|
||||
| model | pass@1 |
|
||||
|:--------------------------:|---------:|
|
||||
|qwen2.5-coder-7b-instruct-hf| 65 |
|
||||
| qwen2.5-14b-instruct-hf | 65 |
|
||||
| deepseek-v2-lite-chat-hf | 28 |
|
@ -0,0 +1,60 @@
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.datasets import HumanevalevalProDataset, HumanevalProEvaluator, humaneval_postprocess_v2
|
||||
|
||||
OFFICIAL_PROMPT_WRAPPER = """You are an exceptionally intelligent coding assistant that consistently delivers accurate and reliable responses to user instructions.
|
||||
@@ Instruction
|
||||
Write a solution of python file to the following problems, the solution of the second problem requires single or multiple calls to the first solution.
|
||||
```python
|
||||
{raw_problem}
|
||||
{new_problem}
|
||||
```
|
||||
|
||||
@@ Response
|
||||
Please put the two solutions to the above problems in one Python code block.
|
||||
"""
|
||||
|
||||
PROMPT_WRAPPER = """You are an exceptionally intelligent coding assistant that consistently delivers accurate and reliable responses to user instructions.
|
||||
Write a solution of python file to the following problems, the solution of the second problem requires single or multiple calls to the first solution.
|
||||
```python
|
||||
{raw_problem}
|
||||
{new_problem}
|
||||
```
|
||||
|
||||
Please put the two solutions within the Python code block provided below, and make sure that the block contains no other unrelated content:
|
||||
```python
|
||||
```
|
||||
"""
|
||||
|
||||
|
||||
humanevalpro_reader_cfg = dict(
|
||||
input_columns=['raw_problem', 'new_problem'], output_column='test_code')
|
||||
|
||||
humanevalpro_infer_cfg = dict(
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(round=[
|
||||
dict(
|
||||
role='HUMAN',
|
||||
prompt=PROMPT_WRAPPER),
|
||||
])),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer))
|
||||
|
||||
humanevalpro_eval_cfg = dict(
|
||||
evaluator=dict(type=HumanevalProEvaluator,
|
||||
ip_address='https://opencompass-multiple-evaluator.hf.space',
|
||||
k=1)
|
||||
)
|
||||
|
||||
humanevalpro_datasets = [
|
||||
dict(
|
||||
abbr='humaneval_pro',
|
||||
type=HumanevalevalProDataset,
|
||||
path='opencompass/humaneval_pro',
|
||||
num_repeats=1,
|
||||
reader_cfg=humanevalpro_reader_cfg,
|
||||
infer_cfg=humanevalpro_infer_cfg,
|
||||
eval_cfg=humanevalpro_eval_cfg,)
|
||||
]
|
@ -0,0 +1,41 @@
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.datasets import HumanevalXDataset, HumanevalXEvaluator
|
||||
|
||||
humanevalx_reader_cfg = dict(
|
||||
input_columns=['prompt'], output_column='declaration', train_split='test')
|
||||
|
||||
humanevalx_infer_cfg = dict(
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template='{prompt}'),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer))
|
||||
|
||||
humanevalx_eval_cfg_dict = {
|
||||
lang : dict(
|
||||
evaluator=dict(
|
||||
type=HumanevalXEvaluator,
|
||||
language=lang,
|
||||
ip_address=
|
||||
'localhost', # replace to your code_eval_server ip_address, port
|
||||
port=5001), # refer to https://opencompass.readthedocs.io/en/latest/advanced_guides/code_eval_service.html to launch a server
|
||||
pred_role='BOT')
|
||||
for lang in ['python', 'cpp', 'java', 'js'] # do not support rust & go now
|
||||
}
|
||||
|
||||
# Please download the needed `xx.jsonl.gz` from
|
||||
# https://github.com/THUDM/CodeGeeX2/tree/main/benchmark/humanevalx
|
||||
# and move them into `data/humanevalx/` folder
|
||||
humanevalx_datasets = [
|
||||
dict(
|
||||
type=HumanevalXDataset,
|
||||
abbr=f'humanevalx-{lang}',
|
||||
language=lang,
|
||||
path='./data/humanevalx',
|
||||
reader_cfg=humanevalx_reader_cfg,
|
||||
infer_cfg=humanevalx_infer_cfg,
|
||||
eval_cfg=humanevalx_eval_cfg_dict[lang])
|
||||
for lang in ['python', 'cpp', 'java', 'js']
|
||||
]
|
@ -33,9 +33,11 @@ lcb_code_generation_eval_cfg = dict(
|
||||
evaluator=dict(type=LCBCodeGenerationEvaluator,
|
||||
num_process_evaluate=4,
|
||||
timeout=6,
|
||||
release_version='release_v5',
|
||||
release_version='v5',
|
||||
start_date='2024-08-01',
|
||||
end_date='2025-02-01'),
|
||||
end_date='2025-02-01',
|
||||
num_repeats=1,
|
||||
k=1,),
|
||||
pred_role='BOT',
|
||||
)
|
||||
|
||||
@ -46,7 +48,8 @@ LCBCodeGeneration_dataset = dict(
|
||||
reader_cfg=lcb_code_generation_reader_cfg,
|
||||
infer_cfg=lcb_code_generation_infer_cfg,
|
||||
eval_cfg=lcb_code_generation_eval_cfg,
|
||||
release_version='release_v5',
|
||||
release_version='v5',
|
||||
num_repeats=1,
|
||||
)
|
||||
|
||||
# Code Execution Dataset
|
||||
@ -127,6 +130,6 @@ LCBTestOutput_dataset = dict(
|
||||
|
||||
LCB_datasets = [
|
||||
LCBCodeGeneration_dataset,
|
||||
LCBCodeExecution_dataset,
|
||||
LCBTestOutput_dataset,
|
||||
# LCBCodeExecution_dataset,
|
||||
# LCBTestOutput_dataset,
|
||||
]
|
||||
|
@ -28,7 +28,9 @@ mbpp_infer_cfg = dict(
|
||||
inferencer=dict(type=GenInferencer, max_out_len=512),
|
||||
)
|
||||
|
||||
mbpp_eval_cfg = dict(evaluator=dict(type=MBPPPassKEvaluator), pred_role='BOT')
|
||||
mbpp_eval_cfg = dict(evaluator=dict(type=MBPPPassKEvaluator,
|
||||
k=1),
|
||||
pred_role='BOT')
|
||||
|
||||
mbpp_datasets = [
|
||||
dict(
|
||||
@ -38,5 +40,6 @@ mbpp_datasets = [
|
||||
reader_cfg=mbpp_reader_cfg,
|
||||
infer_cfg=mbpp_infer_cfg,
|
||||
eval_cfg=mbpp_eval_cfg,
|
||||
num_repeats=1,
|
||||
)
|
||||
]
|
||||
|
17
opencompass/configs/datasets/mbpp_pro/README.md
Normal file
17
opencompass/configs/datasets/mbpp_pro/README.md
Normal file
@ -0,0 +1,17 @@
|
||||
# MBPP pro
|
||||
|
||||
## OC results
|
||||
|
||||
| model | pass@1 |
|
||||
|:--------------------------:|---------:|
|
||||
|qwen2.5-coder-7b-instruct-hf| 66 |
|
||||
| qwen2.5-14b-instruct-hf | 64 |
|
||||
| deepseek-v2-lite-chat-hf | 36 |
|
||||
|
||||
## CodeEval-pro results
|
||||
|
||||
| model | pass@1 |
|
||||
|:--------------------------:|---------:|
|
||||
|qwen2.5-coder-7b-instruct-hf| 65 |
|
||||
| qwen2.5-14b-instruct-hf | 65 |
|
||||
| deepseek-v2-lite-chat-hf | 39 |
|
60
opencompass/configs/datasets/mbpp_pro/mbpp_pro_gen.py
Normal file
60
opencompass/configs/datasets/mbpp_pro/mbpp_pro_gen.py
Normal file
@ -0,0 +1,60 @@
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.datasets import MBPPProDataset, MBPPProEvaluator
|
||||
|
||||
OFFICIAL_PROMPT_WRAPPER = """You are an exceptionally intelligent coding assistant that consistently delivers accurate and reliable responses to user instructions.
|
||||
@@ Instruction
|
||||
Write a solution of python file to the following problems, the solution of the second problem requires single or multiple calls to the first solution.
|
||||
```python
|
||||
{raw_problem}
|
||||
{new_problem}
|
||||
```
|
||||
|
||||
@@ Response
|
||||
Please put the two solutions to the above problems in one Python code block.
|
||||
"""
|
||||
|
||||
PROMPT_WRAPPER = """You are an exceptionally intelligent coding assistant that consistently delivers accurate and reliable responses to user instructions.
|
||||
Write a solution of python file to the following problems, the solution of the second problem requires single or multiple calls to the first solution.
|
||||
```python
|
||||
{raw_problem}
|
||||
{new_problem}
|
||||
```
|
||||
|
||||
Please put the two solutions within the Python code block provided below, and make sure that the block contains no other unrelated content:
|
||||
```python
|
||||
```
|
||||
"""
|
||||
|
||||
|
||||
mbpppro_reader_cfg = dict(
|
||||
input_columns=['raw_problem', 'new_problem'], output_column='test_code')
|
||||
|
||||
mbpppro_infer_cfg = dict(
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(round=[
|
||||
dict(
|
||||
role='HUMAN',
|
||||
prompt=PROMPT_WRAPPER),
|
||||
])),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer))
|
||||
|
||||
mbpppro_eval_cfg = dict(
|
||||
evaluator=dict(type=MBPPProEvaluator,
|
||||
ip_address='https://opencompass-multiple-evaluator.hf.space',
|
||||
k=1),
|
||||
)
|
||||
|
||||
mbpppro_datasets = [
|
||||
dict(
|
||||
abbr='mbpp_pro',
|
||||
type=MBPPProDataset,
|
||||
path='opencompass/mbpp_pro',
|
||||
num_repeats=1,
|
||||
reader_cfg=mbpppro_reader_cfg,
|
||||
infer_cfg=mbpppro_infer_cfg,
|
||||
eval_cfg=mbpppro_eval_cfg)
|
||||
]
|
@ -21,6 +21,7 @@ multiple_eval_cfg = {
|
||||
evaluator=dict(
|
||||
type=MultiplEEvaluator,
|
||||
language=lang,
|
||||
k = 1,
|
||||
ip_address='https://opencompass-multiple-evaluator.hf.space',
|
||||
),
|
||||
pred_role='BOT',
|
||||
|
6
opencompass/configs/summarizers/groups/multipl_e.py
Normal file
6
opencompass/configs/summarizers/groups/multipl_e.py
Normal file
@ -0,0 +1,6 @@
|
||||
multiple_summary_groups = []
|
||||
|
||||
humaneval_multiple = ['humaneval-multiple-cpp', 'humaneval-multiple-cs', 'humaneval-multiple-go', 'humaneval-multiple-java', 'humaneval-multiple-rb', 'humaneval-multiple-js', 'humaneval-multiple-php', 'humaneval-multiple-r', 'humaneval-multiple-rs', 'humaneval-multiple-sh']
|
||||
mbpp_multiple = ['mbpp-multiple-cpp', 'mbpp-multiple-cs', 'mbpp-multiple-go', 'mbpp-multiple-java', 'mbpp-multiple-rb', 'mbpp-multiple-js', 'mbpp-multiple-php', 'mbpp-multiple-r', 'mbpp-multiple-rs', 'mbpp-multiple-sh']
|
||||
multiple_summary_groups.append({'name': 'multiple', 'subsets': humaneval_multiple})
|
||||
multiple_summary_groups.append({'name':'multiple','subsets': mbpp_multiple})
|
@ -62,6 +62,7 @@ from .hle import * # noqa: F401, F403
|
||||
from .huggingface import * # noqa: F401, F403
|
||||
from .humaneval import * # noqa: F401, F403
|
||||
from .humaneval_multi import * # noqa: F401, F403
|
||||
from .humaneval_pro import * # noqa: F401, F403
|
||||
from .humanevalx import * # noqa: F401, F403
|
||||
from .hungarian_math import * # noqa: F401, F403
|
||||
from .IFEval.ifeval import IFEvalDataset, IFEvaluator # noqa: F401, F403
|
||||
@ -91,6 +92,7 @@ from .math401 import * # noqa: F401, F403
|
||||
from .math_intern import * # noqa: F401, F403
|
||||
from .mathbench import * # noqa: F401, F403
|
||||
from .mbpp import * # noqa: F401, F403
|
||||
from .mbpp_pro import * # noqa: F401, F403
|
||||
from .medbench import * # noqa: F401, F403
|
||||
from .MedXpertQA import * # noqa: F401, F403
|
||||
from .mgsm import * # noqa: F401, F403
|
||||
|
@ -4,6 +4,7 @@
|
||||
import os
|
||||
import time
|
||||
from concurrent.futures._base import CancelledError
|
||||
from typing import List, Sequence, Tuple, Union
|
||||
|
||||
import httpx
|
||||
from datasets import Dataset, DatasetDict
|
||||
@ -24,7 +25,8 @@ class BigCodeBenchDataset(BaseDataset):
|
||||
def load(path: str = 'opencompass/bigcodebench',
|
||||
local_mode: bool = False,
|
||||
release_version: str = 'v0.1.2',
|
||||
dataset_version: str = 'full'):
|
||||
dataset_version: str = 'full',
|
||||
num_repeats: int = 1):
|
||||
"""
|
||||
Args:
|
||||
path (str): The path to the dataset.
|
||||
@ -33,6 +35,7 @@ class BigCodeBenchDataset(BaseDataset):
|
||||
release_version (str): The release version of the dataset.
|
||||
dataset_version (str): The data version of the dataset.
|
||||
only support ['full', 'hard']
|
||||
num_repeats (int): Number of times to repeat dataset for pass@k.
|
||||
"""
|
||||
assert dataset_version in ['full', 'hard'], \
|
||||
'dataset_version should be one of ["full", "hard"], '
|
||||
@ -45,11 +48,13 @@ class BigCodeBenchDataset(BaseDataset):
|
||||
# 'entry_point', 'doc_struct', 'libs'
|
||||
if dataset_version == 'full':
|
||||
items = JSONToolkit.read_jsonl(
|
||||
os.path.join(path, f'BigCodeBench-{release_version}.jsonl'))
|
||||
os.path.join(path, f'BigCodeBench-{release_version}.jsonl'),
|
||||
num_repeats)
|
||||
else:
|
||||
items = JSONToolkit.read_jsonl(
|
||||
os.path.join(path,
|
||||
f'BigCodeBench-Hard-{release_version}.jsonl'))
|
||||
f'BigCodeBench-Hard-{release_version}.jsonl'),
|
||||
num_repeats)
|
||||
|
||||
dataset['train'] = Dataset.from_list(items)
|
||||
dataset['test'] = Dataset.from_list(items)
|
||||
@ -61,10 +66,10 @@ class BigCodeBenchEvaluator(BaseEvaluator):
|
||||
"""Evaluator for BigCodeBench.
|
||||
|
||||
Args:
|
||||
num_process_evaluate (int): number of processes to evaluate
|
||||
timeout (int): timeout for each evaluation
|
||||
release_version (str): release version of BigCodeBench
|
||||
eval_type (str): type of evaluation, either 'instruct' or 'completion'
|
||||
k (str): pass@k for evaluation
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
@ -75,7 +80,9 @@ class BigCodeBenchEvaluator(BaseEvaluator):
|
||||
dataset_version: str = 'full',
|
||||
local_mode: bool = False,
|
||||
path: str = 'opencompass/bigcodebench',
|
||||
num_repeats=1,
|
||||
pass_k: str = '1,5,10',
|
||||
k: Union[int, Tuple[int, ...], List[int]] = 1,
|
||||
parallel: int = -1,
|
||||
min_time_limit: float = 1,
|
||||
max_as_limit: int = 30 * 1024,
|
||||
@ -88,12 +95,17 @@ class BigCodeBenchEvaluator(BaseEvaluator):
|
||||
release_version=release_version,
|
||||
dataset_version=dataset_version,
|
||||
local_mode=local_mode,
|
||||
path=path)['test']
|
||||
path=path,
|
||||
num_repeats=num_repeats)['test']
|
||||
self.eval_type = eval_type
|
||||
if not isinstance(k, Sequence):
|
||||
k = (k, )
|
||||
k = ', '.join(map(str, k))
|
||||
self.k = k
|
||||
self.remote_execute_api = remote_execute_api
|
||||
|
||||
self.eval_kwargs = dict(subset=dataset_version,
|
||||
pass_k=pass_k,
|
||||
pass_k=self.k,
|
||||
parallel=parallel,
|
||||
min_time_limit=min_time_limit,
|
||||
max_as_limit=max_as_limit,
|
||||
@ -141,7 +153,7 @@ class BigCodeBenchEvaluator(BaseEvaluator):
|
||||
signal.alarm(0)
|
||||
signal.signal(signal.SIGALRM, original_handler)
|
||||
|
||||
with timeout_handler(10):
|
||||
with timeout_handler(300):
|
||||
sanitized_prediction = extract_code_generation(
|
||||
prediction, entrypoint=entrypoint)
|
||||
|
||||
@ -188,7 +200,9 @@ class BigCodeBenchEvaluator(BaseEvaluator):
|
||||
while True:
|
||||
try:
|
||||
eval_client = Client(self.remote_execute_api,
|
||||
httpx_kwargs=dict(proxies=proxies))
|
||||
httpx_kwargs=dict(
|
||||
proxies=proxies,
|
||||
timeout=httpx.Timeout(100.0)))
|
||||
results, pass_at_k = eval_client.predict(
|
||||
split=self.eval_type,
|
||||
samples=handle_file(submitted_contents_path),
|
||||
@ -196,22 +210,25 @@ class BigCodeBenchEvaluator(BaseEvaluator):
|
||||
**self.eval_kwargs)
|
||||
break
|
||||
except (httpx.ReadTimeout, CancelledError):
|
||||
logger.info('Read timeout error. Retrying in 4s...')
|
||||
logger.info('Read timeout error. Retrying in 10s...')
|
||||
time.sleep(10)
|
||||
|
||||
if 'pass@1' in pass_at_k.keys():
|
||||
pass_at_k['pass@1'] *= 100
|
||||
dump_results = {'details': self._results_processor(results)}
|
||||
dump_results.update(pass_at_k)
|
||||
|
||||
return dump_results
|
||||
pass_at_k = {
|
||||
k: v * 100 if isinstance(v, (int, float)) else v
|
||||
for k, v in pass_at_k.items()
|
||||
}
|
||||
return {
|
||||
**pass_at_k,
|
||||
'details': self._results_processor(results),
|
||||
}
|
||||
|
||||
def _results_processor(self, results):
|
||||
details = []
|
||||
for key, value in results['eval'].items():
|
||||
if value[0]['status'] == 'pass':
|
||||
value[0]['correct'] = True
|
||||
else:
|
||||
value[0]['correct'] = False
|
||||
details.append(value[0])
|
||||
detail = {'correct': False, 'results_details': value}
|
||||
for v in value:
|
||||
if v['status'] == 'pass':
|
||||
detail['correct'] = True
|
||||
break
|
||||
details.append(detail)
|
||||
return details
|
||||
|
@ -191,14 +191,19 @@ class CodeCustomDataset(BaseDataset):
|
||||
path = get_data_path(path, local_mode=local_mode)
|
||||
if file_name is not None:
|
||||
path = os.path.join(path, file_name)
|
||||
files = os.listdir(path)
|
||||
data = []
|
||||
if path.endswith('.jsonl'):
|
||||
with open(path, 'r', encoding='utf-8') as f:
|
||||
if any(f.endswith('.jsonl') for f in files):
|
||||
target_file = next(f for f in files if f.endswith('.jsonl'))
|
||||
target_path = os.path.join(path, target_file)
|
||||
with open(target_path, 'r', encoding='utf-8') as f:
|
||||
for line in f:
|
||||
data.extend(
|
||||
[json.loads(line.strip()) for _ in range(num_repeats)])
|
||||
elif path.endswith('.csv'):
|
||||
with open(path, 'r', encoding='utf-8-sig') as f:
|
||||
elif any(f.endswith('.csv') for f in files):
|
||||
target_file = next(f for f in files if f.endswith('.csv'))
|
||||
target_path = os.path.join(path, target_file)
|
||||
with open(target_path, 'r', encoding='utf-8-sig') as f:
|
||||
reader = csv.reader(f)
|
||||
header = next(reader)
|
||||
for row in reader:
|
||||
|
@ -6,7 +6,7 @@ import os.path as osp
|
||||
import re
|
||||
import tempfile
|
||||
from os import environ
|
||||
from typing import List
|
||||
from typing import List, Sequence, Tuple, Union
|
||||
|
||||
from datasets import Dataset
|
||||
|
||||
@ -70,12 +70,16 @@ class HumanevalDataset(BaseDataset):
|
||||
class HumanEvalEvaluator(BaseEvaluator):
|
||||
"""Evaluator for HumanEval or EvalPlus."""
|
||||
|
||||
def __init__(self, k: List[int] = [1, 10, 100]) -> None:
|
||||
def __init__(self, k: Union[int, Tuple[int, ...], List[int]] = 1,
|
||||
num_repeats: int = 1) -> None:
|
||||
try:
|
||||
import human_eval
|
||||
except ImportError:
|
||||
raise ImportError(HUMANEVAL_IMPORT_ERROR)
|
||||
|
||||
self.n = num_repeats
|
||||
if not isinstance(k, Sequence):
|
||||
k = (k, )
|
||||
self.k = k
|
||||
super().__init__()
|
||||
|
||||
@ -87,16 +91,24 @@ class HumanEvalEvaluator(BaseEvaluator):
|
||||
from human_eval.evaluation import evaluate_functional_correctness
|
||||
|
||||
prompts = [item['prompt'] for item in test_set]
|
||||
humaneval_preds = []
|
||||
predictions_processed, references_processed = [], []
|
||||
for pred, refer in zip(predictions, references):
|
||||
if references_processed and refer == references_processed[-1]:
|
||||
predictions_processed[-1].extend([pred])
|
||||
else:
|
||||
references_processed.append(refer)
|
||||
predictions_processed.append([pred])
|
||||
|
||||
# create json file in human_eval format
|
||||
for preds, refer in zip(predictions, references):
|
||||
humaneval_preds = []
|
||||
for preds_p, refer_p in zip(predictions_processed, references_processed):
|
||||
# suits for two case
|
||||
# 1. use repeated dataset
|
||||
# 2. use `num_return_sequences` to generate multiple responses
|
||||
if not isinstance(preds, list):
|
||||
preds = [preds]
|
||||
for pred in preds:
|
||||
humaneval_preds.append({'task_id': refer, 'completion': pred})
|
||||
if not isinstance(preds_p, list):
|
||||
preds_p = [preds_p]
|
||||
for pred_p in preds_p:
|
||||
humaneval_preds.append({'task_id': refer_p, 'completion': pred_p})
|
||||
with tempfile.TemporaryDirectory() as tmp_dir:
|
||||
out_dir = osp.join(tmp_dir, 'human_eval.json')
|
||||
write_jsonl(out_dir, humaneval_preds)
|
||||
@ -183,13 +195,13 @@ def humaneval_postprocess_v2(text: str) -> str:
|
||||
blocks = re.findall(r'```\w*\n(.*?)```', text, re.DOTALL)
|
||||
if len(blocks) >= 1:
|
||||
text = blocks[0]
|
||||
return text
|
||||
return text.lstrip()
|
||||
|
||||
def humaneval_postprocess_v3(text: str) -> str:
|
||||
blocks = re.findall(r'```\w*\n(.*?)```', text, re.DOTALL)
|
||||
if len(blocks) >= 1:
|
||||
text = blocks[-1]
|
||||
return text
|
||||
return text.lstrip()
|
||||
|
||||
def humaneval_internal_v2_postprocess(text: str):
|
||||
if text.startswith(' ') and not text.startswith(' '):
|
||||
|
96
opencompass/datasets/humaneval_pro.py
Normal file
96
opencompass/datasets/humaneval_pro.py
Normal file
@ -0,0 +1,96 @@
|
||||
import json
|
||||
from typing import Dict, List
|
||||
|
||||
import numpy as np
|
||||
from datasets import Dataset
|
||||
|
||||
from opencompass.openicl.icl_evaluator.code_evaluator import CodeEvaluator
|
||||
from opencompass.utils import get_data_path
|
||||
|
||||
from .base import BaseDataset
|
||||
|
||||
|
||||
class HumanevalevalProDataset(BaseDataset):
|
||||
|
||||
@staticmethod
|
||||
def load(path, num_repeats=1, local_mode=False):
|
||||
path = get_data_path(path, local_mode=local_mode)
|
||||
dataset = []
|
||||
with open(path, encoding='utf-8') as f:
|
||||
raw_data = json.load(f)
|
||||
for data in raw_data:
|
||||
dataset.extend([data for _ in range(num_repeats)])
|
||||
return Dataset.from_list(dataset)
|
||||
|
||||
|
||||
class HumanevalProEvaluator(CodeEvaluator):
|
||||
|
||||
def _process_completions(self, test_case: dict, completions: list) -> list:
|
||||
processed_completions = []
|
||||
for comp in completions:
|
||||
post_comp = self._extract_code(comp)
|
||||
processed_completions.append(post_comp)
|
||||
return processed_completions
|
||||
|
||||
def score(self, predictions: List, references: List,
|
||||
test_set: Dataset) -> Dict:
|
||||
if len(predictions) != len(references):
|
||||
return {
|
||||
'error':
|
||||
'predictions and references have different '
|
||||
f'length. len(predictions): {len(predictions)}, '
|
||||
f'len(references): {len(references)}'
|
||||
}
|
||||
|
||||
test_set = test_set.to_pandas()
|
||||
# Use the first column as the unique identifier
|
||||
test_set_origin = test_set.drop_duplicates(subset=test_set.columns[0])
|
||||
num_repeats = int(len(test_set) / len(test_set_origin))
|
||||
|
||||
# 1. Prepare data for all test cases
|
||||
all_test_cases = []
|
||||
for i in range(len(test_set_origin)):
|
||||
test_case = test_set_origin.iloc[i]
|
||||
completions = predictions[i * num_repeats:(i + 1) * num_repeats]
|
||||
|
||||
# Process code completions
|
||||
processed_completions = self._process_completions(
|
||||
test_case, completions)
|
||||
|
||||
sub_data_dict = {
|
||||
'name': int(test_case['id']),
|
||||
'language': self.language,
|
||||
'prompt': '',
|
||||
'tests': test_case['test_code'],
|
||||
'processed_completions': processed_completions,
|
||||
'completions': completions
|
||||
}
|
||||
|
||||
all_test_cases.append(sub_data_dict)
|
||||
|
||||
# 2. Send all test cases to the evaluation service
|
||||
success, outputs, error_message = self._evaluate(all_test_cases)
|
||||
if not success:
|
||||
return {'error': error_message}
|
||||
|
||||
# 3. Process the returned results
|
||||
details = []
|
||||
total, correct = [], []
|
||||
for output in outputs:
|
||||
passed = [m['status'] == 'OK' for m in output['meta_data']]
|
||||
total.append(len(passed))
|
||||
correct.append(sum(passed))
|
||||
details.append(output)
|
||||
total = np.array(total)
|
||||
correct = np.array(correct)
|
||||
|
||||
pass_at_k = {
|
||||
f'pass@{k}':
|
||||
self.estimate_pass_at_k(total, correct, k).mean() * 100
|
||||
for k in self.k if (total >= k).all()
|
||||
}
|
||||
|
||||
return {
|
||||
**pass_at_k,
|
||||
'details': details,
|
||||
}
|
89
opencompass/datasets/humaneval_pro_.py
Normal file
89
opencompass/datasets/humaneval_pro_.py
Normal file
@ -0,0 +1,89 @@
|
||||
import json
|
||||
import os
|
||||
|
||||
import evaluate
|
||||
from datasets import Dataset
|
||||
|
||||
from opencompass.openicl.icl_evaluator import HuggingfaceEvaluator
|
||||
from opencompass.utils import get_data_path
|
||||
|
||||
from .base import BaseDataset
|
||||
|
||||
os.environ['HF_ALLOW_CODE_EVAL'] = '1'
|
||||
|
||||
|
||||
class HumanevalevalProDataset(BaseDataset):
|
||||
|
||||
@staticmethod
|
||||
def load(path, num_repeats=1, local_mode=False):
|
||||
path = get_data_path(path, local_mode=local_mode)
|
||||
dataset = []
|
||||
with open(path, encoding='utf-8') as f:
|
||||
raw_data = json.load(f)
|
||||
for data in raw_data:
|
||||
dataset.extend([data for _ in range(num_repeats)])
|
||||
return Dataset.from_list(dataset)
|
||||
|
||||
|
||||
class HumanevalProEvaluator(HuggingfaceEvaluator):
|
||||
|
||||
def _preprocess(self, predictions, references):
|
||||
predictions = [[_] for _ in predictions]
|
||||
return {
|
||||
'predictions': predictions,
|
||||
'references': references,
|
||||
}
|
||||
|
||||
def _postprocess(self, scores):
|
||||
scores = {f'humaneval_{k}': scores[k] * 100 for k in scores}
|
||||
return scores
|
||||
|
||||
def score(self, predictions, references, test_set):
|
||||
# predictions are LLM's output; references are the 'output_column' of 'humanevalpro_reader_cfg' # noqa: E501
|
||||
if len(predictions) != len(references):
|
||||
return {
|
||||
'error':
|
||||
'predictions and references have different '
|
||||
f'length. len(predictions): {len(predictions)}, '
|
||||
f'len(references): {len(references)}'
|
||||
}
|
||||
|
||||
# use codes pre-downloaded to opencompass repo, avoid downloading
|
||||
current_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
parrent_dir = os.path.dirname(current_dir)
|
||||
local_path = os.path.join(parrent_dir, 'openicl', 'icl_evaluator',
|
||||
'hf_metrics', self.metric)
|
||||
|
||||
if os.path.exists(local_path):
|
||||
metric = evaluate.load(local_path)
|
||||
else:
|
||||
metric = evaluate.load(self.metric)
|
||||
scores, _ = metric.compute(**self._preprocess(predictions, references),
|
||||
k=[1, 3, 5],
|
||||
num_workers=4)
|
||||
result = self._postprocess(scores)
|
||||
return result
|
||||
|
||||
|
||||
def humanevalpro_postprocess_official(text):
|
||||
"""The official post-processing method for humaneval_pro, which is solely
|
||||
applicable to the complete generation paradigm.
|
||||
|
||||
# noqa: E501 The chat template paradigm requires a different post-
|
||||
processing method.
|
||||
"""
|
||||
text = text[:index if (index := text.find('```')) != -1 else len(text)]
|
||||
return text
|
||||
|
||||
|
||||
def humanevalpro_postprocess_oc(text):
|
||||
"""For those generated based on the chat template paradigm, this method is
|
||||
recommended.
|
||||
|
||||
# noqa: E501
|
||||
"""
|
||||
start = text.rfind('```python') + len('```python')
|
||||
end = text.find('```', start)
|
||||
|
||||
code = text[start:end].strip()
|
||||
return code
|
@ -90,7 +90,7 @@ class HumanevalXEvaluator(BaseEvaluator):
|
||||
self.timeout = timeout
|
||||
super().__init__()
|
||||
|
||||
def score(self, predictions, references):
|
||||
def score(self, predictions, references, test_set):
|
||||
predictions = [{
|
||||
'task_id':
|
||||
f'{_LANGUAGE_NAME_DICT[self.language]}/{i}',
|
||||
|
@ -3,6 +3,7 @@ import json
|
||||
import multiprocessing
|
||||
from collections import defaultdict
|
||||
from concurrent.futures import ProcessPoolExecutor, as_completed
|
||||
from typing import Sequence
|
||||
|
||||
import numpy as np
|
||||
from tqdm import tqdm
|
||||
@ -174,7 +175,7 @@ def codegen_metrics(
|
||||
samples_list,
|
||||
generations_list,
|
||||
k_list=[1, 5, 10, 20, 40, 50, 75, 100, 125, 150, 200, 500, 1000],
|
||||
num_process_evaluate=16,
|
||||
num_process_evaluate=8,
|
||||
timeout=6,
|
||||
debug=False,
|
||||
):
|
||||
@ -238,14 +239,20 @@ class LCBCodeGenerationEvaluator(BaseEvaluator):
|
||||
release_version='release_v1',
|
||||
extractor_version='v1',
|
||||
start_date=None,
|
||||
end_date=None):
|
||||
end_date=None,
|
||||
num_repeats=1,
|
||||
k=1):
|
||||
super().__init__()
|
||||
self.num_process_evaluate = num_process_evaluate
|
||||
self.timeout = timeout
|
||||
if not isinstance(k, Sequence):
|
||||
k = (k, )
|
||||
self.k = k
|
||||
self.dataset = LCBCodeGenerationDataset.load(
|
||||
release_version=release_version,
|
||||
start_date=start_date,
|
||||
end_date=end_date)['test']
|
||||
end_date=end_date,
|
||||
num_repeats=num_repeats)['test']
|
||||
self.extractor_version = extractor_version
|
||||
|
||||
def score(self, predictions, references):
|
||||
@ -273,8 +280,11 @@ class LCBCodeGenerationEvaluator(BaseEvaluator):
|
||||
filtered_references = []
|
||||
for idx, item in enumerate(references):
|
||||
if item in self.dataset['question_id']:
|
||||
filtered_predictions.append(predictions[idx])
|
||||
filtered_references.append(item)
|
||||
if filtered_references and item == filtered_references[-1]:
|
||||
filtered_predictions[-1].extend(predictions[idx])
|
||||
else:
|
||||
filtered_predictions.append(predictions[idx])
|
||||
filtered_references.append(item)
|
||||
|
||||
filtered_references = [
|
||||
evaluation_samples[item] for item in filtered_references
|
||||
@ -291,7 +301,7 @@ class LCBCodeGenerationEvaluator(BaseEvaluator):
|
||||
metrics, eval_results, final_metadata = codegen_metrics(
|
||||
filtered_references,
|
||||
filtered_predictions,
|
||||
k_list=[1],
|
||||
k_list=self.k,
|
||||
num_process_evaluate=self.num_process_evaluate,
|
||||
timeout=self.timeout,
|
||||
)
|
||||
|
@ -56,7 +56,8 @@ class LCBCodeGenerationDataset(BaseDataset):
|
||||
local_mode: bool = False,
|
||||
release_version: str = 'release_v1',
|
||||
start_date: str = None,
|
||||
end_date: str = None):
|
||||
end_date: str = None,
|
||||
num_repeats: int = None):
|
||||
|
||||
def transform(item):
|
||||
# Define the dataitem mapping logic
|
||||
@ -118,7 +119,13 @@ class LCBCodeGenerationDataset(BaseDataset):
|
||||
if end_date is not None:
|
||||
p_end_date = datetime.strptime(end_date, '%Y-%m-%d')
|
||||
dataset = dataset.filter(lambda e: datetime.fromisoformat(e[
|
||||
'contest_date']) <= p_end_date) # noqa: E501
|
||||
'contest_date']) <= p_end_date)
|
||||
|
||||
if num_repeats and num_repeats > 1:
|
||||
indices = []
|
||||
for idx in range(len(dataset)):
|
||||
indices.extend([idx] * num_repeats)
|
||||
dataset = dataset.select(indices)
|
||||
|
||||
return DatasetDict({'test': dataset, 'train': dataset})
|
||||
|
||||
|
@ -436,7 +436,7 @@ class MBPPPassKEvaluator(MBPPEvaluator):
|
||||
"""Better use for pass k evaluation.
|
||||
|
||||
Args:
|
||||
k(Tuple[int]): Choices of Pass@k. Defaults to (1, 10, 100)
|
||||
k(Union[int, Tuple[int, ...], List[int]]): Choices of Pass@k.
|
||||
"""
|
||||
|
||||
def __init__(self, k=(1, 10, 100)) -> None:
|
||||
@ -478,7 +478,7 @@ class MBPPPassKEvaluator(MBPPEvaluator):
|
||||
task_total = defaultdict(int)
|
||||
|
||||
result = {'pass': 0, 'timeout': 0, 'failed': 0, 'wrong_answer': 0}
|
||||
with ProcessPoolExecutor() as executor:
|
||||
with ProcessPoolExecutor(max_workers=8) as executor:
|
||||
futures = []
|
||||
for refer, preds in zip(references, predictions):
|
||||
# suits for two case
|
||||
@ -494,7 +494,7 @@ class MBPPPassKEvaluator(MBPPEvaluator):
|
||||
for pred in preds:
|
||||
pred = self._process_answer(pred)
|
||||
programs = self._process_test(test_case, pred)
|
||||
future = executor.submit(execution, programs, task_id, 10)
|
||||
future = executor.submit(execution, programs, task_id, 8)
|
||||
futures.append(future)
|
||||
|
||||
from tqdm import tqdm
|
||||
|
97
opencompass/datasets/mbpp_pro.py
Normal file
97
opencompass/datasets/mbpp_pro.py
Normal file
@ -0,0 +1,97 @@
|
||||
import json
|
||||
from typing import Dict, List
|
||||
|
||||
import numpy as np
|
||||
from datasets import Dataset
|
||||
|
||||
from opencompass.openicl.icl_evaluator.code_evaluator import CodeEvaluator
|
||||
from opencompass.utils import get_data_path
|
||||
|
||||
from .base import BaseDataset
|
||||
|
||||
|
||||
class MBPPProDataset(BaseDataset):
|
||||
|
||||
@staticmethod
|
||||
def load(path, num_repeats=1, local_mode=False):
|
||||
path = get_data_path(path, local_mode=local_mode)
|
||||
print(path)
|
||||
dataset = []
|
||||
with open(path, encoding='utf-8') as f:
|
||||
for line in f:
|
||||
dataset.extend(
|
||||
[json.loads(line.strip()) for _ in range(num_repeats)])
|
||||
return Dataset.from_list(dataset)
|
||||
|
||||
|
||||
class MBPPProEvaluator(CodeEvaluator):
|
||||
|
||||
def _process_completions(self, test_case: dict, completions: list) -> list:
|
||||
processed_completions = []
|
||||
for comp in completions:
|
||||
post_comp = self._extract_code(comp)
|
||||
processed_completions.append(post_comp)
|
||||
return processed_completions
|
||||
|
||||
def score(self, predictions: List, references: List,
|
||||
test_set: Dataset) -> Dict:
|
||||
if len(predictions) != len(references):
|
||||
return {
|
||||
'error':
|
||||
'predictions and references have different '
|
||||
f'length. len(predictions): {len(predictions)}, '
|
||||
f'len(references): {len(references)}'
|
||||
}
|
||||
|
||||
test_set = test_set.to_pandas()
|
||||
# Use the first column as the unique identifier
|
||||
test_set_origin = test_set.drop_duplicates(subset=test_set.columns[0])
|
||||
num_repeats = int(len(test_set) / len(test_set_origin))
|
||||
|
||||
# 1. Prepare data for all test cases
|
||||
all_test_cases = []
|
||||
for i in range(len(test_set_origin)):
|
||||
test_case = test_set_origin.iloc[i]
|
||||
completions = predictions[i * num_repeats:(i + 1) * num_repeats]
|
||||
|
||||
# Process code completions
|
||||
processed_completions = self._process_completions(
|
||||
test_case, completions)
|
||||
|
||||
sub_data_dict = {
|
||||
'name': int(test_case['id']),
|
||||
'language': self.language,
|
||||
'prompt': '',
|
||||
'tests': test_case['test_code'],
|
||||
'processed_completions': processed_completions,
|
||||
'completions': completions
|
||||
}
|
||||
|
||||
all_test_cases.append(sub_data_dict)
|
||||
|
||||
# 2. Send all test cases to the evaluation service
|
||||
success, outputs, error_message = self._evaluate(all_test_cases)
|
||||
if not success:
|
||||
return {'error': error_message}
|
||||
|
||||
# 3. Process the returned results
|
||||
details = []
|
||||
total, correct = [], []
|
||||
for output in outputs:
|
||||
passed = [m['status'] == 'OK' for m in output['meta_data']]
|
||||
total.append(len(passed))
|
||||
correct.append(sum(passed))
|
||||
details.append(output)
|
||||
total = np.array(total)
|
||||
correct = np.array(correct)
|
||||
|
||||
pass_at_k = {
|
||||
f'pass@{k}':
|
||||
self.estimate_pass_at_k(total, correct, k).mean() * 100
|
||||
for k in self.k if (total >= k).all()
|
||||
}
|
||||
|
||||
return {
|
||||
**pass_at_k,
|
||||
'details': details,
|
||||
}
|
@ -1,12 +1,14 @@
|
||||
# flake8: noqa: E501
|
||||
|
||||
import difflib
|
||||
import itertools
|
||||
import os
|
||||
import re
|
||||
import tempfile
|
||||
import time
|
||||
from typing import Any, Dict, List, Optional, Tuple, Union
|
||||
from typing import Any, Dict, List, Optional, Sequence, Tuple, Union
|
||||
|
||||
import numpy as np
|
||||
from datasets import Dataset
|
||||
from gradio_client import Client
|
||||
|
||||
@ -24,19 +26,24 @@ class CodeEvaluator(BaseEvaluator):
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
language: str,
|
||||
language: str = 'py',
|
||||
ip_address: str = 'localhost',
|
||||
k: Union[int, Tuple[int, ...], List[int]] = 1,
|
||||
retry: int = 3) -> None:
|
||||
"""Initialize the CodeEvaluator.
|
||||
|
||||
Args:
|
||||
language (str): Programming language of the code to evaluate.
|
||||
ip_address (str, optional): IP address of the evaluation service. Defaults to 'localhost'.
|
||||
k: Union[int, Tuple[int,...], List[int,...]]: The number k of pass@k to evaluate the code. Defaults to 1.
|
||||
retry (int, optional): Number of retry attempts for failed connections. Defaults to 3.
|
||||
"""
|
||||
self.language = language
|
||||
self.retry = retry
|
||||
self.client = Client(ip_address)
|
||||
if not isinstance(k, Sequence):
|
||||
k = (k, )
|
||||
self.k = k
|
||||
super().__init__()
|
||||
|
||||
def _extract_code(self, text: str) -> str:
|
||||
@ -195,6 +202,31 @@ class CodeEvaluator(BaseEvaluator):
|
||||
|
||||
return True, output, None
|
||||
|
||||
def estimate_pass_at_k(self, num_samples: Union[int, List[int],
|
||||
np.ndarray],
|
||||
num_correct: Union[List[int], np.ndarray],
|
||||
k: int) -> np.ndarray:
|
||||
"""Estimates pass@k of each problem and returns them in an array."""
|
||||
|
||||
def estimator(n: int, c: int, k: int) -> float:
|
||||
"""
|
||||
Calculates 1 - comb(n - c, k) / comb(n, k).
|
||||
"""
|
||||
if n - c < k:
|
||||
return 1.0
|
||||
return 1.0 - np.prod(1.0 - k / np.arange(n - c + 1, n + 1))
|
||||
|
||||
if isinstance(num_samples, int):
|
||||
num_samples_it = itertools.repeat(num_samples, len(num_correct))
|
||||
else:
|
||||
assert len(num_samples) == len(num_correct)
|
||||
num_samples_it = iter(num_samples)
|
||||
|
||||
return np.array([
|
||||
estimator(int(n), int(c), k)
|
||||
for n, c in zip(num_samples_it, num_correct)
|
||||
])
|
||||
|
||||
def score(self, predictions: List, references: List,
|
||||
test_set: Dataset) -> Dict:
|
||||
"""Score code generation predictions against references.
|
||||
@ -233,7 +265,7 @@ class CodeEvaluator(BaseEvaluator):
|
||||
processed_completions = self._process_completions(
|
||||
test_case, completions)
|
||||
|
||||
result_dict = {
|
||||
sub_data_dict = {
|
||||
'name': test_case['name'],
|
||||
'language': test_case['language'],
|
||||
'prompt': test_case['prompt'],
|
||||
@ -242,7 +274,7 @@ class CodeEvaluator(BaseEvaluator):
|
||||
'completions': completions
|
||||
}
|
||||
|
||||
all_test_cases.append(result_dict)
|
||||
all_test_cases.append(sub_data_dict)
|
||||
|
||||
# 2. Send all test cases to the evaluation service
|
||||
success, outputs, error_message = self._evaluate(all_test_cases)
|
||||
@ -251,17 +283,22 @@ class CodeEvaluator(BaseEvaluator):
|
||||
|
||||
# 3. Process the returned results
|
||||
details = []
|
||||
correct = 0
|
||||
total, correct = [], []
|
||||
for output in outputs:
|
||||
if output.get('status') == 'OK':
|
||||
output['correct'] = True
|
||||
correct += 1
|
||||
else:
|
||||
output['correct'] = False
|
||||
|
||||
passed = [m['status'] == 'OK' for m in output['meta_data']]
|
||||
total.append(len(passed))
|
||||
correct.append(sum(passed))
|
||||
details.append(output)
|
||||
total = np.array(total)
|
||||
correct = np.array(correct)
|
||||
|
||||
pass_at_k = {
|
||||
f'pass@{k}':
|
||||
self.estimate_pass_at_k(total, correct, k).mean() * 100
|
||||
for k in self.k if (total >= k).all()
|
||||
}
|
||||
|
||||
return {
|
||||
f'pass@{num_repeats}': 100 * correct / len(test_set_origin),
|
||||
'details': details
|
||||
**pass_at_k,
|
||||
'details': details,
|
||||
}
|
||||
|
@ -420,6 +420,16 @@ DATASETS_MAPPING = {
|
||||
"hf_id": "",
|
||||
"local": "./data/OlympiadBench",
|
||||
},
|
||||
"opencompass/humaneval_pro": {
|
||||
"ms_id": "",
|
||||
"hf_id": "",
|
||||
"local": "./data/humaneval_pro/humaneval_pro.json",
|
||||
},
|
||||
"opencompass/mbpp_pro": {
|
||||
"ms_id": "",
|
||||
"hf_id": "",
|
||||
"local": "./data/mbpp_pro/mbpp_pro.json",
|
||||
},
|
||||
}
|
||||
|
||||
DATASETS_URL = {
|
||||
@ -746,5 +756,13 @@ DATASETS_URL = {
|
||||
"url":
|
||||
"http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/bigcodebench.zip",
|
||||
"md5": "270f399f4142b74f47ecff116cc3b21d"
|
||||
}
|
||||
},
|
||||
"humaneval_pro": {
|
||||
"url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/humaneval_pro.zip",
|
||||
"md5": "4c6fe556e84e905e4f0902d699e46de5",
|
||||
},
|
||||
"mbpp_pro": {
|
||||
"url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/mbpp_pro.zip",
|
||||
"md5": "eac330b8a0a8687f006265c9383503ce",
|
||||
},
|
||||
}
|
||||
|
@ -51,7 +51,8 @@ class JSONToolkit:
|
||||
raise
|
||||
|
||||
@staticmethod
|
||||
def read_jsonl(file_path: Union[str, Path]) -> List[Dict[str, Any]]:
|
||||
def read_jsonl(file_path: Union[str, Path],
|
||||
num_repeats: int = 1) -> List[Dict[str, Any]]:
|
||||
"""Read a JSONL file and return its contents as a list of dictionaries.
|
||||
|
||||
Args:
|
||||
@ -73,7 +74,9 @@ class JSONToolkit:
|
||||
if not line: # Skip empty lines
|
||||
continue
|
||||
try:
|
||||
results.append(json.loads(line))
|
||||
# results.append(json.loads(line))
|
||||
results.extend(
|
||||
[json.loads(line) for _ in range(num_repeats)])
|
||||
except json.JSONDecodeError as e:
|
||||
logger.error(
|
||||
f'Invalid JSON on line {line_num}: {str(e)}')
|
||||
|
Loading…
Reference in New Issue
Block a user