This commit is contained in:
Dongsheng Zhu 2025-04-12 14:15:23 +08:00 committed by GitHub
commit a3fa2fb105
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
29 changed files with 1039 additions and 75 deletions

153
examples/eval_codebench.py Normal file
View File

@ -0,0 +1,153 @@
from mmengine.config import read_base
import os.path as osp
from opencompass.runners import LocalRunner, VOLCRunner
from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner
from opencompass.tasks import OpenICLInferTask, OpenICLEvalTask
with read_base():
# Datasets Part
# bigcodebench
from opencompass.configs.datasets.bigcodebench.bigcodebench_full_instruct_gen_c3d5ad import (
bigcodebench_full_instruct_datasets
)
from opencompass.configs.datasets.bigcodebench.bigcodebench_hard_instruct_gen_c3d5ad import (
bigcodebench_hard_instruct_datasets
)
# livecodebench code generation lite v5
from opencompass.configs.datasets.livecodebench.livecodebench_time_split_gen import (
LCB_datasets
)
# huamneval
from opencompass.configs.datasets.humaneval.humaneval_openai_sample_evals_gen_dcae0e import (
humaneval_datasets
)
from opencompass.configs.datasets.humaneval_pro.humaneval_pro_gen import (
humanevalpro_datasets
)
from opencompass.configs.datasets.humanevalx.humanevalx_gen_627de5 import (
humanevalx_datasets
)
# mbpp
from opencompass.configs.datasets.mbpp.mbpp_gen import (
mbpp_datasets
)
from opencompass.configs.datasets.mbpp_pro.mbpp_pro_gen import (
mbpppro_datasets
)
# multipl-e
from opencompass.configs.datasets.multipl_e.multiple_top_ten_gen import (
multiple_datasets
)
# ds1000
from opencompass.configs.datasets.ds1000.ds1000_service_eval_gen_cbc84f import (
ds1000_datasets
)
# Models Part
from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_7b_instruct import (
models as lmdeploy_qwen2_5_7b_instruct_model,
)
from opencompass.configs.models.hf_internlm.lmdeploy_internlm3_8b_instruct import (
models as lmdeploy_internlm3_8b_instruct_model,
)
# Summary Groups
from opencompass.configs.summarizers.groups.ds1000 import (
ds1000_summary_groups,
)
from opencompass.configs.summarizers.groups.multipl_e import (
multiple_summary_groups,
)
from opencompass.configs.summarizers.groups.humanevalx import (
humanevalx_summary_groups,
)
# models config
models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
for model in models:
model['max_seq_len'] = 16384
model['max_out_len'] = 8192
# datasets config
datasets = sum(
(v for k, v in locals().items() if k.endswith('_datasets')),
[],
)
for item in humanevalx_datasets:
item['eval_cfg']['evaluator'][
'ip_address'
] = 'codeeval.opencompass.org.cn/humanevalx'
item['eval_cfg']['evaluator']['port'] = ''
for item in ds1000_datasets:
item['eval_cfg']['evaluator'][
'ip_address'
] = 'codeeval.opencompass.org.cn/ds1000'
item['eval_cfg']['evaluator']['port'] = ''
for dataset in datasets:
dataset['infer_cfg']['inferencer']['max_out_len'] = 8192
# summary
summary_groups = sum(
[v for k, v in locals().items() if k.endswith('_summary_groups')], []
)
summary_groups.append(
{'name': 'humanevalx',
'subsets': ['humanevalx-python', 'humanevalx-cpp', 'humanevalx-java', 'humanevalx-js']}
)
summarizer = dict(
dataset_abbrs = [
['bigcodebench_hard_instruct', 'pass@1'],
['bigcodebench_full_instruct', 'pass@1'],
['lcb_code_generation', 'pass@1'],
['openai_humaneval', 'humaneval_pass@1'],
['mbpp', 'score'],
['humaneval_pro', 'pass@1'],
['mbpp_pro', 'pass@1'],
['multiple', 'naive_average'],
['humanevalx', 'naive_average'],
['ds1000', 'naive_average'],
'',
'humanevalx-python',
'humanevalx-cpp',
'humanevalx-java',
'humanevalx-js',
'',
'ds1000_Pandas',
'ds1000_Numpy',
'ds1000_Tensorflow',
'ds1000_Scipy',
'ds1000_Sklearn',
'ds1000_Pytorch',
'ds1000_Matplotlib',
'',
'humaneval-multiple-cpp',
'humaneval-multiple-cs',
'humaneval-multiple-go',
'humaneval-multiple-java',
'humaneval-multiple-rb',
'humaneval-multiple-js',
'humaneval-multiple-php',
'humaneval-multiple-r',
'humaneval-multiple-rs',
'humaneval-multiple-sh',
'',
'mbpp-multiple-cpp',
'mbpp-multiple-cs',
'mbpp-multiple-go',
'mbpp-multiple-java',
'mbpp-multiple-rb',
'mbpp-multiple-js',
'mbpp-multiple-php',
'mbpp-multiple-r',
'mbpp-multiple-rs',
'mbpp-multiple-sh'
],
summary_groups=summary_groups,
)
work_dir = 'outputs/code'

View File

@ -0,0 +1,161 @@
from mmengine.config import read_base
import os.path as osp
from opencompass.runners import LocalRunner, VOLCRunner
from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner
from opencompass.tasks import OpenICLInferTask, OpenICLEvalTask
with read_base():
# Datasets Part
# bigcodebench
from opencompass.configs.datasets.bigcodebench.bigcodebench_full_instruct_gen_c3d5ad import (
bigcodebench_full_instruct_datasets
)
from opencompass.configs.datasets.bigcodebench.bigcodebench_hard_instruct_gen_c3d5ad import (
bigcodebench_hard_instruct_datasets
)
# livecodebench code generation lite v5
from opencompass.configs.datasets.livecodebench.livecodebench_time_split_gen import (
LCB_datasets
)
# huamneval
from opencompass.configs.datasets.humaneval.humaneval_passk_gen_8e312c import (
humaneval_datasets
)
from opencompass.configs.datasets.humaneval_pro.humaneval_pro_gen import (
humanevalpro_datasets
)
# mbpp
from opencompass.configs.datasets.mbpp.mbpp_passk_gen_830460 import (
mbpp_datasets
)
from opencompass.configs.datasets.mbpp_pro.mbpp_pro_gen import (
mbpppro_datasets
)
# multipl-e
from opencompass.configs.datasets.multipl_e.multiple_top_ten_gen import (
multiple_datasets
)
# Models Part
from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_7b_instruct import (
models as lmdeploy_qwen2_5_7b_instruct_model,
)
from opencompass.configs.models.hf_internlm.lmdeploy_internlm3_8b_instruct import (
models as lmdeploy_internlm3_8b_instruct_model,
)
# models config
models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
for model in models:
model['max_seq_len'] = 16384
model['max_out_len'] = 8192
# datasets config
datasets = sum(
(v for k, v in locals().items() if k.endswith('_datasets')),
[],
)
num_repeats = 5
k = (1, 3, 5)
for dataset in datasets:
dataset['infer_cfg']['inferencer']['max_out_len'] = 8192
# openai pass@k config: the current setting is pass@5 (n=10).
if not any(exclude in dataset['abbr'] for exclude in ('mbpp', 'humaneval')):
dataset['eval_cfg']['evaluator']['num_repeats'] = num_repeats
dataset['eval_cfg']['evaluator']['k'] = k
dataset['num_repeats'] = num_repeats
# dataset['abbr'] += f'_passk'
# summary
summarizer = dict(
dataset_abbrs = [
'pass@1',
['bigcodebench_full_instruct_passk', 'pass@1'],
['bigcodebench_hard_instruct_passk', 'pass@1'],
['lcb_code_generation_passk', 'pass@1'],
['openai_humaneval_passk_passk', 'humaneval_pass@1'],
['humaneval_pro_passk', 'pass@1'],
['mbpp_passk_passk', 'pass@1'],
['mbpp_pro_passk', 'pass@1'],
['humaneval-multiple-cpp_passk', 'pass@1'],
['humaneval-multiple-cs_passk', 'pass@1'],
['humaneval-multiple-go_passk', 'pass@1'],
['humaneval-multiple-java_passk', 'pass@1'],
['humaneval-multiple-rb_passk', 'pass@1'],
['humaneval-multiple-js_passk', 'pass@1'],
['humaneval-multiple-php_passk', 'pass@1'],
['humaneval-multiple-r_passk', 'pass@1'],
['humaneval-multiple-rs_passk', 'pass@1'],
['humaneval-multiple-sh_passk', 'pass@1'],
['mbpp-multiple-cpp_passk', 'pass@1'],
['mbpp-multiple-cs_passk', 'pass@1'],
['mbpp-multiple-go_passk', 'pass@1'],
['mbpp-multiple-java_passk', 'pass@1'],
['mbpp-multiple-rb_passk', 'pass@1'],
['mbpp-multiple-js_passk', 'pass@1'],
['mbpp-multiple-php_passk', 'pass@1'],
['mbpp-multiple-r_passk', 'pass@1'],
['mbpp-multiple-rs_passk', 'pass@1'],
['mbpp-multiple-sh_passk', 'pass@1'],
'',
'pass@3',
['bigcodebench_full_instruct_passk', 'pass@3'],
['bigcodebench_hard_instruct_passk', 'pass@3'],
['lcb_code_generation_passk', 'pass@3'],
['openai_humaneval_passk_passk', 'humaneval_pass@3'],
['humaneval_pro_passk', 'pass@3'],
['mbpp_passk_passk', 'pass@3'],
['mbpp_pro_passk', 'pass@3'],
['humaneval-multiple-cpp_passk', 'pass@3'],
['humaneval-multiple-cs_passk', 'pass@3'],
['humaneval-multiple-go_passk', 'pass@3'],
['humaneval-multiple-java_passk', 'pass@3'],
['humaneval-multiple-rb_passk', 'pass@3'],
['humaneval-multiple-js_passk', 'pass@3'],
['humaneval-multiple-php_passk', 'pass@3'],
['humaneval-multiple-r_passk', 'pass@3'],
['humaneval-multiple-rs_passk', 'pass@3'],
['humaneval-multiple-sh_passk', 'pass@3'],
['mbpp-multiple-cpp_passk', 'pass@3'],
['mbpp-multiple-cs_passk', 'pass@3'],
['mbpp-multiple-go_passk', 'pass@3'],
['mbpp-multiple-java_passk', 'pass@3'],
['mbpp-multiple-rb_passk', 'pass@3'],
['mbpp-multiple-js_passk', 'pass@3'],
['mbpp-multiple-php_passk', 'pass@3'],
['mbpp-multiple-r_passk', 'pass@3'],
['mbpp-multiple-rs_passk', 'pass@3'],
['mbpp-multiple-sh_passk', 'pass@3'],
'',
'pass@5',
['bigcodebench_full_instruct_passk', 'pass@5'],
['bigcodebench_hard_instruct_passk', 'pass@5'],
['lcb_code_generation_passk', 'pass@5'],
['openai_humaneval_passk_passk', 'humaneval_pass@5'],
['humaneval_pro_passk', 'pass@5'],
['mbpp_passk_passk', 'pass@5'],
['mbpp_pro_passk', 'pass@5'],
['humaneval-multiple-cpp_passk', 'pass@5'],
['humaneval-multiple-cs_passk', 'pass@5'],
['humaneval-multiple-go_passk', 'pass@5'],
['humaneval-multiple-java_passk', 'pass@5'],
['humaneval-multiple-rb_passk', 'pass@5'],
['humaneval-multiple-js_passk', 'pass@5'],
['humaneval-multiple-php_passk', 'pass@5'],
['humaneval-multiple-r_passk', 'pass@5'],
['humaneval-multiple-rs_passk', 'pass@5'],
['humaneval-multiple-sh_passk', 'pass@5'],
['mbpp-multiple-cpp_passk', 'pass@5'],
['mbpp-multiple-cs_passk', 'pass@5'],
['mbpp-multiple-go_passk', 'pass@5'],
['mbpp-multiple-java_passk', 'pass@5'],
['mbpp-multiple-rb_passk', 'pass@5'],
['mbpp-multiple-js_passk', 'pass@5'],
['mbpp-multiple-php_passk', 'pass@5'],
['mbpp-multiple-r_passk', 'pass@5'],
['mbpp-multiple-rs_passk', 'pass@5'],
['mbpp-multiple-sh_passk', 'pass@5'],
],
)
work_dir = 'outputs/code_passk'

View File

@ -0,0 +1,45 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets import (BigCodeBenchDataset, BigCodeBenchEvaluator)
bigcodebench_full_reader_cfg = dict(
input_columns=['instruct_prompt'],
output_column='test',
)
bigcodebench_full_infer_cfg = dict(prompt_template=dict(
type=PromptTemplate,
template=dict(
begin=[dict(role='system', fallback_role='HUMAN', prompt='')],
round=[
dict(role='HUMAN', prompt='{instruct_prompt}'),
])),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer))
bigcodebench_full_eval_cfg = dict(
evaluator=dict(
type=BigCodeBenchEvaluator,
release_version='v0.1.2',
eval_type='instruct',
remote_execute_api='https://bigcode-bigcodebench-evaluator.hf.space/',
# remote_execute_api=
# 'https://opencompass-opencompass-bigcodebench-evaluator.hf.space', # noqa: E501
dataset_version='full',
num_repeats=1,
k=1,
),
pred_role='BOT',
)
bigcodebench_full_instruct_datasets = [
dict(abbr='bigcodebench_full_instruct',
type=BigCodeBenchDataset,
path='opencompass/bigcodebench',
reader_cfg=bigcodebench_full_reader_cfg,
infer_cfg=bigcodebench_full_infer_cfg,
eval_cfg=bigcodebench_full_eval_cfg,
release_version='v0.1.2',
num_repeats=1,)
]

View File

@ -24,10 +24,12 @@ bigcodebench_hard_eval_cfg = dict(
type=BigCodeBenchEvaluator, type=BigCodeBenchEvaluator,
release_version='v0.1.2', release_version='v0.1.2',
eval_type='instruct', eval_type='instruct',
# remote_execute_api='https://bigcode-bigcodebench-evaluator.hf.space/', remote_execute_api='https://bigcode-bigcodebench-evaluator.hf.space/',
remote_execute_api= # remote_execute_api=
'https://opencompass-opencompass-bigcodebench-evaluator.hf.space', # noqa: E501 # 'https://opencompass-opencompass-bigcodebench-evaluator.hf.space', # noqa: E501
dataset_version='hard', dataset_version='hard',
num_repeats=1,
k=1,
), ),
pred_role='BOT', pred_role='BOT',
) )
@ -42,5 +44,6 @@ bigcodebench_hard_instruct_datasets = [
eval_cfg=bigcodebench_hard_eval_cfg, eval_cfg=bigcodebench_hard_eval_cfg,
release_version='v0.1.2', release_version='v0.1.2',
dataset_version='hard', dataset_version='hard',
num_repeats=1,
) )
] ]

View File

@ -19,9 +19,9 @@ humaneval_infer_cfg = dict(
inferencer=dict(type=GenInferencer, max_out_len=512)) inferencer=dict(type=GenInferencer, max_out_len=512))
humaneval_eval_cfg = dict( humaneval_eval_cfg = dict(
evaluator=dict(type=HumanEvalEvaluator), evaluator=dict(type=HumanEvalEvaluator,
k=1),
pred_role='BOT', pred_role='BOT',
k=[1, 10, 100], # the parameter only for humaneval
pred_postprocessor=dict(type=humaneval_postprocess_v2), pred_postprocessor=dict(type=humaneval_postprocess_v2),
) )
@ -32,5 +32,6 @@ humaneval_datasets = [
path='opencompass/humaneval', path='opencompass/humaneval',
reader_cfg=humaneval_reader_cfg, reader_cfg=humaneval_reader_cfg,
infer_cfg=humaneval_infer_cfg, infer_cfg=humaneval_infer_cfg,
eval_cfg=humaneval_eval_cfg) eval_cfg=humaneval_eval_cfg,
num_repeats=1)
] ]

View File

@ -0,0 +1,17 @@
# HumanEval pro
## OC results
| model | pass@1 |
|:--------------------------:|---------:|
|qwen2.5-coder-7b-instruct-hf| 65 |
| qwen2.5-14b-instruct-hf | 67 |
| deepseek-v2-lite-chat-hf | 35 |
## CodeEval-pro results
| model | pass@1 |
|:--------------------------:|---------:|
|qwen2.5-coder-7b-instruct-hf| 65 |
| qwen2.5-14b-instruct-hf | 65 |
| deepseek-v2-lite-chat-hf | 28 |

View File

@ -0,0 +1,60 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets import HumanevalevalProDataset, HumanevalProEvaluator, humaneval_postprocess_v2
OFFICIAL_PROMPT_WRAPPER = """You are an exceptionally intelligent coding assistant that consistently delivers accurate and reliable responses to user instructions.
@@ Instruction
Write a solution of python file to the following problems, the solution of the second problem requires single or multiple calls to the first solution.
```python
{raw_problem}
{new_problem}
```
@@ Response
Please put the two solutions to the above problems in one Python code block.
"""
PROMPT_WRAPPER = """You are an exceptionally intelligent coding assistant that consistently delivers accurate and reliable responses to user instructions.
Write a solution of python file to the following problems, the solution of the second problem requires single or multiple calls to the first solution.
```python
{raw_problem}
{new_problem}
```
Please put the two solutions within the Python code block provided below, and make sure that the block contains no other unrelated content:
```python
```
"""
humanevalpro_reader_cfg = dict(
input_columns=['raw_problem', 'new_problem'], output_column='test_code')
humanevalpro_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(round=[
dict(
role='HUMAN',
prompt=PROMPT_WRAPPER),
])),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer))
humanevalpro_eval_cfg = dict(
evaluator=dict(type=HumanevalProEvaluator,
ip_address='https://opencompass-multiple-evaluator.hf.space',
k=1)
)
humanevalpro_datasets = [
dict(
abbr='humaneval_pro',
type=HumanevalevalProDataset,
path='opencompass/humaneval_pro',
num_repeats=1,
reader_cfg=humanevalpro_reader_cfg,
infer_cfg=humanevalpro_infer_cfg,
eval_cfg=humanevalpro_eval_cfg,)
]

View File

@ -0,0 +1,41 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets import HumanevalXDataset, HumanevalXEvaluator
humanevalx_reader_cfg = dict(
input_columns=['prompt'], output_column='declaration', train_split='test')
humanevalx_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template='{prompt}'),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer))
humanevalx_eval_cfg_dict = {
lang : dict(
evaluator=dict(
type=HumanevalXEvaluator,
language=lang,
ip_address=
'localhost', # replace to your code_eval_server ip_address, port
port=5001), # refer to https://opencompass.readthedocs.io/en/latest/advanced_guides/code_eval_service.html to launch a server
pred_role='BOT')
for lang in ['python', 'cpp', 'java', 'js'] # do not support rust & go now
}
# Please download the needed `xx.jsonl.gz` from
# https://github.com/THUDM/CodeGeeX2/tree/main/benchmark/humanevalx
# and move them into `data/humanevalx/` folder
humanevalx_datasets = [
dict(
type=HumanevalXDataset,
abbr=f'humanevalx-{lang}',
language=lang,
path='./data/humanevalx',
reader_cfg=humanevalx_reader_cfg,
infer_cfg=humanevalx_infer_cfg,
eval_cfg=humanevalx_eval_cfg_dict[lang])
for lang in ['python', 'cpp', 'java', 'js']
]

View File

@ -33,9 +33,11 @@ lcb_code_generation_eval_cfg = dict(
evaluator=dict(type=LCBCodeGenerationEvaluator, evaluator=dict(type=LCBCodeGenerationEvaluator,
num_process_evaluate=4, num_process_evaluate=4,
timeout=6, timeout=6,
release_version='release_v5', release_version='v5',
start_date='2024-08-01', start_date='2024-08-01',
end_date='2025-02-01'), end_date='2025-02-01',
num_repeats=1,
k=1,),
pred_role='BOT', pred_role='BOT',
) )
@ -46,7 +48,8 @@ LCBCodeGeneration_dataset = dict(
reader_cfg=lcb_code_generation_reader_cfg, reader_cfg=lcb_code_generation_reader_cfg,
infer_cfg=lcb_code_generation_infer_cfg, infer_cfg=lcb_code_generation_infer_cfg,
eval_cfg=lcb_code_generation_eval_cfg, eval_cfg=lcb_code_generation_eval_cfg,
release_version='release_v5', release_version='v5',
num_repeats=1,
) )
# Code Execution Dataset # Code Execution Dataset
@ -127,6 +130,6 @@ LCBTestOutput_dataset = dict(
LCB_datasets = [ LCB_datasets = [
LCBCodeGeneration_dataset, LCBCodeGeneration_dataset,
LCBCodeExecution_dataset, # LCBCodeExecution_dataset,
LCBTestOutput_dataset, # LCBTestOutput_dataset,
] ]

View File

@ -28,7 +28,9 @@ mbpp_infer_cfg = dict(
inferencer=dict(type=GenInferencer, max_out_len=512), inferencer=dict(type=GenInferencer, max_out_len=512),
) )
mbpp_eval_cfg = dict(evaluator=dict(type=MBPPPassKEvaluator), pred_role='BOT') mbpp_eval_cfg = dict(evaluator=dict(type=MBPPPassKEvaluator,
k=1),
pred_role='BOT')
mbpp_datasets = [ mbpp_datasets = [
dict( dict(
@ -38,5 +40,6 @@ mbpp_datasets = [
reader_cfg=mbpp_reader_cfg, reader_cfg=mbpp_reader_cfg,
infer_cfg=mbpp_infer_cfg, infer_cfg=mbpp_infer_cfg,
eval_cfg=mbpp_eval_cfg, eval_cfg=mbpp_eval_cfg,
num_repeats=1,
) )
] ]

View File

@ -0,0 +1,17 @@
# MBPP pro
## OC results
| model | pass@1 |
|:--------------------------:|---------:|
|qwen2.5-coder-7b-instruct-hf| 66 |
| qwen2.5-14b-instruct-hf | 64 |
| deepseek-v2-lite-chat-hf | 36 |
## CodeEval-pro results
| model | pass@1 |
|:--------------------------:|---------:|
|qwen2.5-coder-7b-instruct-hf| 65 |
| qwen2.5-14b-instruct-hf | 65 |
| deepseek-v2-lite-chat-hf | 39 |

View File

@ -0,0 +1,60 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets import MBPPProDataset, MBPPProEvaluator
OFFICIAL_PROMPT_WRAPPER = """You are an exceptionally intelligent coding assistant that consistently delivers accurate and reliable responses to user instructions.
@@ Instruction
Write a solution of python file to the following problems, the solution of the second problem requires single or multiple calls to the first solution.
```python
{raw_problem}
{new_problem}
```
@@ Response
Please put the two solutions to the above problems in one Python code block.
"""
PROMPT_WRAPPER = """You are an exceptionally intelligent coding assistant that consistently delivers accurate and reliable responses to user instructions.
Write a solution of python file to the following problems, the solution of the second problem requires single or multiple calls to the first solution.
```python
{raw_problem}
{new_problem}
```
Please put the two solutions within the Python code block provided below, and make sure that the block contains no other unrelated content:
```python
```
"""
mbpppro_reader_cfg = dict(
input_columns=['raw_problem', 'new_problem'], output_column='test_code')
mbpppro_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(round=[
dict(
role='HUMAN',
prompt=PROMPT_WRAPPER),
])),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer))
mbpppro_eval_cfg = dict(
evaluator=dict(type=MBPPProEvaluator,
ip_address='https://opencompass-multiple-evaluator.hf.space',
k=1),
)
mbpppro_datasets = [
dict(
abbr='mbpp_pro',
type=MBPPProDataset,
path='opencompass/mbpp_pro',
num_repeats=1,
reader_cfg=mbpppro_reader_cfg,
infer_cfg=mbpppro_infer_cfg,
eval_cfg=mbpppro_eval_cfg)
]

View File

@ -21,6 +21,7 @@ multiple_eval_cfg = {
evaluator=dict( evaluator=dict(
type=MultiplEEvaluator, type=MultiplEEvaluator,
language=lang, language=lang,
k = 1,
ip_address='https://opencompass-multiple-evaluator.hf.space', ip_address='https://opencompass-multiple-evaluator.hf.space',
), ),
pred_role='BOT', pred_role='BOT',

View File

@ -0,0 +1,6 @@
multiple_summary_groups = []
humaneval_multiple = ['humaneval-multiple-cpp', 'humaneval-multiple-cs', 'humaneval-multiple-go', 'humaneval-multiple-java', 'humaneval-multiple-rb', 'humaneval-multiple-js', 'humaneval-multiple-php', 'humaneval-multiple-r', 'humaneval-multiple-rs', 'humaneval-multiple-sh']
mbpp_multiple = ['mbpp-multiple-cpp', 'mbpp-multiple-cs', 'mbpp-multiple-go', 'mbpp-multiple-java', 'mbpp-multiple-rb', 'mbpp-multiple-js', 'mbpp-multiple-php', 'mbpp-multiple-r', 'mbpp-multiple-rs', 'mbpp-multiple-sh']
multiple_summary_groups.append({'name': 'multiple', 'subsets': humaneval_multiple})
multiple_summary_groups.append({'name':'multiple','subsets': mbpp_multiple})

View File

@ -62,6 +62,7 @@ from .hle import * # noqa: F401, F403
from .huggingface import * # noqa: F401, F403 from .huggingface import * # noqa: F401, F403
from .humaneval import * # noqa: F401, F403 from .humaneval import * # noqa: F401, F403
from .humaneval_multi import * # noqa: F401, F403 from .humaneval_multi import * # noqa: F401, F403
from .humaneval_pro import * # noqa: F401, F403
from .humanevalx import * # noqa: F401, F403 from .humanevalx import * # noqa: F401, F403
from .hungarian_math import * # noqa: F401, F403 from .hungarian_math import * # noqa: F401, F403
from .IFEval.ifeval import IFEvalDataset, IFEvaluator # noqa: F401, F403 from .IFEval.ifeval import IFEvalDataset, IFEvaluator # noqa: F401, F403
@ -91,6 +92,7 @@ from .math401 import * # noqa: F401, F403
from .math_intern import * # noqa: F401, F403 from .math_intern import * # noqa: F401, F403
from .mathbench import * # noqa: F401, F403 from .mathbench import * # noqa: F401, F403
from .mbpp import * # noqa: F401, F403 from .mbpp import * # noqa: F401, F403
from .mbpp_pro import * # noqa: F401, F403
from .medbench import * # noqa: F401, F403 from .medbench import * # noqa: F401, F403
from .MedXpertQA import * # noqa: F401, F403 from .MedXpertQA import * # noqa: F401, F403
from .mgsm import * # noqa: F401, F403 from .mgsm import * # noqa: F401, F403

View File

@ -4,6 +4,7 @@
import os import os
import time import time
from concurrent.futures._base import CancelledError from concurrent.futures._base import CancelledError
from typing import List, Sequence, Tuple, Union
import httpx import httpx
from datasets import Dataset, DatasetDict from datasets import Dataset, DatasetDict
@ -24,7 +25,8 @@ class BigCodeBenchDataset(BaseDataset):
def load(path: str = 'opencompass/bigcodebench', def load(path: str = 'opencompass/bigcodebench',
local_mode: bool = False, local_mode: bool = False,
release_version: str = 'v0.1.2', release_version: str = 'v0.1.2',
dataset_version: str = 'full'): dataset_version: str = 'full',
num_repeats: int = 1):
""" """
Args: Args:
path (str): The path to the dataset. path (str): The path to the dataset.
@ -33,6 +35,7 @@ class BigCodeBenchDataset(BaseDataset):
release_version (str): The release version of the dataset. release_version (str): The release version of the dataset.
dataset_version (str): The data version of the dataset. dataset_version (str): The data version of the dataset.
only support ['full', 'hard'] only support ['full', 'hard']
num_repeats (int): Number of times to repeat dataset for pass@k.
""" """
assert dataset_version in ['full', 'hard'], \ assert dataset_version in ['full', 'hard'], \
'dataset_version should be one of ["full", "hard"], ' 'dataset_version should be one of ["full", "hard"], '
@ -45,11 +48,13 @@ class BigCodeBenchDataset(BaseDataset):
# 'entry_point', 'doc_struct', 'libs' # 'entry_point', 'doc_struct', 'libs'
if dataset_version == 'full': if dataset_version == 'full':
items = JSONToolkit.read_jsonl( items = JSONToolkit.read_jsonl(
os.path.join(path, f'BigCodeBench-{release_version}.jsonl')) os.path.join(path, f'BigCodeBench-{release_version}.jsonl'),
num_repeats)
else: else:
items = JSONToolkit.read_jsonl( items = JSONToolkit.read_jsonl(
os.path.join(path, os.path.join(path,
f'BigCodeBench-Hard-{release_version}.jsonl')) f'BigCodeBench-Hard-{release_version}.jsonl'),
num_repeats)
dataset['train'] = Dataset.from_list(items) dataset['train'] = Dataset.from_list(items)
dataset['test'] = Dataset.from_list(items) dataset['test'] = Dataset.from_list(items)
@ -61,10 +66,10 @@ class BigCodeBenchEvaluator(BaseEvaluator):
"""Evaluator for BigCodeBench. """Evaluator for BigCodeBench.
Args: Args:
num_process_evaluate (int): number of processes to evaluate
timeout (int): timeout for each evaluation timeout (int): timeout for each evaluation
release_version (str): release version of BigCodeBench release_version (str): release version of BigCodeBench
eval_type (str): type of evaluation, either 'instruct' or 'completion' eval_type (str): type of evaluation, either 'instruct' or 'completion'
k (str): pass@k for evaluation
""" """
def __init__( def __init__(
@ -75,7 +80,9 @@ class BigCodeBenchEvaluator(BaseEvaluator):
dataset_version: str = 'full', dataset_version: str = 'full',
local_mode: bool = False, local_mode: bool = False,
path: str = 'opencompass/bigcodebench', path: str = 'opencompass/bigcodebench',
num_repeats=1,
pass_k: str = '1,5,10', pass_k: str = '1,5,10',
k: Union[int, Tuple[int, ...], List[int]] = 1,
parallel: int = -1, parallel: int = -1,
min_time_limit: float = 1, min_time_limit: float = 1,
max_as_limit: int = 30 * 1024, max_as_limit: int = 30 * 1024,
@ -88,12 +95,17 @@ class BigCodeBenchEvaluator(BaseEvaluator):
release_version=release_version, release_version=release_version,
dataset_version=dataset_version, dataset_version=dataset_version,
local_mode=local_mode, local_mode=local_mode,
path=path)['test'] path=path,
num_repeats=num_repeats)['test']
self.eval_type = eval_type self.eval_type = eval_type
if not isinstance(k, Sequence):
k = (k, )
k = ', '.join(map(str, k))
self.k = k
self.remote_execute_api = remote_execute_api self.remote_execute_api = remote_execute_api
self.eval_kwargs = dict(subset=dataset_version, self.eval_kwargs = dict(subset=dataset_version,
pass_k=pass_k, pass_k=self.k,
parallel=parallel, parallel=parallel,
min_time_limit=min_time_limit, min_time_limit=min_time_limit,
max_as_limit=max_as_limit, max_as_limit=max_as_limit,
@ -141,7 +153,7 @@ class BigCodeBenchEvaluator(BaseEvaluator):
signal.alarm(0) signal.alarm(0)
signal.signal(signal.SIGALRM, original_handler) signal.signal(signal.SIGALRM, original_handler)
with timeout_handler(10): with timeout_handler(300):
sanitized_prediction = extract_code_generation( sanitized_prediction = extract_code_generation(
prediction, entrypoint=entrypoint) prediction, entrypoint=entrypoint)
@ -188,7 +200,9 @@ class BigCodeBenchEvaluator(BaseEvaluator):
while True: while True:
try: try:
eval_client = Client(self.remote_execute_api, eval_client = Client(self.remote_execute_api,
httpx_kwargs=dict(proxies=proxies)) httpx_kwargs=dict(
proxies=proxies,
timeout=httpx.Timeout(100.0)))
results, pass_at_k = eval_client.predict( results, pass_at_k = eval_client.predict(
split=self.eval_type, split=self.eval_type,
samples=handle_file(submitted_contents_path), samples=handle_file(submitted_contents_path),
@ -196,22 +210,25 @@ class BigCodeBenchEvaluator(BaseEvaluator):
**self.eval_kwargs) **self.eval_kwargs)
break break
except (httpx.ReadTimeout, CancelledError): except (httpx.ReadTimeout, CancelledError):
logger.info('Read timeout error. Retrying in 4s...') logger.info('Read timeout error. Retrying in 10s...')
time.sleep(10) time.sleep(10)
if 'pass@1' in pass_at_k.keys(): pass_at_k = {
pass_at_k['pass@1'] *= 100 k: v * 100 if isinstance(v, (int, float)) else v
dump_results = {'details': self._results_processor(results)} for k, v in pass_at_k.items()
dump_results.update(pass_at_k) }
return {
return dump_results **pass_at_k,
'details': self._results_processor(results),
}
def _results_processor(self, results): def _results_processor(self, results):
details = [] details = []
for key, value in results['eval'].items(): for key, value in results['eval'].items():
if value[0]['status'] == 'pass': detail = {'correct': False, 'results_details': value}
value[0]['correct'] = True for v in value:
else: if v['status'] == 'pass':
value[0]['correct'] = False detail['correct'] = True
details.append(value[0]) break
details.append(detail)
return details return details

View File

@ -191,14 +191,19 @@ class CodeCustomDataset(BaseDataset):
path = get_data_path(path, local_mode=local_mode) path = get_data_path(path, local_mode=local_mode)
if file_name is not None: if file_name is not None:
path = os.path.join(path, file_name) path = os.path.join(path, file_name)
files = os.listdir(path)
data = [] data = []
if path.endswith('.jsonl'): if any(f.endswith('.jsonl') for f in files):
with open(path, 'r', encoding='utf-8') as f: target_file = next(f for f in files if f.endswith('.jsonl'))
target_path = os.path.join(path, target_file)
with open(target_path, 'r', encoding='utf-8') as f:
for line in f: for line in f:
data.extend( data.extend(
[json.loads(line.strip()) for _ in range(num_repeats)]) [json.loads(line.strip()) for _ in range(num_repeats)])
elif path.endswith('.csv'): elif any(f.endswith('.csv') for f in files):
with open(path, 'r', encoding='utf-8-sig') as f: target_file = next(f for f in files if f.endswith('.csv'))
target_path = os.path.join(path, target_file)
with open(target_path, 'r', encoding='utf-8-sig') as f:
reader = csv.reader(f) reader = csv.reader(f)
header = next(reader) header = next(reader)
for row in reader: for row in reader:

View File

@ -6,7 +6,7 @@ import os.path as osp
import re import re
import tempfile import tempfile
from os import environ from os import environ
from typing import List from typing import List, Sequence, Tuple, Union
from datasets import Dataset from datasets import Dataset
@ -70,12 +70,16 @@ class HumanevalDataset(BaseDataset):
class HumanEvalEvaluator(BaseEvaluator): class HumanEvalEvaluator(BaseEvaluator):
"""Evaluator for HumanEval or EvalPlus.""" """Evaluator for HumanEval or EvalPlus."""
def __init__(self, k: List[int] = [1, 10, 100]) -> None: def __init__(self, k: Union[int, Tuple[int, ...], List[int]] = 1,
num_repeats: int = 1) -> None:
try: try:
import human_eval import human_eval
except ImportError: except ImportError:
raise ImportError(HUMANEVAL_IMPORT_ERROR) raise ImportError(HUMANEVAL_IMPORT_ERROR)
self.n = num_repeats
if not isinstance(k, Sequence):
k = (k, )
self.k = k self.k = k
super().__init__() super().__init__()
@ -87,16 +91,24 @@ class HumanEvalEvaluator(BaseEvaluator):
from human_eval.evaluation import evaluate_functional_correctness from human_eval.evaluation import evaluate_functional_correctness
prompts = [item['prompt'] for item in test_set] prompts = [item['prompt'] for item in test_set]
humaneval_preds = [] predictions_processed, references_processed = [], []
for pred, refer in zip(predictions, references):
if references_processed and refer == references_processed[-1]:
predictions_processed[-1].extend([pred])
else:
references_processed.append(refer)
predictions_processed.append([pred])
# create json file in human_eval format # create json file in human_eval format
for preds, refer in zip(predictions, references): humaneval_preds = []
for preds_p, refer_p in zip(predictions_processed, references_processed):
# suits for two case # suits for two case
# 1. use repeated dataset # 1. use repeated dataset
# 2. use `num_return_sequences` to generate multiple responses # 2. use `num_return_sequences` to generate multiple responses
if not isinstance(preds, list): if not isinstance(preds_p, list):
preds = [preds] preds_p = [preds_p]
for pred in preds: for pred_p in preds_p:
humaneval_preds.append({'task_id': refer, 'completion': pred}) humaneval_preds.append({'task_id': refer_p, 'completion': pred_p})
with tempfile.TemporaryDirectory() as tmp_dir: with tempfile.TemporaryDirectory() as tmp_dir:
out_dir = osp.join(tmp_dir, 'human_eval.json') out_dir = osp.join(tmp_dir, 'human_eval.json')
write_jsonl(out_dir, humaneval_preds) write_jsonl(out_dir, humaneval_preds)
@ -183,13 +195,13 @@ def humaneval_postprocess_v2(text: str) -> str:
blocks = re.findall(r'```\w*\n(.*?)```', text, re.DOTALL) blocks = re.findall(r'```\w*\n(.*?)```', text, re.DOTALL)
if len(blocks) >= 1: if len(blocks) >= 1:
text = blocks[0] text = blocks[0]
return text return text.lstrip()
def humaneval_postprocess_v3(text: str) -> str: def humaneval_postprocess_v3(text: str) -> str:
blocks = re.findall(r'```\w*\n(.*?)```', text, re.DOTALL) blocks = re.findall(r'```\w*\n(.*?)```', text, re.DOTALL)
if len(blocks) >= 1: if len(blocks) >= 1:
text = blocks[-1] text = blocks[-1]
return text return text.lstrip()
def humaneval_internal_v2_postprocess(text: str): def humaneval_internal_v2_postprocess(text: str):
if text.startswith(' ') and not text.startswith(' '): if text.startswith(' ') and not text.startswith(' '):

View File

@ -0,0 +1,96 @@
import json
from typing import Dict, List
import numpy as np
from datasets import Dataset
from opencompass.openicl.icl_evaluator.code_evaluator import CodeEvaluator
from opencompass.utils import get_data_path
from .base import BaseDataset
class HumanevalevalProDataset(BaseDataset):
@staticmethod
def load(path, num_repeats=1, local_mode=False):
path = get_data_path(path, local_mode=local_mode)
dataset = []
with open(path, encoding='utf-8') as f:
raw_data = json.load(f)
for data in raw_data:
dataset.extend([data for _ in range(num_repeats)])
return Dataset.from_list(dataset)
class HumanevalProEvaluator(CodeEvaluator):
def _process_completions(self, test_case: dict, completions: list) -> list:
processed_completions = []
for comp in completions:
post_comp = self._extract_code(comp)
processed_completions.append(post_comp)
return processed_completions
def score(self, predictions: List, references: List,
test_set: Dataset) -> Dict:
if len(predictions) != len(references):
return {
'error':
'predictions and references have different '
f'length. len(predictions): {len(predictions)}, '
f'len(references): {len(references)}'
}
test_set = test_set.to_pandas()
# Use the first column as the unique identifier
test_set_origin = test_set.drop_duplicates(subset=test_set.columns[0])
num_repeats = int(len(test_set) / len(test_set_origin))
# 1. Prepare data for all test cases
all_test_cases = []
for i in range(len(test_set_origin)):
test_case = test_set_origin.iloc[i]
completions = predictions[i * num_repeats:(i + 1) * num_repeats]
# Process code completions
processed_completions = self._process_completions(
test_case, completions)
sub_data_dict = {
'name': int(test_case['id']),
'language': self.language,
'prompt': '',
'tests': test_case['test_code'],
'processed_completions': processed_completions,
'completions': completions
}
all_test_cases.append(sub_data_dict)
# 2. Send all test cases to the evaluation service
success, outputs, error_message = self._evaluate(all_test_cases)
if not success:
return {'error': error_message}
# 3. Process the returned results
details = []
total, correct = [], []
for output in outputs:
passed = [m['status'] == 'OK' for m in output['meta_data']]
total.append(len(passed))
correct.append(sum(passed))
details.append(output)
total = np.array(total)
correct = np.array(correct)
pass_at_k = {
f'pass@{k}':
self.estimate_pass_at_k(total, correct, k).mean() * 100
for k in self.k if (total >= k).all()
}
return {
**pass_at_k,
'details': details,
}

View File

@ -0,0 +1,89 @@
import json
import os
import evaluate
from datasets import Dataset
from opencompass.openicl.icl_evaluator import HuggingfaceEvaluator
from opencompass.utils import get_data_path
from .base import BaseDataset
os.environ['HF_ALLOW_CODE_EVAL'] = '1'
class HumanevalevalProDataset(BaseDataset):
@staticmethod
def load(path, num_repeats=1, local_mode=False):
path = get_data_path(path, local_mode=local_mode)
dataset = []
with open(path, encoding='utf-8') as f:
raw_data = json.load(f)
for data in raw_data:
dataset.extend([data for _ in range(num_repeats)])
return Dataset.from_list(dataset)
class HumanevalProEvaluator(HuggingfaceEvaluator):
def _preprocess(self, predictions, references):
predictions = [[_] for _ in predictions]
return {
'predictions': predictions,
'references': references,
}
def _postprocess(self, scores):
scores = {f'humaneval_{k}': scores[k] * 100 for k in scores}
return scores
def score(self, predictions, references, test_set):
# predictions are LLM's output; references are the 'output_column' of 'humanevalpro_reader_cfg' # noqa: E501
if len(predictions) != len(references):
return {
'error':
'predictions and references have different '
f'length. len(predictions): {len(predictions)}, '
f'len(references): {len(references)}'
}
# use codes pre-downloaded to opencompass repo, avoid downloading
current_dir = os.path.dirname(os.path.abspath(__file__))
parrent_dir = os.path.dirname(current_dir)
local_path = os.path.join(parrent_dir, 'openicl', 'icl_evaluator',
'hf_metrics', self.metric)
if os.path.exists(local_path):
metric = evaluate.load(local_path)
else:
metric = evaluate.load(self.metric)
scores, _ = metric.compute(**self._preprocess(predictions, references),
k=[1, 3, 5],
num_workers=4)
result = self._postprocess(scores)
return result
def humanevalpro_postprocess_official(text):
"""The official post-processing method for humaneval_pro, which is solely
applicable to the complete generation paradigm.
# noqa: E501 The chat template paradigm requires a different post-
processing method.
"""
text = text[:index if (index := text.find('```')) != -1 else len(text)]
return text
def humanevalpro_postprocess_oc(text):
"""For those generated based on the chat template paradigm, this method is
recommended.
# noqa: E501
"""
start = text.rfind('```python') + len('```python')
end = text.find('```', start)
code = text[start:end].strip()
return code

View File

@ -90,7 +90,7 @@ class HumanevalXEvaluator(BaseEvaluator):
self.timeout = timeout self.timeout = timeout
super().__init__() super().__init__()
def score(self, predictions, references): def score(self, predictions, references, test_set):
predictions = [{ predictions = [{
'task_id': 'task_id':
f'{_LANGUAGE_NAME_DICT[self.language]}/{i}', f'{_LANGUAGE_NAME_DICT[self.language]}/{i}',

View File

@ -3,6 +3,7 @@ import json
import multiprocessing import multiprocessing
from collections import defaultdict from collections import defaultdict
from concurrent.futures import ProcessPoolExecutor, as_completed from concurrent.futures import ProcessPoolExecutor, as_completed
from typing import Sequence
import numpy as np import numpy as np
from tqdm import tqdm from tqdm import tqdm
@ -174,7 +175,7 @@ def codegen_metrics(
samples_list, samples_list,
generations_list, generations_list,
k_list=[1, 5, 10, 20, 40, 50, 75, 100, 125, 150, 200, 500, 1000], k_list=[1, 5, 10, 20, 40, 50, 75, 100, 125, 150, 200, 500, 1000],
num_process_evaluate=16, num_process_evaluate=8,
timeout=6, timeout=6,
debug=False, debug=False,
): ):
@ -238,14 +239,20 @@ class LCBCodeGenerationEvaluator(BaseEvaluator):
release_version='release_v1', release_version='release_v1',
extractor_version='v1', extractor_version='v1',
start_date=None, start_date=None,
end_date=None): end_date=None,
num_repeats=1,
k=1):
super().__init__() super().__init__()
self.num_process_evaluate = num_process_evaluate self.num_process_evaluate = num_process_evaluate
self.timeout = timeout self.timeout = timeout
if not isinstance(k, Sequence):
k = (k, )
self.k = k
self.dataset = LCBCodeGenerationDataset.load( self.dataset = LCBCodeGenerationDataset.load(
release_version=release_version, release_version=release_version,
start_date=start_date, start_date=start_date,
end_date=end_date)['test'] end_date=end_date,
num_repeats=num_repeats)['test']
self.extractor_version = extractor_version self.extractor_version = extractor_version
def score(self, predictions, references): def score(self, predictions, references):
@ -273,6 +280,9 @@ class LCBCodeGenerationEvaluator(BaseEvaluator):
filtered_references = [] filtered_references = []
for idx, item in enumerate(references): for idx, item in enumerate(references):
if item in self.dataset['question_id']: if item in self.dataset['question_id']:
if filtered_references and item == filtered_references[-1]:
filtered_predictions[-1].extend(predictions[idx])
else:
filtered_predictions.append(predictions[idx]) filtered_predictions.append(predictions[idx])
filtered_references.append(item) filtered_references.append(item)
@ -291,7 +301,7 @@ class LCBCodeGenerationEvaluator(BaseEvaluator):
metrics, eval_results, final_metadata = codegen_metrics( metrics, eval_results, final_metadata = codegen_metrics(
filtered_references, filtered_references,
filtered_predictions, filtered_predictions,
k_list=[1], k_list=self.k,
num_process_evaluate=self.num_process_evaluate, num_process_evaluate=self.num_process_evaluate,
timeout=self.timeout, timeout=self.timeout,
) )

View File

@ -56,7 +56,8 @@ class LCBCodeGenerationDataset(BaseDataset):
local_mode: bool = False, local_mode: bool = False,
release_version: str = 'release_v1', release_version: str = 'release_v1',
start_date: str = None, start_date: str = None,
end_date: str = None): end_date: str = None,
num_repeats: int = None):
def transform(item): def transform(item):
# Define the dataitem mapping logic # Define the dataitem mapping logic
@ -118,7 +119,13 @@ class LCBCodeGenerationDataset(BaseDataset):
if end_date is not None: if end_date is not None:
p_end_date = datetime.strptime(end_date, '%Y-%m-%d') p_end_date = datetime.strptime(end_date, '%Y-%m-%d')
dataset = dataset.filter(lambda e: datetime.fromisoformat(e[ dataset = dataset.filter(lambda e: datetime.fromisoformat(e[
'contest_date']) <= p_end_date) # noqa: E501 'contest_date']) <= p_end_date)
if num_repeats and num_repeats > 1:
indices = []
for idx in range(len(dataset)):
indices.extend([idx] * num_repeats)
dataset = dataset.select(indices)
return DatasetDict({'test': dataset, 'train': dataset}) return DatasetDict({'test': dataset, 'train': dataset})

View File

@ -436,7 +436,7 @@ class MBPPPassKEvaluator(MBPPEvaluator):
"""Better use for pass k evaluation. """Better use for pass k evaluation.
Args: Args:
k(Tuple[int]): Choices of Pass@k. Defaults to (1, 10, 100) k(Union[int, Tuple[int, ...], List[int]]): Choices of Pass@k.
""" """
def __init__(self, k=(1, 10, 100)) -> None: def __init__(self, k=(1, 10, 100)) -> None:
@ -478,7 +478,7 @@ class MBPPPassKEvaluator(MBPPEvaluator):
task_total = defaultdict(int) task_total = defaultdict(int)
result = {'pass': 0, 'timeout': 0, 'failed': 0, 'wrong_answer': 0} result = {'pass': 0, 'timeout': 0, 'failed': 0, 'wrong_answer': 0}
with ProcessPoolExecutor() as executor: with ProcessPoolExecutor(max_workers=8) as executor:
futures = [] futures = []
for refer, preds in zip(references, predictions): for refer, preds in zip(references, predictions):
# suits for two case # suits for two case
@ -494,7 +494,7 @@ class MBPPPassKEvaluator(MBPPEvaluator):
for pred in preds: for pred in preds:
pred = self._process_answer(pred) pred = self._process_answer(pred)
programs = self._process_test(test_case, pred) programs = self._process_test(test_case, pred)
future = executor.submit(execution, programs, task_id, 10) future = executor.submit(execution, programs, task_id, 8)
futures.append(future) futures.append(future)
from tqdm import tqdm from tqdm import tqdm

View File

@ -0,0 +1,97 @@
import json
from typing import Dict, List
import numpy as np
from datasets import Dataset
from opencompass.openicl.icl_evaluator.code_evaluator import CodeEvaluator
from opencompass.utils import get_data_path
from .base import BaseDataset
class MBPPProDataset(BaseDataset):
@staticmethod
def load(path, num_repeats=1, local_mode=False):
path = get_data_path(path, local_mode=local_mode)
print(path)
dataset = []
with open(path, encoding='utf-8') as f:
for line in f:
dataset.extend(
[json.loads(line.strip()) for _ in range(num_repeats)])
return Dataset.from_list(dataset)
class MBPPProEvaluator(CodeEvaluator):
def _process_completions(self, test_case: dict, completions: list) -> list:
processed_completions = []
for comp in completions:
post_comp = self._extract_code(comp)
processed_completions.append(post_comp)
return processed_completions
def score(self, predictions: List, references: List,
test_set: Dataset) -> Dict:
if len(predictions) != len(references):
return {
'error':
'predictions and references have different '
f'length. len(predictions): {len(predictions)}, '
f'len(references): {len(references)}'
}
test_set = test_set.to_pandas()
# Use the first column as the unique identifier
test_set_origin = test_set.drop_duplicates(subset=test_set.columns[0])
num_repeats = int(len(test_set) / len(test_set_origin))
# 1. Prepare data for all test cases
all_test_cases = []
for i in range(len(test_set_origin)):
test_case = test_set_origin.iloc[i]
completions = predictions[i * num_repeats:(i + 1) * num_repeats]
# Process code completions
processed_completions = self._process_completions(
test_case, completions)
sub_data_dict = {
'name': int(test_case['id']),
'language': self.language,
'prompt': '',
'tests': test_case['test_code'],
'processed_completions': processed_completions,
'completions': completions
}
all_test_cases.append(sub_data_dict)
# 2. Send all test cases to the evaluation service
success, outputs, error_message = self._evaluate(all_test_cases)
if not success:
return {'error': error_message}
# 3. Process the returned results
details = []
total, correct = [], []
for output in outputs:
passed = [m['status'] == 'OK' for m in output['meta_data']]
total.append(len(passed))
correct.append(sum(passed))
details.append(output)
total = np.array(total)
correct = np.array(correct)
pass_at_k = {
f'pass@{k}':
self.estimate_pass_at_k(total, correct, k).mean() * 100
for k in self.k if (total >= k).all()
}
return {
**pass_at_k,
'details': details,
}

View File

@ -1,12 +1,14 @@
# flake8: noqa: E501 # flake8: noqa: E501
import difflib import difflib
import itertools
import os import os
import re import re
import tempfile import tempfile
import time import time
from typing import Any, Dict, List, Optional, Tuple, Union from typing import Any, Dict, List, Optional, Sequence, Tuple, Union
import numpy as np
from datasets import Dataset from datasets import Dataset
from gradio_client import Client from gradio_client import Client
@ -24,19 +26,24 @@ class CodeEvaluator(BaseEvaluator):
""" """
def __init__(self, def __init__(self,
language: str, language: str = 'py',
ip_address: str = 'localhost', ip_address: str = 'localhost',
k: Union[int, Tuple[int, ...], List[int]] = 1,
retry: int = 3) -> None: retry: int = 3) -> None:
"""Initialize the CodeEvaluator. """Initialize the CodeEvaluator.
Args: Args:
language (str): Programming language of the code to evaluate. language (str): Programming language of the code to evaluate.
ip_address (str, optional): IP address of the evaluation service. Defaults to 'localhost'. ip_address (str, optional): IP address of the evaluation service. Defaults to 'localhost'.
k: Union[int, Tuple[int,...], List[int,...]]: The number k of pass@k to evaluate the code. Defaults to 1.
retry (int, optional): Number of retry attempts for failed connections. Defaults to 3. retry (int, optional): Number of retry attempts for failed connections. Defaults to 3.
""" """
self.language = language self.language = language
self.retry = retry self.retry = retry
self.client = Client(ip_address) self.client = Client(ip_address)
if not isinstance(k, Sequence):
k = (k, )
self.k = k
super().__init__() super().__init__()
def _extract_code(self, text: str) -> str: def _extract_code(self, text: str) -> str:
@ -195,6 +202,31 @@ class CodeEvaluator(BaseEvaluator):
return True, output, None return True, output, None
def estimate_pass_at_k(self, num_samples: Union[int, List[int],
np.ndarray],
num_correct: Union[List[int], np.ndarray],
k: int) -> np.ndarray:
"""Estimates pass@k of each problem and returns them in an array."""
def estimator(n: int, c: int, k: int) -> float:
"""
Calculates 1 - comb(n - c, k) / comb(n, k).
"""
if n - c < k:
return 1.0
return 1.0 - np.prod(1.0 - k / np.arange(n - c + 1, n + 1))
if isinstance(num_samples, int):
num_samples_it = itertools.repeat(num_samples, len(num_correct))
else:
assert len(num_samples) == len(num_correct)
num_samples_it = iter(num_samples)
return np.array([
estimator(int(n), int(c), k)
for n, c in zip(num_samples_it, num_correct)
])
def score(self, predictions: List, references: List, def score(self, predictions: List, references: List,
test_set: Dataset) -> Dict: test_set: Dataset) -> Dict:
"""Score code generation predictions against references. """Score code generation predictions against references.
@ -233,7 +265,7 @@ class CodeEvaluator(BaseEvaluator):
processed_completions = self._process_completions( processed_completions = self._process_completions(
test_case, completions) test_case, completions)
result_dict = { sub_data_dict = {
'name': test_case['name'], 'name': test_case['name'],
'language': test_case['language'], 'language': test_case['language'],
'prompt': test_case['prompt'], 'prompt': test_case['prompt'],
@ -242,7 +274,7 @@ class CodeEvaluator(BaseEvaluator):
'completions': completions 'completions': completions
} }
all_test_cases.append(result_dict) all_test_cases.append(sub_data_dict)
# 2. Send all test cases to the evaluation service # 2. Send all test cases to the evaluation service
success, outputs, error_message = self._evaluate(all_test_cases) success, outputs, error_message = self._evaluate(all_test_cases)
@ -251,17 +283,22 @@ class CodeEvaluator(BaseEvaluator):
# 3. Process the returned results # 3. Process the returned results
details = [] details = []
correct = 0 total, correct = [], []
for output in outputs: for output in outputs:
if output.get('status') == 'OK': passed = [m['status'] == 'OK' for m in output['meta_data']]
output['correct'] = True total.append(len(passed))
correct += 1 correct.append(sum(passed))
else:
output['correct'] = False
details.append(output) details.append(output)
total = np.array(total)
correct = np.array(correct)
pass_at_k = {
f'pass@{k}':
self.estimate_pass_at_k(total, correct, k).mean() * 100
for k in self.k if (total >= k).all()
}
return { return {
f'pass@{num_repeats}': 100 * correct / len(test_set_origin), **pass_at_k,
'details': details 'details': details,
} }

View File

@ -420,6 +420,16 @@ DATASETS_MAPPING = {
"hf_id": "", "hf_id": "",
"local": "./data/OlympiadBench", "local": "./data/OlympiadBench",
}, },
"opencompass/humaneval_pro": {
"ms_id": "",
"hf_id": "",
"local": "./data/humaneval_pro/humaneval_pro.json",
},
"opencompass/mbpp_pro": {
"ms_id": "",
"hf_id": "",
"local": "./data/mbpp_pro/mbpp_pro.json",
},
} }
DATASETS_URL = { DATASETS_URL = {
@ -746,5 +756,13 @@ DATASETS_URL = {
"url": "url":
"http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/bigcodebench.zip", "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/bigcodebench.zip",
"md5": "270f399f4142b74f47ecff116cc3b21d" "md5": "270f399f4142b74f47ecff116cc3b21d"
} },
"humaneval_pro": {
"url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/humaneval_pro.zip",
"md5": "4c6fe556e84e905e4f0902d699e46de5",
},
"mbpp_pro": {
"url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/mbpp_pro.zip",
"md5": "eac330b8a0a8687f006265c9383503ce",
},
} }

View File

@ -51,7 +51,8 @@ class JSONToolkit:
raise raise
@staticmethod @staticmethod
def read_jsonl(file_path: Union[str, Path]) -> List[Dict[str, Any]]: def read_jsonl(file_path: Union[str, Path],
num_repeats: int = 1) -> List[Dict[str, Any]]:
"""Read a JSONL file and return its contents as a list of dictionaries. """Read a JSONL file and return its contents as a list of dictionaries.
Args: Args:
@ -73,7 +74,9 @@ class JSONToolkit:
if not line: # Skip empty lines if not line: # Skip empty lines
continue continue
try: try:
results.append(json.loads(line)) # results.append(json.loads(line))
results.extend(
[json.loads(line) for _ in range(num_repeats)])
except json.JSONDecodeError as e: except json.JSONDecodeError as e:
logger.error( logger.error(
f'Invalid JSON on line {line_num}: {str(e)}') f'Invalid JSON on line {line_num}: {str(e)}')