This commit is contained in:
Dongsheng Zhu 2025-04-12 14:15:23 +08:00 committed by GitHub
commit a3fa2fb105
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
29 changed files with 1039 additions and 75 deletions

153
examples/eval_codebench.py Normal file
View File

@ -0,0 +1,153 @@
from mmengine.config import read_base
import os.path as osp
from opencompass.runners import LocalRunner, VOLCRunner
from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner
from opencompass.tasks import OpenICLInferTask, OpenICLEvalTask
with read_base():
# Datasets Part
# bigcodebench
from opencompass.configs.datasets.bigcodebench.bigcodebench_full_instruct_gen_c3d5ad import (
bigcodebench_full_instruct_datasets
)
from opencompass.configs.datasets.bigcodebench.bigcodebench_hard_instruct_gen_c3d5ad import (
bigcodebench_hard_instruct_datasets
)
# livecodebench code generation lite v5
from opencompass.configs.datasets.livecodebench.livecodebench_time_split_gen import (
LCB_datasets
)
# huamneval
from opencompass.configs.datasets.humaneval.humaneval_openai_sample_evals_gen_dcae0e import (
humaneval_datasets
)
from opencompass.configs.datasets.humaneval_pro.humaneval_pro_gen import (
humanevalpro_datasets
)
from opencompass.configs.datasets.humanevalx.humanevalx_gen_627de5 import (
humanevalx_datasets
)
# mbpp
from opencompass.configs.datasets.mbpp.mbpp_gen import (
mbpp_datasets
)
from opencompass.configs.datasets.mbpp_pro.mbpp_pro_gen import (
mbpppro_datasets
)
# multipl-e
from opencompass.configs.datasets.multipl_e.multiple_top_ten_gen import (
multiple_datasets
)
# ds1000
from opencompass.configs.datasets.ds1000.ds1000_service_eval_gen_cbc84f import (
ds1000_datasets
)
# Models Part
from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_7b_instruct import (
models as lmdeploy_qwen2_5_7b_instruct_model,
)
from opencompass.configs.models.hf_internlm.lmdeploy_internlm3_8b_instruct import (
models as lmdeploy_internlm3_8b_instruct_model,
)
# Summary Groups
from opencompass.configs.summarizers.groups.ds1000 import (
ds1000_summary_groups,
)
from opencompass.configs.summarizers.groups.multipl_e import (
multiple_summary_groups,
)
from opencompass.configs.summarizers.groups.humanevalx import (
humanevalx_summary_groups,
)
# models config
models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
for model in models:
model['max_seq_len'] = 16384
model['max_out_len'] = 8192
# datasets config
datasets = sum(
(v for k, v in locals().items() if k.endswith('_datasets')),
[],
)
for item in humanevalx_datasets:
item['eval_cfg']['evaluator'][
'ip_address'
] = 'codeeval.opencompass.org.cn/humanevalx'
item['eval_cfg']['evaluator']['port'] = ''
for item in ds1000_datasets:
item['eval_cfg']['evaluator'][
'ip_address'
] = 'codeeval.opencompass.org.cn/ds1000'
item['eval_cfg']['evaluator']['port'] = ''
for dataset in datasets:
dataset['infer_cfg']['inferencer']['max_out_len'] = 8192
# summary
summary_groups = sum(
[v for k, v in locals().items() if k.endswith('_summary_groups')], []
)
summary_groups.append(
{'name': 'humanevalx',
'subsets': ['humanevalx-python', 'humanevalx-cpp', 'humanevalx-java', 'humanevalx-js']}
)
summarizer = dict(
dataset_abbrs = [
['bigcodebench_hard_instruct', 'pass@1'],
['bigcodebench_full_instruct', 'pass@1'],
['lcb_code_generation', 'pass@1'],
['openai_humaneval', 'humaneval_pass@1'],
['mbpp', 'score'],
['humaneval_pro', 'pass@1'],
['mbpp_pro', 'pass@1'],
['multiple', 'naive_average'],
['humanevalx', 'naive_average'],
['ds1000', 'naive_average'],
'',
'humanevalx-python',
'humanevalx-cpp',
'humanevalx-java',
'humanevalx-js',
'',
'ds1000_Pandas',
'ds1000_Numpy',
'ds1000_Tensorflow',
'ds1000_Scipy',
'ds1000_Sklearn',
'ds1000_Pytorch',
'ds1000_Matplotlib',
'',
'humaneval-multiple-cpp',
'humaneval-multiple-cs',
'humaneval-multiple-go',
'humaneval-multiple-java',
'humaneval-multiple-rb',
'humaneval-multiple-js',
'humaneval-multiple-php',
'humaneval-multiple-r',
'humaneval-multiple-rs',
'humaneval-multiple-sh',
'',
'mbpp-multiple-cpp',
'mbpp-multiple-cs',
'mbpp-multiple-go',
'mbpp-multiple-java',
'mbpp-multiple-rb',
'mbpp-multiple-js',
'mbpp-multiple-php',
'mbpp-multiple-r',
'mbpp-multiple-rs',
'mbpp-multiple-sh'
],
summary_groups=summary_groups,
)
work_dir = 'outputs/code'

View File

@ -0,0 +1,161 @@
from mmengine.config import read_base
import os.path as osp
from opencompass.runners import LocalRunner, VOLCRunner
from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner
from opencompass.tasks import OpenICLInferTask, OpenICLEvalTask
with read_base():
# Datasets Part
# bigcodebench
from opencompass.configs.datasets.bigcodebench.bigcodebench_full_instruct_gen_c3d5ad import (
bigcodebench_full_instruct_datasets
)
from opencompass.configs.datasets.bigcodebench.bigcodebench_hard_instruct_gen_c3d5ad import (
bigcodebench_hard_instruct_datasets
)
# livecodebench code generation lite v5
from opencompass.configs.datasets.livecodebench.livecodebench_time_split_gen import (
LCB_datasets
)
# huamneval
from opencompass.configs.datasets.humaneval.humaneval_passk_gen_8e312c import (
humaneval_datasets
)
from opencompass.configs.datasets.humaneval_pro.humaneval_pro_gen import (
humanevalpro_datasets
)
# mbpp
from opencompass.configs.datasets.mbpp.mbpp_passk_gen_830460 import (
mbpp_datasets
)
from opencompass.configs.datasets.mbpp_pro.mbpp_pro_gen import (
mbpppro_datasets
)
# multipl-e
from opencompass.configs.datasets.multipl_e.multiple_top_ten_gen import (
multiple_datasets
)
# Models Part
from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_7b_instruct import (
models as lmdeploy_qwen2_5_7b_instruct_model,
)
from opencompass.configs.models.hf_internlm.lmdeploy_internlm3_8b_instruct import (
models as lmdeploy_internlm3_8b_instruct_model,
)
# models config
models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
for model in models:
model['max_seq_len'] = 16384
model['max_out_len'] = 8192
# datasets config
datasets = sum(
(v for k, v in locals().items() if k.endswith('_datasets')),
[],
)
num_repeats = 5
k = (1, 3, 5)
for dataset in datasets:
dataset['infer_cfg']['inferencer']['max_out_len'] = 8192
# openai pass@k config: the current setting is pass@5 (n=10).
if not any(exclude in dataset['abbr'] for exclude in ('mbpp', 'humaneval')):
dataset['eval_cfg']['evaluator']['num_repeats'] = num_repeats
dataset['eval_cfg']['evaluator']['k'] = k
dataset['num_repeats'] = num_repeats
# dataset['abbr'] += f'_passk'
# summary
summarizer = dict(
dataset_abbrs = [
'pass@1',
['bigcodebench_full_instruct_passk', 'pass@1'],
['bigcodebench_hard_instruct_passk', 'pass@1'],
['lcb_code_generation_passk', 'pass@1'],
['openai_humaneval_passk_passk', 'humaneval_pass@1'],
['humaneval_pro_passk', 'pass@1'],
['mbpp_passk_passk', 'pass@1'],
['mbpp_pro_passk', 'pass@1'],
['humaneval-multiple-cpp_passk', 'pass@1'],
['humaneval-multiple-cs_passk', 'pass@1'],
['humaneval-multiple-go_passk', 'pass@1'],
['humaneval-multiple-java_passk', 'pass@1'],
['humaneval-multiple-rb_passk', 'pass@1'],
['humaneval-multiple-js_passk', 'pass@1'],
['humaneval-multiple-php_passk', 'pass@1'],
['humaneval-multiple-r_passk', 'pass@1'],
['humaneval-multiple-rs_passk', 'pass@1'],
['humaneval-multiple-sh_passk', 'pass@1'],
['mbpp-multiple-cpp_passk', 'pass@1'],
['mbpp-multiple-cs_passk', 'pass@1'],
['mbpp-multiple-go_passk', 'pass@1'],
['mbpp-multiple-java_passk', 'pass@1'],
['mbpp-multiple-rb_passk', 'pass@1'],
['mbpp-multiple-js_passk', 'pass@1'],
['mbpp-multiple-php_passk', 'pass@1'],
['mbpp-multiple-r_passk', 'pass@1'],
['mbpp-multiple-rs_passk', 'pass@1'],
['mbpp-multiple-sh_passk', 'pass@1'],
'',
'pass@3',
['bigcodebench_full_instruct_passk', 'pass@3'],
['bigcodebench_hard_instruct_passk', 'pass@3'],
['lcb_code_generation_passk', 'pass@3'],
['openai_humaneval_passk_passk', 'humaneval_pass@3'],
['humaneval_pro_passk', 'pass@3'],
['mbpp_passk_passk', 'pass@3'],
['mbpp_pro_passk', 'pass@3'],
['humaneval-multiple-cpp_passk', 'pass@3'],
['humaneval-multiple-cs_passk', 'pass@3'],
['humaneval-multiple-go_passk', 'pass@3'],
['humaneval-multiple-java_passk', 'pass@3'],
['humaneval-multiple-rb_passk', 'pass@3'],
['humaneval-multiple-js_passk', 'pass@3'],
['humaneval-multiple-php_passk', 'pass@3'],
['humaneval-multiple-r_passk', 'pass@3'],
['humaneval-multiple-rs_passk', 'pass@3'],
['humaneval-multiple-sh_passk', 'pass@3'],
['mbpp-multiple-cpp_passk', 'pass@3'],
['mbpp-multiple-cs_passk', 'pass@3'],
['mbpp-multiple-go_passk', 'pass@3'],
['mbpp-multiple-java_passk', 'pass@3'],
['mbpp-multiple-rb_passk', 'pass@3'],
['mbpp-multiple-js_passk', 'pass@3'],
['mbpp-multiple-php_passk', 'pass@3'],
['mbpp-multiple-r_passk', 'pass@3'],
['mbpp-multiple-rs_passk', 'pass@3'],
['mbpp-multiple-sh_passk', 'pass@3'],
'',
'pass@5',
['bigcodebench_full_instruct_passk', 'pass@5'],
['bigcodebench_hard_instruct_passk', 'pass@5'],
['lcb_code_generation_passk', 'pass@5'],
['openai_humaneval_passk_passk', 'humaneval_pass@5'],
['humaneval_pro_passk', 'pass@5'],
['mbpp_passk_passk', 'pass@5'],
['mbpp_pro_passk', 'pass@5'],
['humaneval-multiple-cpp_passk', 'pass@5'],
['humaneval-multiple-cs_passk', 'pass@5'],
['humaneval-multiple-go_passk', 'pass@5'],
['humaneval-multiple-java_passk', 'pass@5'],
['humaneval-multiple-rb_passk', 'pass@5'],
['humaneval-multiple-js_passk', 'pass@5'],
['humaneval-multiple-php_passk', 'pass@5'],
['humaneval-multiple-r_passk', 'pass@5'],
['humaneval-multiple-rs_passk', 'pass@5'],
['humaneval-multiple-sh_passk', 'pass@5'],
['mbpp-multiple-cpp_passk', 'pass@5'],
['mbpp-multiple-cs_passk', 'pass@5'],
['mbpp-multiple-go_passk', 'pass@5'],
['mbpp-multiple-java_passk', 'pass@5'],
['mbpp-multiple-rb_passk', 'pass@5'],
['mbpp-multiple-js_passk', 'pass@5'],
['mbpp-multiple-php_passk', 'pass@5'],
['mbpp-multiple-r_passk', 'pass@5'],
['mbpp-multiple-rs_passk', 'pass@5'],
['mbpp-multiple-sh_passk', 'pass@5'],
],
)
work_dir = 'outputs/code_passk'

View File

@ -0,0 +1,45 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets import (BigCodeBenchDataset, BigCodeBenchEvaluator)
bigcodebench_full_reader_cfg = dict(
input_columns=['instruct_prompt'],
output_column='test',
)
bigcodebench_full_infer_cfg = dict(prompt_template=dict(
type=PromptTemplate,
template=dict(
begin=[dict(role='system', fallback_role='HUMAN', prompt='')],
round=[
dict(role='HUMAN', prompt='{instruct_prompt}'),
])),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer))
bigcodebench_full_eval_cfg = dict(
evaluator=dict(
type=BigCodeBenchEvaluator,
release_version='v0.1.2',
eval_type='instruct',
remote_execute_api='https://bigcode-bigcodebench-evaluator.hf.space/',
# remote_execute_api=
# 'https://opencompass-opencompass-bigcodebench-evaluator.hf.space', # noqa: E501
dataset_version='full',
num_repeats=1,
k=1,
),
pred_role='BOT',
)
bigcodebench_full_instruct_datasets = [
dict(abbr='bigcodebench_full_instruct',
type=BigCodeBenchDataset,
path='opencompass/bigcodebench',
reader_cfg=bigcodebench_full_reader_cfg,
infer_cfg=bigcodebench_full_infer_cfg,
eval_cfg=bigcodebench_full_eval_cfg,
release_version='v0.1.2',
num_repeats=1,)
]

View File

@ -24,10 +24,12 @@ bigcodebench_hard_eval_cfg = dict(
type=BigCodeBenchEvaluator,
release_version='v0.1.2',
eval_type='instruct',
# remote_execute_api='https://bigcode-bigcodebench-evaluator.hf.space/',
remote_execute_api=
'https://opencompass-opencompass-bigcodebench-evaluator.hf.space', # noqa: E501
remote_execute_api='https://bigcode-bigcodebench-evaluator.hf.space/',
# remote_execute_api=
# 'https://opencompass-opencompass-bigcodebench-evaluator.hf.space', # noqa: E501
dataset_version='hard',
num_repeats=1,
k=1,
),
pred_role='BOT',
)
@ -42,5 +44,6 @@ bigcodebench_hard_instruct_datasets = [
eval_cfg=bigcodebench_hard_eval_cfg,
release_version='v0.1.2',
dataset_version='hard',
num_repeats=1,
)
]

View File

@ -19,9 +19,9 @@ humaneval_infer_cfg = dict(
inferencer=dict(type=GenInferencer, max_out_len=512))
humaneval_eval_cfg = dict(
evaluator=dict(type=HumanEvalEvaluator),
evaluator=dict(type=HumanEvalEvaluator,
k=1),
pred_role='BOT',
k=[1, 10, 100], # the parameter only for humaneval
pred_postprocessor=dict(type=humaneval_postprocess_v2),
)
@ -32,5 +32,6 @@ humaneval_datasets = [
path='opencompass/humaneval',
reader_cfg=humaneval_reader_cfg,
infer_cfg=humaneval_infer_cfg,
eval_cfg=humaneval_eval_cfg)
eval_cfg=humaneval_eval_cfg,
num_repeats=1)
]

View File

@ -0,0 +1,17 @@
# HumanEval pro
## OC results
| model | pass@1 |
|:--------------------------:|---------:|
|qwen2.5-coder-7b-instruct-hf| 65 |
| qwen2.5-14b-instruct-hf | 67 |
| deepseek-v2-lite-chat-hf | 35 |
## CodeEval-pro results
| model | pass@1 |
|:--------------------------:|---------:|
|qwen2.5-coder-7b-instruct-hf| 65 |
| qwen2.5-14b-instruct-hf | 65 |
| deepseek-v2-lite-chat-hf | 28 |

View File

@ -0,0 +1,60 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets import HumanevalevalProDataset, HumanevalProEvaluator, humaneval_postprocess_v2
OFFICIAL_PROMPT_WRAPPER = """You are an exceptionally intelligent coding assistant that consistently delivers accurate and reliable responses to user instructions.
@@ Instruction
Write a solution of python file to the following problems, the solution of the second problem requires single or multiple calls to the first solution.
```python
{raw_problem}
{new_problem}
```
@@ Response
Please put the two solutions to the above problems in one Python code block.
"""
PROMPT_WRAPPER = """You are an exceptionally intelligent coding assistant that consistently delivers accurate and reliable responses to user instructions.
Write a solution of python file to the following problems, the solution of the second problem requires single or multiple calls to the first solution.
```python
{raw_problem}
{new_problem}
```
Please put the two solutions within the Python code block provided below, and make sure that the block contains no other unrelated content:
```python
```
"""
humanevalpro_reader_cfg = dict(
input_columns=['raw_problem', 'new_problem'], output_column='test_code')
humanevalpro_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(round=[
dict(
role='HUMAN',
prompt=PROMPT_WRAPPER),
])),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer))
humanevalpro_eval_cfg = dict(
evaluator=dict(type=HumanevalProEvaluator,
ip_address='https://opencompass-multiple-evaluator.hf.space',
k=1)
)
humanevalpro_datasets = [
dict(
abbr='humaneval_pro',
type=HumanevalevalProDataset,
path='opencompass/humaneval_pro',
num_repeats=1,
reader_cfg=humanevalpro_reader_cfg,
infer_cfg=humanevalpro_infer_cfg,
eval_cfg=humanevalpro_eval_cfg,)
]

View File

@ -0,0 +1,41 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets import HumanevalXDataset, HumanevalXEvaluator
humanevalx_reader_cfg = dict(
input_columns=['prompt'], output_column='declaration', train_split='test')
humanevalx_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template='{prompt}'),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer))
humanevalx_eval_cfg_dict = {
lang : dict(
evaluator=dict(
type=HumanevalXEvaluator,
language=lang,
ip_address=
'localhost', # replace to your code_eval_server ip_address, port
port=5001), # refer to https://opencompass.readthedocs.io/en/latest/advanced_guides/code_eval_service.html to launch a server
pred_role='BOT')
for lang in ['python', 'cpp', 'java', 'js'] # do not support rust & go now
}
# Please download the needed `xx.jsonl.gz` from
# https://github.com/THUDM/CodeGeeX2/tree/main/benchmark/humanevalx
# and move them into `data/humanevalx/` folder
humanevalx_datasets = [
dict(
type=HumanevalXDataset,
abbr=f'humanevalx-{lang}',
language=lang,
path='./data/humanevalx',
reader_cfg=humanevalx_reader_cfg,
infer_cfg=humanevalx_infer_cfg,
eval_cfg=humanevalx_eval_cfg_dict[lang])
for lang in ['python', 'cpp', 'java', 'js']
]

View File

@ -33,9 +33,11 @@ lcb_code_generation_eval_cfg = dict(
evaluator=dict(type=LCBCodeGenerationEvaluator,
num_process_evaluate=4,
timeout=6,
release_version='release_v5',
release_version='v5',
start_date='2024-08-01',
end_date='2025-02-01'),
end_date='2025-02-01',
num_repeats=1,
k=1,),
pred_role='BOT',
)
@ -46,7 +48,8 @@ LCBCodeGeneration_dataset = dict(
reader_cfg=lcb_code_generation_reader_cfg,
infer_cfg=lcb_code_generation_infer_cfg,
eval_cfg=lcb_code_generation_eval_cfg,
release_version='release_v5',
release_version='v5',
num_repeats=1,
)
# Code Execution Dataset
@ -127,6 +130,6 @@ LCBTestOutput_dataset = dict(
LCB_datasets = [
LCBCodeGeneration_dataset,
LCBCodeExecution_dataset,
LCBTestOutput_dataset,
# LCBCodeExecution_dataset,
# LCBTestOutput_dataset,
]

View File

@ -28,7 +28,9 @@ mbpp_infer_cfg = dict(
inferencer=dict(type=GenInferencer, max_out_len=512),
)
mbpp_eval_cfg = dict(evaluator=dict(type=MBPPPassKEvaluator), pred_role='BOT')
mbpp_eval_cfg = dict(evaluator=dict(type=MBPPPassKEvaluator,
k=1),
pred_role='BOT')
mbpp_datasets = [
dict(
@ -38,5 +40,6 @@ mbpp_datasets = [
reader_cfg=mbpp_reader_cfg,
infer_cfg=mbpp_infer_cfg,
eval_cfg=mbpp_eval_cfg,
num_repeats=1,
)
]

View File

@ -0,0 +1,17 @@
# MBPP pro
## OC results
| model | pass@1 |
|:--------------------------:|---------:|
|qwen2.5-coder-7b-instruct-hf| 66 |
| qwen2.5-14b-instruct-hf | 64 |
| deepseek-v2-lite-chat-hf | 36 |
## CodeEval-pro results
| model | pass@1 |
|:--------------------------:|---------:|
|qwen2.5-coder-7b-instruct-hf| 65 |
| qwen2.5-14b-instruct-hf | 65 |
| deepseek-v2-lite-chat-hf | 39 |

View File

@ -0,0 +1,60 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets import MBPPProDataset, MBPPProEvaluator
OFFICIAL_PROMPT_WRAPPER = """You are an exceptionally intelligent coding assistant that consistently delivers accurate and reliable responses to user instructions.
@@ Instruction
Write a solution of python file to the following problems, the solution of the second problem requires single or multiple calls to the first solution.
```python
{raw_problem}
{new_problem}
```
@@ Response
Please put the two solutions to the above problems in one Python code block.
"""
PROMPT_WRAPPER = """You are an exceptionally intelligent coding assistant that consistently delivers accurate and reliable responses to user instructions.
Write a solution of python file to the following problems, the solution of the second problem requires single or multiple calls to the first solution.
```python
{raw_problem}
{new_problem}
```
Please put the two solutions within the Python code block provided below, and make sure that the block contains no other unrelated content:
```python
```
"""
mbpppro_reader_cfg = dict(
input_columns=['raw_problem', 'new_problem'], output_column='test_code')
mbpppro_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(round=[
dict(
role='HUMAN',
prompt=PROMPT_WRAPPER),
])),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer))
mbpppro_eval_cfg = dict(
evaluator=dict(type=MBPPProEvaluator,
ip_address='https://opencompass-multiple-evaluator.hf.space',
k=1),
)
mbpppro_datasets = [
dict(
abbr='mbpp_pro',
type=MBPPProDataset,
path='opencompass/mbpp_pro',
num_repeats=1,
reader_cfg=mbpppro_reader_cfg,
infer_cfg=mbpppro_infer_cfg,
eval_cfg=mbpppro_eval_cfg)
]

View File

@ -21,6 +21,7 @@ multiple_eval_cfg = {
evaluator=dict(
type=MultiplEEvaluator,
language=lang,
k = 1,
ip_address='https://opencompass-multiple-evaluator.hf.space',
),
pred_role='BOT',

View File

@ -0,0 +1,6 @@
multiple_summary_groups = []
humaneval_multiple = ['humaneval-multiple-cpp', 'humaneval-multiple-cs', 'humaneval-multiple-go', 'humaneval-multiple-java', 'humaneval-multiple-rb', 'humaneval-multiple-js', 'humaneval-multiple-php', 'humaneval-multiple-r', 'humaneval-multiple-rs', 'humaneval-multiple-sh']
mbpp_multiple = ['mbpp-multiple-cpp', 'mbpp-multiple-cs', 'mbpp-multiple-go', 'mbpp-multiple-java', 'mbpp-multiple-rb', 'mbpp-multiple-js', 'mbpp-multiple-php', 'mbpp-multiple-r', 'mbpp-multiple-rs', 'mbpp-multiple-sh']
multiple_summary_groups.append({'name': 'multiple', 'subsets': humaneval_multiple})
multiple_summary_groups.append({'name':'multiple','subsets': mbpp_multiple})

View File

@ -62,6 +62,7 @@ from .hle import * # noqa: F401, F403
from .huggingface import * # noqa: F401, F403
from .humaneval import * # noqa: F401, F403
from .humaneval_multi import * # noqa: F401, F403
from .humaneval_pro import * # noqa: F401, F403
from .humanevalx import * # noqa: F401, F403
from .hungarian_math import * # noqa: F401, F403
from .IFEval.ifeval import IFEvalDataset, IFEvaluator # noqa: F401, F403
@ -91,6 +92,7 @@ from .math401 import * # noqa: F401, F403
from .math_intern import * # noqa: F401, F403
from .mathbench import * # noqa: F401, F403
from .mbpp import * # noqa: F401, F403
from .mbpp_pro import * # noqa: F401, F403
from .medbench import * # noqa: F401, F403
from .MedXpertQA import * # noqa: F401, F403
from .mgsm import * # noqa: F401, F403

View File

@ -4,6 +4,7 @@
import os
import time
from concurrent.futures._base import CancelledError
from typing import List, Sequence, Tuple, Union
import httpx
from datasets import Dataset, DatasetDict
@ -24,7 +25,8 @@ class BigCodeBenchDataset(BaseDataset):
def load(path: str = 'opencompass/bigcodebench',
local_mode: bool = False,
release_version: str = 'v0.1.2',
dataset_version: str = 'full'):
dataset_version: str = 'full',
num_repeats: int = 1):
"""
Args:
path (str): The path to the dataset.
@ -33,6 +35,7 @@ class BigCodeBenchDataset(BaseDataset):
release_version (str): The release version of the dataset.
dataset_version (str): The data version of the dataset.
only support ['full', 'hard']
num_repeats (int): Number of times to repeat dataset for pass@k.
"""
assert dataset_version in ['full', 'hard'], \
'dataset_version should be one of ["full", "hard"], '
@ -45,11 +48,13 @@ class BigCodeBenchDataset(BaseDataset):
# 'entry_point', 'doc_struct', 'libs'
if dataset_version == 'full':
items = JSONToolkit.read_jsonl(
os.path.join(path, f'BigCodeBench-{release_version}.jsonl'))
os.path.join(path, f'BigCodeBench-{release_version}.jsonl'),
num_repeats)
else:
items = JSONToolkit.read_jsonl(
os.path.join(path,
f'BigCodeBench-Hard-{release_version}.jsonl'))
f'BigCodeBench-Hard-{release_version}.jsonl'),
num_repeats)
dataset['train'] = Dataset.from_list(items)
dataset['test'] = Dataset.from_list(items)
@ -61,10 +66,10 @@ class BigCodeBenchEvaluator(BaseEvaluator):
"""Evaluator for BigCodeBench.
Args:
num_process_evaluate (int): number of processes to evaluate
timeout (int): timeout for each evaluation
release_version (str): release version of BigCodeBench
eval_type (str): type of evaluation, either 'instruct' or 'completion'
k (str): pass@k for evaluation
"""
def __init__(
@ -75,7 +80,9 @@ class BigCodeBenchEvaluator(BaseEvaluator):
dataset_version: str = 'full',
local_mode: bool = False,
path: str = 'opencompass/bigcodebench',
num_repeats=1,
pass_k: str = '1,5,10',
k: Union[int, Tuple[int, ...], List[int]] = 1,
parallel: int = -1,
min_time_limit: float = 1,
max_as_limit: int = 30 * 1024,
@ -88,12 +95,17 @@ class BigCodeBenchEvaluator(BaseEvaluator):
release_version=release_version,
dataset_version=dataset_version,
local_mode=local_mode,
path=path)['test']
path=path,
num_repeats=num_repeats)['test']
self.eval_type = eval_type
if not isinstance(k, Sequence):
k = (k, )
k = ', '.join(map(str, k))
self.k = k
self.remote_execute_api = remote_execute_api
self.eval_kwargs = dict(subset=dataset_version,
pass_k=pass_k,
pass_k=self.k,
parallel=parallel,
min_time_limit=min_time_limit,
max_as_limit=max_as_limit,
@ -141,7 +153,7 @@ class BigCodeBenchEvaluator(BaseEvaluator):
signal.alarm(0)
signal.signal(signal.SIGALRM, original_handler)
with timeout_handler(10):
with timeout_handler(300):
sanitized_prediction = extract_code_generation(
prediction, entrypoint=entrypoint)
@ -188,7 +200,9 @@ class BigCodeBenchEvaluator(BaseEvaluator):
while True:
try:
eval_client = Client(self.remote_execute_api,
httpx_kwargs=dict(proxies=proxies))
httpx_kwargs=dict(
proxies=proxies,
timeout=httpx.Timeout(100.0)))
results, pass_at_k = eval_client.predict(
split=self.eval_type,
samples=handle_file(submitted_contents_path),
@ -196,22 +210,25 @@ class BigCodeBenchEvaluator(BaseEvaluator):
**self.eval_kwargs)
break
except (httpx.ReadTimeout, CancelledError):
logger.info('Read timeout error. Retrying in 4s...')
logger.info('Read timeout error. Retrying in 10s...')
time.sleep(10)
if 'pass@1' in pass_at_k.keys():
pass_at_k['pass@1'] *= 100
dump_results = {'details': self._results_processor(results)}
dump_results.update(pass_at_k)
return dump_results
pass_at_k = {
k: v * 100 if isinstance(v, (int, float)) else v
for k, v in pass_at_k.items()
}
return {
**pass_at_k,
'details': self._results_processor(results),
}
def _results_processor(self, results):
details = []
for key, value in results['eval'].items():
if value[0]['status'] == 'pass':
value[0]['correct'] = True
else:
value[0]['correct'] = False
details.append(value[0])
detail = {'correct': False, 'results_details': value}
for v in value:
if v['status'] == 'pass':
detail['correct'] = True
break
details.append(detail)
return details

View File

@ -191,14 +191,19 @@ class CodeCustomDataset(BaseDataset):
path = get_data_path(path, local_mode=local_mode)
if file_name is not None:
path = os.path.join(path, file_name)
files = os.listdir(path)
data = []
if path.endswith('.jsonl'):
with open(path, 'r', encoding='utf-8') as f:
if any(f.endswith('.jsonl') for f in files):
target_file = next(f for f in files if f.endswith('.jsonl'))
target_path = os.path.join(path, target_file)
with open(target_path, 'r', encoding='utf-8') as f:
for line in f:
data.extend(
[json.loads(line.strip()) for _ in range(num_repeats)])
elif path.endswith('.csv'):
with open(path, 'r', encoding='utf-8-sig') as f:
elif any(f.endswith('.csv') for f in files):
target_file = next(f for f in files if f.endswith('.csv'))
target_path = os.path.join(path, target_file)
with open(target_path, 'r', encoding='utf-8-sig') as f:
reader = csv.reader(f)
header = next(reader)
for row in reader:

View File

@ -6,7 +6,7 @@ import os.path as osp
import re
import tempfile
from os import environ
from typing import List
from typing import List, Sequence, Tuple, Union
from datasets import Dataset
@ -70,12 +70,16 @@ class HumanevalDataset(BaseDataset):
class HumanEvalEvaluator(BaseEvaluator):
"""Evaluator for HumanEval or EvalPlus."""
def __init__(self, k: List[int] = [1, 10, 100]) -> None:
def __init__(self, k: Union[int, Tuple[int, ...], List[int]] = 1,
num_repeats: int = 1) -> None:
try:
import human_eval
except ImportError:
raise ImportError(HUMANEVAL_IMPORT_ERROR)
self.n = num_repeats
if not isinstance(k, Sequence):
k = (k, )
self.k = k
super().__init__()
@ -87,16 +91,24 @@ class HumanEvalEvaluator(BaseEvaluator):
from human_eval.evaluation import evaluate_functional_correctness
prompts = [item['prompt'] for item in test_set]
humaneval_preds = []
predictions_processed, references_processed = [], []
for pred, refer in zip(predictions, references):
if references_processed and refer == references_processed[-1]:
predictions_processed[-1].extend([pred])
else:
references_processed.append(refer)
predictions_processed.append([pred])
# create json file in human_eval format
for preds, refer in zip(predictions, references):
humaneval_preds = []
for preds_p, refer_p in zip(predictions_processed, references_processed):
# suits for two case
# 1. use repeated dataset
# 2. use `num_return_sequences` to generate multiple responses
if not isinstance(preds, list):
preds = [preds]
for pred in preds:
humaneval_preds.append({'task_id': refer, 'completion': pred})
if not isinstance(preds_p, list):
preds_p = [preds_p]
for pred_p in preds_p:
humaneval_preds.append({'task_id': refer_p, 'completion': pred_p})
with tempfile.TemporaryDirectory() as tmp_dir:
out_dir = osp.join(tmp_dir, 'human_eval.json')
write_jsonl(out_dir, humaneval_preds)
@ -183,13 +195,13 @@ def humaneval_postprocess_v2(text: str) -> str:
blocks = re.findall(r'```\w*\n(.*?)```', text, re.DOTALL)
if len(blocks) >= 1:
text = blocks[0]
return text
return text.lstrip()
def humaneval_postprocess_v3(text: str) -> str:
blocks = re.findall(r'```\w*\n(.*?)```', text, re.DOTALL)
if len(blocks) >= 1:
text = blocks[-1]
return text
return text.lstrip()
def humaneval_internal_v2_postprocess(text: str):
if text.startswith(' ') and not text.startswith(' '):

View File

@ -0,0 +1,96 @@
import json
from typing import Dict, List
import numpy as np
from datasets import Dataset
from opencompass.openicl.icl_evaluator.code_evaluator import CodeEvaluator
from opencompass.utils import get_data_path
from .base import BaseDataset
class HumanevalevalProDataset(BaseDataset):
@staticmethod
def load(path, num_repeats=1, local_mode=False):
path = get_data_path(path, local_mode=local_mode)
dataset = []
with open(path, encoding='utf-8') as f:
raw_data = json.load(f)
for data in raw_data:
dataset.extend([data for _ in range(num_repeats)])
return Dataset.from_list(dataset)
class HumanevalProEvaluator(CodeEvaluator):
def _process_completions(self, test_case: dict, completions: list) -> list:
processed_completions = []
for comp in completions:
post_comp = self._extract_code(comp)
processed_completions.append(post_comp)
return processed_completions
def score(self, predictions: List, references: List,
test_set: Dataset) -> Dict:
if len(predictions) != len(references):
return {
'error':
'predictions and references have different '
f'length. len(predictions): {len(predictions)}, '
f'len(references): {len(references)}'
}
test_set = test_set.to_pandas()
# Use the first column as the unique identifier
test_set_origin = test_set.drop_duplicates(subset=test_set.columns[0])
num_repeats = int(len(test_set) / len(test_set_origin))
# 1. Prepare data for all test cases
all_test_cases = []
for i in range(len(test_set_origin)):
test_case = test_set_origin.iloc[i]
completions = predictions[i * num_repeats:(i + 1) * num_repeats]
# Process code completions
processed_completions = self._process_completions(
test_case, completions)
sub_data_dict = {
'name': int(test_case['id']),
'language': self.language,
'prompt': '',
'tests': test_case['test_code'],
'processed_completions': processed_completions,
'completions': completions
}
all_test_cases.append(sub_data_dict)
# 2. Send all test cases to the evaluation service
success, outputs, error_message = self._evaluate(all_test_cases)
if not success:
return {'error': error_message}
# 3. Process the returned results
details = []
total, correct = [], []
for output in outputs:
passed = [m['status'] == 'OK' for m in output['meta_data']]
total.append(len(passed))
correct.append(sum(passed))
details.append(output)
total = np.array(total)
correct = np.array(correct)
pass_at_k = {
f'pass@{k}':
self.estimate_pass_at_k(total, correct, k).mean() * 100
for k in self.k if (total >= k).all()
}
return {
**pass_at_k,
'details': details,
}

View File

@ -0,0 +1,89 @@
import json
import os
import evaluate
from datasets import Dataset
from opencompass.openicl.icl_evaluator import HuggingfaceEvaluator
from opencompass.utils import get_data_path
from .base import BaseDataset
os.environ['HF_ALLOW_CODE_EVAL'] = '1'
class HumanevalevalProDataset(BaseDataset):
@staticmethod
def load(path, num_repeats=1, local_mode=False):
path = get_data_path(path, local_mode=local_mode)
dataset = []
with open(path, encoding='utf-8') as f:
raw_data = json.load(f)
for data in raw_data:
dataset.extend([data for _ in range(num_repeats)])
return Dataset.from_list(dataset)
class HumanevalProEvaluator(HuggingfaceEvaluator):
def _preprocess(self, predictions, references):
predictions = [[_] for _ in predictions]
return {
'predictions': predictions,
'references': references,
}
def _postprocess(self, scores):
scores = {f'humaneval_{k}': scores[k] * 100 for k in scores}
return scores
def score(self, predictions, references, test_set):
# predictions are LLM's output; references are the 'output_column' of 'humanevalpro_reader_cfg' # noqa: E501
if len(predictions) != len(references):
return {
'error':
'predictions and references have different '
f'length. len(predictions): {len(predictions)}, '
f'len(references): {len(references)}'
}
# use codes pre-downloaded to opencompass repo, avoid downloading
current_dir = os.path.dirname(os.path.abspath(__file__))
parrent_dir = os.path.dirname(current_dir)
local_path = os.path.join(parrent_dir, 'openicl', 'icl_evaluator',
'hf_metrics', self.metric)
if os.path.exists(local_path):
metric = evaluate.load(local_path)
else:
metric = evaluate.load(self.metric)
scores, _ = metric.compute(**self._preprocess(predictions, references),
k=[1, 3, 5],
num_workers=4)
result = self._postprocess(scores)
return result
def humanevalpro_postprocess_official(text):
"""The official post-processing method for humaneval_pro, which is solely
applicable to the complete generation paradigm.
# noqa: E501 The chat template paradigm requires a different post-
processing method.
"""
text = text[:index if (index := text.find('```')) != -1 else len(text)]
return text
def humanevalpro_postprocess_oc(text):
"""For those generated based on the chat template paradigm, this method is
recommended.
# noqa: E501
"""
start = text.rfind('```python') + len('```python')
end = text.find('```', start)
code = text[start:end].strip()
return code

View File

@ -90,7 +90,7 @@ class HumanevalXEvaluator(BaseEvaluator):
self.timeout = timeout
super().__init__()
def score(self, predictions, references):
def score(self, predictions, references, test_set):
predictions = [{
'task_id':
f'{_LANGUAGE_NAME_DICT[self.language]}/{i}',

View File

@ -3,6 +3,7 @@ import json
import multiprocessing
from collections import defaultdict
from concurrent.futures import ProcessPoolExecutor, as_completed
from typing import Sequence
import numpy as np
from tqdm import tqdm
@ -174,7 +175,7 @@ def codegen_metrics(
samples_list,
generations_list,
k_list=[1, 5, 10, 20, 40, 50, 75, 100, 125, 150, 200, 500, 1000],
num_process_evaluate=16,
num_process_evaluate=8,
timeout=6,
debug=False,
):
@ -238,14 +239,20 @@ class LCBCodeGenerationEvaluator(BaseEvaluator):
release_version='release_v1',
extractor_version='v1',
start_date=None,
end_date=None):
end_date=None,
num_repeats=1,
k=1):
super().__init__()
self.num_process_evaluate = num_process_evaluate
self.timeout = timeout
if not isinstance(k, Sequence):
k = (k, )
self.k = k
self.dataset = LCBCodeGenerationDataset.load(
release_version=release_version,
start_date=start_date,
end_date=end_date)['test']
end_date=end_date,
num_repeats=num_repeats)['test']
self.extractor_version = extractor_version
def score(self, predictions, references):
@ -273,8 +280,11 @@ class LCBCodeGenerationEvaluator(BaseEvaluator):
filtered_references = []
for idx, item in enumerate(references):
if item in self.dataset['question_id']:
filtered_predictions.append(predictions[idx])
filtered_references.append(item)
if filtered_references and item == filtered_references[-1]:
filtered_predictions[-1].extend(predictions[idx])
else:
filtered_predictions.append(predictions[idx])
filtered_references.append(item)
filtered_references = [
evaluation_samples[item] for item in filtered_references
@ -291,7 +301,7 @@ class LCBCodeGenerationEvaluator(BaseEvaluator):
metrics, eval_results, final_metadata = codegen_metrics(
filtered_references,
filtered_predictions,
k_list=[1],
k_list=self.k,
num_process_evaluate=self.num_process_evaluate,
timeout=self.timeout,
)

View File

@ -56,7 +56,8 @@ class LCBCodeGenerationDataset(BaseDataset):
local_mode: bool = False,
release_version: str = 'release_v1',
start_date: str = None,
end_date: str = None):
end_date: str = None,
num_repeats: int = None):
def transform(item):
# Define the dataitem mapping logic
@ -118,7 +119,13 @@ class LCBCodeGenerationDataset(BaseDataset):
if end_date is not None:
p_end_date = datetime.strptime(end_date, '%Y-%m-%d')
dataset = dataset.filter(lambda e: datetime.fromisoformat(e[
'contest_date']) <= p_end_date) # noqa: E501
'contest_date']) <= p_end_date)
if num_repeats and num_repeats > 1:
indices = []
for idx in range(len(dataset)):
indices.extend([idx] * num_repeats)
dataset = dataset.select(indices)
return DatasetDict({'test': dataset, 'train': dataset})

View File

@ -436,7 +436,7 @@ class MBPPPassKEvaluator(MBPPEvaluator):
"""Better use for pass k evaluation.
Args:
k(Tuple[int]): Choices of Pass@k. Defaults to (1, 10, 100)
k(Union[int, Tuple[int, ...], List[int]]): Choices of Pass@k.
"""
def __init__(self, k=(1, 10, 100)) -> None:
@ -478,7 +478,7 @@ class MBPPPassKEvaluator(MBPPEvaluator):
task_total = defaultdict(int)
result = {'pass': 0, 'timeout': 0, 'failed': 0, 'wrong_answer': 0}
with ProcessPoolExecutor() as executor:
with ProcessPoolExecutor(max_workers=8) as executor:
futures = []
for refer, preds in zip(references, predictions):
# suits for two case
@ -494,7 +494,7 @@ class MBPPPassKEvaluator(MBPPEvaluator):
for pred in preds:
pred = self._process_answer(pred)
programs = self._process_test(test_case, pred)
future = executor.submit(execution, programs, task_id, 10)
future = executor.submit(execution, programs, task_id, 8)
futures.append(future)
from tqdm import tqdm

View File

@ -0,0 +1,97 @@
import json
from typing import Dict, List
import numpy as np
from datasets import Dataset
from opencompass.openicl.icl_evaluator.code_evaluator import CodeEvaluator
from opencompass.utils import get_data_path
from .base import BaseDataset
class MBPPProDataset(BaseDataset):
@staticmethod
def load(path, num_repeats=1, local_mode=False):
path = get_data_path(path, local_mode=local_mode)
print(path)
dataset = []
with open(path, encoding='utf-8') as f:
for line in f:
dataset.extend(
[json.loads(line.strip()) for _ in range(num_repeats)])
return Dataset.from_list(dataset)
class MBPPProEvaluator(CodeEvaluator):
def _process_completions(self, test_case: dict, completions: list) -> list:
processed_completions = []
for comp in completions:
post_comp = self._extract_code(comp)
processed_completions.append(post_comp)
return processed_completions
def score(self, predictions: List, references: List,
test_set: Dataset) -> Dict:
if len(predictions) != len(references):
return {
'error':
'predictions and references have different '
f'length. len(predictions): {len(predictions)}, '
f'len(references): {len(references)}'
}
test_set = test_set.to_pandas()
# Use the first column as the unique identifier
test_set_origin = test_set.drop_duplicates(subset=test_set.columns[0])
num_repeats = int(len(test_set) / len(test_set_origin))
# 1. Prepare data for all test cases
all_test_cases = []
for i in range(len(test_set_origin)):
test_case = test_set_origin.iloc[i]
completions = predictions[i * num_repeats:(i + 1) * num_repeats]
# Process code completions
processed_completions = self._process_completions(
test_case, completions)
sub_data_dict = {
'name': int(test_case['id']),
'language': self.language,
'prompt': '',
'tests': test_case['test_code'],
'processed_completions': processed_completions,
'completions': completions
}
all_test_cases.append(sub_data_dict)
# 2. Send all test cases to the evaluation service
success, outputs, error_message = self._evaluate(all_test_cases)
if not success:
return {'error': error_message}
# 3. Process the returned results
details = []
total, correct = [], []
for output in outputs:
passed = [m['status'] == 'OK' for m in output['meta_data']]
total.append(len(passed))
correct.append(sum(passed))
details.append(output)
total = np.array(total)
correct = np.array(correct)
pass_at_k = {
f'pass@{k}':
self.estimate_pass_at_k(total, correct, k).mean() * 100
for k in self.k if (total >= k).all()
}
return {
**pass_at_k,
'details': details,
}

View File

@ -1,12 +1,14 @@
# flake8: noqa: E501
import difflib
import itertools
import os
import re
import tempfile
import time
from typing import Any, Dict, List, Optional, Tuple, Union
from typing import Any, Dict, List, Optional, Sequence, Tuple, Union
import numpy as np
from datasets import Dataset
from gradio_client import Client
@ -24,19 +26,24 @@ class CodeEvaluator(BaseEvaluator):
"""
def __init__(self,
language: str,
language: str = 'py',
ip_address: str = 'localhost',
k: Union[int, Tuple[int, ...], List[int]] = 1,
retry: int = 3) -> None:
"""Initialize the CodeEvaluator.
Args:
language (str): Programming language of the code to evaluate.
ip_address (str, optional): IP address of the evaluation service. Defaults to 'localhost'.
k: Union[int, Tuple[int,...], List[int,...]]: The number k of pass@k to evaluate the code. Defaults to 1.
retry (int, optional): Number of retry attempts for failed connections. Defaults to 3.
"""
self.language = language
self.retry = retry
self.client = Client(ip_address)
if not isinstance(k, Sequence):
k = (k, )
self.k = k
super().__init__()
def _extract_code(self, text: str) -> str:
@ -195,6 +202,31 @@ class CodeEvaluator(BaseEvaluator):
return True, output, None
def estimate_pass_at_k(self, num_samples: Union[int, List[int],
np.ndarray],
num_correct: Union[List[int], np.ndarray],
k: int) -> np.ndarray:
"""Estimates pass@k of each problem and returns them in an array."""
def estimator(n: int, c: int, k: int) -> float:
"""
Calculates 1 - comb(n - c, k) / comb(n, k).
"""
if n - c < k:
return 1.0
return 1.0 - np.prod(1.0 - k / np.arange(n - c + 1, n + 1))
if isinstance(num_samples, int):
num_samples_it = itertools.repeat(num_samples, len(num_correct))
else:
assert len(num_samples) == len(num_correct)
num_samples_it = iter(num_samples)
return np.array([
estimator(int(n), int(c), k)
for n, c in zip(num_samples_it, num_correct)
])
def score(self, predictions: List, references: List,
test_set: Dataset) -> Dict:
"""Score code generation predictions against references.
@ -233,7 +265,7 @@ class CodeEvaluator(BaseEvaluator):
processed_completions = self._process_completions(
test_case, completions)
result_dict = {
sub_data_dict = {
'name': test_case['name'],
'language': test_case['language'],
'prompt': test_case['prompt'],
@ -242,7 +274,7 @@ class CodeEvaluator(BaseEvaluator):
'completions': completions
}
all_test_cases.append(result_dict)
all_test_cases.append(sub_data_dict)
# 2. Send all test cases to the evaluation service
success, outputs, error_message = self._evaluate(all_test_cases)
@ -251,17 +283,22 @@ class CodeEvaluator(BaseEvaluator):
# 3. Process the returned results
details = []
correct = 0
total, correct = [], []
for output in outputs:
if output.get('status') == 'OK':
output['correct'] = True
correct += 1
else:
output['correct'] = False
passed = [m['status'] == 'OK' for m in output['meta_data']]
total.append(len(passed))
correct.append(sum(passed))
details.append(output)
total = np.array(total)
correct = np.array(correct)
pass_at_k = {
f'pass@{k}':
self.estimate_pass_at_k(total, correct, k).mean() * 100
for k in self.k if (total >= k).all()
}
return {
f'pass@{num_repeats}': 100 * correct / len(test_set_origin),
'details': details
**pass_at_k,
'details': details,
}

View File

@ -420,6 +420,16 @@ DATASETS_MAPPING = {
"hf_id": "",
"local": "./data/OlympiadBench",
},
"opencompass/humaneval_pro": {
"ms_id": "",
"hf_id": "",
"local": "./data/humaneval_pro/humaneval_pro.json",
},
"opencompass/mbpp_pro": {
"ms_id": "",
"hf_id": "",
"local": "./data/mbpp_pro/mbpp_pro.json",
},
}
DATASETS_URL = {
@ -746,5 +756,13 @@ DATASETS_URL = {
"url":
"http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/bigcodebench.zip",
"md5": "270f399f4142b74f47ecff116cc3b21d"
}
},
"humaneval_pro": {
"url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/humaneval_pro.zip",
"md5": "4c6fe556e84e905e4f0902d699e46de5",
},
"mbpp_pro": {
"url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/mbpp_pro.zip",
"md5": "eac330b8a0a8687f006265c9383503ce",
},
}

View File

@ -51,7 +51,8 @@ class JSONToolkit:
raise
@staticmethod
def read_jsonl(file_path: Union[str, Path]) -> List[Dict[str, Any]]:
def read_jsonl(file_path: Union[str, Path],
num_repeats: int = 1) -> List[Dict[str, Any]]:
"""Read a JSONL file and return its contents as a list of dictionaries.
Args:
@ -73,7 +74,9 @@ class JSONToolkit:
if not line: # Skip empty lines
continue
try:
results.append(json.loads(line))
# results.append(json.loads(line))
results.extend(
[json.loads(line) for _ in range(num_repeats)])
except json.JSONDecodeError as e:
logger.error(
f'Invalid JSON on line {line_num}: {str(e)}')