mirror of
https://github.com/open-compass/opencompass.git
synced 2025-05-30 16:03:24 +08:00
[Dataset] Add human_eval/mbpp pro (#2092)
* add bench * update * bug fix * time update * add index * fix repeat bug
This commit is contained in:
parent
345674f700
commit
2c79dc5227
@ -611,6 +611,12 @@
|
|||||||
paper: https://dl.acm.org/doi/pdf/10.1145/3580305.3599790
|
paper: https://dl.acm.org/doi/pdf/10.1145/3580305.3599790
|
||||||
configpath: opencompass/configs/datasets/humanevalx/humanevalx_gen.py
|
configpath: opencompass/configs/datasets/humanevalx/humanevalx_gen.py
|
||||||
configpath_llmjudge: ''
|
configpath_llmjudge: ''
|
||||||
|
- humaneval_pro:
|
||||||
|
name: HumanEval Pro
|
||||||
|
category: Code
|
||||||
|
paper: https://arxiv.org/abs/2412.21199
|
||||||
|
configpath: opencompass/configs/datasets/humaneval_pro/humaneval_pro_gen.py
|
||||||
|
configpath_llmjudge: ''
|
||||||
- hungarian_math:
|
- hungarian_math:
|
||||||
name: Hungarian_Math
|
name: Hungarian_Math
|
||||||
category: Math
|
category: Math
|
||||||
@ -695,6 +701,12 @@
|
|||||||
paper: ''
|
paper: ''
|
||||||
configpath: opencompass/configs/datasets/mbpp_plus/mbpp_plus_gen.py
|
configpath: opencompass/configs/datasets/mbpp_plus/mbpp_plus_gen.py
|
||||||
configpath_llmjudge: ''
|
configpath_llmjudge: ''
|
||||||
|
- mbpp_pro:
|
||||||
|
name: MBPP Pro
|
||||||
|
category: Code
|
||||||
|
paper: https://arxiv.org/abs/2412.21199
|
||||||
|
configpath: opencompass/configs/datasets/mbpp_pro/mbpp_pro_gen.py
|
||||||
|
configpath_llmjudge: ''
|
||||||
- mgsm:
|
- mgsm:
|
||||||
name: MGSM
|
name: MGSM
|
||||||
category: Language / Math
|
category: Language / Math
|
||||||
|
17
opencompass/configs/datasets/humaneval_pro/README.md
Normal file
17
opencompass/configs/datasets/humaneval_pro/README.md
Normal file
@ -0,0 +1,17 @@
|
|||||||
|
# HumanEval pro
|
||||||
|
|
||||||
|
## OC results
|
||||||
|
|
||||||
|
| model | pass@1 |
|
||||||
|
|:--------------------------:|---------:|
|
||||||
|
|qwen2.5-coder-7b-instruct-hf| 65 |
|
||||||
|
| qwen2.5-14b-instruct-hf | 67 |
|
||||||
|
| deepseek-v2-lite-chat-hf | 35 |
|
||||||
|
|
||||||
|
## CodeEval-pro results
|
||||||
|
|
||||||
|
| model | pass@1 |
|
||||||
|
|:--------------------------:|---------:|
|
||||||
|
|qwen2.5-coder-7b-instruct-hf| 65 |
|
||||||
|
| qwen2.5-14b-instruct-hf | 65 |
|
||||||
|
| deepseek-v2-lite-chat-hf | 28 |
|
@ -0,0 +1,4 @@
|
|||||||
|
from mmengine.config import read_base
|
||||||
|
|
||||||
|
with read_base():
|
||||||
|
from .humaneval_pro_gen_3dc067 import humanevalpro_datasets # noqa: F401, F403
|
@ -0,0 +1,46 @@
|
|||||||
|
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||||
|
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||||
|
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||||
|
from opencompass.datasets import HumanevalevalProDataset, HumanevalProEvaluator, humaneval_postprocess_v2
|
||||||
|
|
||||||
|
|
||||||
|
PROMPT_WRAPPER = """You are an exceptionally intelligent coding assistant that consistently delivers accurate and reliable responses to user instructions.
|
||||||
|
Write a solution of python file to the following problems, the solution of the second problem requires single or multiple calls to the first solution.
|
||||||
|
```python
|
||||||
|
{raw_problem}
|
||||||
|
{new_problem}
|
||||||
|
```
|
||||||
|
Please put the two solutions within the Python code block provided below, and make sure that the block contains no other unrelated content:
|
||||||
|
```python
|
||||||
|
```
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
humanevalpro_reader_cfg = dict(
|
||||||
|
input_columns=['raw_problem', 'new_problem'], output_column='test_code')
|
||||||
|
|
||||||
|
humanevalpro_infer_cfg = dict(
|
||||||
|
prompt_template=dict(
|
||||||
|
type=PromptTemplate,
|
||||||
|
template=dict(round=[
|
||||||
|
dict(
|
||||||
|
role='HUMAN',
|
||||||
|
prompt=PROMPT_WRAPPER),
|
||||||
|
])),
|
||||||
|
retriever=dict(type=ZeroRetriever),
|
||||||
|
inferencer=dict(type=GenInferencer))
|
||||||
|
|
||||||
|
humanevalpro_eval_cfg = dict(
|
||||||
|
evaluator=dict(type=HumanevalProEvaluator,
|
||||||
|
ip_address='https://opencompass-multiple-evaluator.hf.space')
|
||||||
|
)
|
||||||
|
|
||||||
|
humanevalpro_datasets = [
|
||||||
|
dict(
|
||||||
|
abbr='humaneval_pro',
|
||||||
|
type=HumanevalevalProDataset,
|
||||||
|
path='opencompass/humaneval_pro',
|
||||||
|
reader_cfg=humanevalpro_reader_cfg,
|
||||||
|
infer_cfg=humanevalpro_infer_cfg,
|
||||||
|
eval_cfg=humanevalpro_eval_cfg,)
|
||||||
|
]
|
@ -0,0 +1,48 @@
|
|||||||
|
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||||
|
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||||
|
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||||
|
from opencompass.datasets import HumanevalevalProDataset, HumanevalProEvaluator, humaneval_postprocess_v2
|
||||||
|
|
||||||
|
|
||||||
|
PROMPT_WRAPPER = """You are an exceptionally intelligent coding assistant that consistently delivers accurate and reliable responses to user instructions.
|
||||||
|
Write a solution of python file to the following problems, the solution of the second problem requires single or multiple calls to the first solution.
|
||||||
|
```python
|
||||||
|
{raw_problem}
|
||||||
|
{new_problem}
|
||||||
|
```
|
||||||
|
Please put the two solutions within the Python code block provided below, and make sure that the block contains no other unrelated content:
|
||||||
|
```python
|
||||||
|
```
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
humanevalpro_reader_cfg = dict(
|
||||||
|
input_columns=['raw_problem', 'new_problem'], output_column='test_code')
|
||||||
|
|
||||||
|
humanevalpro_infer_cfg = dict(
|
||||||
|
prompt_template=dict(
|
||||||
|
type=PromptTemplate,
|
||||||
|
template=dict(round=[
|
||||||
|
dict(
|
||||||
|
role='HUMAN',
|
||||||
|
prompt=PROMPT_WRAPPER),
|
||||||
|
])),
|
||||||
|
retriever=dict(type=ZeroRetriever),
|
||||||
|
inferencer=dict(type=GenInferencer))
|
||||||
|
|
||||||
|
humanevalpro_eval_cfg = dict(
|
||||||
|
evaluator=dict(type=HumanevalProEvaluator,
|
||||||
|
ip_address='https://opencompass-multiple-evaluator.hf.space')
|
||||||
|
)
|
||||||
|
|
||||||
|
humanevalpro_datasets = [
|
||||||
|
dict(
|
||||||
|
abbr='humaneval_pro',
|
||||||
|
type=HumanevalevalProDataset,
|
||||||
|
path='opencompass/humaneval_pro',
|
||||||
|
reader_cfg=humanevalpro_reader_cfg,
|
||||||
|
infer_cfg=humanevalpro_infer_cfg,
|
||||||
|
eval_cfg=humanevalpro_eval_cfg,
|
||||||
|
n=5,
|
||||||
|
k=3)
|
||||||
|
]
|
17
opencompass/configs/datasets/mbpp_pro/README.md
Normal file
17
opencompass/configs/datasets/mbpp_pro/README.md
Normal file
@ -0,0 +1,17 @@
|
|||||||
|
# MBPP pro
|
||||||
|
|
||||||
|
## OC results
|
||||||
|
|
||||||
|
| model | pass@1 |
|
||||||
|
|:--------------------------:|---------:|
|
||||||
|
|qwen2.5-coder-7b-instruct-hf| 66 |
|
||||||
|
| qwen2.5-14b-instruct-hf | 64 |
|
||||||
|
| deepseek-v2-lite-chat-hf | 36 |
|
||||||
|
|
||||||
|
## CodeEval-pro results
|
||||||
|
|
||||||
|
| model | pass@1 |
|
||||||
|
|:--------------------------:|---------:|
|
||||||
|
|qwen2.5-coder-7b-instruct-hf| 65 |
|
||||||
|
| qwen2.5-14b-instruct-hf | 65 |
|
||||||
|
| deepseek-v2-lite-chat-hf | 39 |
|
4
opencompass/configs/datasets/mbpp_pro/mbpp_pro_gen.py
Normal file
4
opencompass/configs/datasets/mbpp_pro/mbpp_pro_gen.py
Normal file
@ -0,0 +1,4 @@
|
|||||||
|
from mmengine.config import read_base
|
||||||
|
|
||||||
|
with read_base():
|
||||||
|
from .mbpp_pro_gen_3dc067 import mbpppro_datasets # noqa: F401, F403
|
46
opencompass/configs/datasets/mbpp_pro/mbpp_pro_gen_3dc067.py
Normal file
46
opencompass/configs/datasets/mbpp_pro/mbpp_pro_gen_3dc067.py
Normal file
@ -0,0 +1,46 @@
|
|||||||
|
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||||
|
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||||
|
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||||
|
from opencompass.datasets import MBPPProDataset, MBPPProEvaluator
|
||||||
|
|
||||||
|
|
||||||
|
PROMPT_WRAPPER = """You are an exceptionally intelligent coding assistant that consistently delivers accurate and reliable responses to user instructions.
|
||||||
|
Write a solution of python file to the following problems, the solution of the second problem requires single or multiple calls to the first solution.
|
||||||
|
```python
|
||||||
|
{raw_problem}
|
||||||
|
{new_problem}
|
||||||
|
```
|
||||||
|
Please put the two solutions within the Python code block provided below, and make sure that the block contains no other unrelated content:
|
||||||
|
```python
|
||||||
|
```
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
mbpppro_reader_cfg = dict(
|
||||||
|
input_columns=['raw_problem', 'new_problem'], output_column='test_code')
|
||||||
|
|
||||||
|
mbpppro_infer_cfg = dict(
|
||||||
|
prompt_template=dict(
|
||||||
|
type=PromptTemplate,
|
||||||
|
template=dict(round=[
|
||||||
|
dict(
|
||||||
|
role='HUMAN',
|
||||||
|
prompt=PROMPT_WRAPPER),
|
||||||
|
])),
|
||||||
|
retriever=dict(type=ZeroRetriever),
|
||||||
|
inferencer=dict(type=GenInferencer))
|
||||||
|
|
||||||
|
mbpppro_eval_cfg = dict(
|
||||||
|
evaluator=dict(type=MBPPProEvaluator,
|
||||||
|
ip_address='https://opencompass-multiple-evaluator.hf.space'),
|
||||||
|
)
|
||||||
|
|
||||||
|
mbpppro_datasets = [
|
||||||
|
dict(
|
||||||
|
abbr='mbpp_pro',
|
||||||
|
type=MBPPProDataset,
|
||||||
|
path='opencompass/mbpp_pro',
|
||||||
|
reader_cfg=mbpppro_reader_cfg,
|
||||||
|
infer_cfg=mbpppro_infer_cfg,
|
||||||
|
eval_cfg=mbpppro_eval_cfg)
|
||||||
|
]
|
@ -0,0 +1,48 @@
|
|||||||
|
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||||
|
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||||
|
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||||
|
from opencompass.datasets import MBPPProDataset, MBPPProEvaluator
|
||||||
|
|
||||||
|
|
||||||
|
PROMPT_WRAPPER = """You are an exceptionally intelligent coding assistant that consistently delivers accurate and reliable responses to user instructions.
|
||||||
|
Write a solution of python file to the following problems, the solution of the second problem requires single or multiple calls to the first solution.
|
||||||
|
```python
|
||||||
|
{raw_problem}
|
||||||
|
{new_problem}
|
||||||
|
```
|
||||||
|
Please put the two solutions within the Python code block provided below, and make sure that the block contains no other unrelated content:
|
||||||
|
```python
|
||||||
|
```
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
mbpppro_reader_cfg = dict(
|
||||||
|
input_columns=['raw_problem', 'new_problem'], output_column='test_code')
|
||||||
|
|
||||||
|
mbpppro_infer_cfg = dict(
|
||||||
|
prompt_template=dict(
|
||||||
|
type=PromptTemplate,
|
||||||
|
template=dict(round=[
|
||||||
|
dict(
|
||||||
|
role='HUMAN',
|
||||||
|
prompt=PROMPT_WRAPPER),
|
||||||
|
])),
|
||||||
|
retriever=dict(type=ZeroRetriever),
|
||||||
|
inferencer=dict(type=GenInferencer))
|
||||||
|
|
||||||
|
mbpppro_eval_cfg = dict(
|
||||||
|
evaluator=dict(type=MBPPProEvaluator,
|
||||||
|
ip_address='https://opencompass-multiple-evaluator.hf.space'),
|
||||||
|
)
|
||||||
|
|
||||||
|
mbpppro_datasets = [
|
||||||
|
dict(
|
||||||
|
abbr='mbpp_pro',
|
||||||
|
type=MBPPProDataset,
|
||||||
|
path='opencompass/mbpp_pro',
|
||||||
|
reader_cfg=mbpppro_reader_cfg,
|
||||||
|
infer_cfg=mbpppro_infer_cfg,
|
||||||
|
eval_cfg=mbpppro_eval_cfg,
|
||||||
|
n=5,
|
||||||
|
k=3)
|
||||||
|
]
|
4
opencompass/configs/datasets/multipl_e/multiple_gen.py
Normal file
4
opencompass/configs/datasets/multipl_e/multiple_gen.py
Normal file
@ -0,0 +1,4 @@
|
|||||||
|
from mmengine.config import read_base
|
||||||
|
|
||||||
|
with read_base():
|
||||||
|
from .multiple_top_ten_gen_f44aaf import multiple_datasets # noqa: F401, F403
|
@ -32,7 +32,6 @@ multiple_datasets = [
|
|||||||
type=MultiplEDataset,
|
type=MultiplEDataset,
|
||||||
abbr=f'humaneval-multiple-{lang}',
|
abbr=f'humaneval-multiple-{lang}',
|
||||||
language=lang,
|
language=lang,
|
||||||
num_repeats=1,
|
|
||||||
path='opencompass/multipl_e',
|
path='opencompass/multipl_e',
|
||||||
tag='humaneval',
|
tag='humaneval',
|
||||||
reader_cfg=multiple_reader_cfg,
|
reader_cfg=multiple_reader_cfg,
|
||||||
@ -46,7 +45,6 @@ multiple_datasets += [
|
|||||||
type=MultiplEDataset,
|
type=MultiplEDataset,
|
||||||
abbr=f'mbpp-multiple-{lang}',
|
abbr=f'mbpp-multiple-{lang}',
|
||||||
language=lang,
|
language=lang,
|
||||||
num_repeats=1,
|
|
||||||
path='opencompass/multipl_e',
|
path='opencompass/multipl_e',
|
||||||
tag='mbpp',
|
tag='mbpp',
|
||||||
reader_cfg=multiple_reader_cfg,
|
reader_cfg=multiple_reader_cfg,
|
@ -0,0 +1,58 @@
|
|||||||
|
# Select the 10 most popular programming languages from MultiPL-E to compose the test set.
|
||||||
|
|
||||||
|
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||||
|
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||||
|
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||||
|
from opencompass.datasets import MultiplEDataset, MultiplEEvaluator
|
||||||
|
|
||||||
|
|
||||||
|
_TOP_TEN_LANGUAGE_ = ['cpp']
|
||||||
|
|
||||||
|
multiple_reader_cfg = dict(input_columns=['language', 'prompt'], output_column='tests')
|
||||||
|
|
||||||
|
multiple_infer_cfg = dict(
|
||||||
|
prompt_template=dict(type=PromptTemplate, template='Based on the provided {language} code snippet, complete the subsequent content. The initial part of the completed code must match the provided code snippet exactly:\n{prompt}'),
|
||||||
|
retriever=dict(type=ZeroRetriever),
|
||||||
|
inferencer=dict(type=GenInferencer),
|
||||||
|
)
|
||||||
|
|
||||||
|
multiple_eval_cfg = {
|
||||||
|
lang: dict(
|
||||||
|
evaluator=dict(
|
||||||
|
type=MultiplEEvaluator,
|
||||||
|
language=lang,
|
||||||
|
ip_address='https://opencompass-multiple-evaluator.hf.space',
|
||||||
|
),
|
||||||
|
pred_role='BOT',
|
||||||
|
) for lang in _TOP_TEN_LANGUAGE_
|
||||||
|
}
|
||||||
|
|
||||||
|
multiple_datasets = [
|
||||||
|
dict(
|
||||||
|
type=MultiplEDataset,
|
||||||
|
abbr=f'humaneval-multiple-{lang}',
|
||||||
|
language=lang,
|
||||||
|
path='opencompass/multipl_e',
|
||||||
|
tag='humaneval',
|
||||||
|
reader_cfg=multiple_reader_cfg,
|
||||||
|
infer_cfg=multiple_infer_cfg,
|
||||||
|
eval_cfg=multiple_eval_cfg[lang],
|
||||||
|
n=5,
|
||||||
|
k=3
|
||||||
|
) for lang in _TOP_TEN_LANGUAGE_
|
||||||
|
]
|
||||||
|
|
||||||
|
multiple_datasets += [
|
||||||
|
dict(
|
||||||
|
type=MultiplEDataset,
|
||||||
|
abbr=f'mbpp-multiple-{lang}',
|
||||||
|
language=lang,
|
||||||
|
path='opencompass/multipl_e',
|
||||||
|
tag='mbpp',
|
||||||
|
reader_cfg=multiple_reader_cfg,
|
||||||
|
infer_cfg=multiple_infer_cfg,
|
||||||
|
eval_cfg=multiple_eval_cfg[lang],
|
||||||
|
n=5,
|
||||||
|
k=3
|
||||||
|
) for lang in _TOP_TEN_LANGUAGE_
|
||||||
|
]
|
@ -64,6 +64,7 @@ from .hle import * # noqa: F401, F403
|
|||||||
from .huggingface import * # noqa: F401, F403
|
from .huggingface import * # noqa: F401, F403
|
||||||
from .humaneval import * # noqa: F401, F403
|
from .humaneval import * # noqa: F401, F403
|
||||||
from .humaneval_multi import * # noqa: F401, F403
|
from .humaneval_multi import * # noqa: F401, F403
|
||||||
|
from .humaneval_pro import * # noqa: F401, F403
|
||||||
from .humanevalx import * # noqa: F401, F403
|
from .humanevalx import * # noqa: F401, F403
|
||||||
from .hungarian_math import * # noqa: F401, F403
|
from .hungarian_math import * # noqa: F401, F403
|
||||||
from .IFEval.ifeval import IFEvalDataset, IFEvaluator # noqa: F401, F403
|
from .IFEval.ifeval import IFEvalDataset, IFEvaluator # noqa: F401, F403
|
||||||
@ -96,6 +97,7 @@ from .math401 import * # noqa: F401, F403
|
|||||||
from .math_intern import * # noqa: F401, F403
|
from .math_intern import * # noqa: F401, F403
|
||||||
from .mathbench import * # noqa: F401, F403
|
from .mathbench import * # noqa: F401, F403
|
||||||
from .mbpp import * # noqa: F401, F403
|
from .mbpp import * # noqa: F401, F403
|
||||||
|
from .mbpp_pro import * # noqa: F401, F403
|
||||||
from .medbench import * # noqa: F401, F403
|
from .medbench import * # noqa: F401, F403
|
||||||
from .MedCalc_Bench import MedCalc_BenchDataset # noqa: F401
|
from .MedCalc_Bench import MedCalc_BenchDataset # noqa: F401
|
||||||
from .MedCalc_Bench import MedCalcOfficial_Evaluator # noqa: F401
|
from .MedCalc_Bench import MedCalcOfficial_Evaluator # noqa: F401
|
||||||
|
81
opencompass/datasets/humaneval_pro.py
Normal file
81
opencompass/datasets/humaneval_pro.py
Normal file
@ -0,0 +1,81 @@
|
|||||||
|
# flake8: noqa: E501s
|
||||||
|
|
||||||
|
import json
|
||||||
|
from typing import Dict, List
|
||||||
|
|
||||||
|
from datasets import Dataset
|
||||||
|
|
||||||
|
from opencompass.openicl.icl_evaluator.code_evaluator import CodeEvaluator
|
||||||
|
from opencompass.utils import get_data_path
|
||||||
|
|
||||||
|
from .base import BaseDataset
|
||||||
|
|
||||||
|
PROMPT_WRAPPER = """You are an exceptionally intelligent coding assistant that consistently delivers accurate and reliable responses to user instructions.
|
||||||
|
Write a solution of python file to the following problems, the solution of the second problem requires single or multiple calls to the first solution.
|
||||||
|
```python
|
||||||
|
{raw_problem}
|
||||||
|
{new_problem}
|
||||||
|
```
|
||||||
|
Please put the two solutions within the Python code block provided below, and make sure that the block contains no other unrelated content:
|
||||||
|
```python
|
||||||
|
```
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
class HumanevalevalProDataset(BaseDataset):
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def load(path, local_mode=False):
|
||||||
|
path = get_data_path(path, local_mode=local_mode)
|
||||||
|
dataset = []
|
||||||
|
with open(path, encoding='utf-8') as f:
|
||||||
|
raw_data = json.load(f)
|
||||||
|
for data in raw_data:
|
||||||
|
dataset.append(data)
|
||||||
|
return Dataset.from_list(dataset)
|
||||||
|
|
||||||
|
|
||||||
|
class HumanevalProEvaluator(CodeEvaluator):
|
||||||
|
|
||||||
|
def score(self, predictions: List, references: List,
|
||||||
|
test_set: Dataset) -> Dict:
|
||||||
|
if len(predictions) != len(references):
|
||||||
|
return {
|
||||||
|
'error':
|
||||||
|
'predictions and references have different '
|
||||||
|
f'length. len(predictions): {len(predictions)}, '
|
||||||
|
f'len(references): {len(references)}'
|
||||||
|
}
|
||||||
|
|
||||||
|
test_set = test_set.to_pandas()
|
||||||
|
# Use the first column as the unique identifier
|
||||||
|
test_set_origin = test_set.drop_duplicates(subset=test_set.columns[0])
|
||||||
|
|
||||||
|
# 1. Prepare data for all test cases
|
||||||
|
all_test_cases, prompts = [], []
|
||||||
|
for i in range(len(test_set_origin)):
|
||||||
|
test_case = test_set_origin.iloc[i]
|
||||||
|
completion = predictions[i]
|
||||||
|
|
||||||
|
# Process code completions
|
||||||
|
processed_completion = self._process_completions(completion)
|
||||||
|
code = processed_completion + '\n' + test_case['test_code']
|
||||||
|
sub_data_dict = {
|
||||||
|
'name': int(test_case['id']),
|
||||||
|
'language': self.language,
|
||||||
|
'code': code,
|
||||||
|
}
|
||||||
|
all_test_cases.append(sub_data_dict)
|
||||||
|
|
||||||
|
prompt = PROMPT_WRAPPER.format(
|
||||||
|
raw_problem=test_case['raw_problem'],
|
||||||
|
new_problem=test_case['new_problem'])
|
||||||
|
prompts.append(prompt)
|
||||||
|
|
||||||
|
# 2. Send all test cases to the evaluation service
|
||||||
|
success, outputs, error_message = self._evaluate(all_test_cases)
|
||||||
|
if not success:
|
||||||
|
return {'error': error_message}
|
||||||
|
|
||||||
|
# 3. Process the returned results
|
||||||
|
return self._process_results(outputs, prompts, len(test_set_origin))
|
81
opencompass/datasets/mbpp_pro.py
Normal file
81
opencompass/datasets/mbpp_pro.py
Normal file
@ -0,0 +1,81 @@
|
|||||||
|
# flake8: noqa: E501
|
||||||
|
|
||||||
|
import json
|
||||||
|
from typing import Dict, List
|
||||||
|
|
||||||
|
from datasets import Dataset
|
||||||
|
|
||||||
|
from opencompass.openicl.icl_evaluator.code_evaluator import CodeEvaluator
|
||||||
|
from opencompass.utils import get_data_path
|
||||||
|
|
||||||
|
from .base import BaseDataset
|
||||||
|
|
||||||
|
PROMPT_WRAPPER = """You are an exceptionally intelligent coding assistant that consistently delivers accurate and reliable responses to user instructions.
|
||||||
|
Write a solution of python file to the following problems, the solution of the second problem requires single or multiple calls to the first solution.
|
||||||
|
```python
|
||||||
|
{raw_problem}
|
||||||
|
{new_problem}
|
||||||
|
```
|
||||||
|
Please put the two solutions within the Python code block provided below, and make sure that the block contains no other unrelated content:
|
||||||
|
```python
|
||||||
|
```
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
class MBPPProDataset(BaseDataset):
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def load(path, local_mode=False):
|
||||||
|
path = get_data_path(path, local_mode=local_mode)
|
||||||
|
print(path)
|
||||||
|
dataset = []
|
||||||
|
with open(path, encoding='utf-8') as f:
|
||||||
|
for line in f:
|
||||||
|
dataset.append(json.loads(line.strip()))
|
||||||
|
return Dataset.from_list(dataset)
|
||||||
|
|
||||||
|
|
||||||
|
class MBPPProEvaluator(CodeEvaluator):
|
||||||
|
|
||||||
|
def score(self, predictions: List, references: List,
|
||||||
|
test_set: Dataset) -> Dict:
|
||||||
|
if len(predictions) != len(references):
|
||||||
|
return {
|
||||||
|
'error':
|
||||||
|
'predictions and references have different '
|
||||||
|
f'length. len(predictions): {len(predictions)}, '
|
||||||
|
f'len(references): {len(references)}'
|
||||||
|
}
|
||||||
|
|
||||||
|
test_set = test_set.to_pandas()
|
||||||
|
# Use the first column as the unique identifier
|
||||||
|
test_set_origin = test_set.drop_duplicates(subset=test_set.columns[0])
|
||||||
|
|
||||||
|
# 1. Prepare data for all test cases
|
||||||
|
all_test_cases, prompts = [], []
|
||||||
|
for i in range(len(test_set_origin)):
|
||||||
|
test_case = test_set_origin.iloc[i]
|
||||||
|
completion = predictions[i]
|
||||||
|
|
||||||
|
# Process code completions
|
||||||
|
processed_completion = self._process_completions(completion)
|
||||||
|
code = processed_completion + '\n' + test_case['test_code']
|
||||||
|
sub_data_dict = {
|
||||||
|
'name': int(test_case['id']),
|
||||||
|
'language': self.language,
|
||||||
|
'code': code,
|
||||||
|
}
|
||||||
|
all_test_cases.append(sub_data_dict)
|
||||||
|
|
||||||
|
prompt = PROMPT_WRAPPER.format(
|
||||||
|
raw_problem=test_case['raw_problem'],
|
||||||
|
new_problem=test_case['new_problem'])
|
||||||
|
prompts.append(prompt)
|
||||||
|
|
||||||
|
# 2. Send all test cases to the evaluation service
|
||||||
|
success, outputs, error_message = self._evaluate(all_test_cases)
|
||||||
|
if not success:
|
||||||
|
return {'error': error_message}
|
||||||
|
|
||||||
|
# 3. Process the returned results
|
||||||
|
return self._process_results(outputs, prompts, len(test_set_origin))
|
@ -1,3 +1,4 @@
|
|||||||
|
import difflib
|
||||||
import json
|
import json
|
||||||
import os.path as osp
|
import os.path as osp
|
||||||
|
|
||||||
@ -28,7 +29,6 @@ class MultiplEDataset(BaseDataset):
|
|||||||
@staticmethod
|
@staticmethod
|
||||||
def load(path: str,
|
def load(path: str,
|
||||||
language: str,
|
language: str,
|
||||||
num_repeats: int = 1,
|
|
||||||
tag: str = 'humaneval',
|
tag: str = 'humaneval',
|
||||||
local_mode: bool = False):
|
local_mode: bool = False):
|
||||||
"""Load dataset for pass k mode.
|
"""Load dataset for pass k mode.
|
||||||
@ -56,8 +56,7 @@ class MultiplEDataset(BaseDataset):
|
|||||||
dataset = []
|
dataset = []
|
||||||
with open(file_path, 'r', encoding='utf-8') as f:
|
with open(file_path, 'r', encoding='utf-8') as f:
|
||||||
for line in f:
|
for line in f:
|
||||||
dataset.extend(
|
dataset.append(json.loads(line.strip()))
|
||||||
[json.loads(line.strip()) for _ in range(num_repeats)])
|
|
||||||
return Dataset.from_list(dataset)
|
return Dataset.from_list(dataset)
|
||||||
|
|
||||||
|
|
||||||
@ -84,20 +83,56 @@ class MultiplEEvaluator(CodeEvaluator):
|
|||||||
min_stop_index = stop_index
|
min_stop_index = stop_index
|
||||||
return decoded_string[:min_stop_index]
|
return decoded_string[:min_stop_index]
|
||||||
|
|
||||||
def _process_completions(self, test_case, completions):
|
def _remove_prefix(self,
|
||||||
|
prompt: str,
|
||||||
|
completion: str,
|
||||||
|
threshold: float = 0.95) -> str:
|
||||||
|
"""Determine the truncation point in the completion based on the last
|
||||||
|
line of the prompt, remove all content before that line in the
|
||||||
|
completion, and return the completion string after removing the prefix.
|
||||||
|
This is done to convert chatbot-style inference mode to completion
|
||||||
|
mode.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
prompt (str): The prompt text.
|
||||||
|
completion (str): The completion text.
|
||||||
|
threshold (float): Line similarity threshold.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
str: The completion string after removing the prefix.
|
||||||
|
"""
|
||||||
|
prompt_lines = prompt.splitlines()
|
||||||
|
completion_lines = completion.splitlines()
|
||||||
|
|
||||||
|
if not prompt_lines:
|
||||||
|
return completion
|
||||||
|
|
||||||
|
last_prompt_line = prompt_lines[-1]
|
||||||
|
cut_index = -1
|
||||||
|
|
||||||
|
for i, completion_line in enumerate(completion_lines):
|
||||||
|
similarity = difflib.SequenceMatcher(None, last_prompt_line,
|
||||||
|
completion_line).ratio()
|
||||||
|
if similarity >= threshold:
|
||||||
|
cut_index = i
|
||||||
|
break
|
||||||
|
|
||||||
|
if cut_index != -1:
|
||||||
|
return '\n'.join(completion_lines[cut_index + 1:])
|
||||||
|
else:
|
||||||
|
return completion
|
||||||
|
|
||||||
|
def _process_completions(self, test_case, completion):
|
||||||
"""Process completions with a test case.
|
"""Process completions with a test case.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
test_case: A test case.
|
test_case (dict): A test case containing prompt and stop tokens.
|
||||||
completions: A list of completions.
|
completion (str): The generated code completion.
|
||||||
Returns:
|
Returns:
|
||||||
A list of processed completions.
|
str: Processed code completion.
|
||||||
"""
|
"""
|
||||||
processed_completions = []
|
post_comp = self._extract_code(completion)
|
||||||
for comp in completions:
|
post_comp = self._remove_prefix(test_case['prompt'], post_comp)
|
||||||
comp = self._extract_code(comp)
|
post_comp = self._stop_at_stop_token(post_comp,
|
||||||
post_comp = self._remove_prefix(test_case['prompt'], comp)
|
test_case['stop_tokens'])
|
||||||
post_comp = self._stop_at_stop_token(post_comp,
|
return post_comp
|
||||||
test_case['stop_tokens'])
|
|
||||||
processed_completions.append(post_comp)
|
|
||||||
return processed_completions
|
|
||||||
|
@ -1,12 +1,12 @@
|
|||||||
# flake8: noqa: E501
|
# flake8: noqa: E501
|
||||||
|
|
||||||
import difflib
|
|
||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
import tempfile
|
import tempfile
|
||||||
import time
|
import time
|
||||||
from typing import Any, Dict, List, Optional, Tuple, Union
|
from typing import Any, Dict, List, Optional, Tuple, Union
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
from datasets import Dataset
|
from datasets import Dataset
|
||||||
from gradio_client import Client
|
from gradio_client import Client
|
||||||
|
|
||||||
@ -24,9 +24,9 @@ class CodeEvaluator(BaseEvaluator):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self,
|
def __init__(self,
|
||||||
language: str,
|
language: str = 'py',
|
||||||
ip_address: str = 'localhost',
|
ip_address: str = 'localhost',
|
||||||
retry: int = 3) -> None:
|
retry: int = 5) -> None:
|
||||||
"""Initialize the CodeEvaluator.
|
"""Initialize the CodeEvaluator.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
@ -71,6 +71,7 @@ class CodeEvaluator(BaseEvaluator):
|
|||||||
- output (dict/list/str): Evaluation results or error message
|
- output (dict/list/str): Evaluation results or error message
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
|
import requests
|
||||||
temp_file_path = None
|
temp_file_path = None
|
||||||
# Handle file path input
|
# Handle file path input
|
||||||
if isinstance(input_data, str):
|
if isinstance(input_data, str):
|
||||||
@ -83,7 +84,15 @@ class CodeEvaluator(BaseEvaluator):
|
|||||||
input_data = temp_file_path
|
input_data = temp_file_path
|
||||||
|
|
||||||
# Send to evaluation service
|
# Send to evaluation service
|
||||||
result = self.client.predict(input_data, api_name='/evaluate')
|
try:
|
||||||
|
result = self.client.predict(input_data, api_name='/evaluate')
|
||||||
|
except Exception as e:
|
||||||
|
# Catch timeout and other exceptions
|
||||||
|
if 'timed out' in str(e).lower() or 'timeout' in str(
|
||||||
|
e).lower():
|
||||||
|
return False, f'Request to code eval service timed out: {e}'
|
||||||
|
else:
|
||||||
|
raise
|
||||||
|
|
||||||
# Process the result
|
# Process the result
|
||||||
if isinstance(result, (dict, list)):
|
if isinstance(result, (dict, list)):
|
||||||
@ -107,63 +116,16 @@ class CodeEvaluator(BaseEvaluator):
|
|||||||
except: # noqa: E722
|
except: # noqa: E722
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def _remove_prefix(self,
|
def _process_completions(self, completion: str) -> list:
|
||||||
prompt: str,
|
"""Process code completions to extract the relevant code.
|
||||||
completion: str,
|
|
||||||
threshold: float = 0.95) -> str:
|
|
||||||
"""Determine the truncation point in the completion based on the last
|
|
||||||
line of the prompt, remove all content before that line in the
|
|
||||||
completion, and return the completion string after removing the prefix.
|
|
||||||
This is done to convert chatbot-style inference mode to completion
|
|
||||||
mode.
|
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
prompt (str): The prompt text.
|
completion (str): Code completion string.
|
||||||
completion (str): The completion text.
|
|
||||||
threshold (float): Line similarity threshold.
|
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
str: The completion string after removing the prefix.
|
list: List of processed code completions.
|
||||||
"""
|
"""
|
||||||
prompt_lines = prompt.splitlines()
|
post_comp = self._extract_code(completion)
|
||||||
completion_lines = completion.splitlines()
|
return post_comp
|
||||||
|
|
||||||
if not prompt_lines:
|
|
||||||
return completion
|
|
||||||
|
|
||||||
last_prompt_line = prompt_lines[-1]
|
|
||||||
cut_index = -1
|
|
||||||
|
|
||||||
for i, completion_line in enumerate(completion_lines):
|
|
||||||
similarity = difflib.SequenceMatcher(None, last_prompt_line,
|
|
||||||
completion_line).ratio()
|
|
||||||
if similarity >= threshold:
|
|
||||||
cut_index = i
|
|
||||||
break
|
|
||||||
|
|
||||||
if cut_index != -1:
|
|
||||||
return '\n'.join(completion_lines[cut_index + 1:])
|
|
||||||
else:
|
|
||||||
return completion
|
|
||||||
|
|
||||||
def _process_completions(self, test_case: dict, completions: list) -> list:
|
|
||||||
"""Process code completion list, which typically involves extracting
|
|
||||||
code, removing repetitive prefixes caused by chatbot mode, and other
|
|
||||||
steps to ensure the model-generated code can be compiled successfully.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
test_case (dict): Dictionary containing test case information including:
|
|
||||||
completions (list): List of code completions generated by the model.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
list: Processed code completion list.
|
|
||||||
"""
|
|
||||||
processed_completions = []
|
|
||||||
for comp in completions:
|
|
||||||
comp = self._extract_code(comp)
|
|
||||||
post_comp = self._remove_prefix(test_case['prompt'], comp)
|
|
||||||
processed_completions.append(post_comp)
|
|
||||||
return processed_completions
|
|
||||||
|
|
||||||
def _evaluate(
|
def _evaluate(
|
||||||
self, input_data: Union[Dict, List]
|
self, input_data: Union[Dict, List]
|
||||||
@ -186,7 +148,7 @@ class CodeEvaluator(BaseEvaluator):
|
|||||||
succeed, output = self._code_eval_service(input_data)
|
succeed, output = self._code_eval_service(input_data)
|
||||||
if not succeed:
|
if not succeed:
|
||||||
num_retry += 1
|
num_retry += 1
|
||||||
time.sleep(10)
|
time.sleep(30)
|
||||||
else:
|
else:
|
||||||
break
|
break
|
||||||
|
|
||||||
@ -195,6 +157,31 @@ class CodeEvaluator(BaseEvaluator):
|
|||||||
|
|
||||||
return True, output, None
|
return True, output, None
|
||||||
|
|
||||||
|
def _process_results(self, outputs: List, prompts: List,
|
||||||
|
total_count: int) -> Dict:
|
||||||
|
"""Process the evaluation results.
|
||||||
|
Args:
|
||||||
|
outputs (list): List of evaluation results for each test case.
|
||||||
|
prompts (list): List of prompts used for each test case.
|
||||||
|
total_count (int): Total number of test cases.
|
||||||
|
Returns:
|
||||||
|
dict: Processed results including:
|
||||||
|
- pass@1: Percentage of test cases passed
|
||||||
|
- details: Detailed results for each test case
|
||||||
|
"""
|
||||||
|
details = []
|
||||||
|
correct = 0
|
||||||
|
for output, prompt in zip(outputs, prompts):
|
||||||
|
output['prompt'] = prompt
|
||||||
|
if output.get('status') == 'OK':
|
||||||
|
output['correct'] = True
|
||||||
|
correct += 1
|
||||||
|
else:
|
||||||
|
output['correct'] = False
|
||||||
|
details.append(output)
|
||||||
|
|
||||||
|
return {f'pass@1': 100 * correct / total_count, 'details': details}
|
||||||
|
|
||||||
def score(self, predictions: List, references: List,
|
def score(self, predictions: List, references: List,
|
||||||
test_set: Dataset) -> Dict:
|
test_set: Dataset) -> Dict:
|
||||||
"""Score code generation predictions against references.
|
"""Score code generation predictions against references.
|
||||||
@ -221,28 +208,25 @@ class CodeEvaluator(BaseEvaluator):
|
|||||||
test_set = test_set.to_pandas()
|
test_set = test_set.to_pandas()
|
||||||
# Use the first column as the unique identifier
|
# Use the first column as the unique identifier
|
||||||
test_set_origin = test_set.drop_duplicates(subset=test_set.columns[0])
|
test_set_origin = test_set.drop_duplicates(subset=test_set.columns[0])
|
||||||
num_repeats = int(len(test_set) / len(test_set_origin))
|
|
||||||
|
|
||||||
# 1. Prepare data for all test cases
|
# 1. Prepare data for all test cases
|
||||||
all_test_cases = []
|
all_test_cases, prompts = [], []
|
||||||
for i in range(len(test_set_origin)):
|
for i in range(len(test_set_origin)):
|
||||||
test_case = test_set_origin.iloc[i]
|
test_case = test_set_origin.iloc[i]
|
||||||
completions = predictions[i * num_repeats:(i + 1) * num_repeats]
|
completion = predictions[i]
|
||||||
|
|
||||||
# Process code completions
|
# Process code completions
|
||||||
processed_completions = self._process_completions(
|
processed_completion = self._process_completions(
|
||||||
test_case, completions)
|
test_case, completion)
|
||||||
|
code = test_case[
|
||||||
result_dict = {
|
'prompt'] + processed_completion + '\n' + test_case['tests']
|
||||||
|
sub_data_dict = {
|
||||||
'name': test_case['name'],
|
'name': test_case['name'],
|
||||||
'language': test_case['language'],
|
'language': test_case['language'],
|
||||||
'prompt': test_case['prompt'],
|
'code': code
|
||||||
'tests': test_case['tests'],
|
|
||||||
'processed_completions': processed_completions,
|
|
||||||
'completions': completions
|
|
||||||
}
|
}
|
||||||
|
all_test_cases.append(sub_data_dict)
|
||||||
all_test_cases.append(result_dict)
|
prompts.append(test_case['prompt'])
|
||||||
|
|
||||||
# 2. Send all test cases to the evaluation service
|
# 2. Send all test cases to the evaluation service
|
||||||
success, outputs, error_message = self._evaluate(all_test_cases)
|
success, outputs, error_message = self._evaluate(all_test_cases)
|
||||||
@ -250,18 +234,4 @@ class CodeEvaluator(BaseEvaluator):
|
|||||||
return {'error': error_message}
|
return {'error': error_message}
|
||||||
|
|
||||||
# 3. Process the returned results
|
# 3. Process the returned results
|
||||||
details = []
|
return self._process_results(outputs, prompts, len(test_set_origin))
|
||||||
correct = 0
|
|
||||||
for output in outputs:
|
|
||||||
if output.get('status') == 'OK':
|
|
||||||
output['correct'] = True
|
|
||||||
correct += 1
|
|
||||||
else:
|
|
||||||
output['correct'] = False
|
|
||||||
|
|
||||||
details.append(output)
|
|
||||||
|
|
||||||
return {
|
|
||||||
f'pass@{num_repeats}': 100 * correct / len(test_set_origin),
|
|
||||||
'details': details
|
|
||||||
}
|
|
||||||
|
@ -451,7 +451,16 @@ DATASETS_MAPPING = {
|
|||||||
"hf_id": "",
|
"hf_id": "",
|
||||||
"local": "./data/nejmaibench/NEJM_All_Questions_And_Answers.csv",
|
"local": "./data/nejmaibench/NEJM_All_Questions_And_Answers.csv",
|
||||||
},
|
},
|
||||||
|
"opencompass/humaneval_pro": {
|
||||||
|
"ms_id": "",
|
||||||
|
"hf_id": "",
|
||||||
|
"local": "./data/humaneval_pro/humaneval_pro.json",
|
||||||
|
},
|
||||||
|
"opencompass/mbpp_pro": {
|
||||||
|
"ms_id": "",
|
||||||
|
"hf_id": "",
|
||||||
|
"local": "./data/mbpp_pro/mbpp_pro.json",
|
||||||
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
DATASETS_URL = {
|
DATASETS_URL = {
|
||||||
@ -808,6 +817,13 @@ DATASETS_URL = {
|
|||||||
"url":
|
"url":
|
||||||
"http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/nejmaibench.zip",
|
"http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/nejmaibench.zip",
|
||||||
"md5": "e6082cae3596b3ebea73e23ba445b99e"
|
"md5": "e6082cae3596b3ebea73e23ba445b99e"
|
||||||
}
|
},
|
||||||
|
"humaneval_pro": {
|
||||||
|
"url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/humaneval_pro.zip",
|
||||||
|
"md5": "4c6fe556e84e905e4f0902d699e46de5",
|
||||||
|
},
|
||||||
|
"mbpp_pro": {
|
||||||
|
"url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/mbpp_pro.zip",
|
||||||
|
"md5": "eac330b8a0a8687f006265c9383503ce",
|
||||||
|
},
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user