OpenCompass/opencompass/datasets/mbpp_pro.py
Dongsheng Zhu 2c79dc5227
[Dataset] Add human_eval/mbpp pro (#2092)
* add bench

* update

* bug fix

* time update

* add index

* fix repeat bug
2025-05-12 18:38:13 +08:00

82 lines
2.8 KiB
Python

# flake8: noqa: E501
import json
from typing import Dict, List
from datasets import Dataset
from opencompass.openicl.icl_evaluator.code_evaluator import CodeEvaluator
from opencompass.utils import get_data_path
from .base import BaseDataset
PROMPT_WRAPPER = """You are an exceptionally intelligent coding assistant that consistently delivers accurate and reliable responses to user instructions.
Write a solution of python file to the following problems, the solution of the second problem requires single or multiple calls to the first solution.
```python
{raw_problem}
{new_problem}
```
Please put the two solutions within the Python code block provided below, and make sure that the block contains no other unrelated content:
```python
```
"""
class MBPPProDataset(BaseDataset):
@staticmethod
def load(path, local_mode=False):
path = get_data_path(path, local_mode=local_mode)
print(path)
dataset = []
with open(path, encoding='utf-8') as f:
for line in f:
dataset.append(json.loads(line.strip()))
return Dataset.from_list(dataset)
class MBPPProEvaluator(CodeEvaluator):
def score(self, predictions: List, references: List,
test_set: Dataset) -> Dict:
if len(predictions) != len(references):
return {
'error':
'predictions and references have different '
f'length. len(predictions): {len(predictions)}, '
f'len(references): {len(references)}'
}
test_set = test_set.to_pandas()
# Use the first column as the unique identifier
test_set_origin = test_set.drop_duplicates(subset=test_set.columns[0])
# 1. Prepare data for all test cases
all_test_cases, prompts = [], []
for i in range(len(test_set_origin)):
test_case = test_set_origin.iloc[i]
completion = predictions[i]
# Process code completions
processed_completion = self._process_completions(completion)
code = processed_completion + '\n' + test_case['test_code']
sub_data_dict = {
'name': int(test_case['id']),
'language': self.language,
'code': code,
}
all_test_cases.append(sub_data_dict)
prompt = PROMPT_WRAPPER.format(
raw_problem=test_case['raw_problem'],
new_problem=test_case['new_problem'])
prompts.append(prompt)
# 2. Send all test cases to the evaluation service
success, outputs, error_message = self._evaluate(all_test_cases)
if not success:
return {'error': error_message}
# 3. Process the returned results
return self._process_results(outputs, prompts, len(test_set_origin))