mirror of
https://github.com/open-compass/opencompass.git
synced 2025-05-30 16:03:24 +08:00
98 lines
3.2 KiB
Python
98 lines
3.2 KiB
Python
import json
|
|
from typing import Dict, List
|
|
|
|
import numpy as np
|
|
from datasets import Dataset
|
|
|
|
from opencompass.openicl.icl_evaluator.code_evaluator import CodeEvaluator
|
|
from opencompass.utils import get_data_path
|
|
|
|
from .base import BaseDataset
|
|
|
|
|
|
class MBPPProDataset(BaseDataset):
|
|
|
|
@staticmethod
|
|
def load(path, num_repeats=1, local_mode=False):
|
|
path = get_data_path(path, local_mode=local_mode)
|
|
print(path)
|
|
dataset = []
|
|
with open(path, encoding='utf-8') as f:
|
|
for line in f:
|
|
dataset.extend(
|
|
[json.loads(line.strip()) for _ in range(num_repeats)])
|
|
return Dataset.from_list(dataset)
|
|
|
|
|
|
class MBPPProEvaluator(CodeEvaluator):
|
|
|
|
def _process_completions(self, test_case: dict, completions: list) -> list:
|
|
processed_completions = []
|
|
for comp in completions:
|
|
post_comp = self._extract_code(comp)
|
|
processed_completions.append(post_comp)
|
|
return processed_completions
|
|
|
|
def score(self, predictions: List, references: List,
|
|
test_set: Dataset) -> Dict:
|
|
if len(predictions) != len(references):
|
|
return {
|
|
'error':
|
|
'predictions and references have different '
|
|
f'length. len(predictions): {len(predictions)}, '
|
|
f'len(references): {len(references)}'
|
|
}
|
|
|
|
test_set = test_set.to_pandas()
|
|
# Use the first column as the unique identifier
|
|
test_set_origin = test_set.drop_duplicates(subset=test_set.columns[0])
|
|
num_repeats = int(len(test_set) / len(test_set_origin))
|
|
|
|
# 1. Prepare data for all test cases
|
|
all_test_cases = []
|
|
for i in range(len(test_set_origin)):
|
|
test_case = test_set_origin.iloc[i]
|
|
completions = predictions[i * num_repeats:(i + 1) * num_repeats]
|
|
|
|
# Process code completions
|
|
processed_completions = self._process_completions(
|
|
test_case, completions)
|
|
|
|
sub_data_dict = {
|
|
'name': int(test_case['id']),
|
|
'language': self.language,
|
|
'prompt': '',
|
|
'tests': test_case['test_code'],
|
|
'processed_completions': processed_completions,
|
|
'completions': completions
|
|
}
|
|
|
|
all_test_cases.append(sub_data_dict)
|
|
|
|
# 2. Send all test cases to the evaluation service
|
|
success, outputs, error_message = self._evaluate(all_test_cases)
|
|
if not success:
|
|
return {'error': error_message}
|
|
|
|
# 3. Process the returned results
|
|
details = []
|
|
total, correct = [], []
|
|
for output in outputs:
|
|
passed = [m['status'] == 'OK' for m in output['meta_data']]
|
|
total.append(len(passed))
|
|
correct.append(sum(passed))
|
|
details.append(output)
|
|
total = np.array(total)
|
|
correct = np.array(correct)
|
|
|
|
pass_at_k = {
|
|
f'pass@{k}':
|
|
self.estimate_pass_at_k(total, correct, k).mean() * 100
|
|
for k in self.k if (total >= k).all()
|
|
}
|
|
|
|
return {
|
|
**pass_at_k,
|
|
'details': details,
|
|
}
|