OpenCompass/opencompass/datasets/humaneval_pro.py

# flake8: noqa: E501s

import json
from typing import Dict, List

from datasets import Dataset

from opencompass.openicl.icl_evaluator.code_evaluator import CodeEvaluator
from opencompass.utils import get_data_path

from .base import BaseDataset

PROMPT_WRAPPER = """You are an exceptionally intelligent coding assistant that consistently delivers accurate and reliable responses to user instructions.
Write a solution of python file to the following problems, the solution of the second problem requires single or multiple calls to the first solution.
```python
{raw_problem}
{new_problem}
```
Please put the two solutions within the Python code block provided below, and make sure that the block contains no other unrelated content:
```python
```
"""


class HumanevalevalProDataset(BaseDataset):

    @staticmethod
    def load(path, local_mode=False):
        path = get_data_path(path, local_mode=local_mode)
        dataset = []
        with open(path, encoding='utf-8') as f:
            raw_data = json.load(f)
            for data in raw_data:
                dataset.append(data)
        return Dataset.from_list(dataset)


class HumanevalProEvaluator(CodeEvaluator):

    def score(self, predictions: List, references: List,
              test_set: Dataset) -> Dict:
        if len(predictions) != len(references):
            return {
                'error':
                'predictions and references have different '
                f'length. len(predictions): {len(predictions)}, '
                f'len(references): {len(references)}'
            }

        test_set = test_set.to_pandas()
        # Use the first column as the unique identifier
        test_set_origin = test_set.drop_duplicates(subset=test_set.columns[0])

        # 1. Prepare data for all test cases
        all_test_cases, prompts = [], []
        for i in range(len(test_set_origin)):
            test_case = test_set_origin.iloc[i]
            completion = predictions[i]

            # Process code completions
            processed_completion = self._process_completions(completion)
            code = processed_completion + '\n' + test_case['test_code']
            sub_data_dict = {
                'name': int(test_case['id']),
                'language': self.language,
                'code': code,
            }
            all_test_cases.append(sub_data_dict)

            prompt = PROMPT_WRAPPER.format(
                raw_problem=test_case['raw_problem'],
                new_problem=test_case['new_problem'])
            prompts.append(prompt)

        # 2. Send all test cases to the evaluation service
        success, outputs, error_message = self._evaluate(all_test_cases)
        if not success:
            return {'error': error_message}

        # 3. Process the returned results
        return self._process_results(outputs, prompts, len(test_set_origin))
update 2025-05-10 19:35:53 +08:00			`# flake8: noqa: E501s`

add bench 2025-05-09 10:36:39 +08:00			`import json`
			`from typing import Dict, List`

			`from datasets import Dataset`

			`from opencompass.openicl.icl_evaluator.code_evaluator import CodeEvaluator`
			`from opencompass.utils import get_data_path`

			`from .base import BaseDataset`

update 2025-05-10 19:35:53 +08:00			`PROMPT_WRAPPER = """You are an exceptionally intelligent coding assistant that consistently delivers accurate and reliable responses to user instructions.`
			`Write a solution of python file to the following problems, the solution of the second problem requires single or multiple calls to the first solution.`
			```python
			`{raw_problem}`
			`{new_problem}`
			```
			`Please put the two solutions within the Python code block provided below, and make sure that the block contains no other unrelated content:`
			```python
			```
			`"""`

add bench 2025-05-09 10:36:39 +08:00
			`class HumanevalevalProDataset(BaseDataset):`

			`@staticmethod`
update 2025-05-10 19:35:53 +08:00			`def load(path, local_mode=False):`
add bench 2025-05-09 10:36:39 +08:00			`path = get_data_path(path, local_mode=local_mode)`
			`dataset = []`
			`with open(path, encoding='utf-8') as f:`
			`raw_data = json.load(f)`
			`for data in raw_data:`
update 2025-05-10 19:35:53 +08:00			`dataset.append(data)`
add bench 2025-05-09 10:36:39 +08:00			`return Dataset.from_list(dataset)`


			`class HumanevalProEvaluator(CodeEvaluator):`

			`def score(self, predictions: List, references: List,`
			`test_set: Dataset) -> Dict:`
			`if len(predictions) != len(references):`
			`return {`
			`'error':`
			`'predictions and references have different '`
			`f'length. len(predictions): {len(predictions)}, '`
			`f'len(references): {len(references)}'`
			`}`

			`test_set = test_set.to_pandas()`
			`# Use the first column as the unique identifier`
			`test_set_origin = test_set.drop_duplicates(subset=test_set.columns[0])`

			`# 1. Prepare data for all test cases`
update 2025-05-10 19:35:53 +08:00			`all_test_cases, prompts = [], []`
add bench 2025-05-09 10:36:39 +08:00			`for i in range(len(test_set_origin)):`
			`test_case = test_set_origin.iloc[i]`
update 2025-05-10 19:35:53 +08:00			`completion = predictions[i]`
add bench 2025-05-09 10:36:39 +08:00
			`# Process code completions`
update 2025-05-10 19:35:53 +08:00			`processed_completion = self._process_completions(completion)`
			`code = processed_completion + '\n' + test_case['test_code']`
add bench 2025-05-09 10:36:39 +08:00			`sub_data_dict = {`
			`'name': int(test_case['id']),`
			`'language': self.language,`
update 2025-05-10 19:35:53 +08:00			`'code': code,`
add bench 2025-05-09 10:36:39 +08:00			`}`
			`all_test_cases.append(sub_data_dict)`

update 2025-05-10 19:35:53 +08:00			`prompt = PROMPT_WRAPPER.format(`
			`raw_problem=test_case['raw_problem'],`
			`new_problem=test_case['new_problem'])`
			`prompts.append(prompt)`

add bench 2025-05-09 10:36:39 +08:00			`# 2. Send all test cases to the evaluation service`
			`success, outputs, error_message = self._evaluate(all_test_cases)`
			`if not success:`
			`return {'error': error_message}`

			`# 3. Process the returned results`
update 2025-05-10 19:35:53 +08:00			`return self._process_results(outputs, prompts, len(test_set_origin))`