2025-05-10 19:35:53 +08:00
# flake8: noqa: E501s
2025-05-09 10:36:39 +08:00
import json
from typing import Dict , List
from datasets import Dataset
from opencompass . openicl . icl_evaluator . code_evaluator import CodeEvaluator
from opencompass . utils import get_data_path
from . base import BaseDataset
2025-05-10 19:35:53 +08:00
PROMPT_WRAPPER = """ You are an exceptionally intelligent coding assistant that consistently delivers accurate and reliable responses to user instructions.
Write a solution of python file to the following problems , the solution of the second problem requires single or multiple calls to the first solution .
` ` ` python
{ raw_problem }
{ new_problem }
` ` `
Please put the two solutions within the Python code block provided below , and make sure that the block contains no other unrelated content :
` ` ` python
` ` `
"""
2025-05-09 10:36:39 +08:00
class HumanevalevalProDataset ( BaseDataset ) :
@staticmethod
2025-05-10 19:35:53 +08:00
def load ( path , local_mode = False ) :
2025-05-09 10:36:39 +08:00
path = get_data_path ( path , local_mode = local_mode )
dataset = [ ]
with open ( path , encoding = ' utf-8 ' ) as f :
raw_data = json . load ( f )
for data in raw_data :
2025-05-10 19:35:53 +08:00
dataset . append ( data )
2025-05-09 10:36:39 +08:00
return Dataset . from_list ( dataset )
class HumanevalProEvaluator ( CodeEvaluator ) :
def score ( self , predictions : List , references : List ,
test_set : Dataset ) - > Dict :
if len ( predictions ) != len ( references ) :
return {
' error ' :
' predictions and references have different '
f ' length. len(predictions): { len ( predictions ) } , '
f ' len(references): { len ( references ) } '
}
test_set = test_set . to_pandas ( )
# Use the first column as the unique identifier
test_set_origin = test_set . drop_duplicates ( subset = test_set . columns [ 0 ] )
# 1. Prepare data for all test cases
2025-05-10 19:35:53 +08:00
all_test_cases , prompts = [ ] , [ ]
2025-05-09 10:36:39 +08:00
for i in range ( len ( test_set_origin ) ) :
test_case = test_set_origin . iloc [ i ]
2025-05-10 19:35:53 +08:00
completion = predictions [ i ]
2025-05-09 10:36:39 +08:00
# Process code completions
2025-05-10 19:35:53 +08:00
processed_completion = self . _process_completions ( completion )
code = processed_completion + ' \n ' + test_case [ ' test_code ' ]
2025-05-09 10:36:39 +08:00
sub_data_dict = {
' name ' : int ( test_case [ ' id ' ] ) ,
' language ' : self . language ,
2025-05-10 19:35:53 +08:00
' code ' : code ,
2025-05-09 10:36:39 +08:00
}
all_test_cases . append ( sub_data_dict )
2025-05-10 19:35:53 +08:00
prompt = PROMPT_WRAPPER . format (
raw_problem = test_case [ ' raw_problem ' ] ,
new_problem = test_case [ ' new_problem ' ] )
prompts . append ( prompt )
2025-05-09 10:36:39 +08:00
# 2. Send all test cases to the evaluation service
success , outputs , error_message = self . _evaluate ( all_test_cases )
if not success :
return { ' error ' : error_message }
# 3. Process the returned results
2025-05-10 19:35:53 +08:00
return self . _process_results ( outputs , prompts , len ( test_set_origin ) )