# flake8: noqa: E501 import os import re import tempfile import time from typing import Any, Dict, List, Optional, Tuple, Union import numpy as np from datasets import Dataset from gradio_client import Client from opencompass.openicl.icl_evaluator import BaseEvaluator from opencompass.registry import ICL_EVALUATORS @ICL_EVALUATORS.register_module() class CodeEvaluator(BaseEvaluator): """Evaluator for code generation tasks. This evaluator sends code to a remote evaluation service to test its functionality against provided test cases. It handles code extraction, processing, and result analysis. """ def __init__(self, language: str = 'py', ip_address: str = 'localhost', retry: int = 5) -> None: """Initialize the CodeEvaluator. Args: language (str): Programming language of the code to evaluate. ip_address (str, optional): IP address of the evaluation service. Defaults to 'localhost'. retry (int, optional): Number of retry attempts for failed connections. Defaults to 3. """ self.language = language self.retry = retry self.client = Client(ip_address) super().__init__() def _extract_code(self, text: str) -> str: """Extract code from markdown-formatted text. Args: text (str): Text that may contain code blocks in markdown format. Returns: str: Extracted code from the last code block, or the original text if no code blocks found. """ blocks = re.findall(r'```\w*\n(.*?)```', text, re.DOTALL) if len(blocks) >= 1: text = blocks[0] return text def _code_eval_service( self, input_data: Union[Dict, List, str]) -> Tuple[bool, Union[Dict, List, Any]]: """Send code to the remote evaluation service using gradio_client and get the results. Args: input_data: Can be one of: - dict: Dictionary containing code information for a single test case - list: List of dictionaries for batch evaluation - str: File path to code file Returns: tuple: (succeed, output) - succeed (bool): Whether the request was successful - output (dict/list/str): Evaluation results or error message """ try: import requests temp_file_path = None # Handle file path input if isinstance(input_data, str): with tempfile.NamedTemporaryFile(suffix=f'.{self.language}', delete=False) as temp_file: temp_file_path = temp_file.name with open(input_data, 'r') as src_file: content = src_file.read() temp_file.write(content.encode()) input_data = temp_file_path # Send to evaluation service try: result = self.client.predict(input_data, api_name='/evaluate') except Exception as e: # Catch timeout and other exceptions if 'timed out' in str(e).lower() or 'timeout' in str( e).lower(): return False, f'Request to code eval service timed out: {e}' else: raise # Process the result if isinstance(result, (dict, list)): return True, result else: # Try to parse the result as JSON if it's a string try: import json parsed_result = json.loads(result) return True, parsed_result except: # noqa: E722 return True, {'status': 'unknown', 'raw_result': result} except Exception as e: return False, str(e) finally: # Clean up temporary file if it was created if temp_file_path and os.path.exists(temp_file_path): try: os.unlink(temp_file_path) except: # noqa: E722 pass def _process_completions(self, completion: str) -> list: """Process code completions to extract the relevant code. Args: completion (str): Code completion string. Returns: list: List of processed code completions. """ post_comp = self._extract_code(completion) return post_comp def _evaluate( self, input_data: Union[Dict, List] ) -> Tuple[bool, Optional[Union[Dict, List]], Optional[str]]: """Evaluate code with retry mechanism. Args: input_data: Can be either: - dict: Dictionary containing code and test information for a single test case - list: List of dictionaries for batch evaluation Returns: tuple: (success, output, error_message) - success (bool): Whether the evaluation was successful - output (dict or list): Evaluation output (if successful) - error_message (str): Error message (if failed) """ num_retry = 0 while num_retry < self.retry: succeed, output = self._code_eval_service(input_data) if not succeed: num_retry += 1 time.sleep(30) else: break if not succeed: return False, None, f'code eval service connection failed: {output}' return True, output, None def _process_results(self, outputs: List, prompts: List, total_count: int) -> Dict: """Process the evaluation results. Args: outputs (list): List of evaluation results for each test case. prompts (list): List of prompts used for each test case. total_count (int): Total number of test cases. Returns: dict: Processed results including: - pass@1: Percentage of test cases passed - details: Detailed results for each test case """ details = [] correct = 0 for output, prompt in zip(outputs, prompts): output['prompt'] = prompt if output.get('status') == 'OK': output['correct'] = True correct += 1 else: output['correct'] = False details.append(output) return {f'pass@1': 100 * correct / total_count, 'details': details} def score(self, predictions: List, references: List, test_set: Dataset) -> Dict: """Score code generation predictions against references. Args: predictions (list): List of model-generated code completions. references (list): List of reference solutions (not directly used in evaluation). test_set (Dataset): Dataset containing test cases and other metadata. Returns: dict: Evaluation results including: - accuracy: Percentage of correctly solved problems - details: Detailed results for each test case - error: Error message if evaluation failed """ if len(predictions) != len(references): return { 'error': 'predictions and references have different ' f'length. len(predictions): {len(predictions)}, ' f'len(references): {len(references)}' } test_set = test_set.to_pandas() # Use the first column as the unique identifier test_set_origin = test_set.drop_duplicates(subset=test_set.columns[0]) # 1. Prepare data for all test cases all_test_cases, prompts = [], [] for i in range(len(test_set_origin)): test_case = test_set_origin.iloc[i] completion = predictions[i] # Process code completions processed_completion = self._process_completions( test_case, completion) code = test_case[ 'prompt'] + processed_completion + '\n' + test_case['tests'] sub_data_dict = { 'name': test_case['name'], 'language': test_case['language'], 'code': code } all_test_cases.append(sub_data_dict) prompts.append(test_case['prompt']) # 2. Send all test cases to the evaluation service success, outputs, error_message = self._evaluate(all_test_cases) if not success: return {'error': error_message} # 3. Process the returned results return self._process_results(outputs, prompts, len(test_set_origin))