multiple_code develop

2025-05-30 16:03:24 +08:00 · 2025-03-20 05:56:27 +00:00 · 2025-03-20 05:56:27 +00:00 · 373a0cba9b
commit 373a0cba9b
parent b9de8b0e2b
7 changed files with 465 additions and 0 deletions
--- a/opencompass/configs/datasets/multipl_e/multiple_top_ten_gen.py
+++ b/opencompass/configs/datasets/multipl_e/multiple_top_ten_gen.py
@ -0,0 +1,56 @@
+# Select the 10 most popular programming languages from MultiPL-E to compose the test set.
+
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import MultiplEDataset, MultiplEEvaluator
+
+
+_TOP_TEN_LANGUAGE_ = ['cpp', 'cs', 'go', 'java', 'rb', 'js', 'php', 'r', 'rs', 'sh']
+
+multiple_reader_cfg = dict(input_columns=['language', 'prompt'], output_column='tests')
+
+multiple_infer_cfg = dict(
+    prompt_template=dict(type=PromptTemplate, template='Based on the provided {language} code snippet, complete the subsequent content. The initial part of the completed code must match the provided code snippet exactly:\n{prompt}'),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer, max_out_len=2048),
+)
+
+multiple_eval_cfg = {
+    lang: dict(
+        evaluator=dict(
+            type=MultiplEEvaluator,
+            language=lang,
+            ip_address='https://dongsheng-docker-test.hf.space', # https://opencompass-multiple-evaluator.hf.space
+        ),
+        pred_role='BOT',
+    ) for lang in _TOP_TEN_LANGUAGE_
+}
+
+multiple_datasets = [
+    dict(
+        type=MultiplEDataset,
+        abbr=f'humaneval-multiple-{lang}',
+        language=lang,
+        num_repeats=1,
+        path='opencompass/multipl_e',
+        tag='humaneval',
+        reader_cfg=multiple_reader_cfg,
+        infer_cfg=multiple_infer_cfg,
+        eval_cfg=multiple_eval_cfg[lang],
+    ) for lang in _TOP_TEN_LANGUAGE_
+]
+
+multiple_datasets += [
+    dict(
+        type=MultiplEDataset,
+        abbr=f'mbpp-multiple-{lang}',
+        language=lang,
+        num_repeats=1,
+        path='opencompass/multipl_e',
+        tag='mbpp',
+        reader_cfg=multiple_reader_cfg,
+        infer_cfg=multiple_infer_cfg,
+        eval_cfg=multiple_eval_cfg[lang],
+    ) for lang in _TOP_TEN_LANGUAGE_
+]
--- a/opencompass/configs/models/phi/hf_phi_4.py
+++ b/opencompass/configs/models/phi/hf_phi_4.py
@ -0,0 +1,12 @@
+from opencompass.models import HuggingFacewithChatTemplate
+
+models = [
+    dict(
+        type=HuggingFacewithChatTemplate,
+        abbr='phi-4',
+        path='microsoft/phi-4',
+        max_out_len=1024,
+        batch_size=8,
+        run_cfg=dict(num_gpus=2),
+    )
+]
--- a/opencompass/datasets/init.py
+++ b/opencompass/datasets/init.py
@ -98,6 +98,7 @@ from .mmlu_cf import *  # noqa: F401, F403
 from .mmlu_pro import *  # noqa: F401, F403
 from .MMLUArabic import *  # noqa: F401, F403
 from .mmmlu import *  # noqa: F401, F403
+from .multipl_e import *  # noqa: F401, F403
 from .multirc import *  # noqa: F401, F403
 from .musr import *  # noqa: F401, F403
 from .narrativeqa import *  # noqa: F401, F403
--- a/opencompass/datasets/custom.py
+++ b/opencompass/datasets/custom.py
@ -183,6 +183,33 @@ class CustomDataset(BaseDataset):
        return Dataset.from_list(data)


+@LOAD_DATASET.register_module()
+class CodeCustomDataset(BaseDataset):
+
+    @staticmethod
+    def load(path, file_name=None, local_mode=False, num_repeats=1, **kwargs):
+        path = get_data_path(path, local_mode=local_mode)
+        if file_name is not None:
+            path = os.path.join(path, file_name)
+        data = []
+        if path.endswith('.jsonl'):
+            with open(path, 'r', encoding='utf-8') as f:
+                for line in f:
+                    data.extend(
+                        [json.loads(line.strip()) for _ in range(num_repeats)])
+        elif path.endswith('.csv'):
+            with open(path, 'r', encoding='utf-8-sig') as f:
+                reader = csv.reader(f)
+                header = next(reader)
+                for row in reader:
+                    data.extend(
+                        [dict(zip(header, row)) for _ in range(num_repeats)])
+        else:
+            raise ValueError(f'Unsupported file format: {path}')
+
+        return Dataset.from_list(data)
+
+
 class CircularCustomDataset(CustomDataset, metaclass=CircularDatasetMeta):
    dataset_class = CustomDataset

--- a/opencompass/datasets/multipl_e.py
+++ b/opencompass/datasets/multipl_e.py
@ -0,0 +1,91 @@
+import json
+import os.path as osp
+
+from datasets import Dataset
+
+from opencompass.openicl.icl_evaluator.code_evaluator import CodeEvaluator
+from opencompass.registry import LOAD_DATASET
+from opencompass.utils import get_data_path
+
+from .base import BaseDataset
+
+# currently supporting languages
+_HUMANEVAL_LANGUAGE_ = [
+    'adb', 'clj', 'cpp', 'cs', 'd', 'dart', 'elixir', 'go', 'hs', 'java', 'jl',
+    'js', 'lua', 'ml', 'php', 'pl', 'py', 'r', 'rb', 'rkt', 'rs', 'scala',
+    'sh', 'swift', 'ts'
+]
+_MBPP_LANGUAGE_ = [
+    'adb', 'clj', 'cpp', 'cs', 'd', 'elixir', 'go', 'hs', 'java', 'jl', 'js',
+    'lua', 'ml', 'php', 'pl', 'py', 'r', 'rb', 'rkt', 'rs', 'scala', 'sh',
+    'swift', 'ts'
+]
+
+
+@LOAD_DATASET.register_module()
+class MultiplEDataset(BaseDataset):
+
+    @staticmethod
+    def load(path: str,
+             language: str,
+             num_repeats: int = 1,
+             tag: str = 'humaneval',
+             local_mode: bool = False):
+        """Load humaneval dataset for pass k mode.
+
+        Note that you can use num_repeats > 1 when your model does not support
+        `num_return_sequence` in generation, otherwise use the raw
+        humaneval dataset and set `num_return_sequence` in model config to
+        generate multiple responses for testing pass@k>1.
+
+        It better to change your dataset abbr correspondingly if you want to
+        change num_repeats>1, otherwise the number in
+        `.cache/dataset_size.json` might be inconsistent.
+
+        Args:
+            num_repeats(int): Number of repetition for this dataset to get
+        multiple responses in special cases.
+        """
+        path = get_data_path(path, local_mode=local_mode)
+        assert tag in ['humaneval',
+                       'mbpp'], 'tag must be in ["humaneval", "mbpp"]'
+        if tag == 'humaneval':
+            assert language in _HUMANEVAL_LANGUAGE_, (
+                f'language must be in {_HUMANEVAL_LANGUAGE_}')
+        else:
+            assert language in _MBPP_LANGUAGE_, (
+                f'language must be in {_MBPP_LANGUAGE_}')
+        file_path = osp.join(path, f'{tag}-{language}.jsonl')
+        dataset = []
+        with open(file_path, 'r', encoding='utf-8') as f:
+            for line in f:
+                dataset.extend(
+                    [json.loads(line.strip()) for _ in range(num_repeats)])
+        return Dataset.from_list(dataset)
+
+
+class MultiplEEvaluator(CodeEvaluator):
+
+    def _stop_at_stop_token(self, decoded_string, stop_tokens):
+        """Produces the prefix of decoded_string that ends at the first
+        occurrence of a stop_token.
+
+        WARNING: the decoded_string *must not* include the prompt,
+        which may have stop tokens itself.
+        """
+        min_stop_index = len(decoded_string)
+        for stop_token in stop_tokens:
+            stop_index = decoded_string.find(stop_token)
+            if stop_index != -1 and stop_index < min_stop_index:
+                min_stop_index = stop_index
+        return decoded_string[:min_stop_index]
+
+    def _process_completions(self, test_case, completions):
+        processed_completions = []
+        for comp in completions:
+            comp = self._extract_code(comp)
+            post_comp = self._remove_prefix(test_case['prompt'], comp)
+            post_comp = self._stop_at_stop_token(post_comp,
+                                                 test_case['stop_tokens'])
+            processed_completions.append(post_comp)
+        return processed_completions
--- a/opencompass/openicl/icl_evaluator/code_evaluator.py
+++ b/opencompass/openicl/icl_evaluator/code_evaluator.py
@ -0,0 +1,267 @@
+# flake8: noqa: E501
+
+import difflib
+import os
+import re
+import tempfile
+import time
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+from datasets import Dataset
+from gradio_client import Client
+
+from opencompass.openicl.icl_evaluator import BaseEvaluator
+from opencompass.registry import ICL_EVALUATORS
+
+
+@ICL_EVALUATORS.register_module()
+class CodeEvaluator(BaseEvaluator):
+    """Evaluator for code generation tasks.
+
+    This evaluator sends code to a remote evaluation service to test its
+    functionality against provided test cases. It handles code extraction,
+    processing, and result analysis.
+    """
+
+    def __init__(self,
+                 language: str,
+                 ip_address: str = 'localhost',
+                 retry: int = 3) -> None:
+        """Initialize the CodeEvaluator.
+
+        Args:
+            language (str): Programming language of the code to evaluate.
+            ip_address (str, optional): IP address of the evaluation service. Defaults to 'localhost'.
+            retry (int, optional): Number of retry attempts for failed connections. Defaults to 3.
+        """
+        self.language = language
+        self.retry = retry
+        self.client = Client(ip_address)
+        super().__init__()
+
+    def _extract_code(self, text: str) -> str:
+        """Extract code from markdown-formatted text.
+
+        Args:
+            text (str): Text that may contain code blocks in markdown format.
+
+        Returns:
+            str: Extracted code from the last code block, or the original text if no code blocks found.
+        """
+        blocks = re.findall(r'```\w*\n(.*?)```', text, re.DOTALL)
+        if len(blocks) >= 1:
+            text = blocks[0]
+        return text
+
+    def _code_eval_service(
+        self, input_data: Union[Dict, List,
+                                str]) -> Tuple[bool, Union[Dict, List, Any]]:
+        """Send code to the remote evaluation service using gradio_client and
+        get the results.
+
+        Args:
+            input_data: Can be one of:
+                - dict: Dictionary containing code information for a single test case
+                - list: List of dictionaries for batch evaluation
+                - str: File path to code file
+
+        Returns:
+            tuple: (succeed, output)
+                - succeed (bool): Whether the request was successful
+                - output (dict/list/str): Evaluation results or error message
+        """
+        try:
+            temp_file_path = None
+            # Handle file path input
+            if isinstance(input_data, str):
+                with tempfile.NamedTemporaryFile(suffix=f'.{self.language}',
+                                                 delete=False) as temp_file:
+                    temp_file_path = temp_file.name
+                    with open(input_data, 'r') as src_file:
+                        content = src_file.read()
+                    temp_file.write(content.encode())
+                input_data = temp_file_path
+
+            # Send to evaluation service
+            result = self.client.predict(input_data, api_name='/evaluate')
+
+            # Process the result
+            if isinstance(result, (dict, list)):
+                return True, result
+            else:
+                # Try to parse the result as JSON if it's a string
+                try:
+                    import json
+                    parsed_result = json.loads(result)
+                    return True, parsed_result
+                except:  # noqa: E722
+                    return True, {'status': 'unknown', 'raw_result': result}
+
+        except Exception as e:
+            return False, str(e)
+        finally:
+            # Clean up temporary file if it was created
+            if temp_file_path and os.path.exists(temp_file_path):
+                try:
+                    os.unlink(temp_file_path)
+                except:  # noqa: E722
+                    pass
+
+    def _remove_prefix(self,
+                       prompt: str,
+                       completion: str,
+                       threshold: float = 0.95) -> str:
+        """Determine the truncation point in the completion based on the last
+        line of the prompt, remove all content before that line in the
+        completion, and return the completion string after removing the prefix.
+        This is done to convert chatbot-style inference mode to completion
+        mode.
+
+        Args:
+            prompt (str): The prompt text.
+            completion (str): The completion text.
+            threshold (float): Line similarity threshold.
+
+        Returns:
+            str: The completion string after removing the prefix.
+        """
+        prompt_lines = prompt.splitlines()
+        completion_lines = completion.splitlines()
+
+        if not prompt_lines:
+            return completion
+
+        last_prompt_line = prompt_lines[-1]
+        cut_index = -1
+
+        for i, completion_line in enumerate(completion_lines):
+            similarity = difflib.SequenceMatcher(None, last_prompt_line,
+                                                 completion_line).ratio()
+            if similarity >= threshold:
+                cut_index = i
+                break
+
+        if cut_index != -1:
+            return '\n'.join(completion_lines[cut_index + 1:])
+        else:
+            return completion
+
+    def _process_completions(self, test_case: dict, completions: list) -> list:
+        """Process code completion list, which typically involves extracting
+        code, removing repetitive prefixes caused by chatbot mode, and other
+        steps to ensure the model-generated code can be compiled successfully.
+
+        Args:
+            test_case (dict): Dictionary containing test case information including:
+            completions (list): List of code completions generated by the model.
+
+        Returns:
+            list: Processed code completion list.
+        """
+        processed_completions = []
+        for comp in completions:
+            comp = self._extract_code(comp)
+            post_comp = self._remove_prefix(test_case['prompt'], comp)
+            processed_completions.append(post_comp)
+        return processed_completions
+
+    def _evaluate(
+        self, input_data: Union[Dict, List]
+    ) -> Tuple[bool, Optional[Union[Dict, List]], Optional[str]]:
+        """Evaluate code with retry mechanism.
+
+        Args:
+            input_data: Can be either:
+                - dict: Dictionary containing code and test information for a single test case
+                - list: List of dictionaries for batch evaluation
+
+        Returns:
+            tuple: (success, output, error_message)
+                - success (bool): Whether the evaluation was successful
+                - output (dict or list): Evaluation output (if successful)
+                - error_message (str): Error message (if failed)
+        """
+        num_retry = 0
+        while num_retry < self.retry:
+            succeed, output = self._code_eval_service(input_data)
+            if not succeed:
+                num_retry += 1
+                time.sleep(10)
+            else:
+                break
+
+        if not succeed:
+            return False, None, f'code eval service connection failed: {output}'
+
+        return True, output, None
+
+    def score(self, predictions: List, references: List,
+              test_set: Dataset) -> Dict:
+        """Score code generation predictions against references.
+
+        Args:
+            predictions (list): List of model-generated code completions.
+            references (list): List of reference solutions (not directly used in evaluation).
+            test_set (Dataset): Dataset containing test cases and other metadata.
+
+        Returns:
+            dict: Evaluation results including:
+                - accuracy: Percentage of correctly solved problems
+                - details: Detailed results for each test case
+                - error: Error message if evaluation failed
+        """
+        if len(predictions) != len(references):
+            return {
+                'error':
+                'predictions and references have different '
+                f'length. len(predictions): {len(predictions)}, '
+                f'len(references): {len(references)}'
+            }
+
+        test_set = test_set.to_pandas()
+        # Use the first column as the unique identifier
+        test_set_origin = test_set.drop_duplicates(subset=test_set.columns[0])
+        num_repeats = int(len(test_set) / len(test_set_origin))
+
+        # 1. Prepare data for all test cases
+        all_test_cases = []
+        for i in range(len(test_set_origin)):
+            test_case = test_set_origin.iloc[i]
+            completions = predictions[i * num_repeats:(i + 1) * num_repeats]
+
+            # Process code completions
+            processed_completions = self._process_completions(
+                test_case, completions)
+
+            result_dict = {
+                'name': test_case['name'],
+                'language': test_case['language'],
+                'prompt': test_case['prompt'],
+                'tests': test_case['tests'],
+                'processed_completions': processed_completions,
+                'completions': completions
+            }
+
+            all_test_cases.append(result_dict)
+
+        # 2. Send all test cases to the evaluation service
+        success, outputs, error_message = self._evaluate(all_test_cases)
+        if not success:
+            return {'error': error_message}
+
+        # 3. Process the returned results
+        details = []
+        correct = 0
+        for output in outputs:
+            if output.get('status') == 'OK':
+                output['correct'] = True
+                correct += 1
+            else:
+                output['correct'] = False
+
+            details.append(output)
+
+        return {
+            f'pass@{num_repeats}': 100 * correct / len(test_set_origin),
+            'details': details
+        }
--- a/opencompass/utils/datasets_info.py
+++ b/opencompass/utils/datasets_info.py
@ -193,6 +193,12 @@ DATASETS_MAPPING = {
        "hf_id": "",
        "local": "./data/mmlu_pro",
    },
+    # MultiPL-E
+    "opencompass/multipl_e": {
+        "ms_id": "",
+        "hf_id": "",
+        "local": "./data/multipl_e",
+    },
    # NQ
    "opencompass/natural_question": {
        "ms_id": "opencompass/natural_question",
@ -627,6 +633,11 @@ DATASETS_URL = {
        "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/mmlu_pro.zip",
        "md5": "e3200c7380f4cea5f13c768f2815fabb",
    },
+    "multipl_e": {
+        "url":
+        "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/multipl_e.zip",
+        "md5": "24462aac7a38a4a62f5c5e89eb614e20",
+    },
    "/Longbench": {
        "url":
        "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/Longbench.zip",