From 2c79dc522723fe294671a3ff52e138879b9571b6 Mon Sep 17 00:00:00 2001 From: Dongsheng Zhu <59612926+Zhudongsheng75@users.noreply.github.com> Date: Mon, 12 May 2025 18:38:13 +0800 Subject: [PATCH] [Dataset] Add human_eval/mbpp pro (#2092) * add bench * update * bug fix * time update * add index * fix repeat bug --- dataset-index.yml | 12 ++ .../configs/datasets/humaneval_pro/README.md | 17 +++ .../humaneval_pro/humaneval_pro_gen.py | 4 + .../humaneval_pro/humaneval_pro_gen_3dc067.py | 46 ++++++ .../humaneval_pro_repeat_gen_3dc067.py | 48 ++++++ .../configs/datasets/mbpp_pro/README.md | 17 +++ .../configs/datasets/mbpp_pro/mbpp_pro_gen.py | 4 + .../datasets/mbpp_pro/mbpp_pro_gen_3dc067.py | 46 ++++++ .../mbpp_pro/mbpp_pro_repeat_gen_3dc067.py | 48 ++++++ .../datasets/multipl_e/multiple_gen.py | 4 + ..._gen.py => multiple_top_ten_gen_f44aaf.py} | 2 - .../multiple_top_ten_repeat_gen_0cd6ce.py | 58 +++++++ opencompass/datasets/__init__.py | 2 + opencompass/datasets/humaneval_pro.py | 81 ++++++++++ opencompass/datasets/mbpp_pro.py | 81 ++++++++++ opencompass/datasets/multipl_e.py | 65 ++++++-- .../openicl/icl_evaluator/code_evaluator.py | 142 +++++++----------- opencompass/utils/datasets_info.py | 22 ++- 18 files changed, 593 insertions(+), 106 deletions(-) create mode 100644 opencompass/configs/datasets/humaneval_pro/README.md create mode 100644 opencompass/configs/datasets/humaneval_pro/humaneval_pro_gen.py create mode 100644 opencompass/configs/datasets/humaneval_pro/humaneval_pro_gen_3dc067.py create mode 100644 opencompass/configs/datasets/humaneval_pro/humaneval_pro_repeat_gen_3dc067.py create mode 100644 opencompass/configs/datasets/mbpp_pro/README.md create mode 100644 opencompass/configs/datasets/mbpp_pro/mbpp_pro_gen.py create mode 100644 opencompass/configs/datasets/mbpp_pro/mbpp_pro_gen_3dc067.py create mode 100644 opencompass/configs/datasets/mbpp_pro/mbpp_pro_repeat_gen_3dc067.py create mode 100644 opencompass/configs/datasets/multipl_e/multiple_gen.py rename opencompass/configs/datasets/multipl_e/{multiple_top_ten_gen.py => multiple_top_ten_gen_f44aaf.py} (97%) create mode 100644 opencompass/configs/datasets/multipl_e/multiple_top_ten_repeat_gen_0cd6ce.py create mode 100644 opencompass/datasets/humaneval_pro.py create mode 100644 opencompass/datasets/mbpp_pro.py diff --git a/dataset-index.yml b/dataset-index.yml index 57bd924e..5ebad535 100644 --- a/dataset-index.yml +++ b/dataset-index.yml @@ -611,6 +611,12 @@ paper: https://dl.acm.org/doi/pdf/10.1145/3580305.3599790 configpath: opencompass/configs/datasets/humanevalx/humanevalx_gen.py configpath_llmjudge: '' +- humaneval_pro: + name: HumanEval Pro + category: Code + paper: https://arxiv.org/abs/2412.21199 + configpath: opencompass/configs/datasets/humaneval_pro/humaneval_pro_gen.py + configpath_llmjudge: '' - hungarian_math: name: Hungarian_Math category: Math @@ -695,6 +701,12 @@ paper: '' configpath: opencompass/configs/datasets/mbpp_plus/mbpp_plus_gen.py configpath_llmjudge: '' +- mbpp_pro: + name: MBPP Pro + category: Code + paper: https://arxiv.org/abs/2412.21199 + configpath: opencompass/configs/datasets/mbpp_pro/mbpp_pro_gen.py + configpath_llmjudge: '' - mgsm: name: MGSM category: Language / Math diff --git a/opencompass/configs/datasets/humaneval_pro/README.md b/opencompass/configs/datasets/humaneval_pro/README.md new file mode 100644 index 00000000..853b59f2 --- /dev/null +++ b/opencompass/configs/datasets/humaneval_pro/README.md @@ -0,0 +1,17 @@ +# HumanEval pro + +## OC results + +| model | pass@1 | +|:--------------------------:|---------:| +|qwen2.5-coder-7b-instruct-hf| 65 | +| qwen2.5-14b-instruct-hf | 67 | +| deepseek-v2-lite-chat-hf | 35 | + +## CodeEval-pro results + +| model | pass@1 | +|:--------------------------:|---------:| +|qwen2.5-coder-7b-instruct-hf| 65 | +| qwen2.5-14b-instruct-hf | 65 | +| deepseek-v2-lite-chat-hf | 28 | \ No newline at end of file diff --git a/opencompass/configs/datasets/humaneval_pro/humaneval_pro_gen.py b/opencompass/configs/datasets/humaneval_pro/humaneval_pro_gen.py new file mode 100644 index 00000000..9bccdd66 --- /dev/null +++ b/opencompass/configs/datasets/humaneval_pro/humaneval_pro_gen.py @@ -0,0 +1,4 @@ +from mmengine.config import read_base + +with read_base(): + from .humaneval_pro_gen_3dc067 import humanevalpro_datasets # noqa: F401, F403 diff --git a/opencompass/configs/datasets/humaneval_pro/humaneval_pro_gen_3dc067.py b/opencompass/configs/datasets/humaneval_pro/humaneval_pro_gen_3dc067.py new file mode 100644 index 00000000..e3ed8349 --- /dev/null +++ b/opencompass/configs/datasets/humaneval_pro/humaneval_pro_gen_3dc067.py @@ -0,0 +1,46 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import HumanevalevalProDataset, HumanevalProEvaluator, humaneval_postprocess_v2 + + +PROMPT_WRAPPER = """You are an exceptionally intelligent coding assistant that consistently delivers accurate and reliable responses to user instructions. +Write a solution of python file to the following problems, the solution of the second problem requires single or multiple calls to the first solution. +```python +{raw_problem} +{new_problem} +``` +Please put the two solutions within the Python code block provided below, and make sure that the block contains no other unrelated content: +```python +``` +""" + + +humanevalpro_reader_cfg = dict( + input_columns=['raw_problem', 'new_problem'], output_column='test_code') + +humanevalpro_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict(round=[ + dict( + role='HUMAN', + prompt=PROMPT_WRAPPER), + ])), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer)) + +humanevalpro_eval_cfg = dict( + evaluator=dict(type=HumanevalProEvaluator, + ip_address='https://opencompass-multiple-evaluator.hf.space') +) + +humanevalpro_datasets = [ + dict( + abbr='humaneval_pro', + type=HumanevalevalProDataset, + path='opencompass/humaneval_pro', + reader_cfg=humanevalpro_reader_cfg, + infer_cfg=humanevalpro_infer_cfg, + eval_cfg=humanevalpro_eval_cfg,) +] \ No newline at end of file diff --git a/opencompass/configs/datasets/humaneval_pro/humaneval_pro_repeat_gen_3dc067.py b/opencompass/configs/datasets/humaneval_pro/humaneval_pro_repeat_gen_3dc067.py new file mode 100644 index 00000000..98320f78 --- /dev/null +++ b/opencompass/configs/datasets/humaneval_pro/humaneval_pro_repeat_gen_3dc067.py @@ -0,0 +1,48 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import HumanevalevalProDataset, HumanevalProEvaluator, humaneval_postprocess_v2 + + +PROMPT_WRAPPER = """You are an exceptionally intelligent coding assistant that consistently delivers accurate and reliable responses to user instructions. +Write a solution of python file to the following problems, the solution of the second problem requires single or multiple calls to the first solution. +```python +{raw_problem} +{new_problem} +``` +Please put the two solutions within the Python code block provided below, and make sure that the block contains no other unrelated content: +```python +``` +""" + + +humanevalpro_reader_cfg = dict( + input_columns=['raw_problem', 'new_problem'], output_column='test_code') + +humanevalpro_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict(round=[ + dict( + role='HUMAN', + prompt=PROMPT_WRAPPER), + ])), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer)) + +humanevalpro_eval_cfg = dict( + evaluator=dict(type=HumanevalProEvaluator, + ip_address='https://opencompass-multiple-evaluator.hf.space') +) + +humanevalpro_datasets = [ + dict( + abbr='humaneval_pro', + type=HumanevalevalProDataset, + path='opencompass/humaneval_pro', + reader_cfg=humanevalpro_reader_cfg, + infer_cfg=humanevalpro_infer_cfg, + eval_cfg=humanevalpro_eval_cfg, + n=5, + k=3) +] \ No newline at end of file diff --git a/opencompass/configs/datasets/mbpp_pro/README.md b/opencompass/configs/datasets/mbpp_pro/README.md new file mode 100644 index 00000000..d34980e1 --- /dev/null +++ b/opencompass/configs/datasets/mbpp_pro/README.md @@ -0,0 +1,17 @@ +# MBPP pro + +## OC results + +| model | pass@1 | +|:--------------------------:|---------:| +|qwen2.5-coder-7b-instruct-hf| 66 | +| qwen2.5-14b-instruct-hf | 64 | +| deepseek-v2-lite-chat-hf | 36 | + +## CodeEval-pro results + +| model | pass@1 | +|:--------------------------:|---------:| +|qwen2.5-coder-7b-instruct-hf| 65 | +| qwen2.5-14b-instruct-hf | 65 | +| deepseek-v2-lite-chat-hf | 39 | \ No newline at end of file diff --git a/opencompass/configs/datasets/mbpp_pro/mbpp_pro_gen.py b/opencompass/configs/datasets/mbpp_pro/mbpp_pro_gen.py new file mode 100644 index 00000000..84d45d83 --- /dev/null +++ b/opencompass/configs/datasets/mbpp_pro/mbpp_pro_gen.py @@ -0,0 +1,4 @@ +from mmengine.config import read_base + +with read_base(): + from .mbpp_pro_gen_3dc067 import mbpppro_datasets # noqa: F401, F403 diff --git a/opencompass/configs/datasets/mbpp_pro/mbpp_pro_gen_3dc067.py b/opencompass/configs/datasets/mbpp_pro/mbpp_pro_gen_3dc067.py new file mode 100644 index 00000000..c14b05cb --- /dev/null +++ b/opencompass/configs/datasets/mbpp_pro/mbpp_pro_gen_3dc067.py @@ -0,0 +1,46 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import MBPPProDataset, MBPPProEvaluator + + +PROMPT_WRAPPER = """You are an exceptionally intelligent coding assistant that consistently delivers accurate and reliable responses to user instructions. +Write a solution of python file to the following problems, the solution of the second problem requires single or multiple calls to the first solution. +```python +{raw_problem} +{new_problem} +``` +Please put the two solutions within the Python code block provided below, and make sure that the block contains no other unrelated content: +```python +``` +""" + + +mbpppro_reader_cfg = dict( + input_columns=['raw_problem', 'new_problem'], output_column='test_code') + +mbpppro_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict(round=[ + dict( + role='HUMAN', + prompt=PROMPT_WRAPPER), + ])), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer)) + +mbpppro_eval_cfg = dict( + evaluator=dict(type=MBPPProEvaluator, + ip_address='https://opencompass-multiple-evaluator.hf.space'), +) + +mbpppro_datasets = [ + dict( + abbr='mbpp_pro', + type=MBPPProDataset, + path='opencompass/mbpp_pro', + reader_cfg=mbpppro_reader_cfg, + infer_cfg=mbpppro_infer_cfg, + eval_cfg=mbpppro_eval_cfg) +] \ No newline at end of file diff --git a/opencompass/configs/datasets/mbpp_pro/mbpp_pro_repeat_gen_3dc067.py b/opencompass/configs/datasets/mbpp_pro/mbpp_pro_repeat_gen_3dc067.py new file mode 100644 index 00000000..631713b8 --- /dev/null +++ b/opencompass/configs/datasets/mbpp_pro/mbpp_pro_repeat_gen_3dc067.py @@ -0,0 +1,48 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import MBPPProDataset, MBPPProEvaluator + + +PROMPT_WRAPPER = """You are an exceptionally intelligent coding assistant that consistently delivers accurate and reliable responses to user instructions. +Write a solution of python file to the following problems, the solution of the second problem requires single or multiple calls to the first solution. +```python +{raw_problem} +{new_problem} +``` +Please put the two solutions within the Python code block provided below, and make sure that the block contains no other unrelated content: +```python +``` +""" + + +mbpppro_reader_cfg = dict( + input_columns=['raw_problem', 'new_problem'], output_column='test_code') + +mbpppro_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict(round=[ + dict( + role='HUMAN', + prompt=PROMPT_WRAPPER), + ])), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer)) + +mbpppro_eval_cfg = dict( + evaluator=dict(type=MBPPProEvaluator, + ip_address='https://opencompass-multiple-evaluator.hf.space'), +) + +mbpppro_datasets = [ + dict( + abbr='mbpp_pro', + type=MBPPProDataset, + path='opencompass/mbpp_pro', + reader_cfg=mbpppro_reader_cfg, + infer_cfg=mbpppro_infer_cfg, + eval_cfg=mbpppro_eval_cfg, + n=5, + k=3) +] \ No newline at end of file diff --git a/opencompass/configs/datasets/multipl_e/multiple_gen.py b/opencompass/configs/datasets/multipl_e/multiple_gen.py new file mode 100644 index 00000000..b32af567 --- /dev/null +++ b/opencompass/configs/datasets/multipl_e/multiple_gen.py @@ -0,0 +1,4 @@ +from mmengine.config import read_base + +with read_base(): + from .multiple_top_ten_gen_f44aaf import multiple_datasets # noqa: F401, F403 diff --git a/opencompass/configs/datasets/multipl_e/multiple_top_ten_gen.py b/opencompass/configs/datasets/multipl_e/multiple_top_ten_gen_f44aaf.py similarity index 97% rename from opencompass/configs/datasets/multipl_e/multiple_top_ten_gen.py rename to opencompass/configs/datasets/multipl_e/multiple_top_ten_gen_f44aaf.py index 93ab2962..040c5ba5 100644 --- a/opencompass/configs/datasets/multipl_e/multiple_top_ten_gen.py +++ b/opencompass/configs/datasets/multipl_e/multiple_top_ten_gen_f44aaf.py @@ -32,7 +32,6 @@ multiple_datasets = [ type=MultiplEDataset, abbr=f'humaneval-multiple-{lang}', language=lang, - num_repeats=1, path='opencompass/multipl_e', tag='humaneval', reader_cfg=multiple_reader_cfg, @@ -46,7 +45,6 @@ multiple_datasets += [ type=MultiplEDataset, abbr=f'mbpp-multiple-{lang}', language=lang, - num_repeats=1, path='opencompass/multipl_e', tag='mbpp', reader_cfg=multiple_reader_cfg, diff --git a/opencompass/configs/datasets/multipl_e/multiple_top_ten_repeat_gen_0cd6ce.py b/opencompass/configs/datasets/multipl_e/multiple_top_ten_repeat_gen_0cd6ce.py new file mode 100644 index 00000000..1a603d32 --- /dev/null +++ b/opencompass/configs/datasets/multipl_e/multiple_top_ten_repeat_gen_0cd6ce.py @@ -0,0 +1,58 @@ +# Select the 10 most popular programming languages from MultiPL-E to compose the test set. + +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import MultiplEDataset, MultiplEEvaluator + + +_TOP_TEN_LANGUAGE_ = ['cpp'] + +multiple_reader_cfg = dict(input_columns=['language', 'prompt'], output_column='tests') + +multiple_infer_cfg = dict( + prompt_template=dict(type=PromptTemplate, template='Based on the provided {language} code snippet, complete the subsequent content. The initial part of the completed code must match the provided code snippet exactly:\n{prompt}'), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer), +) + +multiple_eval_cfg = { + lang: dict( + evaluator=dict( + type=MultiplEEvaluator, + language=lang, + ip_address='https://opencompass-multiple-evaluator.hf.space', + ), + pred_role='BOT', + ) for lang in _TOP_TEN_LANGUAGE_ +} + +multiple_datasets = [ + dict( + type=MultiplEDataset, + abbr=f'humaneval-multiple-{lang}', + language=lang, + path='opencompass/multipl_e', + tag='humaneval', + reader_cfg=multiple_reader_cfg, + infer_cfg=multiple_infer_cfg, + eval_cfg=multiple_eval_cfg[lang], + n=5, + k=3 + ) for lang in _TOP_TEN_LANGUAGE_ +] + +multiple_datasets += [ + dict( + type=MultiplEDataset, + abbr=f'mbpp-multiple-{lang}', + language=lang, + path='opencompass/multipl_e', + tag='mbpp', + reader_cfg=multiple_reader_cfg, + infer_cfg=multiple_infer_cfg, + eval_cfg=multiple_eval_cfg[lang], + n=5, + k=3 + ) for lang in _TOP_TEN_LANGUAGE_ +] diff --git a/opencompass/datasets/__init__.py b/opencompass/datasets/__init__.py index c441a2d8..92cda579 100644 --- a/opencompass/datasets/__init__.py +++ b/opencompass/datasets/__init__.py @@ -64,6 +64,7 @@ from .hle import * # noqa: F401, F403 from .huggingface import * # noqa: F401, F403 from .humaneval import * # noqa: F401, F403 from .humaneval_multi import * # noqa: F401, F403 +from .humaneval_pro import * # noqa: F401, F403 from .humanevalx import * # noqa: F401, F403 from .hungarian_math import * # noqa: F401, F403 from .IFEval.ifeval import IFEvalDataset, IFEvaluator # noqa: F401, F403 @@ -96,6 +97,7 @@ from .math401 import * # noqa: F401, F403 from .math_intern import * # noqa: F401, F403 from .mathbench import * # noqa: F401, F403 from .mbpp import * # noqa: F401, F403 +from .mbpp_pro import * # noqa: F401, F403 from .medbench import * # noqa: F401, F403 from .MedCalc_Bench import MedCalc_BenchDataset # noqa: F401 from .MedCalc_Bench import MedCalcOfficial_Evaluator # noqa: F401 diff --git a/opencompass/datasets/humaneval_pro.py b/opencompass/datasets/humaneval_pro.py new file mode 100644 index 00000000..871b468f --- /dev/null +++ b/opencompass/datasets/humaneval_pro.py @@ -0,0 +1,81 @@ +# flake8: noqa: E501s + +import json +from typing import Dict, List + +from datasets import Dataset + +from opencompass.openicl.icl_evaluator.code_evaluator import CodeEvaluator +from opencompass.utils import get_data_path + +from .base import BaseDataset + +PROMPT_WRAPPER = """You are an exceptionally intelligent coding assistant that consistently delivers accurate and reliable responses to user instructions. +Write a solution of python file to the following problems, the solution of the second problem requires single or multiple calls to the first solution. +```python +{raw_problem} +{new_problem} +``` +Please put the two solutions within the Python code block provided below, and make sure that the block contains no other unrelated content: +```python +``` +""" + + +class HumanevalevalProDataset(BaseDataset): + + @staticmethod + def load(path, local_mode=False): + path = get_data_path(path, local_mode=local_mode) + dataset = [] + with open(path, encoding='utf-8') as f: + raw_data = json.load(f) + for data in raw_data: + dataset.append(data) + return Dataset.from_list(dataset) + + +class HumanevalProEvaluator(CodeEvaluator): + + def score(self, predictions: List, references: List, + test_set: Dataset) -> Dict: + if len(predictions) != len(references): + return { + 'error': + 'predictions and references have different ' + f'length. len(predictions): {len(predictions)}, ' + f'len(references): {len(references)}' + } + + test_set = test_set.to_pandas() + # Use the first column as the unique identifier + test_set_origin = test_set.drop_duplicates(subset=test_set.columns[0]) + + # 1. Prepare data for all test cases + all_test_cases, prompts = [], [] + for i in range(len(test_set_origin)): + test_case = test_set_origin.iloc[i] + completion = predictions[i] + + # Process code completions + processed_completion = self._process_completions(completion) + code = processed_completion + '\n' + test_case['test_code'] + sub_data_dict = { + 'name': int(test_case['id']), + 'language': self.language, + 'code': code, + } + all_test_cases.append(sub_data_dict) + + prompt = PROMPT_WRAPPER.format( + raw_problem=test_case['raw_problem'], + new_problem=test_case['new_problem']) + prompts.append(prompt) + + # 2. Send all test cases to the evaluation service + success, outputs, error_message = self._evaluate(all_test_cases) + if not success: + return {'error': error_message} + + # 3. Process the returned results + return self._process_results(outputs, prompts, len(test_set_origin)) diff --git a/opencompass/datasets/mbpp_pro.py b/opencompass/datasets/mbpp_pro.py new file mode 100644 index 00000000..fe7d01a4 --- /dev/null +++ b/opencompass/datasets/mbpp_pro.py @@ -0,0 +1,81 @@ +# flake8: noqa: E501 + +import json +from typing import Dict, List + +from datasets import Dataset + +from opencompass.openicl.icl_evaluator.code_evaluator import CodeEvaluator +from opencompass.utils import get_data_path + +from .base import BaseDataset + +PROMPT_WRAPPER = """You are an exceptionally intelligent coding assistant that consistently delivers accurate and reliable responses to user instructions. +Write a solution of python file to the following problems, the solution of the second problem requires single or multiple calls to the first solution. +```python +{raw_problem} +{new_problem} +``` +Please put the two solutions within the Python code block provided below, and make sure that the block contains no other unrelated content: +```python +``` +""" + + +class MBPPProDataset(BaseDataset): + + @staticmethod + def load(path, local_mode=False): + path = get_data_path(path, local_mode=local_mode) + print(path) + dataset = [] + with open(path, encoding='utf-8') as f: + for line in f: + dataset.append(json.loads(line.strip())) + return Dataset.from_list(dataset) + + +class MBPPProEvaluator(CodeEvaluator): + + def score(self, predictions: List, references: List, + test_set: Dataset) -> Dict: + if len(predictions) != len(references): + return { + 'error': + 'predictions and references have different ' + f'length. len(predictions): {len(predictions)}, ' + f'len(references): {len(references)}' + } + + test_set = test_set.to_pandas() + # Use the first column as the unique identifier + test_set_origin = test_set.drop_duplicates(subset=test_set.columns[0]) + + # 1. Prepare data for all test cases + all_test_cases, prompts = [], [] + for i in range(len(test_set_origin)): + test_case = test_set_origin.iloc[i] + completion = predictions[i] + + # Process code completions + processed_completion = self._process_completions(completion) + code = processed_completion + '\n' + test_case['test_code'] + sub_data_dict = { + 'name': int(test_case['id']), + 'language': self.language, + 'code': code, + } + all_test_cases.append(sub_data_dict) + + prompt = PROMPT_WRAPPER.format( + raw_problem=test_case['raw_problem'], + new_problem=test_case['new_problem']) + prompts.append(prompt) + + # 2. Send all test cases to the evaluation service + success, outputs, error_message = self._evaluate(all_test_cases) + if not success: + return {'error': error_message} + + # 3. Process the returned results + return self._process_results(outputs, prompts, len(test_set_origin)) diff --git a/opencompass/datasets/multipl_e.py b/opencompass/datasets/multipl_e.py index 657b52de..c8f80632 100644 --- a/opencompass/datasets/multipl_e.py +++ b/opencompass/datasets/multipl_e.py @@ -1,3 +1,4 @@ +import difflib import json import os.path as osp @@ -28,7 +29,6 @@ class MultiplEDataset(BaseDataset): @staticmethod def load(path: str, language: str, - num_repeats: int = 1, tag: str = 'humaneval', local_mode: bool = False): """Load dataset for pass k mode. @@ -56,8 +56,7 @@ class MultiplEDataset(BaseDataset): dataset = [] with open(file_path, 'r', encoding='utf-8') as f: for line in f: - dataset.extend( - [json.loads(line.strip()) for _ in range(num_repeats)]) + dataset.append(json.loads(line.strip())) return Dataset.from_list(dataset) @@ -84,20 +83,56 @@ class MultiplEEvaluator(CodeEvaluator): min_stop_index = stop_index return decoded_string[:min_stop_index] - def _process_completions(self, test_case, completions): + def _remove_prefix(self, + prompt: str, + completion: str, + threshold: float = 0.95) -> str: + """Determine the truncation point in the completion based on the last + line of the prompt, remove all content before that line in the + completion, and return the completion string after removing the prefix. + This is done to convert chatbot-style inference mode to completion + mode. + + Args: + prompt (str): The prompt text. + completion (str): The completion text. + threshold (float): Line similarity threshold. + + Returns: + str: The completion string after removing the prefix. + """ + prompt_lines = prompt.splitlines() + completion_lines = completion.splitlines() + + if not prompt_lines: + return completion + + last_prompt_line = prompt_lines[-1] + cut_index = -1 + + for i, completion_line in enumerate(completion_lines): + similarity = difflib.SequenceMatcher(None, last_prompt_line, + completion_line).ratio() + if similarity >= threshold: + cut_index = i + break + + if cut_index != -1: + return '\n'.join(completion_lines[cut_index + 1:]) + else: + return completion + + def _process_completions(self, test_case, completion): """Process completions with a test case. Args: - test_case: A test case. - completions: A list of completions. + test_case (dict): A test case containing prompt and stop tokens. + completion (str): The generated code completion. Returns: - A list of processed completions. + str: Processed code completion. """ - processed_completions = [] - for comp in completions: - comp = self._extract_code(comp) - post_comp = self._remove_prefix(test_case['prompt'], comp) - post_comp = self._stop_at_stop_token(post_comp, - test_case['stop_tokens']) - processed_completions.append(post_comp) - return processed_completions + post_comp = self._extract_code(completion) + post_comp = self._remove_prefix(test_case['prompt'], post_comp) + post_comp = self._stop_at_stop_token(post_comp, + test_case['stop_tokens']) + return post_comp diff --git a/opencompass/openicl/icl_evaluator/code_evaluator.py b/opencompass/openicl/icl_evaluator/code_evaluator.py index d586cd6e..a2804207 100644 --- a/opencompass/openicl/icl_evaluator/code_evaluator.py +++ b/opencompass/openicl/icl_evaluator/code_evaluator.py @@ -1,12 +1,12 @@ # flake8: noqa: E501 -import difflib import os import re import tempfile import time from typing import Any, Dict, List, Optional, Tuple, Union +import numpy as np from datasets import Dataset from gradio_client import Client @@ -24,9 +24,9 @@ class CodeEvaluator(BaseEvaluator): """ def __init__(self, - language: str, + language: str = 'py', ip_address: str = 'localhost', - retry: int = 3) -> None: + retry: int = 5) -> None: """Initialize the CodeEvaluator. Args: @@ -71,6 +71,7 @@ class CodeEvaluator(BaseEvaluator): - output (dict/list/str): Evaluation results or error message """ try: + import requests temp_file_path = None # Handle file path input if isinstance(input_data, str): @@ -83,7 +84,15 @@ class CodeEvaluator(BaseEvaluator): input_data = temp_file_path # Send to evaluation service - result = self.client.predict(input_data, api_name='/evaluate') + try: + result = self.client.predict(input_data, api_name='/evaluate') + except Exception as e: + # Catch timeout and other exceptions + if 'timed out' in str(e).lower() or 'timeout' in str( + e).lower(): + return False, f'Request to code eval service timed out: {e}' + else: + raise # Process the result if isinstance(result, (dict, list)): @@ -107,63 +116,16 @@ class CodeEvaluator(BaseEvaluator): except: # noqa: E722 pass - def _remove_prefix(self, - prompt: str, - completion: str, - threshold: float = 0.95) -> str: - """Determine the truncation point in the completion based on the last - line of the prompt, remove all content before that line in the - completion, and return the completion string after removing the prefix. - This is done to convert chatbot-style inference mode to completion - mode. + def _process_completions(self, completion: str) -> list: + """Process code completions to extract the relevant code. Args: - prompt (str): The prompt text. - completion (str): The completion text. - threshold (float): Line similarity threshold. - + completion (str): Code completion string. Returns: - str: The completion string after removing the prefix. + list: List of processed code completions. """ - prompt_lines = prompt.splitlines() - completion_lines = completion.splitlines() - - if not prompt_lines: - return completion - - last_prompt_line = prompt_lines[-1] - cut_index = -1 - - for i, completion_line in enumerate(completion_lines): - similarity = difflib.SequenceMatcher(None, last_prompt_line, - completion_line).ratio() - if similarity >= threshold: - cut_index = i - break - - if cut_index != -1: - return '\n'.join(completion_lines[cut_index + 1:]) - else: - return completion - - def _process_completions(self, test_case: dict, completions: list) -> list: - """Process code completion list, which typically involves extracting - code, removing repetitive prefixes caused by chatbot mode, and other - steps to ensure the model-generated code can be compiled successfully. - - Args: - test_case (dict): Dictionary containing test case information including: - completions (list): List of code completions generated by the model. - - Returns: - list: Processed code completion list. - """ - processed_completions = [] - for comp in completions: - comp = self._extract_code(comp) - post_comp = self._remove_prefix(test_case['prompt'], comp) - processed_completions.append(post_comp) - return processed_completions + post_comp = self._extract_code(completion) + return post_comp def _evaluate( self, input_data: Union[Dict, List] @@ -186,7 +148,7 @@ class CodeEvaluator(BaseEvaluator): succeed, output = self._code_eval_service(input_data) if not succeed: num_retry += 1 - time.sleep(10) + time.sleep(30) else: break @@ -195,6 +157,31 @@ class CodeEvaluator(BaseEvaluator): return True, output, None + def _process_results(self, outputs: List, prompts: List, + total_count: int) -> Dict: + """Process the evaluation results. + Args: + outputs (list): List of evaluation results for each test case. + prompts (list): List of prompts used for each test case. + total_count (int): Total number of test cases. + Returns: + dict: Processed results including: + - pass@1: Percentage of test cases passed + - details: Detailed results for each test case + """ + details = [] + correct = 0 + for output, prompt in zip(outputs, prompts): + output['prompt'] = prompt + if output.get('status') == 'OK': + output['correct'] = True + correct += 1 + else: + output['correct'] = False + details.append(output) + + return {f'pass@1': 100 * correct / total_count, 'details': details} + def score(self, predictions: List, references: List, test_set: Dataset) -> Dict: """Score code generation predictions against references. @@ -221,28 +208,25 @@ class CodeEvaluator(BaseEvaluator): test_set = test_set.to_pandas() # Use the first column as the unique identifier test_set_origin = test_set.drop_duplicates(subset=test_set.columns[0]) - num_repeats = int(len(test_set) / len(test_set_origin)) # 1. Prepare data for all test cases - all_test_cases = [] + all_test_cases, prompts = [], [] for i in range(len(test_set_origin)): test_case = test_set_origin.iloc[i] - completions = predictions[i * num_repeats:(i + 1) * num_repeats] + completion = predictions[i] # Process code completions - processed_completions = self._process_completions( - test_case, completions) - - result_dict = { + processed_completion = self._process_completions( + test_case, completion) + code = test_case[ + 'prompt'] + processed_completion + '\n' + test_case['tests'] + sub_data_dict = { 'name': test_case['name'], 'language': test_case['language'], - 'prompt': test_case['prompt'], - 'tests': test_case['tests'], - 'processed_completions': processed_completions, - 'completions': completions + 'code': code } - - all_test_cases.append(result_dict) + all_test_cases.append(sub_data_dict) + prompts.append(test_case['prompt']) # 2. Send all test cases to the evaluation service success, outputs, error_message = self._evaluate(all_test_cases) @@ -250,18 +234,4 @@ class CodeEvaluator(BaseEvaluator): return {'error': error_message} # 3. Process the returned results - details = [] - correct = 0 - for output in outputs: - if output.get('status') == 'OK': - output['correct'] = True - correct += 1 - else: - output['correct'] = False - - details.append(output) - - return { - f'pass@{num_repeats}': 100 * correct / len(test_set_origin), - 'details': details - } + return self._process_results(outputs, prompts, len(test_set_origin)) diff --git a/opencompass/utils/datasets_info.py b/opencompass/utils/datasets_info.py index 10ca4436..ce12af64 100644 --- a/opencompass/utils/datasets_info.py +++ b/opencompass/utils/datasets_info.py @@ -451,7 +451,16 @@ DATASETS_MAPPING = { "hf_id": "", "local": "./data/nejmaibench/NEJM_All_Questions_And_Answers.csv", }, - + "opencompass/humaneval_pro": { + "ms_id": "", + "hf_id": "", + "local": "./data/humaneval_pro/humaneval_pro.json", + }, + "opencompass/mbpp_pro": { + "ms_id": "", + "hf_id": "", + "local": "./data/mbpp_pro/mbpp_pro.json", + }, } DATASETS_URL = { @@ -808,6 +817,13 @@ DATASETS_URL = { "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/nejmaibench.zip", "md5": "e6082cae3596b3ebea73e23ba445b99e" - } - + }, + "humaneval_pro": { + "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/humaneval_pro.zip", + "md5": "4c6fe556e84e905e4f0902d699e46de5", + }, + "mbpp_pro": { + "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/mbpp_pro.zip", + "md5": "eac330b8a0a8687f006265c9383503ce", + }, }