diff --git a/opencompass/configs/datasets/bigcodebench/bigcodebench_full_complete_gen_faf748.py b/opencompass/configs/datasets/bigcodebench/bigcodebench_full_complete_gen_faf748.py index f2edc098..720776fe 100644 --- a/opencompass/configs/datasets/bigcodebench/bigcodebench_full_complete_gen_faf748.py +++ b/opencompass/configs/datasets/bigcodebench/bigcodebench_full_complete_gen_faf748.py @@ -34,7 +34,8 @@ bigcodebench_full_eval_cfg = dict( type=BigCodeBenchEvaluator, release_version='v0.1.2', eval_type='complete', - remote_execute_api='https://bigcode-bigcodebench-evaluator.hf.space/', + # remote_execute_api='https://bigcode-bigcodebench-evaluator.hf.space/', + remote_execute_api='https://opencompass-opencompass-bigcodebench-evaluator.hf.space', # noqa: E501 dataset_version='full', ), pred_role='BOT', @@ -50,4 +51,4 @@ bigcodebench_full_complete_datasets = [ eval_cfg=bigcodebench_full_eval_cfg, release_version='v0.1.2' ) -] \ No newline at end of file +] diff --git a/opencompass/configs/datasets/bigcodebench/bigcodebench_full_instruct_gen_8815eb.py b/opencompass/configs/datasets/bigcodebench/bigcodebench_full_instruct_gen_8815eb.py index 88b410ae..0be7d6a6 100644 --- a/opencompass/configs/datasets/bigcodebench/bigcodebench_full_instruct_gen_8815eb.py +++ b/opencompass/configs/datasets/bigcodebench/bigcodebench_full_instruct_gen_8815eb.py @@ -34,7 +34,8 @@ bigcodebench_full_eval_cfg = dict( type=BigCodeBenchEvaluator, release_version='v0.1.2', eval_type='instruct', - remote_execute_api='https://bigcode-bigcodebench-evaluator.hf.space/', + # remote_execute_api='https://bigcode-bigcodebench-evaluator.hf.space/', + remote_execute_api='https://opencompass-opencompass-bigcodebench-evaluator.hf.space', # noqa: E501 dataset_version='full', ), pred_role='BOT', @@ -50,4 +51,4 @@ bigcodebench_full_instruct_datasets = [ eval_cfg=bigcodebench_full_eval_cfg, release_version='v0.1.2' ) -] \ No newline at end of file +] diff --git a/opencompass/configs/datasets/bigcodebench/bigcodebench_hard_complete_gen_faf748.py b/opencompass/configs/datasets/bigcodebench/bigcodebench_hard_complete_gen_faf748.py index c0419774..9eb0c219 100644 --- a/opencompass/configs/datasets/bigcodebench/bigcodebench_hard_complete_gen_faf748.py +++ b/opencompass/configs/datasets/bigcodebench/bigcodebench_hard_complete_gen_faf748.py @@ -34,7 +34,8 @@ bigcodebench_hard_eval_cfg = dict( type=BigCodeBenchEvaluator, release_version='v0.1.2', eval_type='complete', - remote_execute_api='https://bigcode-bigcodebench-evaluator.hf.space/', + # remote_execute_api='https://bigcode-bigcodebench-evaluator.hf.space/', + remote_execute_api='https://opencompass-opencompass-bigcodebench-evaluator.hf.space', # noqa: E501 dataset_version='hard', ), pred_role='BOT', @@ -51,4 +52,4 @@ bigcodebench_hard_complete_datasets = [ release_version='v0.1.2', dataset_version='hard', ) -] \ No newline at end of file +] diff --git a/opencompass/configs/datasets/bigcodebench/bigcodebench_hard_instruct_gen_8815eb.py b/opencompass/configs/datasets/bigcodebench/bigcodebench_hard_instruct_gen_8815eb.py index 3d1cc82c..443dae9d 100644 --- a/opencompass/configs/datasets/bigcodebench/bigcodebench_hard_instruct_gen_8815eb.py +++ b/opencompass/configs/datasets/bigcodebench/bigcodebench_hard_instruct_gen_8815eb.py @@ -34,7 +34,8 @@ bigcodebench_hard_eval_cfg = dict( type=BigCodeBenchEvaluator, release_version='v0.1.2', eval_type='instruct', - remote_execute_api='https://bigcode-bigcodebench-evaluator.hf.space/', + # remote_execute_api='https://bigcode-bigcodebench-evaluator.hf.space/', + remote_execute_api='https://opencompass-opencompass-bigcodebench-evaluator.hf.space', # noqa: E501 dataset_version='hard', ), pred_role='BOT', @@ -51,4 +52,4 @@ bigcodebench_hard_instruct_datasets = [ release_version='v0.1.2', dataset_version='hard', ) -] \ No newline at end of file +] diff --git a/opencompass/configs/datasets/livecodebench/livecodebench_time_split_gen.py b/opencompass/configs/datasets/livecodebench/livecodebench_time_split_gen.py new file mode 100644 index 00000000..f2a17026 --- /dev/null +++ b/opencompass/configs/datasets/livecodebench/livecodebench_time_split_gen.py @@ -0,0 +1,167 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import ( + LCBCodeGenerationDataset, + LCBCodeExecutionDataset, + LCBTestOutputPredictionDataset, + LCBCodeGenerationEvaluator, + LCBCodeExecutionEvaluator, + LCBTestOutputEvaluator +) + + +lcb_code_generation_reader_cfg = dict( + input_columns=[ + 'question_content', + 'format_prompt', + ], + # output_column='evaluation_sample', + output_column='question_id', +) + +SYSTEM_MESSAGE_GENERIC = 'You are an expert Python programmer. You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests. You will NOT return anything except for the program.' # noqa: E501 + +prompt_template = '### Question:\n{question_content}\n\n{format_prompt}' + \ + '### Answer: (use the provided format with backticks)\n\n' + + +# Code Generation Tasks +lcb_code_generation_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict( + role='HUMAN', + prompt=prompt_template + ) + ] + ) + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer) +) + +lcb_code_generation_eval_cfg = dict( + evaluator=dict( + type=LCBCodeGenerationEvaluator, + num_process_evaluate=4, + timeout=6, + release_version='release_v5', + start_date='2024-08-01', + end_date='2025-02-01' + ), + pred_role='BOT', +) + +LCBCodeGeneration_dataset = dict( + type=LCBCodeGenerationDataset, + abbr='lcb_code_generation', + path='opencompass/code_generation_lite', + reader_cfg=lcb_code_generation_reader_cfg, + infer_cfg=lcb_code_generation_infer_cfg, + eval_cfg=lcb_code_generation_eval_cfg, + release_version='release_v5', +) + +# Code Execution Dataset +lcb_code_execution_reader_cfg = dict( + input_columns=[ + 'prompt', + ], + output_column='evaluation_sample', +) + +lcb_code_execution_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + begin=[ + dict( + role='SYSTEM', + fallback_role='HUMAN', + prompt='You are an expert at Python programming, code execution, test case generation, and fuzzing.' # noqa: E501 + ), + ], + round=[ + dict( + role='HUMAN', + prompt='{prompt}' + ) + ] + ) + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer) +) + +lcb_code_execution_eval_cfg = dict( + evaluator=dict( + type=LCBCodeExecutionEvaluator, + ), + pred_role='BOT', +) + +LCBCodeExecution_dataset = dict( + type=LCBCodeExecutionDataset, + abbr='lcb_code_execution', + path='opencompass/execution-v2', + reader_cfg=lcb_code_execution_reader_cfg, + infer_cfg=lcb_code_execution_infer_cfg, + eval_cfg=lcb_code_execution_eval_cfg, +) + +# TestOuputput Dataset +lcb_test_output_reader_cfg = dict( + input_columns=[ + 'prompt', + ], + output_column='evaluation_sample', +) + +system_prompt = 'You are an expert Python programmer. You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests. You will NOT return anything except for the program.' # noqa: E501 + +lcb_test_output_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + # begin=[ + # dict( + # role='SYSTEM', + # prompt=system_prompt + # ), + # ], + round=[ + dict( + role='HUMAN', + prompt='{prompt}' + ) + ] + ) + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer) +) + +lcb_test_output_eval_cfg = dict( + evaluator=dict( + type=LCBTestOutputEvaluator, + ), + pred_role='BOT', +) + +LCBTestOutput_dataset = dict( + type=LCBTestOutputPredictionDataset, + abbr='lcb_test_output', + path='opencompass/test_generation', + reader_cfg=lcb_test_output_reader_cfg, + infer_cfg=lcb_test_output_infer_cfg, + eval_cfg=lcb_test_output_eval_cfg, +) + +LCB_datasets = [ + LCBCodeGeneration_dataset, + LCBCodeExecution_dataset, + LCBTestOutput_dataset, +] diff --git a/opencompass/datasets/bigcodebench/bigcodebench.py b/opencompass/datasets/bigcodebench/bigcodebench.py index f347e9e2..9ce3d196 100644 --- a/opencompass/datasets/bigcodebench/bigcodebench.py +++ b/opencompass/datasets/bigcodebench/bigcodebench.py @@ -197,11 +197,21 @@ class BigCodeBenchEvaluator(BaseEvaluator): break except (httpx.ReadTimeout, CancelledError): logger.info('Read timeout error. Retrying in 4s...') - time.sleep(4) + time.sleep(10) if 'pass@1' in pass_at_k.keys(): pass_at_k['pass@1'] *= 100 - dump_results = {'details': results} + dump_results = {'details': self._results_processor(results)} dump_results.update(pass_at_k) return dump_results + + def _results_processor(self, results): + details = [] + for key, value in results['eval'].items(): + if value[0]['status'] == 'pass': + value[0]['correct'] = True + else: + value[0]['correct'] = False + details.append(value[0]) + return details