diff --git a/opencompass/configs/datasets/livecodebench/livecodebench_code_generation_repeat_gen_b5b6c5.py b/opencompass/configs/datasets/livecodebench/livecodebench_code_generation_repeat_gen_b5b6c5.py new file mode 100644 index 00000000..e788acd5 --- /dev/null +++ b/opencompass/configs/datasets/livecodebench/livecodebench_code_generation_repeat_gen_b5b6c5.py @@ -0,0 +1,166 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import ( + LCBCodeGenerationDataset, + LCBCodeExecutionDataset, + LCBTestOutputPredictionDataset, + LCBCodeGenerationEvaluator, + LCBCodeExecutionEvaluator, + LCBTestOutputEvaluator +) +from opencompass.datasets.livecodebench import TestOutputPromptConstants + + +lcb_code_generation_reader_cfg = dict( + input_columns=[ + 'question_content', + 'format_prompt', + ], + # output_column='evaluation_sample', + output_column='question_id', +) + +SYSTEM_MESSAGE_GENERIC = f'You are an expert Python programmer. You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests. You will NOT return anything except for the program.' + +prompt_template = '### Question:\n{question_content}\n\n{format_prompt}' + \ + '### Answer: (use the provided format with backticks)\n\n' + + +# Code Generation Tasks +lcb_code_generation_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict( + role='HUMAN', + prompt=prompt_template + ) + ] + ) + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer) +) + +lcb_code_generation_eval_cfg = dict( + evaluator=dict( + type=LCBCodeGenerationEvaluator, + num_process_evaluate=4, + timeout=6, + ), + pred_role='BOT', +) + +LCBCodeGeneration_dataset = dict( + type=LCBCodeGenerationDataset, + abbr='lcb_code_generation', + path='opencompass/code_generation_lite', + reader_cfg=lcb_code_generation_reader_cfg, + infer_cfg=lcb_code_generation_infer_cfg, + eval_cfg=lcb_code_generation_eval_cfg, + n=3, + k=2 +) + +# Code Execution Dataset +lcb_code_execution_reader_cfg = dict( + input_columns=[ + 'prompt', + ], + output_column='evaluation_sample', +) + +lcb_code_execution_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + begin=[ + dict( + role='SYSTEM', + fallback_role='HUMAN', + prompt='You are an expert at Python programming, code execution, test case generation, and fuzzing.' + ), + ], + round=[ + dict( + role='HUMAN', + prompt='{prompt}' + ) + ] + ) + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer) +) + +lcb_code_execution_eval_cfg = dict( + evaluator=dict( + type=LCBCodeExecutionEvaluator, + ), + pred_role='BOT', +) + +LCBCodeExecution_dataset = dict( + type=LCBCodeExecutionDataset, + abbr='lcb_code_execution', + path='opencompass/execution-v2', + reader_cfg=lcb_code_execution_reader_cfg, + infer_cfg=lcb_code_execution_infer_cfg, + eval_cfg=lcb_code_execution_eval_cfg, +) + +# TestOuputput Dataset +lcb_test_output_reader_cfg = dict( + input_columns=[ + 'prompt', + ], + output_column='evaluation_sample', +) + +system_prompt = 'You are an expert Python programmer. You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests. You will NOT return anything except for the program.' + +lcb_test_output_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + # begin=[ + # dict( + # role='SYSTEM', + # prompt=system_prompt + # ), + # ], + round=[ + dict( + role='HUMAN', + prompt='{prompt}' + ) + ] + ) + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer) +) + +lcb_test_output_eval_cfg = dict( + evaluator=dict( + type=LCBTestOutputEvaluator, + ), + pred_role='BOT', +) + +LCBTestOutput_dataset = dict( + type=LCBTestOutputPredictionDataset, + abbr='lcb_test_output', + path='opencompass/test_generation', + reader_cfg=lcb_test_output_reader_cfg, + infer_cfg=lcb_test_output_infer_cfg, + eval_cfg=lcb_test_output_eval_cfg, +) + +LCB_datasets = [ + LCBCodeGeneration_dataset, + # LCBCodeExecution_dataset, + # LCBTestOutput_dataset, +] diff --git a/opencompass/datasets/livecodebench/evaluator.py b/opencompass/datasets/livecodebench/evaluator.py index 65867d47..e6afd838 100644 --- a/opencompass/datasets/livecodebench/evaluator.py +++ b/opencompass/datasets/livecodebench/evaluator.py @@ -248,6 +248,28 @@ class LCBCodeGenerationEvaluator(BaseEvaluator): end_date=end_date)['test'] self.extractor_version = extractor_version + def _build_results(self, extracted_predictions, metrics, eval_results, + final_metadata): + results = {} + results['pass@1'] = metrics.get('pass@1', 0.0) + details = [] + # Safely get the details list from metrics + r = metrics.get('details', {}).get('pass@1', []) + for i, (ep, er, fm) in enumerate( + zip(extracted_predictions.values(), eval_results.values(), + final_metadata)): + detail = { + 'extracted_prediction': + ep[0] if isinstance(ep, list) and ep else ep, + 'eval_result': er[0] if isinstance(er, list) and er else er, + 'final_metadata': fm[0] if isinstance(fm, list) and fm else fm + } + # Use r[i] if available, otherwise fallback to False + detail['correct'] = bool(r[i] == 100.0) if i < len(r) else False + details.append(detail) + results['details'] = details + return results + def score(self, predictions, references): if len(predictions) != len(references): return { @@ -295,13 +317,14 @@ class LCBCodeGenerationEvaluator(BaseEvaluator): num_process_evaluate=self.num_process_evaluate, timeout=self.timeout, ) - results = { - 'extracted_predictions': extracted_predictions, - 'eval_results': eval_results - } - results.update(metrics) + # results = { + # 'extracted_predictions': extracted_predictions, + # 'eval_results': eval_results + # } + # results.update(metrics) - return results + return self._build_results(extracted_predictions, metrics, + eval_results, final_metadata) def evaluate_score(args) -> list[bool]: