livecodebench

This commit is contained in:
Dongsheng Zhu 2025-05-14 09:13:36 +00:00
parent 431047ab05
commit cb34f95984
2 changed files with 195 additions and 6 deletions

View File

@ -0,0 +1,166 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets import (
LCBCodeGenerationDataset,
LCBCodeExecutionDataset,
LCBTestOutputPredictionDataset,
LCBCodeGenerationEvaluator,
LCBCodeExecutionEvaluator,
LCBTestOutputEvaluator
)
from opencompass.datasets.livecodebench import TestOutputPromptConstants
lcb_code_generation_reader_cfg = dict(
input_columns=[
'question_content',
'format_prompt',
],
# output_column='evaluation_sample',
output_column='question_id',
)
SYSTEM_MESSAGE_GENERIC = f'You are an expert Python programmer. You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests. You will NOT return anything except for the program.'
prompt_template = '### Question:\n{question_content}\n\n{format_prompt}' + \
'### Answer: (use the provided format with backticks)\n\n'
# Code Generation Tasks
lcb_code_generation_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(
role='HUMAN',
prompt=prompt_template
)
]
)
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer)
)
lcb_code_generation_eval_cfg = dict(
evaluator=dict(
type=LCBCodeGenerationEvaluator,
num_process_evaluate=4,
timeout=6,
),
pred_role='BOT',
)
LCBCodeGeneration_dataset = dict(
type=LCBCodeGenerationDataset,
abbr='lcb_code_generation',
path='opencompass/code_generation_lite',
reader_cfg=lcb_code_generation_reader_cfg,
infer_cfg=lcb_code_generation_infer_cfg,
eval_cfg=lcb_code_generation_eval_cfg,
n=3,
k=2
)
# Code Execution Dataset
lcb_code_execution_reader_cfg = dict(
input_columns=[
'prompt',
],
output_column='evaluation_sample',
)
lcb_code_execution_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
begin=[
dict(
role='SYSTEM',
fallback_role='HUMAN',
prompt='You are an expert at Python programming, code execution, test case generation, and fuzzing.'
),
],
round=[
dict(
role='HUMAN',
prompt='{prompt}'
)
]
)
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer)
)
lcb_code_execution_eval_cfg = dict(
evaluator=dict(
type=LCBCodeExecutionEvaluator,
),
pred_role='BOT',
)
LCBCodeExecution_dataset = dict(
type=LCBCodeExecutionDataset,
abbr='lcb_code_execution',
path='opencompass/execution-v2',
reader_cfg=lcb_code_execution_reader_cfg,
infer_cfg=lcb_code_execution_infer_cfg,
eval_cfg=lcb_code_execution_eval_cfg,
)
# TestOuputput Dataset
lcb_test_output_reader_cfg = dict(
input_columns=[
'prompt',
],
output_column='evaluation_sample',
)
system_prompt = 'You are an expert Python programmer. You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests. You will NOT return anything except for the program.'
lcb_test_output_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
# begin=[
# dict(
# role='SYSTEM',
# prompt=system_prompt
# ),
# ],
round=[
dict(
role='HUMAN',
prompt='{prompt}'
)
]
)
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer)
)
lcb_test_output_eval_cfg = dict(
evaluator=dict(
type=LCBTestOutputEvaluator,
),
pred_role='BOT',
)
LCBTestOutput_dataset = dict(
type=LCBTestOutputPredictionDataset,
abbr='lcb_test_output',
path='opencompass/test_generation',
reader_cfg=lcb_test_output_reader_cfg,
infer_cfg=lcb_test_output_infer_cfg,
eval_cfg=lcb_test_output_eval_cfg,
)
LCB_datasets = [
LCBCodeGeneration_dataset,
# LCBCodeExecution_dataset,
# LCBTestOutput_dataset,
]

View File

@ -248,6 +248,28 @@ class LCBCodeGenerationEvaluator(BaseEvaluator):
end_date=end_date)['test']
self.extractor_version = extractor_version
def _build_results(self, extracted_predictions, metrics, eval_results,
final_metadata):
results = {}
results['pass@1'] = metrics.get('pass@1', 0.0)
details = []
# Safely get the details list from metrics
r = metrics.get('details', {}).get('pass@1', [])
for i, (ep, er, fm) in enumerate(
zip(extracted_predictions.values(), eval_results.values(),
final_metadata)):
detail = {
'extracted_prediction':
ep[0] if isinstance(ep, list) and ep else ep,
'eval_result': er[0] if isinstance(er, list) and er else er,
'final_metadata': fm[0] if isinstance(fm, list) and fm else fm
}
# Use r[i] if available, otherwise fallback to False
detail['correct'] = bool(r[i] == 100.0) if i < len(r) else False
details.append(detail)
results['details'] = details
return results
def score(self, predictions, references):
if len(predictions) != len(references):
return {
@ -295,13 +317,14 @@ class LCBCodeGenerationEvaluator(BaseEvaluator):
num_process_evaluate=self.num_process_evaluate,
timeout=self.timeout,
)
results = {
'extracted_predictions': extracted_predictions,
'eval_results': eval_results
}
results.update(metrics)
# results = {
# 'extracted_predictions': extracted_predictions,
# 'eval_results': eval_results
# }
# results.update(metrics)
return results
return self._build_results(extracted_predictions, metrics,
eval_results, final_metadata)
def evaluate_score(args) -> list[bool]: