bigcodebench update

This commit is contained in:
Dongsheng Zhu 2025-03-04 08:38:24 +00:00
parent 84ade2ef3c
commit 63c7970937
6 changed files with 191 additions and 10 deletions

View File

@ -34,7 +34,8 @@ bigcodebench_full_eval_cfg = dict(
type=BigCodeBenchEvaluator, type=BigCodeBenchEvaluator,
release_version='v0.1.2', release_version='v0.1.2',
eval_type='complete', eval_type='complete',
remote_execute_api='https://bigcode-bigcodebench-evaluator.hf.space/', # remote_execute_api='https://bigcode-bigcodebench-evaluator.hf.space/',
remote_execute_api='https://opencompass-opencompass-bigcodebench-evaluator.hf.space', # noqa: E501
dataset_version='full', dataset_version='full',
), ),
pred_role='BOT', pred_role='BOT',
@ -50,4 +51,4 @@ bigcodebench_full_complete_datasets = [
eval_cfg=bigcodebench_full_eval_cfg, eval_cfg=bigcodebench_full_eval_cfg,
release_version='v0.1.2' release_version='v0.1.2'
) )
] ]

View File

@ -34,7 +34,8 @@ bigcodebench_full_eval_cfg = dict(
type=BigCodeBenchEvaluator, type=BigCodeBenchEvaluator,
release_version='v0.1.2', release_version='v0.1.2',
eval_type='instruct', eval_type='instruct',
remote_execute_api='https://bigcode-bigcodebench-evaluator.hf.space/', # remote_execute_api='https://bigcode-bigcodebench-evaluator.hf.space/',
remote_execute_api='https://opencompass-opencompass-bigcodebench-evaluator.hf.space', # noqa: E501
dataset_version='full', dataset_version='full',
), ),
pred_role='BOT', pred_role='BOT',
@ -50,4 +51,4 @@ bigcodebench_full_instruct_datasets = [
eval_cfg=bigcodebench_full_eval_cfg, eval_cfg=bigcodebench_full_eval_cfg,
release_version='v0.1.2' release_version='v0.1.2'
) )
] ]

View File

@ -34,7 +34,8 @@ bigcodebench_hard_eval_cfg = dict(
type=BigCodeBenchEvaluator, type=BigCodeBenchEvaluator,
release_version='v0.1.2', release_version='v0.1.2',
eval_type='complete', eval_type='complete',
remote_execute_api='https://bigcode-bigcodebench-evaluator.hf.space/', # remote_execute_api='https://bigcode-bigcodebench-evaluator.hf.space/',
remote_execute_api='https://opencompass-opencompass-bigcodebench-evaluator.hf.space', # noqa: E501
dataset_version='hard', dataset_version='hard',
), ),
pred_role='BOT', pred_role='BOT',
@ -51,4 +52,4 @@ bigcodebench_hard_complete_datasets = [
release_version='v0.1.2', release_version='v0.1.2',
dataset_version='hard', dataset_version='hard',
) )
] ]

View File

@ -34,7 +34,8 @@ bigcodebench_hard_eval_cfg = dict(
type=BigCodeBenchEvaluator, type=BigCodeBenchEvaluator,
release_version='v0.1.2', release_version='v0.1.2',
eval_type='instruct', eval_type='instruct',
remote_execute_api='https://bigcode-bigcodebench-evaluator.hf.space/', # remote_execute_api='https://bigcode-bigcodebench-evaluator.hf.space/',
remote_execute_api='https://opencompass-opencompass-bigcodebench-evaluator.hf.space', # noqa: E501
dataset_version='hard', dataset_version='hard',
), ),
pred_role='BOT', pred_role='BOT',
@ -51,4 +52,4 @@ bigcodebench_hard_instruct_datasets = [
release_version='v0.1.2', release_version='v0.1.2',
dataset_version='hard', dataset_version='hard',
) )
] ]

View File

@ -0,0 +1,167 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets import (
LCBCodeGenerationDataset,
LCBCodeExecutionDataset,
LCBTestOutputPredictionDataset,
LCBCodeGenerationEvaluator,
LCBCodeExecutionEvaluator,
LCBTestOutputEvaluator
)
lcb_code_generation_reader_cfg = dict(
input_columns=[
'question_content',
'format_prompt',
],
# output_column='evaluation_sample',
output_column='question_id',
)
SYSTEM_MESSAGE_GENERIC = 'You are an expert Python programmer. You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests. You will NOT return anything except for the program.' # noqa: E501
prompt_template = '### Question:\n{question_content}\n\n{format_prompt}' + \
'### Answer: (use the provided format with backticks)\n\n'
# Code Generation Tasks
lcb_code_generation_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(
role='HUMAN',
prompt=prompt_template
)
]
)
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer)
)
lcb_code_generation_eval_cfg = dict(
evaluator=dict(
type=LCBCodeGenerationEvaluator,
num_process_evaluate=4,
timeout=6,
release_version='release_v5',
start_date='2024-08-01',
end_date='2025-02-01'
),
pred_role='BOT',
)
LCBCodeGeneration_dataset = dict(
type=LCBCodeGenerationDataset,
abbr='lcb_code_generation',
path='opencompass/code_generation_lite',
reader_cfg=lcb_code_generation_reader_cfg,
infer_cfg=lcb_code_generation_infer_cfg,
eval_cfg=lcb_code_generation_eval_cfg,
release_version='release_v5',
)
# Code Execution Dataset
lcb_code_execution_reader_cfg = dict(
input_columns=[
'prompt',
],
output_column='evaluation_sample',
)
lcb_code_execution_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
begin=[
dict(
role='SYSTEM',
fallback_role='HUMAN',
prompt='You are an expert at Python programming, code execution, test case generation, and fuzzing.' # noqa: E501
),
],
round=[
dict(
role='HUMAN',
prompt='{prompt}'
)
]
)
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer)
)
lcb_code_execution_eval_cfg = dict(
evaluator=dict(
type=LCBCodeExecutionEvaluator,
),
pred_role='BOT',
)
LCBCodeExecution_dataset = dict(
type=LCBCodeExecutionDataset,
abbr='lcb_code_execution',
path='opencompass/execution-v2',
reader_cfg=lcb_code_execution_reader_cfg,
infer_cfg=lcb_code_execution_infer_cfg,
eval_cfg=lcb_code_execution_eval_cfg,
)
# TestOuputput Dataset
lcb_test_output_reader_cfg = dict(
input_columns=[
'prompt',
],
output_column='evaluation_sample',
)
system_prompt = 'You are an expert Python programmer. You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests. You will NOT return anything except for the program.' # noqa: E501
lcb_test_output_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
# begin=[
# dict(
# role='SYSTEM',
# prompt=system_prompt
# ),
# ],
round=[
dict(
role='HUMAN',
prompt='{prompt}'
)
]
)
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer)
)
lcb_test_output_eval_cfg = dict(
evaluator=dict(
type=LCBTestOutputEvaluator,
),
pred_role='BOT',
)
LCBTestOutput_dataset = dict(
type=LCBTestOutputPredictionDataset,
abbr='lcb_test_output',
path='opencompass/test_generation',
reader_cfg=lcb_test_output_reader_cfg,
infer_cfg=lcb_test_output_infer_cfg,
eval_cfg=lcb_test_output_eval_cfg,
)
LCB_datasets = [
LCBCodeGeneration_dataset,
LCBCodeExecution_dataset,
LCBTestOutput_dataset,
]

View File

@ -197,11 +197,21 @@ class BigCodeBenchEvaluator(BaseEvaluator):
break break
except (httpx.ReadTimeout, CancelledError): except (httpx.ReadTimeout, CancelledError):
logger.info('Read timeout error. Retrying in 4s...') logger.info('Read timeout error. Retrying in 4s...')
time.sleep(4) time.sleep(10)
if 'pass@1' in pass_at_k.keys(): if 'pass@1' in pass_at_k.keys():
pass_at_k['pass@1'] *= 100 pass_at_k['pass@1'] *= 100
dump_results = {'details': results} dump_results = {'details': self._results_processor(results)}
dump_results.update(pass_at_k) dump_results.update(pass_at_k)
return dump_results return dump_results
def _results_processor(self, results):
details = []
for key, value in results['eval'].items():
if value[0]['status'] == 'pass':
value[0]['correct'] = True
else:
value[0]['correct'] = False
details.append(value[0])
return details