mirror of
https://github.com/open-compass/opencompass.git
synced 2025-05-30 16:03:24 +08:00
[Update] Update Fullbench (#1712)
* Update JuderBench * Support O1-style Prompts * Update Code
This commit is contained in:
parent
300adc31e8
commit
f97c4eae42
@ -47,8 +47,3 @@ for _name in subjective_all_sets:
|
||||
infer_cfg=subjective_infer_cfg,
|
||||
eval_cfg=subjective_eval_cfg,
|
||||
))
|
||||
# ds1000_eval_cfg = dict(
|
||||
# evaluator=dict(type=DS1000Evaluator),
|
||||
# pred_role='BOT',
|
||||
# pred_postprocessor=dict(type=ds1000_postprocess),
|
||||
# )
|
||||
|
@ -0,0 +1,39 @@
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.datasets import Aime2024Dataset, MATHEvaluator, math_postprocess_v2
|
||||
|
||||
|
||||
aime2024_reader_cfg = dict(
|
||||
input_columns=['question'],
|
||||
output_column='answer'
|
||||
)
|
||||
|
||||
|
||||
aime2024_infer_cfg = dict(
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(
|
||||
round=[
|
||||
dict(role='HUMAN', prompt='{question}\nRemember to put your final answer within \\boxed{}.'),
|
||||
],
|
||||
)
|
||||
),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer, max_out_len=2048)
|
||||
)
|
||||
|
||||
aime2024_eval_cfg = dict(
|
||||
evaluator=dict(type=MATHEvaluator, version='v2'), pred_postprocessor=dict(type=math_postprocess_v2)
|
||||
)
|
||||
|
||||
aime2024_datasets = [
|
||||
dict(
|
||||
abbr='aime2024',
|
||||
type=Aime2024Dataset,
|
||||
path='opencompass/aime2024',
|
||||
reader_cfg=aime2024_reader_cfg,
|
||||
infer_cfg=aime2024_infer_cfg,
|
||||
eval_cfg=aime2024_eval_cfg
|
||||
)
|
||||
]
|
@ -0,0 +1,96 @@
|
||||
import os
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.openicl.icl_evaluator import AccEvaluator
|
||||
from opencompass.datasets import BBHDataset, BBHEvaluator, bbh_mcq_postprocess, BBHEvaluator_mcq
|
||||
|
||||
bbh_reader_cfg = dict(input_columns=['input'], output_column='target')
|
||||
|
||||
bbh_multiple_choice_sets = [
|
||||
'temporal_sequences',
|
||||
'disambiguation_qa',
|
||||
'date_understanding',
|
||||
'tracking_shuffled_objects_three_objects',
|
||||
'penguins_in_a_table',
|
||||
'geometric_shapes',
|
||||
'snarks',
|
||||
'ruin_names',
|
||||
'tracking_shuffled_objects_seven_objects',
|
||||
'tracking_shuffled_objects_five_objects',
|
||||
'logical_deduction_three_objects',
|
||||
'hyperbaton',
|
||||
'logical_deduction_five_objects',
|
||||
'logical_deduction_seven_objects',
|
||||
'movie_recommendation',
|
||||
'salient_translation_error_detection',
|
||||
'reasoning_about_colored_objects',
|
||||
]
|
||||
bbh_free_form_sets = [
|
||||
'multistep_arithmetic_two',
|
||||
'navigate',
|
||||
'dyck_languages',
|
||||
'word_sorting',
|
||||
'sports_understanding',
|
||||
'boolean_expressions',
|
||||
'object_counting',
|
||||
'formal_fallacies',
|
||||
'causal_judgement',
|
||||
'web_of_lies',
|
||||
]
|
||||
|
||||
bbh_datasets = []
|
||||
for _name in bbh_multiple_choice_sets:
|
||||
bbh_infer_cfg = dict(
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(round=[
|
||||
dict(
|
||||
role='HUMAN',
|
||||
prompt=
|
||||
f"Follow the given examples and answer the question.\n\nQuestion: {{input}}\n You must give your final answer by starting with 'So the answer is' "
|
||||
)
|
||||
])),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer, max_out_len=512))
|
||||
bbh_eval_cfg = dict(
|
||||
evaluator=dict(type=BBHEvaluator_mcq),
|
||||
pred_role='BOT',
|
||||
pred_postprocessor=dict(type=bbh_mcq_postprocess),
|
||||
dataset_postprocessor=dict(type=bbh_mcq_postprocess))
|
||||
|
||||
bbh_datasets.append(
|
||||
dict(
|
||||
type=BBHDataset,
|
||||
path='opencompass/bbh',
|
||||
name=_name,
|
||||
abbr='bbh-' + _name,
|
||||
reader_cfg=bbh_reader_cfg,
|
||||
infer_cfg=bbh_infer_cfg.copy(),
|
||||
eval_cfg=bbh_eval_cfg.copy()))
|
||||
|
||||
for _name in bbh_free_form_sets:
|
||||
|
||||
bbh_infer_cfg = dict(
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(round=[
|
||||
dict(
|
||||
role='HUMAN',
|
||||
prompt=
|
||||
f"Follow the given examples and answer the question.\n\nQuestion: {{input}}\n You must give your final answer by starting with 'So the answer is' "
|
||||
)
|
||||
])),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer, max_out_len=512))
|
||||
bbh_eval_cfg = dict(evaluator=dict(type=BBHEvaluator), pred_role='BOT')
|
||||
|
||||
bbh_datasets.append(
|
||||
dict(
|
||||
type=BBHDataset,
|
||||
path='opencompass/bbh',
|
||||
name=_name,
|
||||
abbr='bbh-' + _name,
|
||||
reader_cfg=bbh_reader_cfg,
|
||||
infer_cfg=bbh_infer_cfg.copy(),
|
||||
eval_cfg=bbh_eval_cfg.copy()))
|
@ -0,0 +1,96 @@
|
||||
import os
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.openicl.icl_evaluator import AccEvaluator
|
||||
from opencompass.datasets import BBHDataset, BBHEvaluator, bbh_mcq_postprocess, BBHEvaluator_mcq
|
||||
|
||||
bbh_reader_cfg = dict(input_columns=['input'], output_column='target')
|
||||
|
||||
bbh_multiple_choice_sets = [
|
||||
'temporal_sequences',
|
||||
'disambiguation_qa',
|
||||
'date_understanding',
|
||||
'tracking_shuffled_objects_three_objects',
|
||||
'penguins_in_a_table',
|
||||
'geometric_shapes',
|
||||
'snarks',
|
||||
'ruin_names',
|
||||
'tracking_shuffled_objects_seven_objects',
|
||||
'tracking_shuffled_objects_five_objects',
|
||||
'logical_deduction_three_objects',
|
||||
'hyperbaton',
|
||||
'logical_deduction_five_objects',
|
||||
'logical_deduction_seven_objects',
|
||||
'movie_recommendation',
|
||||
'salient_translation_error_detection',
|
||||
'reasoning_about_colored_objects',
|
||||
]
|
||||
bbh_free_form_sets = [
|
||||
'multistep_arithmetic_two',
|
||||
'navigate',
|
||||
'dyck_languages',
|
||||
'word_sorting',
|
||||
'sports_understanding',
|
||||
'boolean_expressions',
|
||||
'object_counting',
|
||||
'formal_fallacies',
|
||||
'causal_judgement',
|
||||
'web_of_lies',
|
||||
]
|
||||
|
||||
bbh_datasets = []
|
||||
for _name in bbh_multiple_choice_sets:
|
||||
bbh_infer_cfg = dict(
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(round=[
|
||||
dict(
|
||||
role='HUMAN',
|
||||
prompt=
|
||||
f"Question: {{input}}\n You must give your final answer by starting with 'So the answer is' "
|
||||
)
|
||||
])),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer, max_out_len=512))
|
||||
bbh_eval_cfg = dict(
|
||||
evaluator=dict(type=BBHEvaluator_mcq),
|
||||
pred_role='BOT',
|
||||
pred_postprocessor=dict(type=bbh_mcq_postprocess),
|
||||
dataset_postprocessor=dict(type=bbh_mcq_postprocess))
|
||||
|
||||
bbh_datasets.append(
|
||||
dict(
|
||||
type=BBHDataset,
|
||||
path='opencompass/bbh',
|
||||
name=_name,
|
||||
abbr='bbh-' + _name,
|
||||
reader_cfg=bbh_reader_cfg,
|
||||
infer_cfg=bbh_infer_cfg.copy(),
|
||||
eval_cfg=bbh_eval_cfg.copy()))
|
||||
|
||||
for _name in bbh_free_form_sets:
|
||||
|
||||
bbh_infer_cfg = dict(
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(round=[
|
||||
dict(
|
||||
role='HUMAN',
|
||||
prompt=
|
||||
f"Follow the given examples and answer the question.\n\nQuestion: {{input}}\n You must give your final answer by starting with 'So the answer is' "
|
||||
)
|
||||
])),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer, max_out_len=512))
|
||||
bbh_eval_cfg = dict(evaluator=dict(type=BBHEvaluator), pred_role='BOT')
|
||||
|
||||
bbh_datasets.append(
|
||||
dict(
|
||||
type=BBHDataset,
|
||||
path='opencompass/bbh',
|
||||
name=_name,
|
||||
abbr='bbh-' + _name,
|
||||
reader_cfg=bbh_reader_cfg,
|
||||
infer_cfg=bbh_infer_cfg.copy(),
|
||||
eval_cfg=bbh_eval_cfg.copy()))
|
@ -0,0 +1,39 @@
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.datasets import CMOFibDataset, MATHEvaluator, math_postprocess_v2
|
||||
|
||||
|
||||
cmo_fib_reader_cfg = dict(
|
||||
input_columns=['question'],
|
||||
output_column='answer'
|
||||
)
|
||||
|
||||
|
||||
cmo_fib_infer_cfg = dict(
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(
|
||||
round=[
|
||||
dict(role='HUMAN', prompt='{question}\n你需要讲最终答案写入\\boxed{}.'),
|
||||
],
|
||||
)
|
||||
),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer, max_out_len=2048)
|
||||
)
|
||||
|
||||
cmo_fib_eval_cfg = dict(
|
||||
evaluator=dict(type=MATHEvaluator, version='v2'), pred_postprocessor=dict(type=math_postprocess_v2)
|
||||
)
|
||||
|
||||
cmo_fib_datasets = [
|
||||
dict(
|
||||
abbr='cmo_fib',
|
||||
type=CMOFibDataset,
|
||||
path='opencompass/cmo_fib',
|
||||
reader_cfg=cmo_fib_reader_cfg,
|
||||
infer_cfg=cmo_fib_infer_cfg,
|
||||
eval_cfg=cmo_fib_eval_cfg
|
||||
)
|
||||
]
|
@ -0,0 +1,52 @@
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.datasets import GPQADataset, GPQA_Simple_Eval_postprocess, GPQAEvaluator
|
||||
|
||||
# openai_simple_eval prompt
|
||||
align_prompt = """
|
||||
Answer the following multiple choice question. The last line of your response should be of the following format: 'ANSWER: $LETTER' (without quotes) where LETTER is one of ABCD.
|
||||
|
||||
{question}
|
||||
|
||||
A) {A}
|
||||
B) {B}
|
||||
C) {C}
|
||||
D) {D}
|
||||
""".strip()
|
||||
|
||||
gpqa_reader_cfg = dict(
|
||||
input_columns=['question', 'A', 'B', 'C', 'D'],
|
||||
output_column='answer')
|
||||
|
||||
gpqa_infer_cfg = dict(
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(
|
||||
round=[
|
||||
dict(role='HUMAN', prompt=align_prompt),
|
||||
], )),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer))
|
||||
|
||||
gpqa_eval_cfg = dict(evaluator=dict(type=GPQAEvaluator),
|
||||
pred_postprocessor=dict(type=GPQA_Simple_Eval_postprocess))
|
||||
|
||||
gpqa_datasets = []
|
||||
gpqa_subsets = {
|
||||
# 'extended': 'gpqa_extended.csv',
|
||||
# 'main': 'gpqa_main.csv',
|
||||
'diamond': 'gpqa_diamond.csv'
|
||||
}
|
||||
|
||||
for split in list(gpqa_subsets.keys()):
|
||||
gpqa_datasets.append(
|
||||
dict(
|
||||
abbr='GPQA_' + split,
|
||||
type=GPQADataset,
|
||||
path='./data/gpqa/',
|
||||
name=gpqa_subsets[split],
|
||||
reader_cfg=gpqa_reader_cfg,
|
||||
infer_cfg=gpqa_infer_cfg,
|
||||
eval_cfg=gpqa_eval_cfg)
|
||||
)
|
@ -0,0 +1,37 @@
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.datasets import GSM8KDataset, gsm8k_postprocess, gsm8k_dataset_postprocess, Gsm8kEvaluator
|
||||
from opencompass.datasets import MATHEvaluator, math_postprocess_v2
|
||||
|
||||
gsm8k_reader_cfg = dict(input_columns=['question'], output_column='answer')
|
||||
|
||||
gsm8k_infer_cfg = dict(
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(
|
||||
round=[
|
||||
dict(role='HUMAN', prompt='{question}\nPlease put your final answer within \\boxed{}.'),
|
||||
],
|
||||
),
|
||||
),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer, max_out_len=512),
|
||||
)
|
||||
|
||||
gsm8k_eval_cfg = dict(
|
||||
evaluator=dict(type=MATHEvaluator, version='v2'),
|
||||
pred_postprocessor=dict(type=math_postprocess_v2),
|
||||
dataset_postprocessor=dict(type=gsm8k_dataset_postprocess),
|
||||
)
|
||||
|
||||
gsm8k_datasets = [
|
||||
dict(
|
||||
abbr='gsm8k',
|
||||
type=GSM8KDataset,
|
||||
path='opencompass/gsm8k',
|
||||
reader_cfg=gsm8k_reader_cfg,
|
||||
infer_cfg=gsm8k_infer_cfg,
|
||||
eval_cfg=gsm8k_eval_cfg,
|
||||
)
|
||||
]
|
@ -0,0 +1,38 @@
|
||||
# THIS SHALL ALSO BE DEPRECATED
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.datasets import HumanevalDataset, HumanEvalPlusEvaluator, humaneval_postprocess_v2
|
||||
|
||||
humaneval_plus_reader_cfg = dict(
|
||||
input_columns=['prompt'], output_column='task_id', train_split='test')
|
||||
|
||||
# TODO: allow empty output-column
|
||||
humaneval_plus_infer_cfg = dict(
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(round=[
|
||||
dict(
|
||||
role='HUMAN',
|
||||
prompt='Read the following function signature and docstring, and fully implement the function described. Your response should only contain the code for this function.\n{prompt}'
|
||||
),
|
||||
])),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer, max_out_len=512))
|
||||
|
||||
humaneval_plus_eval_cfg = dict(
|
||||
evaluator=dict(type=HumanEvalPlusEvaluator),
|
||||
pred_role='BOT',
|
||||
k=[1, 10, 100], # the parameter only for humaneval
|
||||
pred_postprocessor=dict(type=humaneval_postprocess_v2),
|
||||
)
|
||||
|
||||
humaneval_plus_datasets = [
|
||||
dict(
|
||||
abbr='humaneval_plus',
|
||||
type=HumanevalDataset,
|
||||
path='opencompass/humaneval',
|
||||
reader_cfg=humaneval_plus_reader_cfg,
|
||||
infer_cfg=humaneval_plus_infer_cfg,
|
||||
eval_cfg=humaneval_plus_eval_cfg)
|
||||
]
|
@ -0,0 +1,40 @@
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.datasets import HumanevalXDataset, HumanevalXEvaluator
|
||||
|
||||
humanevalx_reader_cfg = dict(
|
||||
input_columns=['prompt'], output_column='declaration', train_split='test')
|
||||
|
||||
humanevalx_infer_cfg = dict(
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
prompt='Read the following function signature and docstring, and fully implement the function described. Your response should only contain the code for this function.\n{prompt}'), retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer, max_out_len=1024))
|
||||
|
||||
humanevalx_eval_cfg_dict = {
|
||||
lang : dict(
|
||||
evaluator=dict(
|
||||
type=HumanevalXEvaluator,
|
||||
language=lang,
|
||||
ip_address=
|
||||
'localhost', # replace to your code_eval_server ip_address, port
|
||||
port=5001), # refer to https://opencompass.readthedocs.io/en/latest/advanced_guides/code_eval_service.html to launch a server
|
||||
pred_role='BOT')
|
||||
for lang in ['python', 'cpp', 'go', 'java', 'js'] # do not support rust now
|
||||
}
|
||||
|
||||
# Please download the needed `xx.jsonl.gz` from
|
||||
# https://github.com/THUDM/CodeGeeX2/tree/main/benchmark/humanevalx
|
||||
# and move them into `data/humanevalx/` folder
|
||||
humanevalx_datasets = [
|
||||
dict(
|
||||
type=HumanevalXDataset,
|
||||
abbr=f'humanevalx-{lang}',
|
||||
language=lang,
|
||||
path='./data/humanevalx',
|
||||
reader_cfg=humanevalx_reader_cfg,
|
||||
infer_cfg=humanevalx_infer_cfg,
|
||||
eval_cfg=humanevalx_eval_cfg_dict[lang])
|
||||
for lang in ['python', 'cpp', 'go', 'java', 'js']
|
||||
]
|
@ -0,0 +1,165 @@
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.datasets import (
|
||||
LCBCodeGenerationDataset,
|
||||
LCBCodeExecutionDataset,
|
||||
LCBTestOutputPredictionDataset,
|
||||
LCBCodeGenerationEvaluator,
|
||||
LCBCodeExecutionEvaluator,
|
||||
LCBTestOutputEvaluator
|
||||
)
|
||||
from opencompass.datasets.livecodebench import TestOutputPromptConstants
|
||||
|
||||
|
||||
lcb_code_generation_reader_cfg = dict(
|
||||
input_columns=[
|
||||
'question_content',
|
||||
'format_prompt',
|
||||
],
|
||||
# output_column='evaluation_sample',
|
||||
output_column='question_id',
|
||||
)
|
||||
|
||||
SYSTEM_MESSAGE_GENERIC = f'You are an expert Python programmer. You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests. You will NOT return anything except for the program.'
|
||||
|
||||
prompt_template = '### Question:\n{question_content}\n\n{format_prompt}' + \
|
||||
'### Answer: (use the provided format with backticks)\n\n'
|
||||
|
||||
|
||||
# Code Generation Tasks
|
||||
lcb_code_generation_infer_cfg = dict(
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(
|
||||
round=[
|
||||
dict(
|
||||
role='HUMAN',
|
||||
prompt=prompt_template
|
||||
)
|
||||
]
|
||||
)
|
||||
),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer, max_out_len=1024)
|
||||
)
|
||||
|
||||
lcb_code_generation_eval_cfg = dict(
|
||||
evaluator=dict(
|
||||
type=LCBCodeGenerationEvaluator,
|
||||
num_process_evaluate=4,
|
||||
timeout=6,
|
||||
release_version='release_v4',
|
||||
),
|
||||
pred_role='BOT',
|
||||
)
|
||||
|
||||
LCBCodeGeneration_dataset = dict(
|
||||
type=LCBCodeGenerationDataset,
|
||||
abbr='lcb_code_generation_v4',
|
||||
path='opencompass/code_generation_lite',
|
||||
reader_cfg=lcb_code_generation_reader_cfg,
|
||||
infer_cfg=lcb_code_generation_infer_cfg,
|
||||
eval_cfg=lcb_code_generation_eval_cfg,
|
||||
release_version='release_v4',
|
||||
)
|
||||
|
||||
# Code Execution Dataset
|
||||
lcb_code_execution_reader_cfg = dict(
|
||||
input_columns=[
|
||||
'prompt',
|
||||
],
|
||||
output_column='evaluation_sample',
|
||||
)
|
||||
|
||||
lcb_code_execution_infer_cfg = dict(
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(
|
||||
begin=[
|
||||
dict(
|
||||
role='SYSTEM',
|
||||
prompt='You are an expert at Python programming, code execution, test case generation, and fuzzing.'
|
||||
),
|
||||
],
|
||||
round=[
|
||||
dict(
|
||||
role='HUMAN',
|
||||
prompt='{prompt}'
|
||||
)
|
||||
]
|
||||
)
|
||||
),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer, max_out_len=1024)
|
||||
)
|
||||
|
||||
lcb_code_execution_eval_cfg = dict(
|
||||
evaluator=dict(
|
||||
type=LCBCodeExecutionEvaluator,
|
||||
),
|
||||
pred_role='BOT',
|
||||
)
|
||||
|
||||
LCBCodeExecution_dataset = dict(
|
||||
type=LCBCodeExecutionDataset,
|
||||
abbr='lcb_code_execution',
|
||||
path='opencompass/execution-v2',
|
||||
reader_cfg=lcb_code_execution_reader_cfg,
|
||||
infer_cfg=lcb_code_execution_infer_cfg,
|
||||
eval_cfg=lcb_code_execution_eval_cfg,
|
||||
)
|
||||
|
||||
# TestOuputput Dataset
|
||||
lcb_test_output_reader_cfg = dict(
|
||||
input_columns=[
|
||||
'prompt',
|
||||
],
|
||||
output_column='evaluation_sample',
|
||||
)
|
||||
|
||||
system_prompt = 'You are an expert Python programmer. You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests. You will NOT return anything except for the program.'
|
||||
|
||||
lcb_test_output_infer_cfg = dict(
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(
|
||||
# begin=[
|
||||
# dict(
|
||||
# role='SYSTEM',
|
||||
# prompt=system_prompt
|
||||
# ),
|
||||
# ],
|
||||
round=[
|
||||
dict(
|
||||
role='HUMAN',
|
||||
prompt='{prompt}'
|
||||
)
|
||||
]
|
||||
)
|
||||
),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer, max_out_len=1024)
|
||||
)
|
||||
|
||||
lcb_test_output_eval_cfg = dict(
|
||||
evaluator=dict(
|
||||
type=LCBTestOutputEvaluator,
|
||||
),
|
||||
pred_role='BOT',
|
||||
)
|
||||
|
||||
LCBTestOutput_dataset = dict(
|
||||
type=LCBTestOutputPredictionDataset,
|
||||
abbr='lcb_test_output',
|
||||
path='opencompass/test_generation',
|
||||
reader_cfg=lcb_test_output_reader_cfg,
|
||||
infer_cfg=lcb_test_output_infer_cfg,
|
||||
eval_cfg=lcb_test_output_eval_cfg,
|
||||
)
|
||||
|
||||
LCB_datasets = [
|
||||
LCBCodeGeneration_dataset,
|
||||
# LCBCodeExecution_dataset,
|
||||
# LCBTestOutput_dataset,
|
||||
]
|
@ -0,0 +1,165 @@
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.datasets import (
|
||||
LCBCodeGenerationDataset,
|
||||
LCBCodeExecutionDataset,
|
||||
LCBTestOutputPredictionDataset,
|
||||
LCBCodeGenerationEvaluator,
|
||||
LCBCodeExecutionEvaluator,
|
||||
LCBTestOutputEvaluator
|
||||
)
|
||||
from opencompass.datasets.livecodebench import TestOutputPromptConstants
|
||||
|
||||
|
||||
lcb_code_generation_reader_cfg = dict(
|
||||
input_columns=[
|
||||
'question_content',
|
||||
'format_prompt',
|
||||
],
|
||||
# output_column='evaluation_sample',
|
||||
output_column='question_id',
|
||||
)
|
||||
|
||||
SYSTEM_MESSAGE_GENERIC = f'You are an expert Python programmer. You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests. You will NOT return anything except for the program.'
|
||||
|
||||
prompt_template = '### Question:\n{question_content}\n\n{format_prompt}' + \
|
||||
'### Answer: (use the provided format with backticks)\n\n'
|
||||
|
||||
|
||||
# Code Generation Tasks
|
||||
lcb_code_generation_infer_cfg = dict(
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(
|
||||
round=[
|
||||
dict(
|
||||
role='HUMAN',
|
||||
prompt=prompt_template
|
||||
)
|
||||
]
|
||||
)
|
||||
),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer, max_out_len=1024)
|
||||
)
|
||||
|
||||
lcb_code_generation_eval_cfg = dict(
|
||||
evaluator=dict(
|
||||
type=LCBCodeGenerationEvaluator,
|
||||
num_process_evaluate=4,
|
||||
timeout=6,
|
||||
release_version='release_split_v4',
|
||||
),
|
||||
pred_role='BOT',
|
||||
)
|
||||
|
||||
LCBCodeGeneration_dataset = dict(
|
||||
type=LCBCodeGenerationDataset,
|
||||
abbr='lcb_code_generation_split_v4',
|
||||
path='opencompass/code_generation_lite',
|
||||
reader_cfg=lcb_code_generation_reader_cfg,
|
||||
infer_cfg=lcb_code_generation_infer_cfg,
|
||||
eval_cfg=lcb_code_generation_eval_cfg,
|
||||
release_version='release_split_v4',
|
||||
)
|
||||
|
||||
# Code Execution Dataset
|
||||
lcb_code_execution_reader_cfg = dict(
|
||||
input_columns=[
|
||||
'prompt',
|
||||
],
|
||||
output_column='evaluation_sample',
|
||||
)
|
||||
|
||||
lcb_code_execution_infer_cfg = dict(
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(
|
||||
begin=[
|
||||
dict(
|
||||
role='SYSTEM',
|
||||
prompt='You are an expert at Python programming, code execution, test case generation, and fuzzing.'
|
||||
),
|
||||
],
|
||||
round=[
|
||||
dict(
|
||||
role='HUMAN',
|
||||
prompt='{prompt}'
|
||||
)
|
||||
]
|
||||
)
|
||||
),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer, max_out_len=1024)
|
||||
)
|
||||
|
||||
lcb_code_execution_eval_cfg = dict(
|
||||
evaluator=dict(
|
||||
type=LCBCodeExecutionEvaluator,
|
||||
),
|
||||
pred_role='BOT',
|
||||
)
|
||||
|
||||
LCBCodeExecution_dataset = dict(
|
||||
type=LCBCodeExecutionDataset,
|
||||
abbr='lcb_code_execution',
|
||||
path='opencompass/execution-v2',
|
||||
reader_cfg=lcb_code_execution_reader_cfg,
|
||||
infer_cfg=lcb_code_execution_infer_cfg,
|
||||
eval_cfg=lcb_code_execution_eval_cfg,
|
||||
)
|
||||
|
||||
# TestOuputput Dataset
|
||||
lcb_test_output_reader_cfg = dict(
|
||||
input_columns=[
|
||||
'prompt',
|
||||
],
|
||||
output_column='evaluation_sample',
|
||||
)
|
||||
|
||||
system_prompt = 'You are an expert Python programmer. You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests. You will NOT return anything except for the program.'
|
||||
|
||||
lcb_test_output_infer_cfg = dict(
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(
|
||||
# begin=[
|
||||
# dict(
|
||||
# role='SYSTEM',
|
||||
# prompt=system_prompt
|
||||
# ),
|
||||
# ],
|
||||
round=[
|
||||
dict(
|
||||
role='HUMAN',
|
||||
prompt='{prompt}'
|
||||
)
|
||||
]
|
||||
)
|
||||
),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer, max_out_len=1024)
|
||||
)
|
||||
|
||||
lcb_test_output_eval_cfg = dict(
|
||||
evaluator=dict(
|
||||
type=LCBTestOutputEvaluator,
|
||||
),
|
||||
pred_role='BOT',
|
||||
)
|
||||
|
||||
LCBTestOutput_dataset = dict(
|
||||
type=LCBTestOutputPredictionDataset,
|
||||
abbr='lcb_test_output',
|
||||
path='opencompass/test_generation',
|
||||
reader_cfg=lcb_test_output_reader_cfg,
|
||||
infer_cfg=lcb_test_output_infer_cfg,
|
||||
eval_cfg=lcb_test_output_eval_cfg,
|
||||
)
|
||||
|
||||
LCB_datasets = [
|
||||
LCBCodeGeneration_dataset,
|
||||
# LCBCodeExecution_dataset,
|
||||
# LCBTestOutput_dataset,
|
||||
]
|
@ -0,0 +1,164 @@
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.datasets import (
|
||||
LCBCodeGenerationDataset,
|
||||
LCBCodeExecutionDataset,
|
||||
LCBTestOutputPredictionDataset,
|
||||
LCBCodeGenerationEvaluator,
|
||||
LCBCodeExecutionEvaluator,
|
||||
LCBTestOutputEvaluator
|
||||
)
|
||||
from opencompass.datasets.livecodebench import TestOutputPromptConstants
|
||||
|
||||
|
||||
lcb_code_generation_reader_cfg = dict(
|
||||
input_columns=[
|
||||
'question_content',
|
||||
'format_prompt',
|
||||
],
|
||||
# output_column='evaluation_sample',
|
||||
output_column='question_id',
|
||||
)
|
||||
|
||||
SYSTEM_MESSAGE_GENERIC = f'You are an expert Python programmer. You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests. You will NOT return anything except for the program.'
|
||||
|
||||
prompt_template = '### Question:\n{question_content}\n\n{format_prompt}' + \
|
||||
'### Answer: (use the provided format with backticks)\n\n'
|
||||
|
||||
|
||||
# Code Generation Tasks
|
||||
lcb_code_generation_infer_cfg = dict(
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(
|
||||
round=[
|
||||
dict(
|
||||
role='HUMAN',
|
||||
prompt=prompt_template
|
||||
)
|
||||
]
|
||||
)
|
||||
),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer, max_out_len=1024)
|
||||
)
|
||||
|
||||
lcb_code_generation_eval_cfg = dict(
|
||||
evaluator=dict(
|
||||
type=LCBCodeGenerationEvaluator,
|
||||
num_process_evaluate=4,
|
||||
timeout=6,
|
||||
),
|
||||
pred_role='BOT',
|
||||
)
|
||||
|
||||
LCBCodeGeneration_dataset = dict(
|
||||
type=LCBCodeGenerationDataset,
|
||||
abbr='lcb_code_generation_v1',
|
||||
path='opencompass/code_generation_lite',
|
||||
reader_cfg=lcb_code_generation_reader_cfg,
|
||||
infer_cfg=lcb_code_generation_infer_cfg,
|
||||
eval_cfg=lcb_code_generation_eval_cfg,
|
||||
release_version='release_v1',
|
||||
)
|
||||
|
||||
# Code Execution Dataset
|
||||
lcb_code_execution_reader_cfg = dict(
|
||||
input_columns=[
|
||||
'prompt',
|
||||
],
|
||||
output_column='evaluation_sample',
|
||||
)
|
||||
|
||||
lcb_code_execution_infer_cfg = dict(
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(
|
||||
begin=[
|
||||
dict(
|
||||
role='SYSTEM',
|
||||
prompt='You are an expert at Python programming, code execution, test case generation, and fuzzing.'
|
||||
),
|
||||
],
|
||||
round=[
|
||||
dict(
|
||||
role='HUMAN',
|
||||
prompt='{prompt}'
|
||||
)
|
||||
]
|
||||
)
|
||||
),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer, max_out_len=1024)
|
||||
)
|
||||
|
||||
lcb_code_execution_eval_cfg = dict(
|
||||
evaluator=dict(
|
||||
type=LCBCodeExecutionEvaluator,
|
||||
),
|
||||
pred_role='BOT',
|
||||
)
|
||||
|
||||
LCBCodeExecution_dataset = dict(
|
||||
type=LCBCodeExecutionDataset,
|
||||
abbr='lcb_code_execution',
|
||||
path='opencompass/execution-v2',
|
||||
reader_cfg=lcb_code_execution_reader_cfg,
|
||||
infer_cfg=lcb_code_execution_infer_cfg,
|
||||
eval_cfg=lcb_code_execution_eval_cfg,
|
||||
)
|
||||
|
||||
# TestOuputput Dataset
|
||||
lcb_test_output_reader_cfg = dict(
|
||||
input_columns=[
|
||||
'prompt',
|
||||
],
|
||||
output_column='evaluation_sample',
|
||||
)
|
||||
|
||||
system_prompt = 'You are an expert Python programmer. You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests. You will NOT return anything except for the program.'
|
||||
|
||||
lcb_test_output_infer_cfg = dict(
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(
|
||||
# begin=[
|
||||
# dict(
|
||||
# role='SYSTEM',
|
||||
# prompt=system_prompt
|
||||
# ),
|
||||
# ],
|
||||
round=[
|
||||
dict(
|
||||
role='HUMAN',
|
||||
prompt='{prompt}'
|
||||
)
|
||||
]
|
||||
)
|
||||
),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer, max_out_len=1024)
|
||||
)
|
||||
|
||||
lcb_test_output_eval_cfg = dict(
|
||||
evaluator=dict(
|
||||
type=LCBTestOutputEvaluator,
|
||||
),
|
||||
pred_role='BOT',
|
||||
)
|
||||
|
||||
LCBTestOutput_dataset = dict(
|
||||
type=LCBTestOutputPredictionDataset,
|
||||
abbr='lcb_test_output',
|
||||
path='opencompass/test_generation',
|
||||
reader_cfg=lcb_test_output_reader_cfg,
|
||||
infer_cfg=lcb_test_output_infer_cfg,
|
||||
eval_cfg=lcb_test_output_eval_cfg,
|
||||
)
|
||||
|
||||
LCB_datasets = [
|
||||
LCBCodeGeneration_dataset,
|
||||
# LCBCodeExecution_dataset,
|
||||
# LCBTestOutput_dataset,
|
||||
]
|
@ -0,0 +1,36 @@
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.datasets import MATHDataset, MATHEvaluator, math_postprocess_v2, normalize_final_answer
|
||||
|
||||
math_reader_cfg = dict(input_columns=['problem'], output_column='solution')
|
||||
|
||||
math_infer_cfg = dict(
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(
|
||||
round=[
|
||||
dict(role='HUMAN', prompt='{problem}\nRemember to put your final answer within \\boxed{}.'),
|
||||
]
|
||||
),
|
||||
),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer, max_out_len=1024),
|
||||
)
|
||||
|
||||
# postprocess v2
|
||||
math_eval_cfg = dict(
|
||||
evaluator=dict(type=MATHEvaluator, version='v2'), pred_postprocessor=dict(type=math_postprocess_v2),
|
||||
)
|
||||
|
||||
math_datasets = [
|
||||
dict(
|
||||
type=MATHDataset,
|
||||
abbr='math_prm800k_500',
|
||||
path='opencompass/math',
|
||||
file_name = 'test_prm800k_500.json',
|
||||
reader_cfg=math_reader_cfg,
|
||||
infer_cfg=math_infer_cfg,
|
||||
eval_cfg=math_eval_cfg,
|
||||
)
|
||||
]
|
@ -0,0 +1,85 @@
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.datasets import MATHDataset, MATHEvaluator, math_postprocess_v2, normalize_final_answer
|
||||
from opencompass.datasets import MATHDataset, MATHEvaluator, math_postprocess_v2, GaoKaoMATHEvaluator
|
||||
# from opencompass.utils.model_postprocessors import naive_model_postprocess, xfinder_postprocess
|
||||
from opencompass.utils.postprocessors.naive import MATH_NAVIE_PROMPT_TEMPLATE
|
||||
|
||||
# ----------------------------- Eval Parameters -----------------------------
|
||||
## Postprocess function
|
||||
post_func = 're' # 're', 'xfinder_model', 'naive_model'
|
||||
|
||||
## Evalute function
|
||||
eval_func = 'naive_model' # 're', 'naive_model'
|
||||
|
||||
|
||||
## Model api url
|
||||
# xfinder_url = 'http://0.0.0.0:23333/v1' # for 'xFinder-qwen1505' if post_func is 'xfinder_model'
|
||||
# naive_model_name = 'Qwen/Qwen2.5-72B-Instruct' # replace with your model name
|
||||
naive_model_name = 'dlc_model'
|
||||
# naive_model_url = [
|
||||
# 'http://172.30.56.38:23001/v1',
|
||||
# ] # Multi-apis for accerlation
|
||||
naive_model_url = [
|
||||
"http://172.30.56.38:23001/v1",
|
||||
"http://172.30.8.4:23003/v1",
|
||||
"http://172.30.8.14:23002/v1",
|
||||
"http://172.30.48.80:23004/v1",
|
||||
"http://172.30.56.132:23005/v1",
|
||||
"http://172.30.16.115:23006/v1",
|
||||
"http://172.30.48.82:23007/v1",
|
||||
"http://172.30.24.53:23008/v1",
|
||||
"http://172.30.56.141:23009/v1",
|
||||
"http://172.30.8.35:23010/v1",
|
||||
"http://172.30.48.85:23011/v1",
|
||||
"http://172.30.16.116:23012/v1"
|
||||
]
|
||||
# ----------------------------- Detailed Config -----------------------------
|
||||
|
||||
math_reader_cfg = dict(input_columns=['problem'], output_column='solution')
|
||||
|
||||
math_infer_cfg = dict(
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(
|
||||
round=[
|
||||
dict(role='HUMAN', prompt='{problem}\nRemember to put your final answer within \\boxed{}.'),
|
||||
]
|
||||
),
|
||||
),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer, max_out_len=8192),
|
||||
)
|
||||
|
||||
|
||||
if post_func == 're':
|
||||
pred_postprocessor = dict(type=math_postprocess_v2)
|
||||
|
||||
|
||||
if eval_func == 're':
|
||||
evaluator = dict(type=MATHEvaluator, version='v2')
|
||||
elif eval_func == 'naive_model':
|
||||
evaluator = dict(
|
||||
type=GaoKaoMATHEvaluator,
|
||||
judge_model_name=naive_model_name,
|
||||
url=naive_model_url,
|
||||
)
|
||||
|
||||
# postprocess v2
|
||||
math_eval_cfg = dict(
|
||||
evaluator=evaluator,
|
||||
pred_postprocessor=pred_postprocessor,
|
||||
)
|
||||
|
||||
math_datasets = [
|
||||
dict(
|
||||
type=MATHDataset,
|
||||
abbr='math_prm800k_500-llmjudge',
|
||||
path='opencompass/math',
|
||||
file_name = 'test_prm800k_500.json',
|
||||
reader_cfg=math_reader_cfg,
|
||||
infer_cfg=math_infer_cfg,
|
||||
eval_cfg=math_eval_cfg,
|
||||
)
|
||||
]
|
@ -0,0 +1,40 @@
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.datasets import SanitizedMBPPDataset, MBPPEvaluator
|
||||
|
||||
sanitized_mbpp_reader_cfg = dict(input_columns=['text', 'test_list'], output_column='test_list_2')
|
||||
|
||||
sanitized_mbpp_infer_cfg = dict(
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(
|
||||
round=[
|
||||
# dict(role='HUMAN', prompt='You are an expert Python programmer, and here is your task:\nWrite a function to find the similar elements from the given two tuple lists.\nYour code should pass these tests:\n\nassert similar_elements((3, 4, 5, 6),(5, 7, 4, 10)) == (4, 5)\nassert similar_elements((1, 2, 3, 4),(5, 4, 3, 7)) == (3, 4)\nassert similar_elements((11, 12, 14, 13),(17, 15, 14, 13)) == (13, 14)\n',),
|
||||
# dict(role='BOT', prompt='```python\ndef similar_elements(test_tup1, test_tup2):\n res = tuple(set(test_tup1) & set(test_tup2))\n return (res)```',),
|
||||
|
||||
# dict(role='HUMAN', prompt='You are an expert Python programmer, and here is your task:\nWrite a python function to identify non-prime numbers.\nYour code should pass these tests:\n\nassert is_not_prime(2) == False\nassert is_not_prime(10) == True\nassert is_not_prime(35) == True\n',),
|
||||
# dict(role='BOT', prompt='```python\nimport math\ndef is_not_prime(n):\n result = False\n for i in range(2,int(math.sqrt(n)) + 1):\n if n %% i == 0:\n result = True\n return result```',),
|
||||
|
||||
# dict(role='HUMAN', prompt='You are an expert Python programmer, and here is your task:\nWrite a function to find the largest integers from a given list of numbers using heap queue algorithm.\nYour code should pass these tests:\n\nassert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],3)==[85, 75, 65]\nassert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],2)==[85, 75]\nassert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],5)==[85, 75, 65, 58, 35]\n',),
|
||||
# dict(role='BOT', prompt='```python\nimport heapq as hq\ndef heap_queue_largest(nums,n):\n largest_nums = hq.nlargest(n, nums)\n return largest_nums```',),
|
||||
dict(role='HUMAN', prompt='You are an expert Python programmer, and here is your task:\n{text}\nYour code should pass these tests:\n\n{test_list}\n You should submit your final solution in the following format: ```python\n\n```',),
|
||||
],
|
||||
),
|
||||
),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer, max_out_len=512),
|
||||
)
|
||||
|
||||
sanitized_mbpp_eval_cfg = dict(evaluator=dict(type=MBPPEvaluator), pred_role='BOT')
|
||||
|
||||
sanitized_mbpp_datasets = [
|
||||
dict(
|
||||
type=SanitizedMBPPDataset,
|
||||
abbr='sanitized_mbpp',
|
||||
path='opencompass/sanitized_mbpp',
|
||||
reader_cfg=sanitized_mbpp_reader_cfg,
|
||||
infer_cfg=sanitized_mbpp_infer_cfg,
|
||||
eval_cfg=sanitized_mbpp_eval_cfg,
|
||||
)
|
||||
]
|
@ -47,8 +47,3 @@ for _name in subjective_all_sets:
|
||||
infer_cfg=subjective_infer_cfg,
|
||||
eval_cfg=subjective_eval_cfg,
|
||||
))
|
||||
# ds1000_eval_cfg = dict(
|
||||
# evaluator=dict(type=DS1000Evaluator),
|
||||
# pred_role='BOT',
|
||||
# pred_postprocessor=dict(type=ds1000_postprocess),
|
||||
# )
|
||||
|
@ -1,6 +1,7 @@
|
||||
from .evaluator import LCBCodeExecutionEvaluator # noqa: F401, F403
|
||||
from .evaluator import LCBCodeGenerationEvaluator # noqa: F401, F403
|
||||
from .evaluator import LCBTestOutputEvaluator # noqa: F401, F403
|
||||
from .livecodebench import CompassBenchCodeExecutionDataset # noqa: F401, F403
|
||||
from .livecodebench import LCBCodeExecutionDataset # noqa: F401, F403
|
||||
from .livecodebench import LCBCodeGenerationDataset # noqa: F401, F403
|
||||
from .livecodebench import LCBTestOutputPredictionDataset # noqa: F401, F403
|
||||
|
@ -228,11 +228,15 @@ def codegen_metrics(
|
||||
@ICL_EVALUATORS.register_module()
|
||||
class LCBCodeGenerationEvaluator(BaseEvaluator):
|
||||
|
||||
def __init__(self, num_process_evaluate, timeout=6):
|
||||
def __init__(self,
|
||||
num_process_evaluate,
|
||||
timeout=6,
|
||||
release_version='release_v1'):
|
||||
super().__init__()
|
||||
self.num_process_evaluate = num_process_evaluate
|
||||
self.timeout = timeout
|
||||
self.dataset = LCBCodeGenerationDataset.load()['test']
|
||||
self.dataset = LCBCodeGenerationDataset.load(
|
||||
release_version=release_version)['test']
|
||||
|
||||
def score(self, predictions, references):
|
||||
predictions = [[extract_code_generation(item)] for item in predictions]
|
||||
|
@ -8,7 +8,7 @@ import zlib
|
||||
from dataclasses import dataclass
|
||||
from enum import Enum
|
||||
|
||||
from datasets import DatasetDict, load_dataset
|
||||
from datasets import DatasetDict, load_dataset, load_from_disk
|
||||
|
||||
from opencompass.utils import get_data_path # noqa: F401, F403
|
||||
|
||||
@ -215,3 +215,37 @@ class LCBSelfRepairDataset(BaseDataset):
|
||||
dataset = dataset.map(transform)
|
||||
|
||||
return DatasetDict({'test': dataset, 'train': dataset})
|
||||
|
||||
|
||||
class CompassBenchCodeExecutionDataset(BaseDataset):
|
||||
|
||||
@staticmethod
|
||||
def load(
|
||||
path: str = 'opencompass/execution-v2',
|
||||
local_mode: bool = False,
|
||||
cot: bool = False,
|
||||
# release_version: str = "release_v1"
|
||||
):
|
||||
# path = get_data_path(path, local_mode=local_mode)
|
||||
|
||||
def transform(item):
|
||||
code, input = item['code'], item['input']
|
||||
prompt = make_code_execution_prompt(code, input, cot=cot)
|
||||
|
||||
item['prompt'] = prompt
|
||||
|
||||
evaluation_sample = json.dumps({
|
||||
'code': item['code'],
|
||||
'input': item['input'],
|
||||
'output': item['output']
|
||||
})
|
||||
item['evaluation_sample'] = evaluation_sample
|
||||
|
||||
return item
|
||||
|
||||
path = get_data_path(path, local_mode=local_mode)
|
||||
dataset = load_from_disk(path) # 'livecodebench/execution-v2'
|
||||
dataset = dataset['test']
|
||||
dataset = dataset.map(transform)
|
||||
|
||||
return DatasetDict({'test': dataset, 'train': dataset})
|
||||
|
@ -390,6 +390,7 @@ class LMTemplateParser:
|
||||
elif item.get('prompt', ''): # it's a dict
|
||||
prompt += last_sep + item.get('prompt', '')
|
||||
last_sep = '\n'
|
||||
|
||||
return prompt
|
||||
|
||||
def _split_rounds(
|
||||
|
@ -176,7 +176,7 @@ class VOLCRunner(BaseRunner):
|
||||
cmd = get_cmd()
|
||||
|
||||
logger = get_logger()
|
||||
logger.debug(f'Running command: {cmd}')
|
||||
logger.info(f'Running command: {cmd}')
|
||||
|
||||
out_path = task.get_log_path(file_extension='txt')
|
||||
mmengine.mkdir_or_exist(osp.split(out_path)[0])
|
||||
@ -205,10 +205,17 @@ class VOLCRunner(BaseRunner):
|
||||
return task_name, returncode
|
||||
|
||||
def _run_task(self, cmd, log_path, poll_interval):
|
||||
logger = get_logger()
|
||||
result = subprocess.run(cmd,
|
||||
shell=True,
|
||||
text=True,
|
||||
capture_output=True)
|
||||
|
||||
logger.info(f'Command output: {result.stdout}')
|
||||
if result.stderr:
|
||||
logger.error(f'Command error: {result.stderr}')
|
||||
logger.info(f'Return code: {result.returncode}')
|
||||
|
||||
pattern = r'(?<=task_id=).*(?=\n\n)'
|
||||
match = re.search(pattern, result.stdout)
|
||||
if match:
|
||||
|
@ -554,4 +554,8 @@ DATASETS_URL = {
|
||||
"url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/korbench.zip",
|
||||
"md5": "9107597d137e7362eaf7d218ddef7a6d",
|
||||
},
|
||||
"subjective/judgerbench": {
|
||||
"url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/judgerbench.zip",
|
||||
"md5": "60d605883aa8cac9755819140ab42c6b"
|
||||
}
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user