diff --git a/configs/datasets/subjective/judgerbench/judgerbench.py b/configs/datasets/subjective/judgerbench/judgerbench.py index 3c436585..e6aafb12 100644 --- a/configs/datasets/subjective/judgerbench/judgerbench.py +++ b/configs/datasets/subjective/judgerbench/judgerbench.py @@ -47,8 +47,3 @@ for _name in subjective_all_sets: infer_cfg=subjective_infer_cfg, eval_cfg=subjective_eval_cfg, )) -# ds1000_eval_cfg = dict( -# evaluator=dict(type=DS1000Evaluator), -# pred_role='BOT', -# pred_postprocessor=dict(type=ds1000_postprocess), -# ) diff --git a/opencompass/configs/datasets/aime2024/aime2024_0shot_nocot_gen_2b9dc2.py b/opencompass/configs/datasets/aime2024/aime2024_0shot_nocot_gen_2b9dc2.py new file mode 100644 index 00000000..0a1790e0 --- /dev/null +++ b/opencompass/configs/datasets/aime2024/aime2024_0shot_nocot_gen_2b9dc2.py @@ -0,0 +1,39 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import Aime2024Dataset, MATHEvaluator, math_postprocess_v2 + + +aime2024_reader_cfg = dict( + input_columns=['question'], + output_column='answer' +) + + +aime2024_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict(role='HUMAN', prompt='{question}\nRemember to put your final answer within \\boxed{}.'), + ], + ) + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=2048) +) + +aime2024_eval_cfg = dict( + evaluator=dict(type=MATHEvaluator, version='v2'), pred_postprocessor=dict(type=math_postprocess_v2) +) + +aime2024_datasets = [ + dict( + abbr='aime2024', + type=Aime2024Dataset, + path='opencompass/aime2024', + reader_cfg=aime2024_reader_cfg, + infer_cfg=aime2024_infer_cfg, + eval_cfg=aime2024_eval_cfg + ) +] \ No newline at end of file diff --git a/opencompass/configs/datasets/bbh/bbh_0shot_nocot_gen_9c32f6.py b/opencompass/configs/datasets/bbh/bbh_0shot_nocot_gen_9c32f6.py new file mode 100644 index 00000000..586d0107 --- /dev/null +++ b/opencompass/configs/datasets/bbh/bbh_0shot_nocot_gen_9c32f6.py @@ -0,0 +1,96 @@ +import os +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.openicl.icl_evaluator import AccEvaluator +from opencompass.datasets import BBHDataset, BBHEvaluator, bbh_mcq_postprocess, BBHEvaluator_mcq + +bbh_reader_cfg = dict(input_columns=['input'], output_column='target') + +bbh_multiple_choice_sets = [ + 'temporal_sequences', + 'disambiguation_qa', + 'date_understanding', + 'tracking_shuffled_objects_three_objects', + 'penguins_in_a_table', + 'geometric_shapes', + 'snarks', + 'ruin_names', + 'tracking_shuffled_objects_seven_objects', + 'tracking_shuffled_objects_five_objects', + 'logical_deduction_three_objects', + 'hyperbaton', + 'logical_deduction_five_objects', + 'logical_deduction_seven_objects', + 'movie_recommendation', + 'salient_translation_error_detection', + 'reasoning_about_colored_objects', +] +bbh_free_form_sets = [ + 'multistep_arithmetic_two', + 'navigate', + 'dyck_languages', + 'word_sorting', + 'sports_understanding', + 'boolean_expressions', + 'object_counting', + 'formal_fallacies', + 'causal_judgement', + 'web_of_lies', +] + +bbh_datasets = [] +for _name in bbh_multiple_choice_sets: + bbh_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict(round=[ + dict( + role='HUMAN', + prompt= + f"Follow the given examples and answer the question.\n\nQuestion: {{input}}\n You must give your final answer by starting with 'So the answer is' " + ) + ])), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=512)) + bbh_eval_cfg = dict( + evaluator=dict(type=BBHEvaluator_mcq), + pred_role='BOT', + pred_postprocessor=dict(type=bbh_mcq_postprocess), + dataset_postprocessor=dict(type=bbh_mcq_postprocess)) + + bbh_datasets.append( + dict( + type=BBHDataset, + path='opencompass/bbh', + name=_name, + abbr='bbh-' + _name, + reader_cfg=bbh_reader_cfg, + infer_cfg=bbh_infer_cfg.copy(), + eval_cfg=bbh_eval_cfg.copy())) + +for _name in bbh_free_form_sets: + + bbh_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict(round=[ + dict( + role='HUMAN', + prompt= + f"Follow the given examples and answer the question.\n\nQuestion: {{input}}\n You must give your final answer by starting with 'So the answer is' " + ) + ])), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=512)) + bbh_eval_cfg = dict(evaluator=dict(type=BBHEvaluator), pred_role='BOT') + + bbh_datasets.append( + dict( + type=BBHDataset, + path='opencompass/bbh', + name=_name, + abbr='bbh-' + _name, + reader_cfg=bbh_reader_cfg, + infer_cfg=bbh_infer_cfg.copy(), + eval_cfg=bbh_eval_cfg.copy())) diff --git a/opencompass/configs/datasets/bbh/bbh_0shot_nocot_gen_ea7952.py b/opencompass/configs/datasets/bbh/bbh_0shot_nocot_gen_ea7952.py new file mode 100644 index 00000000..61ea50dd --- /dev/null +++ b/opencompass/configs/datasets/bbh/bbh_0shot_nocot_gen_ea7952.py @@ -0,0 +1,96 @@ +import os +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.openicl.icl_evaluator import AccEvaluator +from opencompass.datasets import BBHDataset, BBHEvaluator, bbh_mcq_postprocess, BBHEvaluator_mcq + +bbh_reader_cfg = dict(input_columns=['input'], output_column='target') + +bbh_multiple_choice_sets = [ + 'temporal_sequences', + 'disambiguation_qa', + 'date_understanding', + 'tracking_shuffled_objects_three_objects', + 'penguins_in_a_table', + 'geometric_shapes', + 'snarks', + 'ruin_names', + 'tracking_shuffled_objects_seven_objects', + 'tracking_shuffled_objects_five_objects', + 'logical_deduction_three_objects', + 'hyperbaton', + 'logical_deduction_five_objects', + 'logical_deduction_seven_objects', + 'movie_recommendation', + 'salient_translation_error_detection', + 'reasoning_about_colored_objects', +] +bbh_free_form_sets = [ + 'multistep_arithmetic_two', + 'navigate', + 'dyck_languages', + 'word_sorting', + 'sports_understanding', + 'boolean_expressions', + 'object_counting', + 'formal_fallacies', + 'causal_judgement', + 'web_of_lies', +] + +bbh_datasets = [] +for _name in bbh_multiple_choice_sets: + bbh_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict(round=[ + dict( + role='HUMAN', + prompt= + f"Question: {{input}}\n You must give your final answer by starting with 'So the answer is' " + ) + ])), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=512)) + bbh_eval_cfg = dict( + evaluator=dict(type=BBHEvaluator_mcq), + pred_role='BOT', + pred_postprocessor=dict(type=bbh_mcq_postprocess), + dataset_postprocessor=dict(type=bbh_mcq_postprocess)) + + bbh_datasets.append( + dict( + type=BBHDataset, + path='opencompass/bbh', + name=_name, + abbr='bbh-' + _name, + reader_cfg=bbh_reader_cfg, + infer_cfg=bbh_infer_cfg.copy(), + eval_cfg=bbh_eval_cfg.copy())) + +for _name in bbh_free_form_sets: + + bbh_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict(round=[ + dict( + role='HUMAN', + prompt= + f"Follow the given examples and answer the question.\n\nQuestion: {{input}}\n You must give your final answer by starting with 'So the answer is' " + ) + ])), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=512)) + bbh_eval_cfg = dict(evaluator=dict(type=BBHEvaluator), pred_role='BOT') + + bbh_datasets.append( + dict( + type=BBHDataset, + path='opencompass/bbh', + name=_name, + abbr='bbh-' + _name, + reader_cfg=bbh_reader_cfg, + infer_cfg=bbh_infer_cfg.copy(), + eval_cfg=bbh_eval_cfg.copy())) diff --git a/opencompass/configs/datasets/cmo_fib/cmo_fib_0shot_notcot_gen_4c6c29.py b/opencompass/configs/datasets/cmo_fib/cmo_fib_0shot_notcot_gen_4c6c29.py new file mode 100644 index 00000000..39b08adf --- /dev/null +++ b/opencompass/configs/datasets/cmo_fib/cmo_fib_0shot_notcot_gen_4c6c29.py @@ -0,0 +1,39 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import CMOFibDataset, MATHEvaluator, math_postprocess_v2 + + +cmo_fib_reader_cfg = dict( + input_columns=['question'], + output_column='answer' +) + + +cmo_fib_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict(role='HUMAN', prompt='{question}\n你需要讲最终答案写入\\boxed{}.'), + ], + ) + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=2048) +) + +cmo_fib_eval_cfg = dict( + evaluator=dict(type=MATHEvaluator, version='v2'), pred_postprocessor=dict(type=math_postprocess_v2) +) + +cmo_fib_datasets = [ + dict( + abbr='cmo_fib', + type=CMOFibDataset, + path='opencompass/cmo_fib', + reader_cfg=cmo_fib_reader_cfg, + infer_cfg=cmo_fib_infer_cfg, + eval_cfg=cmo_fib_eval_cfg + ) +] \ No newline at end of file diff --git a/opencompass/configs/datasets/gpqa/gpqa_0shot_nocot_gen_772ea0.py b/opencompass/configs/datasets/gpqa/gpqa_0shot_nocot_gen_772ea0.py new file mode 100644 index 00000000..4783dae4 --- /dev/null +++ b/opencompass/configs/datasets/gpqa/gpqa_0shot_nocot_gen_772ea0.py @@ -0,0 +1,52 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import GPQADataset, GPQA_Simple_Eval_postprocess, GPQAEvaluator + +# openai_simple_eval prompt +align_prompt = """ +Answer the following multiple choice question. The last line of your response should be of the following format: 'ANSWER: $LETTER' (without quotes) where LETTER is one of ABCD. + +{question} + +A) {A} +B) {B} +C) {C} +D) {D} +""".strip() + +gpqa_reader_cfg = dict( + input_columns=['question', 'A', 'B', 'C', 'D'], + output_column='answer') + +gpqa_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict(role='HUMAN', prompt=align_prompt), + ], )), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer)) + +gpqa_eval_cfg = dict(evaluator=dict(type=GPQAEvaluator), + pred_postprocessor=dict(type=GPQA_Simple_Eval_postprocess)) + +gpqa_datasets = [] +gpqa_subsets = { + # 'extended': 'gpqa_extended.csv', + # 'main': 'gpqa_main.csv', + 'diamond': 'gpqa_diamond.csv' +} + +for split in list(gpqa_subsets.keys()): + gpqa_datasets.append( + dict( + abbr='GPQA_' + split, + type=GPQADataset, + path='./data/gpqa/', + name=gpqa_subsets[split], + reader_cfg=gpqa_reader_cfg, + infer_cfg=gpqa_infer_cfg, + eval_cfg=gpqa_eval_cfg) + ) diff --git a/opencompass/configs/datasets/gsm8k/gsm8k_0shot_nocot_gen_6cbf22.py b/opencompass/configs/datasets/gsm8k/gsm8k_0shot_nocot_gen_6cbf22.py new file mode 100644 index 00000000..f1b7fe30 --- /dev/null +++ b/opencompass/configs/datasets/gsm8k/gsm8k_0shot_nocot_gen_6cbf22.py @@ -0,0 +1,37 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import GSM8KDataset, gsm8k_postprocess, gsm8k_dataset_postprocess, Gsm8kEvaluator +from opencompass.datasets import MATHEvaluator, math_postprocess_v2 + +gsm8k_reader_cfg = dict(input_columns=['question'], output_column='answer') + +gsm8k_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict(role='HUMAN', prompt='{question}\nPlease put your final answer within \\boxed{}.'), + ], + ), + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=512), +) + +gsm8k_eval_cfg = dict( + evaluator=dict(type=MATHEvaluator, version='v2'), + pred_postprocessor=dict(type=math_postprocess_v2), + dataset_postprocessor=dict(type=gsm8k_dataset_postprocess), +) + +gsm8k_datasets = [ + dict( + abbr='gsm8k', + type=GSM8KDataset, + path='opencompass/gsm8k', + reader_cfg=gsm8k_reader_cfg, + infer_cfg=gsm8k_infer_cfg, + eval_cfg=gsm8k_eval_cfg, + ) +] diff --git a/opencompass/configs/datasets/humaneval_plus/humaneval_plus_openai_simple_evals_gen_159614.py b/opencompass/configs/datasets/humaneval_plus/humaneval_plus_openai_simple_evals_gen_159614.py new file mode 100644 index 00000000..8e91abdc --- /dev/null +++ b/opencompass/configs/datasets/humaneval_plus/humaneval_plus_openai_simple_evals_gen_159614.py @@ -0,0 +1,38 @@ +# THIS SHALL ALSO BE DEPRECATED +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import HumanevalDataset, HumanEvalPlusEvaluator, humaneval_postprocess_v2 + +humaneval_plus_reader_cfg = dict( + input_columns=['prompt'], output_column='task_id', train_split='test') + +# TODO: allow empty output-column +humaneval_plus_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict(round=[ + dict( + role='HUMAN', + prompt='Read the following function signature and docstring, and fully implement the function described. Your response should only contain the code for this function.\n{prompt}' + ), + ])), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=512)) + +humaneval_plus_eval_cfg = dict( + evaluator=dict(type=HumanEvalPlusEvaluator), + pred_role='BOT', + k=[1, 10, 100], # the parameter only for humaneval + pred_postprocessor=dict(type=humaneval_postprocess_v2), +) + +humaneval_plus_datasets = [ + dict( + abbr='humaneval_plus', + type=HumanevalDataset, + path='opencompass/humaneval', + reader_cfg=humaneval_plus_reader_cfg, + infer_cfg=humaneval_plus_infer_cfg, + eval_cfg=humaneval_plus_eval_cfg) +] diff --git a/opencompass/configs/datasets/humanevalx/humanevalx_0shot_nocot_gen_3e4bbd.py b/opencompass/configs/datasets/humanevalx/humanevalx_0shot_nocot_gen_3e4bbd.py new file mode 100644 index 00000000..b4d85fad --- /dev/null +++ b/opencompass/configs/datasets/humanevalx/humanevalx_0shot_nocot_gen_3e4bbd.py @@ -0,0 +1,40 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import HumanevalXDataset, HumanevalXEvaluator + +humanevalx_reader_cfg = dict( + input_columns=['prompt'], output_column='declaration', train_split='test') + +humanevalx_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + prompt='Read the following function signature and docstring, and fully implement the function described. Your response should only contain the code for this function.\n{prompt}'), retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=1024)) + +humanevalx_eval_cfg_dict = { + lang : dict( + evaluator=dict( + type=HumanevalXEvaluator, + language=lang, + ip_address= + 'localhost', # replace to your code_eval_server ip_address, port + port=5001), # refer to https://opencompass.readthedocs.io/en/latest/advanced_guides/code_eval_service.html to launch a server + pred_role='BOT') + for lang in ['python', 'cpp', 'go', 'java', 'js'] # do not support rust now +} + +# Please download the needed `xx.jsonl.gz` from +# https://github.com/THUDM/CodeGeeX2/tree/main/benchmark/humanevalx +# and move them into `data/humanevalx/` folder +humanevalx_datasets = [ + dict( + type=HumanevalXDataset, + abbr=f'humanevalx-{lang}', + language=lang, + path='./data/humanevalx', + reader_cfg=humanevalx_reader_cfg, + infer_cfg=humanevalx_infer_cfg, + eval_cfg=humanevalx_eval_cfg_dict[lang]) + for lang in ['python', 'cpp', 'go', 'java', 'js'] +] diff --git a/opencompass/configs/datasets/livecodebench/livecodebench_o1_gen_f0ed6c.py b/opencompass/configs/datasets/livecodebench/livecodebench_o1_gen_f0ed6c.py new file mode 100644 index 00000000..b74e2c76 --- /dev/null +++ b/opencompass/configs/datasets/livecodebench/livecodebench_o1_gen_f0ed6c.py @@ -0,0 +1,165 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import ( + LCBCodeGenerationDataset, + LCBCodeExecutionDataset, + LCBTestOutputPredictionDataset, + LCBCodeGenerationEvaluator, + LCBCodeExecutionEvaluator, + LCBTestOutputEvaluator +) +from opencompass.datasets.livecodebench import TestOutputPromptConstants + + +lcb_code_generation_reader_cfg = dict( + input_columns=[ + 'question_content', + 'format_prompt', + ], + # output_column='evaluation_sample', + output_column='question_id', +) + +SYSTEM_MESSAGE_GENERIC = f'You are an expert Python programmer. You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests. You will NOT return anything except for the program.' + +prompt_template = '### Question:\n{question_content}\n\n{format_prompt}' + \ + '### Answer: (use the provided format with backticks)\n\n' + + +# Code Generation Tasks +lcb_code_generation_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict( + role='HUMAN', + prompt=prompt_template + ) + ] + ) + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=1024) +) + +lcb_code_generation_eval_cfg = dict( + evaluator=dict( + type=LCBCodeGenerationEvaluator, + num_process_evaluate=4, + timeout=6, + release_version='release_v4', + ), + pred_role='BOT', +) + +LCBCodeGeneration_dataset = dict( + type=LCBCodeGenerationDataset, + abbr='lcb_code_generation_v4', + path='opencompass/code_generation_lite', + reader_cfg=lcb_code_generation_reader_cfg, + infer_cfg=lcb_code_generation_infer_cfg, + eval_cfg=lcb_code_generation_eval_cfg, + release_version='release_v4', +) + +# Code Execution Dataset +lcb_code_execution_reader_cfg = dict( + input_columns=[ + 'prompt', + ], + output_column='evaluation_sample', +) + +lcb_code_execution_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + begin=[ + dict( + role='SYSTEM', + prompt='You are an expert at Python programming, code execution, test case generation, and fuzzing.' + ), + ], + round=[ + dict( + role='HUMAN', + prompt='{prompt}' + ) + ] + ) + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=1024) +) + +lcb_code_execution_eval_cfg = dict( + evaluator=dict( + type=LCBCodeExecutionEvaluator, + ), + pred_role='BOT', +) + +LCBCodeExecution_dataset = dict( + type=LCBCodeExecutionDataset, + abbr='lcb_code_execution', + path='opencompass/execution-v2', + reader_cfg=lcb_code_execution_reader_cfg, + infer_cfg=lcb_code_execution_infer_cfg, + eval_cfg=lcb_code_execution_eval_cfg, +) + +# TestOuputput Dataset +lcb_test_output_reader_cfg = dict( + input_columns=[ + 'prompt', + ], + output_column='evaluation_sample', +) + +system_prompt = 'You are an expert Python programmer. You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests. You will NOT return anything except for the program.' + +lcb_test_output_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + # begin=[ + # dict( + # role='SYSTEM', + # prompt=system_prompt + # ), + # ], + round=[ + dict( + role='HUMAN', + prompt='{prompt}' + ) + ] + ) + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=1024) +) + +lcb_test_output_eval_cfg = dict( + evaluator=dict( + type=LCBTestOutputEvaluator, + ), + pred_role='BOT', +) + +LCBTestOutput_dataset = dict( + type=LCBTestOutputPredictionDataset, + abbr='lcb_test_output', + path='opencompass/test_generation', + reader_cfg=lcb_test_output_reader_cfg, + infer_cfg=lcb_test_output_infer_cfg, + eval_cfg=lcb_test_output_eval_cfg, +) + +LCB_datasets = [ + LCBCodeGeneration_dataset, + # LCBCodeExecution_dataset, + # LCBTestOutput_dataset, +] diff --git a/opencompass/configs/datasets/livecodebench/livecodebench_split_v4_o1_gen_f0ed6c.py b/opencompass/configs/datasets/livecodebench/livecodebench_split_v4_o1_gen_f0ed6c.py new file mode 100644 index 00000000..cd65c3ae --- /dev/null +++ b/opencompass/configs/datasets/livecodebench/livecodebench_split_v4_o1_gen_f0ed6c.py @@ -0,0 +1,165 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import ( + LCBCodeGenerationDataset, + LCBCodeExecutionDataset, + LCBTestOutputPredictionDataset, + LCBCodeGenerationEvaluator, + LCBCodeExecutionEvaluator, + LCBTestOutputEvaluator +) +from opencompass.datasets.livecodebench import TestOutputPromptConstants + + +lcb_code_generation_reader_cfg = dict( + input_columns=[ + 'question_content', + 'format_prompt', + ], + # output_column='evaluation_sample', + output_column='question_id', +) + +SYSTEM_MESSAGE_GENERIC = f'You are an expert Python programmer. You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests. You will NOT return anything except for the program.' + +prompt_template = '### Question:\n{question_content}\n\n{format_prompt}' + \ + '### Answer: (use the provided format with backticks)\n\n' + + +# Code Generation Tasks +lcb_code_generation_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict( + role='HUMAN', + prompt=prompt_template + ) + ] + ) + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=1024) +) + +lcb_code_generation_eval_cfg = dict( + evaluator=dict( + type=LCBCodeGenerationEvaluator, + num_process_evaluate=4, + timeout=6, + release_version='release_split_v4', + ), + pred_role='BOT', +) + +LCBCodeGeneration_dataset = dict( + type=LCBCodeGenerationDataset, + abbr='lcb_code_generation_split_v4', + path='opencompass/code_generation_lite', + reader_cfg=lcb_code_generation_reader_cfg, + infer_cfg=lcb_code_generation_infer_cfg, + eval_cfg=lcb_code_generation_eval_cfg, + release_version='release_split_v4', +) + +# Code Execution Dataset +lcb_code_execution_reader_cfg = dict( + input_columns=[ + 'prompt', + ], + output_column='evaluation_sample', +) + +lcb_code_execution_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + begin=[ + dict( + role='SYSTEM', + prompt='You are an expert at Python programming, code execution, test case generation, and fuzzing.' + ), + ], + round=[ + dict( + role='HUMAN', + prompt='{prompt}' + ) + ] + ) + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=1024) +) + +lcb_code_execution_eval_cfg = dict( + evaluator=dict( + type=LCBCodeExecutionEvaluator, + ), + pred_role='BOT', +) + +LCBCodeExecution_dataset = dict( + type=LCBCodeExecutionDataset, + abbr='lcb_code_execution', + path='opencompass/execution-v2', + reader_cfg=lcb_code_execution_reader_cfg, + infer_cfg=lcb_code_execution_infer_cfg, + eval_cfg=lcb_code_execution_eval_cfg, +) + +# TestOuputput Dataset +lcb_test_output_reader_cfg = dict( + input_columns=[ + 'prompt', + ], + output_column='evaluation_sample', +) + +system_prompt = 'You are an expert Python programmer. You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests. You will NOT return anything except for the program.' + +lcb_test_output_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + # begin=[ + # dict( + # role='SYSTEM', + # prompt=system_prompt + # ), + # ], + round=[ + dict( + role='HUMAN', + prompt='{prompt}' + ) + ] + ) + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=1024) +) + +lcb_test_output_eval_cfg = dict( + evaluator=dict( + type=LCBTestOutputEvaluator, + ), + pred_role='BOT', +) + +LCBTestOutput_dataset = dict( + type=LCBTestOutputPredictionDataset, + abbr='lcb_test_output', + path='opencompass/test_generation', + reader_cfg=lcb_test_output_reader_cfg, + infer_cfg=lcb_test_output_infer_cfg, + eval_cfg=lcb_test_output_eval_cfg, +) + +LCB_datasets = [ + LCBCodeGeneration_dataset, + # LCBCodeExecution_dataset, + # LCBTestOutput_dataset, +] diff --git a/opencompass/configs/datasets/livecodebench/livecodebench_v1_o1_gen_f0ed6c.py b/opencompass/configs/datasets/livecodebench/livecodebench_v1_o1_gen_f0ed6c.py new file mode 100644 index 00000000..f2a0d77e --- /dev/null +++ b/opencompass/configs/datasets/livecodebench/livecodebench_v1_o1_gen_f0ed6c.py @@ -0,0 +1,164 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import ( + LCBCodeGenerationDataset, + LCBCodeExecutionDataset, + LCBTestOutputPredictionDataset, + LCBCodeGenerationEvaluator, + LCBCodeExecutionEvaluator, + LCBTestOutputEvaluator +) +from opencompass.datasets.livecodebench import TestOutputPromptConstants + + +lcb_code_generation_reader_cfg = dict( + input_columns=[ + 'question_content', + 'format_prompt', + ], + # output_column='evaluation_sample', + output_column='question_id', +) + +SYSTEM_MESSAGE_GENERIC = f'You are an expert Python programmer. You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests. You will NOT return anything except for the program.' + +prompt_template = '### Question:\n{question_content}\n\n{format_prompt}' + \ + '### Answer: (use the provided format with backticks)\n\n' + + +# Code Generation Tasks +lcb_code_generation_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict( + role='HUMAN', + prompt=prompt_template + ) + ] + ) + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=1024) +) + +lcb_code_generation_eval_cfg = dict( + evaluator=dict( + type=LCBCodeGenerationEvaluator, + num_process_evaluate=4, + timeout=6, + ), + pred_role='BOT', +) + +LCBCodeGeneration_dataset = dict( + type=LCBCodeGenerationDataset, + abbr='lcb_code_generation_v1', + path='opencompass/code_generation_lite', + reader_cfg=lcb_code_generation_reader_cfg, + infer_cfg=lcb_code_generation_infer_cfg, + eval_cfg=lcb_code_generation_eval_cfg, + release_version='release_v1', +) + +# Code Execution Dataset +lcb_code_execution_reader_cfg = dict( + input_columns=[ + 'prompt', + ], + output_column='evaluation_sample', +) + +lcb_code_execution_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + begin=[ + dict( + role='SYSTEM', + prompt='You are an expert at Python programming, code execution, test case generation, and fuzzing.' + ), + ], + round=[ + dict( + role='HUMAN', + prompt='{prompt}' + ) + ] + ) + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=1024) +) + +lcb_code_execution_eval_cfg = dict( + evaluator=dict( + type=LCBCodeExecutionEvaluator, + ), + pred_role='BOT', +) + +LCBCodeExecution_dataset = dict( + type=LCBCodeExecutionDataset, + abbr='lcb_code_execution', + path='opencompass/execution-v2', + reader_cfg=lcb_code_execution_reader_cfg, + infer_cfg=lcb_code_execution_infer_cfg, + eval_cfg=lcb_code_execution_eval_cfg, +) + +# TestOuputput Dataset +lcb_test_output_reader_cfg = dict( + input_columns=[ + 'prompt', + ], + output_column='evaluation_sample', +) + +system_prompt = 'You are an expert Python programmer. You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests. You will NOT return anything except for the program.' + +lcb_test_output_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + # begin=[ + # dict( + # role='SYSTEM', + # prompt=system_prompt + # ), + # ], + round=[ + dict( + role='HUMAN', + prompt='{prompt}' + ) + ] + ) + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=1024) +) + +lcb_test_output_eval_cfg = dict( + evaluator=dict( + type=LCBTestOutputEvaluator, + ), + pred_role='BOT', +) + +LCBTestOutput_dataset = dict( + type=LCBTestOutputPredictionDataset, + abbr='lcb_test_output', + path='opencompass/test_generation', + reader_cfg=lcb_test_output_reader_cfg, + infer_cfg=lcb_test_output_infer_cfg, + eval_cfg=lcb_test_output_eval_cfg, +) + +LCB_datasets = [ + LCBCodeGeneration_dataset, + # LCBCodeExecution_dataset, + # LCBTestOutput_dataset, +] diff --git a/opencompass/configs/datasets/math/math_prm800k_500_0shot_nocot_gen_b27274.py b/opencompass/configs/datasets/math/math_prm800k_500_0shot_nocot_gen_b27274.py new file mode 100644 index 00000000..c1d20a80 --- /dev/null +++ b/opencompass/configs/datasets/math/math_prm800k_500_0shot_nocot_gen_b27274.py @@ -0,0 +1,36 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import MATHDataset, MATHEvaluator, math_postprocess_v2, normalize_final_answer + +math_reader_cfg = dict(input_columns=['problem'], output_column='solution') + +math_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict(role='HUMAN', prompt='{problem}\nRemember to put your final answer within \\boxed{}.'), + ] + ), + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=1024), +) + +# postprocess v2 +math_eval_cfg = dict( + evaluator=dict(type=MATHEvaluator, version='v2'), pred_postprocessor=dict(type=math_postprocess_v2), +) + +math_datasets = [ + dict( + type=MATHDataset, + abbr='math_prm800k_500', + path='opencompass/math', + file_name = 'test_prm800k_500.json', + reader_cfg=math_reader_cfg, + infer_cfg=math_infer_cfg, + eval_cfg=math_eval_cfg, + ) +] diff --git a/opencompass/configs/datasets/math/math_prm800k_500_0shot_nocot_llmjudge_gen_63a000.py b/opencompass/configs/datasets/math/math_prm800k_500_0shot_nocot_llmjudge_gen_63a000.py new file mode 100644 index 00000000..eabc6c68 --- /dev/null +++ b/opencompass/configs/datasets/math/math_prm800k_500_0shot_nocot_llmjudge_gen_63a000.py @@ -0,0 +1,85 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import MATHDataset, MATHEvaluator, math_postprocess_v2, normalize_final_answer +from opencompass.datasets import MATHDataset, MATHEvaluator, math_postprocess_v2, GaoKaoMATHEvaluator +# from opencompass.utils.model_postprocessors import naive_model_postprocess, xfinder_postprocess +from opencompass.utils.postprocessors.naive import MATH_NAVIE_PROMPT_TEMPLATE + +# ----------------------------- Eval Parameters ----------------------------- +## Postprocess function +post_func = 're' # 're', 'xfinder_model', 'naive_model' + +## Evalute function +eval_func = 'naive_model' # 're', 'naive_model' + + +## Model api url +# xfinder_url = 'http://0.0.0.0:23333/v1' # for 'xFinder-qwen1505' if post_func is 'xfinder_model' +# naive_model_name = 'Qwen/Qwen2.5-72B-Instruct' # replace with your model name +naive_model_name = 'dlc_model' +# naive_model_url = [ +# 'http://172.30.56.38:23001/v1', +# ] # Multi-apis for accerlation +naive_model_url = [ + "http://172.30.56.38:23001/v1", + "http://172.30.8.4:23003/v1", + "http://172.30.8.14:23002/v1", + "http://172.30.48.80:23004/v1", + "http://172.30.56.132:23005/v1", + "http://172.30.16.115:23006/v1", + "http://172.30.48.82:23007/v1", + "http://172.30.24.53:23008/v1", + "http://172.30.56.141:23009/v1", + "http://172.30.8.35:23010/v1", + "http://172.30.48.85:23011/v1", + "http://172.30.16.116:23012/v1" +] +# ----------------------------- Detailed Config ----------------------------- + +math_reader_cfg = dict(input_columns=['problem'], output_column='solution') + +math_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict(role='HUMAN', prompt='{problem}\nRemember to put your final answer within \\boxed{}.'), + ] + ), + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=8192), +) + + +if post_func == 're': + pred_postprocessor = dict(type=math_postprocess_v2) + + +if eval_func == 're': + evaluator = dict(type=MATHEvaluator, version='v2') +elif eval_func == 'naive_model': + evaluator = dict( + type=GaoKaoMATHEvaluator, + judge_model_name=naive_model_name, + url=naive_model_url, + ) + +# postprocess v2 +math_eval_cfg = dict( + evaluator=evaluator, + pred_postprocessor=pred_postprocessor, +) + +math_datasets = [ + dict( + type=MATHDataset, + abbr='math_prm800k_500-llmjudge', + path='opencompass/math', + file_name = 'test_prm800k_500.json', + reader_cfg=math_reader_cfg, + infer_cfg=math_infer_cfg, + eval_cfg=math_eval_cfg, + ) +] diff --git a/opencompass/configs/datasets/mbpp/sanitized_mbpp_mdblock_0shot_nocot_gen_a2e416.py b/opencompass/configs/datasets/mbpp/sanitized_mbpp_mdblock_0shot_nocot_gen_a2e416.py new file mode 100644 index 00000000..17005593 --- /dev/null +++ b/opencompass/configs/datasets/mbpp/sanitized_mbpp_mdblock_0shot_nocot_gen_a2e416.py @@ -0,0 +1,40 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import SanitizedMBPPDataset, MBPPEvaluator + +sanitized_mbpp_reader_cfg = dict(input_columns=['text', 'test_list'], output_column='test_list_2') + +sanitized_mbpp_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + # dict(role='HUMAN', prompt='You are an expert Python programmer, and here is your task:\nWrite a function to find the similar elements from the given two tuple lists.\nYour code should pass these tests:\n\nassert similar_elements((3, 4, 5, 6),(5, 7, 4, 10)) == (4, 5)\nassert similar_elements((1, 2, 3, 4),(5, 4, 3, 7)) == (3, 4)\nassert similar_elements((11, 12, 14, 13),(17, 15, 14, 13)) == (13, 14)\n',), + # dict(role='BOT', prompt='```python\ndef similar_elements(test_tup1, test_tup2):\n res = tuple(set(test_tup1) & set(test_tup2))\n return (res)```',), + + # dict(role='HUMAN', prompt='You are an expert Python programmer, and here is your task:\nWrite a python function to identify non-prime numbers.\nYour code should pass these tests:\n\nassert is_not_prime(2) == False\nassert is_not_prime(10) == True\nassert is_not_prime(35) == True\n',), + # dict(role='BOT', prompt='```python\nimport math\ndef is_not_prime(n):\n result = False\n for i in range(2,int(math.sqrt(n)) + 1):\n if n %% i == 0:\n result = True\n return result```',), + + # dict(role='HUMAN', prompt='You are an expert Python programmer, and here is your task:\nWrite a function to find the largest integers from a given list of numbers using heap queue algorithm.\nYour code should pass these tests:\n\nassert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],3)==[85, 75, 65]\nassert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],2)==[85, 75]\nassert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],5)==[85, 75, 65, 58, 35]\n',), + # dict(role='BOT', prompt='```python\nimport heapq as hq\ndef heap_queue_largest(nums,n):\n largest_nums = hq.nlargest(n, nums)\n return largest_nums```',), + dict(role='HUMAN', prompt='You are an expert Python programmer, and here is your task:\n{text}\nYour code should pass these tests:\n\n{test_list}\n You should submit your final solution in the following format: ```python\n\n```',), + ], + ), + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=512), +) + +sanitized_mbpp_eval_cfg = dict(evaluator=dict(type=MBPPEvaluator), pred_role='BOT') + +sanitized_mbpp_datasets = [ + dict( + type=SanitizedMBPPDataset, + abbr='sanitized_mbpp', + path='opencompass/sanitized_mbpp', + reader_cfg=sanitized_mbpp_reader_cfg, + infer_cfg=sanitized_mbpp_infer_cfg, + eval_cfg=sanitized_mbpp_eval_cfg, + ) +] diff --git a/opencompass/configs/datasets/subjective/judgerbench/judgerbench.py b/opencompass/configs/datasets/subjective/judgerbench/judgerbench.py index 3c436585..e6aafb12 100644 --- a/opencompass/configs/datasets/subjective/judgerbench/judgerbench.py +++ b/opencompass/configs/datasets/subjective/judgerbench/judgerbench.py @@ -47,8 +47,3 @@ for _name in subjective_all_sets: infer_cfg=subjective_infer_cfg, eval_cfg=subjective_eval_cfg, )) -# ds1000_eval_cfg = dict( -# evaluator=dict(type=DS1000Evaluator), -# pred_role='BOT', -# pred_postprocessor=dict(type=ds1000_postprocess), -# ) diff --git a/opencompass/datasets/livecodebench/__init__.py b/opencompass/datasets/livecodebench/__init__.py index 5c7a048d..4870e556 100644 --- a/opencompass/datasets/livecodebench/__init__.py +++ b/opencompass/datasets/livecodebench/__init__.py @@ -1,6 +1,7 @@ from .evaluator import LCBCodeExecutionEvaluator # noqa: F401, F403 from .evaluator import LCBCodeGenerationEvaluator # noqa: F401, F403 from .evaluator import LCBTestOutputEvaluator # noqa: F401, F403 +from .livecodebench import CompassBenchCodeExecutionDataset # noqa: F401, F403 from .livecodebench import LCBCodeExecutionDataset # noqa: F401, F403 from .livecodebench import LCBCodeGenerationDataset # noqa: F401, F403 from .livecodebench import LCBTestOutputPredictionDataset # noqa: F401, F403 diff --git a/opencompass/datasets/livecodebench/evaluator.py b/opencompass/datasets/livecodebench/evaluator.py index 9ae936a8..cb6d2360 100644 --- a/opencompass/datasets/livecodebench/evaluator.py +++ b/opencompass/datasets/livecodebench/evaluator.py @@ -228,11 +228,15 @@ def codegen_metrics( @ICL_EVALUATORS.register_module() class LCBCodeGenerationEvaluator(BaseEvaluator): - def __init__(self, num_process_evaluate, timeout=6): + def __init__(self, + num_process_evaluate, + timeout=6, + release_version='release_v1'): super().__init__() self.num_process_evaluate = num_process_evaluate self.timeout = timeout - self.dataset = LCBCodeGenerationDataset.load()['test'] + self.dataset = LCBCodeGenerationDataset.load( + release_version=release_version)['test'] def score(self, predictions, references): predictions = [[extract_code_generation(item)] for item in predictions] diff --git a/opencompass/datasets/livecodebench/livecodebench.py b/opencompass/datasets/livecodebench/livecodebench.py index aa7e82ef..dbd76d71 100644 --- a/opencompass/datasets/livecodebench/livecodebench.py +++ b/opencompass/datasets/livecodebench/livecodebench.py @@ -8,7 +8,7 @@ import zlib from dataclasses import dataclass from enum import Enum -from datasets import DatasetDict, load_dataset +from datasets import DatasetDict, load_dataset, load_from_disk from opencompass.utils import get_data_path # noqa: F401, F403 @@ -215,3 +215,37 @@ class LCBSelfRepairDataset(BaseDataset): dataset = dataset.map(transform) return DatasetDict({'test': dataset, 'train': dataset}) + + +class CompassBenchCodeExecutionDataset(BaseDataset): + + @staticmethod + def load( + path: str = 'opencompass/execution-v2', + local_mode: bool = False, + cot: bool = False, + # release_version: str = "release_v1" + ): + # path = get_data_path(path, local_mode=local_mode) + + def transform(item): + code, input = item['code'], item['input'] + prompt = make_code_execution_prompt(code, input, cot=cot) + + item['prompt'] = prompt + + evaluation_sample = json.dumps({ + 'code': item['code'], + 'input': item['input'], + 'output': item['output'] + }) + item['evaluation_sample'] = evaluation_sample + + return item + + path = get_data_path(path, local_mode=local_mode) + dataset = load_from_disk(path) # 'livecodebench/execution-v2' + dataset = dataset['test'] + dataset = dataset.map(transform) + + return DatasetDict({'test': dataset, 'train': dataset}) diff --git a/opencompass/models/base.py b/opencompass/models/base.py index 9e983f39..eb8e298d 100644 --- a/opencompass/models/base.py +++ b/opencompass/models/base.py @@ -390,6 +390,7 @@ class LMTemplateParser: elif item.get('prompt', ''): # it's a dict prompt += last_sep + item.get('prompt', '') last_sep = '\n' + return prompt def _split_rounds( diff --git a/opencompass/runners/volc.py b/opencompass/runners/volc.py index 81d0c869..d48f7a43 100644 --- a/opencompass/runners/volc.py +++ b/opencompass/runners/volc.py @@ -176,7 +176,7 @@ class VOLCRunner(BaseRunner): cmd = get_cmd() logger = get_logger() - logger.debug(f'Running command: {cmd}') + logger.info(f'Running command: {cmd}') out_path = task.get_log_path(file_extension='txt') mmengine.mkdir_or_exist(osp.split(out_path)[0]) @@ -205,10 +205,17 @@ class VOLCRunner(BaseRunner): return task_name, returncode def _run_task(self, cmd, log_path, poll_interval): + logger = get_logger() result = subprocess.run(cmd, shell=True, text=True, capture_output=True) + + logger.info(f'Command output: {result.stdout}') + if result.stderr: + logger.error(f'Command error: {result.stderr}') + logger.info(f'Return code: {result.returncode}') + pattern = r'(?<=task_id=).*(?=\n\n)' match = re.search(pattern, result.stdout) if match: diff --git a/opencompass/utils/datasets_info.py b/opencompass/utils/datasets_info.py index 8fe89971..21e640c8 100644 --- a/opencompass/utils/datasets_info.py +++ b/opencompass/utils/datasets_info.py @@ -554,4 +554,8 @@ DATASETS_URL = { "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/korbench.zip", "md5": "9107597d137e7362eaf7d218ddef7a6d", }, + "subjective/judgerbench": { + "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/judgerbench.zip", + "md5": "60d605883aa8cac9755819140ab42c6b" + } }