Merge 22cf36683f into d572761cef

2025-05-30 16:03:24 +08:00 · 2025-05-29 14:37:25 +08:00 · 2025-05-29 14:37:25 +08:00 · a08be10602
commit a08be10602
parent d572761cef 22cf36683f
17 changed files with 713 additions and 0 deletions
--- a/opencompass/configs/datasets/coinflip/coinflip_0shot_cot_gen.py
+++ b/opencompass/configs/datasets/coinflip/coinflip_0shot_cot_gen.py
@ -0,0 +1,41 @@
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.openicl.icl_evaluator import AccEvaluator
 from opencompass.datasets import CoinFlipDataset, coinflip_pred_postprocess
 coinflip_reader_cfg = dict(
    input_columns=['question'],
    output_column='answer',
    train_split='test',
    test_split='test'
 )
 coinflip_infer_cfg = dict(
    prompt_template=dict(
        type=PromptTemplate,
        template=dict(
            round=[
                dict(role='HUMAN', prompt='Question: {question}\nPlease reason step by step, and format your final answer as `The answer is [ANSWER]`, where [ANSWER] should be `yes` or `no`.\nAnswer:'),
            ]
        )
    ),
    retriever=dict(type=ZeroRetriever),
    inferencer=dict(type=GenInferencer, max_out_len=512),
 )
 coinflip_eval_cfg = dict(
    evaluator=dict(type=AccEvaluator),
    pred_postprocessor=dict(type=coinflip_pred_postprocess),
 )
 coinflip_datasets = [
    dict(
        abbr='coinflip',
        type=CoinFlipDataset,
        path='coin_flip',
        reader_cfg=coinflip_reader_cfg,
        infer_cfg=coinflip_infer_cfg,
        eval_cfg=coinflip_eval_cfg
    )
 ]
--- a/opencompass/configs/datasets/coinflip/coinflip_0shot_gen.py
+++ b/opencompass/configs/datasets/coinflip/coinflip_0shot_gen.py
@ -0,0 +1,41 @@
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.openicl.icl_evaluator import AccEvaluator
 from opencompass.datasets import CoinFlipDataset, coinflip_pred_postprocess
 coinflip_reader_cfg = dict(
    input_columns=['question'],
    output_column='answer',
    train_split='test',
    test_split='test'
 )
 coinflip_infer_cfg = dict(
    prompt_template=dict(
        type=PromptTemplate,
        template=dict(
            round=[
                dict(role='HUMAN', prompt='Question: {question}\nPlease respond `yes` or `no` directly without any additional explanations.\nAnswer:'),
            ]
        )
    ),
    retriever=dict(type=ZeroRetriever),
    inferencer=dict(type=GenInferencer, max_out_len=512),
 )
 coinflip_eval_cfg = dict(
    evaluator=dict(type=AccEvaluator),
    pred_postprocessor=dict(type=coinflip_pred_postprocess),
 )
 coinflip_datasets = [
    dict(
        abbr='coinflip',
        type=CoinFlipDataset,
        path='coin_flip',
        reader_cfg=coinflip_reader_cfg,
        infer_cfg=coinflip_infer_cfg,
        eval_cfg=coinflip_eval_cfg
    )
 ]
--- a/opencompass/configs/datasets/coinflip/coinflip_fewshot_cot_gen.py
+++ b/opencompass/configs/datasets/coinflip/coinflip_fewshot_cot_gen.py
@ -0,0 +1,57 @@
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.openicl.icl_evaluator import AccEvaluator
 from opencompass.datasets import CoinFlipDataset, coinflip_pred_postprocess
 coinflip_reader_cfg = dict(
    input_columns=['question'],
    output_column='answer',
    train_split='test',
    test_split='test'
 )
 coinflip_infer_cfg = dict(
    prompt_template=dict(
        type=PromptTemplate,
        template=dict(
            round=[
                dict(role='HUMAN', prompt='Question: A coin is heads up. Ka flips the coin. Sherrie flips the coin. Is the coin still heads up?\nAnswer:'),
                dict(role='BOT', prompt='The coin was flipped by Ka and Sherrie. So the coin was flipped 2 times, which is an even number. The coin started heads up, so after an even number of flips, it will still be heads up.\nSo the answer is yes.\n'),
                dict(role='HUMAN', prompt='Question: A coin is heads up. Jamey flips the coin. Teressa flips the coin. Is the coin still heads up?\nAnswer:'),
                dict(role='BOT', prompt='The coin was flipped by Jamey and Teressa. So the coin was flipped 2 times, which is an even number. The coin started heads up, so after an even number of flips, it will still be heads up.\nSo the answer is yes.\n'),
                dict(role='HUMAN', prompt='Question: A coin is heads up. Maybelle flips the coin. Shalonda does not flip the coin. Is the coin still heads up?\nAnswer:'),
                dict(role='BOT', prompt='The coin was flipped by Maybelle. So the coin was flipped 1 time, which is an odd number. The coin started heads up, so after an odd number of flips, it will be tails up.\nSo the answer is no.\n'),
                dict(role='HUMAN', prompt='Question: A coin is heads up. Millicent does not flip the coin. Conception flips the coin. Is the coin still heads up?\nAnswer:'),
                dict(role='BOT', prompt='The coin was flipped by Conception. So the coin was flipped 1 time, which is an odd number. The coin started heads up, so after an odd number of flips, it will be tails up.\nSo the answer is no.\n'),
                dict(role='HUMAN', prompt='Question: A coin is heads up. Sal flips the coin. Raymond does not flip the coin. Is the coin still heads up?\nAnswer:'),
                dict(role='BOT', prompt='The coin was flipped by Sal. So the coin was flipped 1 time, which is an odd number. The coin started heads up, so after an odd number of flips, it will be tails up.\nSo the answer is no.\n'),
                dict(role='HUMAN', prompt='Question: A coin is heads up. Conception flips the coin. Kristian does not flip the coin. Is the coin still heads up?\nAnswer:'),
                dict(role='BOT', prompt='The coin was flipped by Conception. So the coin was flipped 1 time, which is an odd number. The coin started heads up, so after an odd number of flips, it will be tails up.\nSo the answer is no.\n'),
                dict(role='HUMAN', prompt='Question: A coin is heads up. Inga does not flip the coin. Elanor does not flip the coin. Is the coin still heads up?\nAnswer:'),
                dict(role='BOT', prompt='The coin was flipped by no one. So the coin was flipped 0 times. The coin started heads up, and it was not flipped, so it is still heads up.\nSo the answer is yes.\n'),
                dict(role='HUMAN', prompt='Question: A coin is heads up. Ryan flips the coin. Shaunda flips the coin. Is the coin still heads up?\nAnswer:'),
                dict(role='BOT', prompt='The coin was flipped by Ryan and Shaunda. So the coin was flipped 2 times, which is an even number. The coin started heads up, so after an even number of flips, it will still be heads up.\nSo the answer is yes.\n'),
                dict(role='HUMAN', prompt='Question: {question}\nAnswer:'),
            ]
        )
    ),
    retriever=dict(type=ZeroRetriever),
    inferencer=dict(type=GenInferencer, max_out_len=512),
 )
 coinflip_eval_cfg = dict(
    evaluator=dict(type=AccEvaluator),
    pred_postprocessor=dict(type=coinflip_pred_postprocess),
 )
 coinflip_datasets = [
    dict(
        abbr='coinflip',
        type=CoinFlipDataset,
        path='coin_flip',
        reader_cfg=coinflip_reader_cfg,
        infer_cfg=coinflip_infer_cfg,
        eval_cfg=coinflip_eval_cfg
    )
 ]
--- a/opencompass/configs/datasets/coinflip/coinflip_fewshot_gen.py
+++ b/opencompass/configs/datasets/coinflip/coinflip_fewshot_gen.py
@ -0,0 +1,57 @@
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.openicl.icl_evaluator import AccEvaluator
 from opencompass.datasets import CoinFlipDataset, coinflip_pred_postprocess
 coinflip_reader_cfg = dict(
    input_columns=['question'],
    output_column='answer',
    train_split='test',
    test_split='test'
 )
 coinflip_infer_cfg = dict(
    prompt_template=dict(
        type=PromptTemplate,
        template=dict(
            round=[
                dict(role='HUMAN', prompt='Question: A coin is heads up. Ka flips the coin. Sherrie flips the coin. Is the coin still heads up?\nPlease answer directly without additional reasoning steps.\nAnswer:'),
                dict(role='BOT', prompt='The answer is yes.\n'),
                dict(role='HUMAN', prompt='Question: A coin is heads up. Jamey flips the coin. Teressa flips the coin. Is the coin still heads up?\nPlease answer directly without additional reasoning steps.\nAnswer:'),
                dict(role='BOT', prompt='The answer is yes.\n'),
                dict(role='HUMAN', prompt='Question: A coin is heads up. Maybelle flips the coin. Shalonda does not flip the coin. Is the coin still heads up?\nPlease answer directly without additional reasoning steps.\nAnswer:'),
                dict(role='BOT', prompt='The answer is no.\n'),
                dict(role='HUMAN', prompt='Question: A coin is heads up. Millicent does not flip the coin. Conception flips the coin. Is the coin still heads up?\nPlease answer directly without additional reasoning steps.\nAnswer:'),
                dict(role='BOT', prompt='The answer is no.\n'),
                dict(role='HUMAN', prompt='Question: A coin is heads up. Sal flips the coin. Raymond does not flip the coin. Is the coin still heads up?\nPlease answer directly without additional reasoning steps.\nAnswer:'),
                dict(role='BOT', prompt='The answer is no.\n'),
                dict(role='HUMAN', prompt='Question: A coin is heads up. Conception flips the coin. Kristian does not flip the coin. Is the coin still heads up?\nPlease answer directly without additional reasoning steps.\nAnswer:'),
                dict(role='BOT', prompt='The answer is no.\n'),
                dict(role='HUMAN', prompt='Question: A coin is heads up. Inga does not flip the coin. Elanor does not flip the coin. Is the coin still heads up?\nPlease answer directly without additional reasoning steps.\nAnswer:'),
                dict(role='BOT', prompt='The answer is yes.\n'),
                dict(role='HUMAN', prompt='Question: A coin is heads up. Ryan flips the coin. Shaunda flips the coin. Is the coin still heads up?\nPlease answer directly without additional reasoning steps.\nAnswer:'),
                dict(role='BOT', prompt='The answer is yes.\n'),
                dict(role='HUMAN', prompt='Question: {question}\nPlease answer directly without additional reasoning steps.\nAnswer:'),
            ]
        )
    ),
    retriever=dict(type=ZeroRetriever),
    inferencer=dict(type=GenInferencer, max_out_len=512),
 )
 coinflip_eval_cfg = dict(
    evaluator=dict(type=AccEvaluator),
    pred_postprocessor=dict(type=coinflip_pred_postprocess),
 )
 coinflip_datasets = [
    dict(
        abbr='coinflip',
        type=CoinFlipDataset,
        path='coin_flip',
        reader_cfg=coinflip_reader_cfg,
        infer_cfg=coinflip_infer_cfg,
        eval_cfg=coinflip_eval_cfg
    )
 ]
--- a/opencompass/configs/datasets/gpqa/gpqa_0shot_gen.py
+++ b/opencompass/configs/datasets/gpqa/gpqa_0shot_gen.py
@ -0,0 +1,47 @@
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.datasets import GPQADataset, GPQAEvaluator
 from opencompass.utils import first_option_postprocess
 gpqa_reader_cfg = dict(
    input_columns=['question', 'A', 'B', 'C', 'D'],
    output_column='answer')
 gpqa_infer_cfg = dict(
    prompt_template=dict(
        type=PromptTemplate,
        template=dict(
            round=[
                dict(role='HUMAN', prompt='What is the correct answer to this question: {question}\nChoices:\n'
                                          '(A){A}\n'
                                          '(B){B}\n'
                                          '(C){C}\n'
                                          '(D){D}\n'
                                          'Please provide your answer directly without any additional reasoning steps or explanations. '
                                          'Format your response as follows: "The correct answer is (insert answer here)"'),
            ], )),
    retriever=dict(type=ZeroRetriever),
    inferencer=dict(type=GenInferencer))
 gpqa_eval_cfg = dict(evaluator=dict(type=GPQAEvaluator),
                     pred_postprocessor=dict(type=first_option_postprocess, options='ABCD'))
 gpqa_datasets = []
 gpqa_subsets = {
    'extended': 'gpqa_extended.csv',
    'main': 'gpqa_main.csv',
    'diamond': 'gpqa_diamond.csv'
 }
 for split in list(gpqa_subsets.keys()):
    gpqa_datasets.append(
        dict(
            abbr='GPQA_' + split,
            type=GPQADataset,
            path='./data/gpqa/',
            name=gpqa_subsets[split],
            reader_cfg=gpqa_reader_cfg,
            infer_cfg=gpqa_infer_cfg,
            eval_cfg=gpqa_eval_cfg)
    )
--- a/opencompass/configs/datasets/gsm8k/gsm8k_0shot_v2_gen.py
+++ b/opencompass/configs/datasets/gsm8k/gsm8k_0shot_v2_gen.py
@ -0,0 +1,37 @@
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.datasets import GSM8KDataset, gsm8k_postprocess, gsm8k_dataset_postprocess, Gsm8kEvaluator
 from opencompass.datasets import MATHEvaluator, math_postprocess_v2
 gsm8k_reader_cfg = dict(input_columns=['question'], output_column='answer')
 gsm8k_infer_cfg = dict(
    prompt_template=dict(
        type=PromptTemplate,
        template=dict(
            round=[
                dict(role='HUMAN', prompt='{question}\nPlease provide only the final answer, without including any intermediate reasoning steps, and put your final answer within \\boxed{}.'),
            ],
        ),
    ),
    retriever=dict(type=ZeroRetriever),
    inferencer=dict(type=GenInferencer, max_out_len=512),
 )
 gsm8k_eval_cfg = dict(
    evaluator=dict(type=MATHEvaluator, version='v2'),
    pred_postprocessor=dict(type=math_postprocess_v2),
    dataset_postprocessor=dict(type=gsm8k_dataset_postprocess),
 )
 gsm8k_datasets = [
    dict(
        abbr='gsm8k',
        type=GSM8KDataset,
        path='opencompass/gsm8k',
        reader_cfg=gsm8k_reader_cfg,
        infer_cfg=gsm8k_infer_cfg,
        eval_cfg=gsm8k_eval_cfg,
    )
 ]
--- a/opencompass/configs/datasets/gsm8k/gsm8k_fewshot_gen.py
+++ b/opencompass/configs/datasets/gsm8k/gsm8k_fewshot_gen.py
@ -0,0 +1,40 @@
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.datasets import GSM8KDataset, gsm8k_postprocess, gsm8k_dataset_postprocess, Gsm8kEvaluator
 gsm8k_reader_cfg = dict(input_columns=['question'], output_column='answer')
 gsm8k_infer_cfg = dict(
    prompt_template=dict(
        type=PromptTemplate,
        template=dict(
            round=[
                dict(role='HUMAN', prompt="Question: Angelo and Melanie want to plan how many hours over the next week they should study together for their test next week. They have 2 chapters of their textbook to study and 4 worksheets to memorize. They figure out that they should dedicate 3 hours to each chapter of their textbook and 1.5 hours for each worksheet. If they plan to study no more than 4 hours each day, how many days should they plan to study total over the next week if they take a 10-minute break every hour, include 3 10-minute snack breaks each day, and 30 minutes for lunch each day?\nPlease provide your answer directly without additional reasoning steps. Format your answer as 'The answer is (insert answer here)'\nAnswer:"),
                dict(role='BOT', prompt='The answer is 4\n'),
                dict(role='HUMAN', prompt="Question: Mark's basketball team scores 25 2 pointers, 8 3 pointers and 10 free throws.  Their opponents score double the 2 pointers but half the 3 pointers and free throws.  What's the total number of points scored by both teams added together?\nPlease provide your answer directly without additional reasoning steps. Format your answer as 'The answer is (insert answer here)'\nAnswer:"),
                dict(role='BOT', prompt="The answer is 201\n"),
                dict(role='HUMAN', prompt="Question: Bella has two times as many marbles as frisbees. She also has 20 more frisbees than deck cards. If she buys 2/5 times more of each item, what would be the total number of the items she will have if she currently has 60 marbles?\nPlease provide your answer directly without additional reasoning steps. Format your answer as 'The answer is (insert answer here)'\nAnswer:"),
                dict(role='BOT', prompt="The answer is 140\n"),
                dict(role='HUMAN', prompt="Question: A group of 4 fruit baskets contains 9 apples, 15 oranges, and 14 bananas in the first three baskets and 2 less of each fruit in the fourth basket. How many fruits are there?\nPlease provide your answer directly without additional reasoning steps. Format your answer as 'The answer is (insert answer here)'\nAnswer:"),
                dict(role='BOT', prompt='The answer is 146\n'),
                dict(role='HUMAN', prompt="Question: {question}\nPlease provide your answer directly without additional reasoning steps. Format your answer as 'The answer is (insert answer here)'\nAnswer:"),
            ],
        )),
    retriever=dict(type=ZeroRetriever),
    inferencer=dict(type=GenInferencer, max_out_len=512))
 gsm8k_eval_cfg = dict(evaluator=dict(type=Gsm8kEvaluator),
                      pred_postprocessor=dict(type=gsm8k_postprocess),
                      dataset_postprocessor=dict(type=gsm8k_dataset_postprocess))
 gsm8k_datasets = [
    dict(
        abbr='gsm8k',
        type=GSM8KDataset,
        path='opencompass/gsm8k',
        reader_cfg=gsm8k_reader_cfg,
        infer_cfg=gsm8k_infer_cfg,
        eval_cfg=gsm8k_eval_cfg)
 ]
--- a/opencompass/configs/datasets/last_letters/last_letters_0shot_cot_gen.py
+++ b/opencompass/configs/datasets/last_letters/last_letters_0shot_cot_gen.py
@ -0,0 +1,41 @@
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.openicl.icl_evaluator import AccEvaluator
 from opencompass.datasets import LastLettersDataset, last_letters_pred_postprocess
 last_letters_reader_cfg = dict(
    input_columns=['question'],
    output_column='answer',
    train_split='test',
    test_split='test'
 )
 last_letters_infer_cfg = dict(
    prompt_template=dict(
        type=PromptTemplate,
        template=dict(
            round=[
                dict(role='HUMAN', prompt='Question: {question}\nPlease reason step by step, and format your final answer as `The answer is [ANSWER]`.\nAnswer:'),
            ]
        )
    ),
    retriever=dict(type=ZeroRetriever),
    inferencer=dict(type=GenInferencer, max_out_len=512),
 )
 last_letters_eval_cfg = dict(
    evaluator=dict(type=AccEvaluator),
    pred_postprocessor=dict(type=last_letters_pred_postprocess),
 )
 last_letters_datasets = [
    dict(
        abbr='last_letters',
        type=LastLettersDataset,
        path='last_letters',
        reader_cfg=last_letters_reader_cfg,
        infer_cfg=last_letters_infer_cfg,
        eval_cfg=last_letters_eval_cfg
    )
 ]
--- a/opencompass/configs/datasets/last_letters/last_letters_0shot_gen.py
+++ b/opencompass/configs/datasets/last_letters/last_letters_0shot_gen.py
@ -0,0 +1,41 @@
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.openicl.icl_evaluator import AccEvaluator
 from opencompass.datasets import LastLettersDataset, last_letters_pred_postprocess
 last_letters_reader_cfg = dict(
    input_columns=['question'],
    output_column='answer',
    train_split='test',
    test_split='test'
 )
 last_letters_infer_cfg = dict(
    prompt_template=dict(
        type=PromptTemplate,
        template=dict(
            round=[
                dict(role='HUMAN', prompt='Question: {question}\nPlease respond directly without any additional explanations, and format your final answer as `The answer is [ANSWER]`.\nAnswer:'),
            ]
        )
    ),
    retriever=dict(type=ZeroRetriever),
    inferencer=dict(type=GenInferencer, max_out_len=512),
 )
 last_letters_eval_cfg = dict(
    evaluator=dict(type=AccEvaluator),
    pred_postprocessor=dict(type=last_letters_pred_postprocess),
 )
 last_letters_datasets = [
    dict(
        abbr='last_letters',
        type=LastLettersDataset,
        path='last_letters',
        reader_cfg=last_letters_reader_cfg,
        infer_cfg=last_letters_infer_cfg,
        eval_cfg=last_letters_eval_cfg
    )
 ]
--- a/opencompass/configs/datasets/last_letters/last_letters_fewshot_cot_gen.py
+++ b/opencompass/configs/datasets/last_letters/last_letters_fewshot_cot_gen.py
@ -0,0 +1,49 @@
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.openicl.icl_evaluator import AccEvaluator
 from opencompass.datasets import LastLettersDataset, last_letters_pred_postprocess
 last_letters_reader_cfg = dict(
    input_columns=['question'],
    output_column='answer',
    train_split='test',
    test_split='test'
 )
 last_letters_infer_cfg = dict(
    prompt_template=dict(
        type=PromptTemplate,
        template=dict(
            round=[
                dict(role='HUMAN', prompt='Question: Take the last letters of the words in "Elon Musk" and concatenate them.\nAnswer:'),
                dict(role='BOT', prompt='The last letter of "Elon" is "n". The last letter of "Musk" is "k". Concatenating them is "nk".\nSo the answer is nk.\n'),
                dict(role='HUMAN', prompt='Question: Take the last letters of the words in "Larry Page" and concatenate them.\nAnswer:'),
                dict(role='BOT', prompt='The last letter of "Larry" is "y". The last letter of "Page" is "e". Concatenating them is "ye".\nSo the answer is ye.\n'),
                dict(role='HUMAN', prompt='Question: Take the last letters of the words in "Sergey Brin" and concatenate them.\nAnswer:'),
                dict(role='BOT', prompt='The last letter of "Sergey" is "y". The last letter of "Brin" is "n". Concatenating them is "yn".\nSo the answer is yn.\n'),
                dict(role='HUMAN', prompt='Question: Take the last letters of the words in "Bill Gates" and concatenate them.\nAnswer:'),
                dict(role='BOT', prompt='The last letter of "Bill" is "l". The last letter of "Gates" is "s". Concatenating them is "ls".\nSo the answer is ls.\n'),
                dict(role='HUMAN', prompt='Question: {question}\nAnswer:'),
            ]
        )
    ),
    retriever=dict(type=ZeroRetriever),
    inferencer=dict(type=GenInferencer, max_out_len=512),
 )
 last_letters_eval_cfg = dict(
    evaluator=dict(type=AccEvaluator),
    pred_postprocessor=dict(type=last_letters_pred_postprocess),
 )
 last_letters_datasets = [
    dict(
        abbr='last_letters',
        type=LastLettersDataset,
        path='last_letters',
        reader_cfg=last_letters_reader_cfg,
        infer_cfg=last_letters_infer_cfg,
        eval_cfg=last_letters_eval_cfg
    )
 ]
--- a/opencompass/configs/datasets/last_letters/last_letters_fewshot_gen.py
+++ b/opencompass/configs/datasets/last_letters/last_letters_fewshot_gen.py
@ -0,0 +1,49 @@
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.openicl.icl_evaluator import AccEvaluator
 from opencompass.datasets import LastLettersDataset, last_letters_pred_postprocess
 last_letters_reader_cfg = dict(
    input_columns=['question'],
    output_column='answer',
    train_split='test',
    test_split='test'
 )
 last_letters_infer_cfg = dict(
    prompt_template=dict(
        type=PromptTemplate,
        template=dict(
            round=[
                dict(role='HUMAN', prompt='Question: Take the last letters of the words in "Elon Musk" and concatenate them.\nPlease answer directly without additional reasoning steps.\nAnswer:'),
                dict(role='BOT', prompt='The answer is nk.\n'),
                dict(role='HUMAN', prompt='Question: Take the last letters of the words in "Larry Page" and concatenate them.\nPlease answer directly without additional reasoning steps.\nAnswer:'),
                dict(role='BOT', prompt='The answer is ye.\n'),
                dict(role='HUMAN', prompt='Question: Take the last letters of the words in "Sergey Brin" and concatenate them.\nPlease answer directly without additional reasoning steps.\nAnswer:'),
                dict(role='BOT', prompt='The answer is yn.\n'),
                dict(role='HUMAN', prompt='Question: Take the last letters of the words in "Bill Gates" and concatenate them.\nPlease answer directly without additional reasoning steps.\nAnswer:'),
                dict(role='BOT', prompt='The answer is ls.\n'),
                dict(role='HUMAN', prompt='Question: {question}\nPlease answer directly without additional reasoning steps.\nAnswer:'),
            ]
        )
    ),
    retriever=dict(type=ZeroRetriever),
    inferencer=dict(type=GenInferencer, max_out_len=512),
 )
 last_letters_eval_cfg = dict(
    evaluator=dict(type=AccEvaluator),
    pred_postprocessor=dict(type=last_letters_pred_postprocess),
 )
 last_letters_datasets = [
    dict(
        abbr='last_letters',
        type=LastLettersDataset,
        path='last_letters',
        reader_cfg=last_letters_reader_cfg,
        infer_cfg=last_letters_infer_cfg,
        eval_cfg=last_letters_eval_cfg
    )
 ]
--- a/opencompass/configs/datasets/math/math_0shot_gen.py
+++ b/opencompass/configs/datasets/math/math_0shot_gen.py
@ -0,0 +1,35 @@
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.datasets import MATHDataset, MATHEvaluator, math_postprocess_v2, normalize_final_answer
 math_reader_cfg = dict(input_columns=['problem'], output_column='solution')
 math_infer_cfg = dict(
    prompt_template=dict(
        type=PromptTemplate,
        template=dict(
            round=[
                dict(role='HUMAN', prompt='{problem}\nPlease provide only the final answer, without including any intermediate reasoning steps, and put your final answer within \\boxed{}.'),
            ]
        ),
    ),
    retriever=dict(type=ZeroRetriever),
    inferencer=dict(type=GenInferencer, max_out_len=1024),
 )
 # postprocess v2
 math_eval_cfg = dict(
    evaluator=dict(type=MATHEvaluator, version='v2'), pred_postprocessor=dict(type=math_postprocess_v2),
 )
 math_datasets = [
    dict(
        type=MATHDataset,
        abbr='math',
        path='opencompass/math',
        reader_cfg=math_reader_cfg,
        infer_cfg=math_infer_cfg,
        eval_cfg=math_eval_cfg,
    )
 ]
--- a/opencompass/configs/datasets/math/math_fewshot_gen.py
+++ b/opencompass/configs/datasets/math/math_fewshot_gen.py
@ -0,0 +1,36 @@
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.datasets import MATHDataset, MATHEvaluator, math_postprocess_v2
 math_reader_cfg = dict(input_columns=['problem'], output_column='solution')
 math_infer_cfg = dict(
    prompt_template=dict(
        type=PromptTemplate,
        template=dict(round=[
            dict(role='HUMAN', prompt='Problem:\nFind the domain of the expression $\\frac{{\sqrt{{x-2}}}}{{\sqrt{{5-x}}}}$.}}\nPlease provide only the final answer, without including any intermediate reasoning steps, and put your final answer within \\boxed{}.\nSolution:'),
            dict(role='BOT', prompt='Final Answer: \\boxed{{[2,5)}}\n'),
            dict(role='HUMAN', prompt='Problem:\nIf $\det \mathbf{{A}} = 2$ and $\det \mathbf{{B}} = 12,$ then find $\det (\mathbf{{A}} \mathbf{{B}}).$\nPlease provide only the final answer, without including any intermediate reasoning steps, and put your final answer within \\boxed{}.\nSolution:'),
            dict(role='BOT', prompt='Final Answer: \\boxed{{24}}\n'),
            dict(role='HUMAN', prompt='Problem:\nTerrell usually lifts two 20-pound weights 12 times. If he uses two 15-pound weights instead, how many times must Terrell lift them in order to lift the same total weight?\nPlease provide only the final answer, without including any intermediate reasoning steps, and put your final answer within \\boxed{}.\nSolution:'),
            dict(role='BOT', prompt='Final Answer: \\boxed{{16}}\n'),
            dict(role='HUMAN', prompt='Problem:\nIf the system of equations: \\begin{{align*}} 6x-4y&=a,\\\\ 6y-9x &=b. \end{{align*}}has a solution $(x, y)$ where $x$ and $y$ are both nonzero, find $\\frac{{a}}{{b}},$ assuming $b$ is nonzero.\nPlease provide only the final answer, without including any intermediate reasoning steps, and put your final answer within \\boxed{}.\nSolution:'),
            dict(role='BOT', prompt='Final Answer: \\boxed{{-\\frac{{2}}{{3}}}}\n'),
            dict(role='HUMAN', prompt='Problem:\n{problem}\nPlease provide only the final answer, without including any intermediate reasoning steps, and put your final answer within \\boxed{}.\nSolution:\n'),
        ])),
    retriever=dict(type=ZeroRetriever),
    inferencer=dict(type=GenInferencer, max_out_len=512))
 math_eval_cfg = dict(
    evaluator=dict(type=MATHEvaluator, version='v2'), pred_postprocessor=dict(type=math_postprocess_v2))
 math_datasets = [
    dict(
        type=MATHDataset,
        abbr='math',
        path='opencompass/math',
        reader_cfg=math_reader_cfg,
        infer_cfg=math_infer_cfg,
        eval_cfg=math_eval_cfg)
 ]
--- a/opencompass/configs/datasets/mmlu_pro/mmlu_pro_0shot_gen.py
+++ b/opencompass/configs/datasets/mmlu_pro/mmlu_pro_0shot_gen.py
@ -0,0 +1,64 @@
 from mmengine.config import read_base
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.openicl.icl_evaluator import AccEvaluator
 from opencompass.datasets import MMLUProDataset
 from opencompass.utils.text_postprocessors import match_answer_pattern
 with read_base():
    from .mmlu_pro_categories import categories
 QUERY_TEMPLATE = """
 Answer the following multiple choice question. Your response should be of the following format: 'ANSWER: $LETTER' (without quotes) where LETTER is one of Options(e.g. one of ABCDEFGHIJKLMNOP). Please answer directly without additional explanations.
 Question:\n
 {question}
 Options:\n
 {options_str}
 """.strip()
 mmlu_pro_datasets = []
 for category in categories:
    mmlu_pro_reader_cfg = dict(
        input_columns=['question', 'cot_content', 'options_str'],
        output_column='answer',
        train_split='validation',
        test_split='test',
    )
    mmlu_pro_infer_cfg = dict(
        prompt_template=dict(
            type=PromptTemplate,
            template=dict(
                round=[
                    dict(role='HUMAN',
                         prompt=QUERY_TEMPLATE),
                ],
            ),
        ),
        retriever=dict(type=ZeroRetriever),
        inferencer=dict(type=GenInferencer),
    )
    mmlu_pro_eval_cfg = dict(
        evaluator=dict(type=AccEvaluator),
        pred_postprocessor=dict(
            type=match_answer_pattern,
            answer_pattern=r'(?i)ANSWER\s*:\s*([A-P])')
    )
    mmlu_pro_datasets.append(
        dict(
            abbr=f'mmlu_pro_{category.replace(" ", "_")}',
            type=MMLUProDataset,
            path='opencompass/mmlu_pro',
            category=category,
            reader_cfg=mmlu_pro_reader_cfg,
            infer_cfg=mmlu_pro_infer_cfg,
            eval_cfg=mmlu_pro_eval_cfg,
        ))
--- a/opencompass/datasets/init.py
+++ b/opencompass/datasets/init.py
@ -34,6 +34,7 @@ from .cmmlu import *  # noqa: F401, F403
 from .cmnli import *  # noqa: F401, F403
 from .cmo_fib import *  # noqa: F401, F403
 from .cmrc import *  # noqa: F401, F403
 from .coinflip import *  # noqa: F401, F403
 from .commonsenseqa import *  # noqa: F401, F403
 from .commonsenseqa_cn import *  # noqa: F401, F403
 from .copa import *  # noqa: F401, F403
@ -78,6 +79,7 @@ from .judge import *  # noqa: F401, F403
 from .kaoshi import KaoshiDataset, KaoshiEvaluator  # noqa: F401, F403
 from .korbench import *  # noqa: F401, F403
 from .lambada import *  # noqa: F401, F403
 from .last_letters import * # noqa: F401, F403
 from .lawbench import *  # noqa: F401, F403
 from .LCBench import *  # noqa: F401, F403
 from .lcsts import *  # noqa: F401, F403
--- a/opencompass/datasets/coinflip.py
+++ b/opencompass/datasets/coinflip.py
@ -0,0 +1,39 @@
 import os
 import re
 import json
 from datasets import Dataset, DatasetDict
 from opencompass.registry import LOAD_DATASET, TEXT_POSTPROCESSORS
 from .base import BaseDataset
 from opencompass.utils.datasets import DEFAULT_DATA_FOLDER
 from opencompass.utils.fileio import download_url
@LOAD_DATASET.register_module()
 class CoinFlipDataset(BaseDataset):
    @staticmethod
    def load(path: str):
        cache_dir = os.environ.get('COMPASS_DATA_CACHE', '')
        local_path = './data/coin_flip/coin_flip.json'
        data_path = os.path.join(DEFAULT_DATA_FOLDER, cache_dir, local_path)
        if not os.path.exists(data_path):
            dataset_url = "https://raw.githubusercontent.com/wjn1996/Chain-of-Knowledge/refs/heads/main/tasks/Coin/dataset/coin_flip.json"
            download_url(dataset_url, os.path.dirname(data_path))
        dataset = []
        with open(data_path, 'r', encoding='utf-8') as f:
            for ex in json.load(f)["examples"]:
                dataset.append(ex)
        dataset = Dataset.from_list(dataset)
        return DatasetDict({'test': dataset})
@TEXT_POSTPROCESSORS.register_module('coinflip')
 def coinflip_pred_postprocess(text: str) -> str:
    text = text.split('answer is ')[-1]
    match = re.search(r'(yes|no)', text.lower())
    if match:
        return match.group(1)
    return ''
--- a/opencompass/datasets/last_letters.py
+++ b/opencompass/datasets/last_letters.py
@ -0,0 +1,37 @@
 import os
 import re
 import json
 from datasets import Dataset, DatasetDict
 from opencompass.registry import LOAD_DATASET, TEXT_POSTPROCESSORS
 from .base import BaseDataset
 from opencompass.utils.datasets import DEFAULT_DATA_FOLDER
 from opencompass.utils.fileio import download_url
@LOAD_DATASET.register_module()
 class LastLettersDataset(BaseDataset):
    @staticmethod
    def load(path: str):
        cache_dir = os.environ.get('COMPASS_DATA_CACHE', '')
        local_path = './data/last_letters/last_letters.json'
        data_path = os.path.join(DEFAULT_DATA_FOLDER, cache_dir, local_path)
        if not os.path.exists(data_path):
            dataset_url = "https://raw.githubusercontent.com/wjn1996/Chain-of-Knowledge/refs/heads/main/tasks/Letter/dataset/last_letters.json"
            download_url(dataset_url, os.path.dirname(data_path))
        dataset = []
        with open(data_path, 'r', encoding='utf-8') as f:
            for ex in json.load(f)["examples"]:
                dataset.append(ex)
        dataset = Dataset.from_list(dataset)
        return DatasetDict({'test': dataset})
@TEXT_POSTPROCESSORS.register_module('last_letters')
 def last_letters_pred_postprocess(text: str) -> str:
    text = text.split('answer is ')[-1]
    text = re.sub("\"|\'|\n|\.|\s", "", text)
    text = re.sub(r"^[^a-zA-Z]+|[^a-zA-Z]+$", "", text)
    return text