add additional evaluation configs

2025-05-30 16:03:24 +08:00 · 2025-02-19 09:50:46 -05:00 · 2025-02-19 09:50:46 -05:00 · 22cf36683f
commit 22cf36683f
parent 68a9838907
17 changed files with 713 additions and 0 deletions
--- a/opencompass/configs/datasets/coinflip/coinflip_0shot_cot_gen.py
+++ b/opencompass/configs/datasets/coinflip/coinflip_0shot_cot_gen.py
@ -0,0 +1,41 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import AccEvaluator
+from opencompass.datasets import CoinFlipDataset, coinflip_pred_postprocess
+
+coinflip_reader_cfg = dict(
+    input_columns=['question'],
+    output_column='answer',
+    train_split='test',
+    test_split='test'
+)
+
+coinflip_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(role='HUMAN', prompt='Question: {question}\nPlease reason step by step, and format your final answer as `The answer is [ANSWER]`, where [ANSWER] should be `yes` or `no`.\nAnswer:'),
+            ]
+        )
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer, max_out_len=512),
+)
+
+coinflip_eval_cfg = dict(
+    evaluator=dict(type=AccEvaluator),
+    pred_postprocessor=dict(type=coinflip_pred_postprocess),
+)
+
+coinflip_datasets = [
+    dict(
+        abbr='coinflip',
+        type=CoinFlipDataset,
+        path='coin_flip',
+        reader_cfg=coinflip_reader_cfg,
+        infer_cfg=coinflip_infer_cfg,
+        eval_cfg=coinflip_eval_cfg
+    )
+]
--- a/opencompass/configs/datasets/coinflip/coinflip_0shot_gen.py
+++ b/opencompass/configs/datasets/coinflip/coinflip_0shot_gen.py
@ -0,0 +1,41 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import AccEvaluator
+from opencompass.datasets import CoinFlipDataset, coinflip_pred_postprocess
+
+coinflip_reader_cfg = dict(
+    input_columns=['question'],
+    output_column='answer',
+    train_split='test',
+    test_split='test'
+)
+
+coinflip_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(role='HUMAN', prompt='Question: {question}\nPlease respond `yes` or `no` directly without any additional explanations.\nAnswer:'),
+            ]
+        )
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer, max_out_len=512),
+)
+
+coinflip_eval_cfg = dict(
+    evaluator=dict(type=AccEvaluator),
+    pred_postprocessor=dict(type=coinflip_pred_postprocess),
+)
+
+coinflip_datasets = [
+    dict(
+        abbr='coinflip',
+        type=CoinFlipDataset,
+        path='coin_flip',
+        reader_cfg=coinflip_reader_cfg,
+        infer_cfg=coinflip_infer_cfg,
+        eval_cfg=coinflip_eval_cfg
+    )
+]
--- a/opencompass/configs/datasets/coinflip/coinflip_fewshot_cot_gen.py
+++ b/opencompass/configs/datasets/coinflip/coinflip_fewshot_cot_gen.py
@ -0,0 +1,57 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import AccEvaluator
+from opencompass.datasets import CoinFlipDataset, coinflip_pred_postprocess
+
+coinflip_reader_cfg = dict(
+    input_columns=['question'],
+    output_column='answer',
+    train_split='test',
+    test_split='test'
+)
+
+coinflip_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(role='HUMAN', prompt='Question: A coin is heads up. Ka flips the coin. Sherrie flips the coin. Is the coin still heads up?\nAnswer:'),
+                dict(role='BOT', prompt='The coin was flipped by Ka and Sherrie. So the coin was flipped 2 times, which is an even number. The coin started heads up, so after an even number of flips, it will still be heads up.\nSo the answer is yes.\n'),
+                dict(role='HUMAN', prompt='Question: A coin is heads up. Jamey flips the coin. Teressa flips the coin. Is the coin still heads up?\nAnswer:'),
+                dict(role='BOT', prompt='The coin was flipped by Jamey and Teressa. So the coin was flipped 2 times, which is an even number. The coin started heads up, so after an even number of flips, it will still be heads up.\nSo the answer is yes.\n'),
+                dict(role='HUMAN', prompt='Question: A coin is heads up. Maybelle flips the coin. Shalonda does not flip the coin. Is the coin still heads up?\nAnswer:'),
+                dict(role='BOT', prompt='The coin was flipped by Maybelle. So the coin was flipped 1 time, which is an odd number. The coin started heads up, so after an odd number of flips, it will be tails up.\nSo the answer is no.\n'),
+                dict(role='HUMAN', prompt='Question: A coin is heads up. Millicent does not flip the coin. Conception flips the coin. Is the coin still heads up?\nAnswer:'),
+                dict(role='BOT', prompt='The coin was flipped by Conception. So the coin was flipped 1 time, which is an odd number. The coin started heads up, so after an odd number of flips, it will be tails up.\nSo the answer is no.\n'),
+                dict(role='HUMAN', prompt='Question: A coin is heads up. Sal flips the coin. Raymond does not flip the coin. Is the coin still heads up?\nAnswer:'),
+                dict(role='BOT', prompt='The coin was flipped by Sal. So the coin was flipped 1 time, which is an odd number. The coin started heads up, so after an odd number of flips, it will be tails up.\nSo the answer is no.\n'),
+                dict(role='HUMAN', prompt='Question: A coin is heads up. Conception flips the coin. Kristian does not flip the coin. Is the coin still heads up?\nAnswer:'),
+                dict(role='BOT', prompt='The coin was flipped by Conception. So the coin was flipped 1 time, which is an odd number. The coin started heads up, so after an odd number of flips, it will be tails up.\nSo the answer is no.\n'),
+                dict(role='HUMAN', prompt='Question: A coin is heads up. Inga does not flip the coin. Elanor does not flip the coin. Is the coin still heads up?\nAnswer:'),
+                dict(role='BOT', prompt='The coin was flipped by no one. So the coin was flipped 0 times. The coin started heads up, and it was not flipped, so it is still heads up.\nSo the answer is yes.\n'),
+                dict(role='HUMAN', prompt='Question: A coin is heads up. Ryan flips the coin. Shaunda flips the coin. Is the coin still heads up?\nAnswer:'),
+                dict(role='BOT', prompt='The coin was flipped by Ryan and Shaunda. So the coin was flipped 2 times, which is an even number. The coin started heads up, so after an even number of flips, it will still be heads up.\nSo the answer is yes.\n'),
+                dict(role='HUMAN', prompt='Question: {question}\nAnswer:'),
+            ]
+        )
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer, max_out_len=512),
+)
+
+coinflip_eval_cfg = dict(
+    evaluator=dict(type=AccEvaluator),
+    pred_postprocessor=dict(type=coinflip_pred_postprocess),
+)
+
+coinflip_datasets = [
+    dict(
+        abbr='coinflip',
+        type=CoinFlipDataset,
+        path='coin_flip',
+        reader_cfg=coinflip_reader_cfg,
+        infer_cfg=coinflip_infer_cfg,
+        eval_cfg=coinflip_eval_cfg
+    )
+]
--- a/opencompass/configs/datasets/coinflip/coinflip_fewshot_gen.py
+++ b/opencompass/configs/datasets/coinflip/coinflip_fewshot_gen.py
@ -0,0 +1,57 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import AccEvaluator
+from opencompass.datasets import CoinFlipDataset, coinflip_pred_postprocess
+
+coinflip_reader_cfg = dict(
+    input_columns=['question'],
+    output_column='answer',
+    train_split='test',
+    test_split='test'
+)
+
+coinflip_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(role='HUMAN', prompt='Question: A coin is heads up. Ka flips the coin. Sherrie flips the coin. Is the coin still heads up?\nPlease answer directly without additional reasoning steps.\nAnswer:'),
+                dict(role='BOT', prompt='The answer is yes.\n'),
+                dict(role='HUMAN', prompt='Question: A coin is heads up. Jamey flips the coin. Teressa flips the coin. Is the coin still heads up?\nPlease answer directly without additional reasoning steps.\nAnswer:'),
+                dict(role='BOT', prompt='The answer is yes.\n'),
+                dict(role='HUMAN', prompt='Question: A coin is heads up. Maybelle flips the coin. Shalonda does not flip the coin. Is the coin still heads up?\nPlease answer directly without additional reasoning steps.\nAnswer:'),
+                dict(role='BOT', prompt='The answer is no.\n'),
+                dict(role='HUMAN', prompt='Question: A coin is heads up. Millicent does not flip the coin. Conception flips the coin. Is the coin still heads up?\nPlease answer directly without additional reasoning steps.\nAnswer:'),
+                dict(role='BOT', prompt='The answer is no.\n'),
+                dict(role='HUMAN', prompt='Question: A coin is heads up. Sal flips the coin. Raymond does not flip the coin. Is the coin still heads up?\nPlease answer directly without additional reasoning steps.\nAnswer:'),
+                dict(role='BOT', prompt='The answer is no.\n'),
+                dict(role='HUMAN', prompt='Question: A coin is heads up. Conception flips the coin. Kristian does not flip the coin. Is the coin still heads up?\nPlease answer directly without additional reasoning steps.\nAnswer:'),
+                dict(role='BOT', prompt='The answer is no.\n'),
+                dict(role='HUMAN', prompt='Question: A coin is heads up. Inga does not flip the coin. Elanor does not flip the coin. Is the coin still heads up?\nPlease answer directly without additional reasoning steps.\nAnswer:'),
+                dict(role='BOT', prompt='The answer is yes.\n'),
+                dict(role='HUMAN', prompt='Question: A coin is heads up. Ryan flips the coin. Shaunda flips the coin. Is the coin still heads up?\nPlease answer directly without additional reasoning steps.\nAnswer:'),
+                dict(role='BOT', prompt='The answer is yes.\n'),
+                dict(role='HUMAN', prompt='Question: {question}\nPlease answer directly without additional reasoning steps.\nAnswer:'),
+            ]
+        )
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer, max_out_len=512),
+)
+
+coinflip_eval_cfg = dict(
+    evaluator=dict(type=AccEvaluator),
+    pred_postprocessor=dict(type=coinflip_pred_postprocess),
+)
+
+coinflip_datasets = [
+    dict(
+        abbr='coinflip',
+        type=CoinFlipDataset,
+        path='coin_flip',
+        reader_cfg=coinflip_reader_cfg,
+        infer_cfg=coinflip_infer_cfg,
+        eval_cfg=coinflip_eval_cfg
+    )
+]
--- a/opencompass/configs/datasets/gpqa/gpqa_0shot_gen.py
+++ b/opencompass/configs/datasets/gpqa/gpqa_0shot_gen.py
@ -0,0 +1,47 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import GPQADataset, GPQAEvaluator
+from opencompass.utils import first_option_postprocess
+
+gpqa_reader_cfg = dict(
+    input_columns=['question', 'A', 'B', 'C', 'D'],
+    output_column='answer')
+
+gpqa_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(role='HUMAN', prompt='What is the correct answer to this question: {question}\nChoices:\n'
+                                          '(A){A}\n'
+                                          '(B){B}\n'
+                                          '(C){C}\n'
+                                          '(D){D}\n'
+                                          'Please provide your answer directly without any additional reasoning steps or explanations. '
+                                          'Format your response as follows: "The correct answer is (insert answer here)"'),
+            ], )),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer))
+
+gpqa_eval_cfg = dict(evaluator=dict(type=GPQAEvaluator),
+                     pred_postprocessor=dict(type=first_option_postprocess, options='ABCD'))
+
+gpqa_datasets = []
+gpqa_subsets = {
+    'extended': 'gpqa_extended.csv',
+    'main': 'gpqa_main.csv',
+    'diamond': 'gpqa_diamond.csv'
+}
+
+for split in list(gpqa_subsets.keys()):
+    gpqa_datasets.append(
+        dict(
+            abbr='GPQA_' + split,
+            type=GPQADataset,
+            path='./data/gpqa/',
+            name=gpqa_subsets[split],
+            reader_cfg=gpqa_reader_cfg,
+            infer_cfg=gpqa_infer_cfg,
+            eval_cfg=gpqa_eval_cfg)
+    )
--- a/opencompass/configs/datasets/gsm8k/gsm8k_0shot_v2_gen.py
+++ b/opencompass/configs/datasets/gsm8k/gsm8k_0shot_v2_gen.py
@ -0,0 +1,37 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import GSM8KDataset, gsm8k_postprocess, gsm8k_dataset_postprocess, Gsm8kEvaluator
+from opencompass.datasets import MATHEvaluator, math_postprocess_v2
+
+gsm8k_reader_cfg = dict(input_columns=['question'], output_column='answer')
+
+gsm8k_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(role='HUMAN', prompt='{question}\nPlease provide only the final answer, without including any intermediate reasoning steps, and put your final answer within \\boxed{}.'),
+            ],
+        ),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer, max_out_len=512),
+)
+
+gsm8k_eval_cfg = dict(
+    evaluator=dict(type=MATHEvaluator, version='v2'),
+    pred_postprocessor=dict(type=math_postprocess_v2),
+    dataset_postprocessor=dict(type=gsm8k_dataset_postprocess),
+)
+
+gsm8k_datasets = [
+    dict(
+        abbr='gsm8k',
+        type=GSM8KDataset,
+        path='opencompass/gsm8k',
+        reader_cfg=gsm8k_reader_cfg,
+        infer_cfg=gsm8k_infer_cfg,
+        eval_cfg=gsm8k_eval_cfg,
+    )
+]
--- a/opencompass/configs/datasets/gsm8k/gsm8k_fewshot_gen.py
+++ b/opencompass/configs/datasets/gsm8k/gsm8k_fewshot_gen.py
@ -0,0 +1,40 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import GSM8KDataset, gsm8k_postprocess, gsm8k_dataset_postprocess, Gsm8kEvaluator
+
+
+gsm8k_reader_cfg = dict(input_columns=['question'], output_column='answer')
+
+gsm8k_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(role='HUMAN', prompt="Question: Angelo and Melanie want to plan how many hours over the next week they should study together for their test next week. They have 2 chapters of their textbook to study and 4 worksheets to memorize. They figure out that they should dedicate 3 hours to each chapter of their textbook and 1.5 hours for each worksheet. If they plan to study no more than 4 hours each day, how many days should they plan to study total over the next week if they take a 10-minute break every hour, include 3 10-minute snack breaks each day, and 30 minutes for lunch each day?\nPlease provide your answer directly without additional reasoning steps. Format your answer as 'The answer is (insert answer here)'\nAnswer:"),
+                dict(role='BOT', prompt='The answer is 4\n'),
+                dict(role='HUMAN', prompt="Question: Mark's basketball team scores 25 2 pointers, 8 3 pointers and 10 free throws.  Their opponents score double the 2 pointers but half the 3 pointers and free throws.  What's the total number of points scored by both teams added together?\nPlease provide your answer directly without additional reasoning steps. Format your answer as 'The answer is (insert answer here)'\nAnswer:"),
+                dict(role='BOT', prompt="The answer is 201\n"),
+                dict(role='HUMAN', prompt="Question: Bella has two times as many marbles as frisbees. She also has 20 more frisbees than deck cards. If she buys 2/5 times more of each item, what would be the total number of the items she will have if she currently has 60 marbles?\nPlease provide your answer directly without additional reasoning steps. Format your answer as 'The answer is (insert answer here)'\nAnswer:"),
+                dict(role='BOT', prompt="The answer is 140\n"),
+                dict(role='HUMAN', prompt="Question: A group of 4 fruit baskets contains 9 apples, 15 oranges, and 14 bananas in the first three baskets and 2 less of each fruit in the fourth basket. How many fruits are there?\nPlease provide your answer directly without additional reasoning steps. Format your answer as 'The answer is (insert answer here)'\nAnswer:"),
+                dict(role='BOT', prompt='The answer is 146\n'),
+                dict(role='HUMAN', prompt="Question: {question}\nPlease provide your answer directly without additional reasoning steps. Format your answer as 'The answer is (insert answer here)'\nAnswer:"),
+            ],
+        )),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer, max_out_len=512))
+
+gsm8k_eval_cfg = dict(evaluator=dict(type=Gsm8kEvaluator),
+                      pred_postprocessor=dict(type=gsm8k_postprocess),
+                      dataset_postprocessor=dict(type=gsm8k_dataset_postprocess))
+
+gsm8k_datasets = [
+    dict(
+        abbr='gsm8k',
+        type=GSM8KDataset,
+        path='opencompass/gsm8k',
+        reader_cfg=gsm8k_reader_cfg,
+        infer_cfg=gsm8k_infer_cfg,
+        eval_cfg=gsm8k_eval_cfg)
+]
--- a/opencompass/configs/datasets/last_letters/last_letters_0shot_cot_gen.py
+++ b/opencompass/configs/datasets/last_letters/last_letters_0shot_cot_gen.py
@ -0,0 +1,41 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import AccEvaluator
+from opencompass.datasets import LastLettersDataset, last_letters_pred_postprocess
+
+last_letters_reader_cfg = dict(
+    input_columns=['question'],
+    output_column='answer',
+    train_split='test',
+    test_split='test'
+)
+
+last_letters_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(role='HUMAN', prompt='Question: {question}\nPlease reason step by step, and format your final answer as `The answer is [ANSWER]`.\nAnswer:'),
+            ]
+        )
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer, max_out_len=512),
+)
+
+last_letters_eval_cfg = dict(
+    evaluator=dict(type=AccEvaluator),
+    pred_postprocessor=dict(type=last_letters_pred_postprocess),
+)
+
+last_letters_datasets = [
+    dict(
+        abbr='last_letters',
+        type=LastLettersDataset,
+        path='last_letters',
+        reader_cfg=last_letters_reader_cfg,
+        infer_cfg=last_letters_infer_cfg,
+        eval_cfg=last_letters_eval_cfg
+    )
+]
--- a/opencompass/configs/datasets/last_letters/last_letters_0shot_gen.py
+++ b/opencompass/configs/datasets/last_letters/last_letters_0shot_gen.py
@ -0,0 +1,41 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import AccEvaluator
+from opencompass.datasets import LastLettersDataset, last_letters_pred_postprocess
+
+last_letters_reader_cfg = dict(
+    input_columns=['question'],
+    output_column='answer',
+    train_split='test',
+    test_split='test'
+)
+
+last_letters_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(role='HUMAN', prompt='Question: {question}\nPlease respond directly without any additional explanations, and format your final answer as `The answer is [ANSWER]`.\nAnswer:'),
+            ]
+        )
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer, max_out_len=512),
+)
+
+last_letters_eval_cfg = dict(
+    evaluator=dict(type=AccEvaluator),
+    pred_postprocessor=dict(type=last_letters_pred_postprocess),
+)
+
+last_letters_datasets = [
+    dict(
+        abbr='last_letters',
+        type=LastLettersDataset,
+        path='last_letters',
+        reader_cfg=last_letters_reader_cfg,
+        infer_cfg=last_letters_infer_cfg,
+        eval_cfg=last_letters_eval_cfg
+    )
+]
--- a/opencompass/configs/datasets/last_letters/last_letters_fewshot_cot_gen.py
+++ b/opencompass/configs/datasets/last_letters/last_letters_fewshot_cot_gen.py
@ -0,0 +1,49 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import AccEvaluator
+from opencompass.datasets import LastLettersDataset, last_letters_pred_postprocess
+
+last_letters_reader_cfg = dict(
+    input_columns=['question'],
+    output_column='answer',
+    train_split='test',
+    test_split='test'
+)
+
+last_letters_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(role='HUMAN', prompt='Question: Take the last letters of the words in "Elon Musk" and concatenate them.\nAnswer:'),
+                dict(role='BOT', prompt='The last letter of "Elon" is "n". The last letter of "Musk" is "k". Concatenating them is "nk".\nSo the answer is nk.\n'),
+                dict(role='HUMAN', prompt='Question: Take the last letters of the words in "Larry Page" and concatenate them.\nAnswer:'),
+                dict(role='BOT', prompt='The last letter of "Larry" is "y". The last letter of "Page" is "e". Concatenating them is "ye".\nSo the answer is ye.\n'),
+                dict(role='HUMAN', prompt='Question: Take the last letters of the words in "Sergey Brin" and concatenate them.\nAnswer:'),
+                dict(role='BOT', prompt='The last letter of "Sergey" is "y". The last letter of "Brin" is "n". Concatenating them is "yn".\nSo the answer is yn.\n'),
+                dict(role='HUMAN', prompt='Question: Take the last letters of the words in "Bill Gates" and concatenate them.\nAnswer:'),
+                dict(role='BOT', prompt='The last letter of "Bill" is "l". The last letter of "Gates" is "s". Concatenating them is "ls".\nSo the answer is ls.\n'),
+                dict(role='HUMAN', prompt='Question: {question}\nAnswer:'),
+            ]
+        )
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer, max_out_len=512),
+)
+
+last_letters_eval_cfg = dict(
+    evaluator=dict(type=AccEvaluator),
+    pred_postprocessor=dict(type=last_letters_pred_postprocess),
+)
+
+last_letters_datasets = [
+    dict(
+        abbr='last_letters',
+        type=LastLettersDataset,
+        path='last_letters',
+        reader_cfg=last_letters_reader_cfg,
+        infer_cfg=last_letters_infer_cfg,
+        eval_cfg=last_letters_eval_cfg
+    )
+]
--- a/opencompass/configs/datasets/last_letters/last_letters_fewshot_gen.py
+++ b/opencompass/configs/datasets/last_letters/last_letters_fewshot_gen.py
@ -0,0 +1,49 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import AccEvaluator
+from opencompass.datasets import LastLettersDataset, last_letters_pred_postprocess
+
+last_letters_reader_cfg = dict(
+    input_columns=['question'],
+    output_column='answer',
+    train_split='test',
+    test_split='test'
+)
+
+last_letters_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(role='HUMAN', prompt='Question: Take the last letters of the words in "Elon Musk" and concatenate them.\nPlease answer directly without additional reasoning steps.\nAnswer:'),
+                dict(role='BOT', prompt='The answer is nk.\n'),
+                dict(role='HUMAN', prompt='Question: Take the last letters of the words in "Larry Page" and concatenate them.\nPlease answer directly without additional reasoning steps.\nAnswer:'),
+                dict(role='BOT', prompt='The answer is ye.\n'),
+                dict(role='HUMAN', prompt='Question: Take the last letters of the words in "Sergey Brin" and concatenate them.\nPlease answer directly without additional reasoning steps.\nAnswer:'),
+                dict(role='BOT', prompt='The answer is yn.\n'),
+                dict(role='HUMAN', prompt='Question: Take the last letters of the words in "Bill Gates" and concatenate them.\nPlease answer directly without additional reasoning steps.\nAnswer:'),
+                dict(role='BOT', prompt='The answer is ls.\n'),
+                dict(role='HUMAN', prompt='Question: {question}\nPlease answer directly without additional reasoning steps.\nAnswer:'),
+            ]
+        )
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer, max_out_len=512),
+)
+
+last_letters_eval_cfg = dict(
+    evaluator=dict(type=AccEvaluator),
+    pred_postprocessor=dict(type=last_letters_pred_postprocess),
+)
+
+last_letters_datasets = [
+    dict(
+        abbr='last_letters',
+        type=LastLettersDataset,
+        path='last_letters',
+        reader_cfg=last_letters_reader_cfg,
+        infer_cfg=last_letters_infer_cfg,
+        eval_cfg=last_letters_eval_cfg
+    )
+]
--- a/opencompass/configs/datasets/math/math_0shot_gen.py
+++ b/opencompass/configs/datasets/math/math_0shot_gen.py
@ -0,0 +1,35 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import MATHDataset, MATHEvaluator, math_postprocess_v2, normalize_final_answer
+
+math_reader_cfg = dict(input_columns=['problem'], output_column='solution')
+
+math_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(role='HUMAN', prompt='{problem}\nPlease provide only the final answer, without including any intermediate reasoning steps, and put your final answer within \\boxed{}.'),
+            ]
+        ),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer, max_out_len=1024),
+)
+
+# postprocess v2
+math_eval_cfg = dict(
+    evaluator=dict(type=MATHEvaluator, version='v2'), pred_postprocessor=dict(type=math_postprocess_v2),
+)
+
+math_datasets = [
+    dict(
+        type=MATHDataset,
+        abbr='math',
+        path='opencompass/math',
+        reader_cfg=math_reader_cfg,
+        infer_cfg=math_infer_cfg,
+        eval_cfg=math_eval_cfg,
+    )
+]
--- a/opencompass/configs/datasets/math/math_fewshot_gen.py
+++ b/opencompass/configs/datasets/math/math_fewshot_gen.py
@ -0,0 +1,36 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import MATHDataset, MATHEvaluator, math_postprocess_v2
+
+math_reader_cfg = dict(input_columns=['problem'], output_column='solution')
+
+math_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(round=[
+            dict(role='HUMAN', prompt='Problem:\nFind the domain of the expression $\\frac{{\sqrt{{x-2}}}}{{\sqrt{{5-x}}}}$.}}\nPlease provide only the final answer, without including any intermediate reasoning steps, and put your final answer within \\boxed{}.\nSolution:'),
+            dict(role='BOT', prompt='Final Answer: \\boxed{{[2,5)}}\n'),
+            dict(role='HUMAN', prompt='Problem:\nIf $\det \mathbf{{A}} = 2$ and $\det \mathbf{{B}} = 12,$ then find $\det (\mathbf{{A}} \mathbf{{B}}).$\nPlease provide only the final answer, without including any intermediate reasoning steps, and put your final answer within \\boxed{}.\nSolution:'),
+            dict(role='BOT', prompt='Final Answer: \\boxed{{24}}\n'),
+            dict(role='HUMAN', prompt='Problem:\nTerrell usually lifts two 20-pound weights 12 times. If he uses two 15-pound weights instead, how many times must Terrell lift them in order to lift the same total weight?\nPlease provide only the final answer, without including any intermediate reasoning steps, and put your final answer within \\boxed{}.\nSolution:'),
+            dict(role='BOT', prompt='Final Answer: \\boxed{{16}}\n'),
+            dict(role='HUMAN', prompt='Problem:\nIf the system of equations: \\begin{{align*}} 6x-4y&=a,\\\\ 6y-9x &=b. \end{{align*}}has a solution $(x, y)$ where $x$ and $y$ are both nonzero, find $\\frac{{a}}{{b}},$ assuming $b$ is nonzero.\nPlease provide only the final answer, without including any intermediate reasoning steps, and put your final answer within \\boxed{}.\nSolution:'),
+            dict(role='BOT', prompt='Final Answer: \\boxed{{-\\frac{{2}}{{3}}}}\n'),
+            dict(role='HUMAN', prompt='Problem:\n{problem}\nPlease provide only the final answer, without including any intermediate reasoning steps, and put your final answer within \\boxed{}.\nSolution:\n'),
+        ])),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer, max_out_len=512))
+
+math_eval_cfg = dict(
+    evaluator=dict(type=MATHEvaluator, version='v2'), pred_postprocessor=dict(type=math_postprocess_v2))
+
+math_datasets = [
+    dict(
+        type=MATHDataset,
+        abbr='math',
+        path='opencompass/math',
+        reader_cfg=math_reader_cfg,
+        infer_cfg=math_infer_cfg,
+        eval_cfg=math_eval_cfg)
+]
--- a/opencompass/configs/datasets/mmlu_pro/mmlu_pro_0shot_gen.py
+++ b/opencompass/configs/datasets/mmlu_pro/mmlu_pro_0shot_gen.py
@ -0,0 +1,64 @@
+from mmengine.config import read_base
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import AccEvaluator
+from opencompass.datasets import MMLUProDataset
+from opencompass.utils.text_postprocessors import match_answer_pattern
+
+with read_base():
+    from .mmlu_pro_categories import categories
+
+
+
+QUERY_TEMPLATE = """
+Answer the following multiple choice question. Your response should be of the following format: 'ANSWER: $LETTER' (without quotes) where LETTER is one of Options(e.g. one of ABCDEFGHIJKLMNOP). Please answer directly without additional explanations.
+
+Question:\n
+{question}
+
+Options:\n
+{options_str}
+
+""".strip()
+
+mmlu_pro_datasets = []
+
+for category in categories:
+    mmlu_pro_reader_cfg = dict(
+        input_columns=['question', 'cot_content', 'options_str'],
+        output_column='answer',
+        train_split='validation',
+        test_split='test',
+    )
+    mmlu_pro_infer_cfg = dict(
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(
+                round=[
+                    dict(role='HUMAN',
+                         prompt=QUERY_TEMPLATE),
+                ],
+            ),
+        ),
+        retriever=dict(type=ZeroRetriever),
+        inferencer=dict(type=GenInferencer),
+    )
+
+    mmlu_pro_eval_cfg = dict(
+        evaluator=dict(type=AccEvaluator),
+        pred_postprocessor=dict(
+            type=match_answer_pattern,
+            answer_pattern=r'(?i)ANSWER\s*:\s*([A-P])')
+    )
+
+    mmlu_pro_datasets.append(
+        dict(
+            abbr=f'mmlu_pro_{category.replace(" ", "_")}',
+            type=MMLUProDataset,
+            path='opencompass/mmlu_pro',
+            category=category,
+            reader_cfg=mmlu_pro_reader_cfg,
+            infer_cfg=mmlu_pro_infer_cfg,
+            eval_cfg=mmlu_pro_eval_cfg,
+        ))
--- a/opencompass/datasets/init.py
+++ b/opencompass/datasets/init.py
@ -31,6 +31,7 @@ from .cmmlu import *  # noqa: F401, F403
 from .cmnli import *  # noqa: F401, F403
 from .cmo_fib import *  # noqa: F401, F403
 from .cmrc import *  # noqa: F401, F403
+from .coinflip import *  # noqa: F401, F403
 from .commonsenseqa import *  # noqa: F401, F403
 from .commonsenseqa_cn import *  # noqa: F401, F403
 from .copa import *  # noqa: F401, F403
@ -71,6 +72,7 @@ from .jsonl import JsonlDataset  # noqa: F401, F403
 from .kaoshi import KaoshiDataset, KaoshiEvaluator  # noqa: F401, F403
 from .korbench import *  # noqa: F401, F403
 from .lambada import *  # noqa: F401, F403
+from .last_letters import * # noqa: F401, F403
 from .lawbench import *  # noqa: F401, F403
 from .LCBench import *  # noqa: F401, F403
 from .lcsts import *  # noqa: F401, F403
--- a/opencompass/datasets/coinflip.py
+++ b/opencompass/datasets/coinflip.py
@ -0,0 +1,39 @@
+import os
+import re
+import json
+from datasets import Dataset, DatasetDict
+
+from opencompass.registry import LOAD_DATASET, TEXT_POSTPROCESSORS
+
+from .base import BaseDataset
+from opencompass.utils.datasets import DEFAULT_DATA_FOLDER
+from opencompass.utils.fileio import download_url
+
+@LOAD_DATASET.register_module()
+class CoinFlipDataset(BaseDataset):
+    
+    @staticmethod
+    def load(path: str):
+        cache_dir = os.environ.get('COMPASS_DATA_CACHE', '')
+        local_path = './data/coin_flip/coin_flip.json'
+        data_path = os.path.join(DEFAULT_DATA_FOLDER, cache_dir, local_path)
+        
+        if not os.path.exists(data_path):
+            dataset_url = "https://raw.githubusercontent.com/wjn1996/Chain-of-Knowledge/refs/heads/main/tasks/Coin/dataset/coin_flip.json"
+            download_url(dataset_url, os.path.dirname(data_path))
+            
+        dataset = []
+        with open(data_path, 'r', encoding='utf-8') as f:
+            for ex in json.load(f)["examples"]:
+                dataset.append(ex)
+        dataset = Dataset.from_list(dataset)
+        return DatasetDict({'test': dataset})
+    
+
+@TEXT_POSTPROCESSORS.register_module('coinflip')
+def coinflip_pred_postprocess(text: str) -> str:
+    text = text.split('answer is ')[-1]
+    match = re.search(r'(yes|no)', text.lower())
+    if match:
+        return match.group(1)
+    return ''
--- a/opencompass/datasets/last_letters.py
+++ b/opencompass/datasets/last_letters.py
@ -0,0 +1,37 @@
+import os
+import re
+import json
+from datasets import Dataset, DatasetDict
+
+from opencompass.registry import LOAD_DATASET, TEXT_POSTPROCESSORS
+
+from .base import BaseDataset
+from opencompass.utils.datasets import DEFAULT_DATA_FOLDER
+from opencompass.utils.fileio import download_url
+
+@LOAD_DATASET.register_module()
+class LastLettersDataset(BaseDataset):
+    
+    @staticmethod
+    def load(path: str):
+        cache_dir = os.environ.get('COMPASS_DATA_CACHE', '')
+        local_path = './data/last_letters/last_letters.json'
+        data_path = os.path.join(DEFAULT_DATA_FOLDER, cache_dir, local_path)
+        
+        if not os.path.exists(data_path):
+            dataset_url = "https://raw.githubusercontent.com/wjn1996/Chain-of-Knowledge/refs/heads/main/tasks/Letter/dataset/last_letters.json"
+            download_url(dataset_url, os.path.dirname(data_path))
+            
+        dataset = []
+        with open(data_path, 'r', encoding='utf-8') as f:
+            for ex in json.load(f)["examples"]:
+                dataset.append(ex)
+        dataset = Dataset.from_list(dataset)
+        return DatasetDict({'test': dataset})
+    
+@TEXT_POSTPROCESSORS.register_module('last_letters')
+def last_letters_pred_postprocess(text: str) -> str:
+    text = text.split('answer is ')[-1]
+    text = re.sub("\"|\'|\n|\.|\s", "", text)
+    text = re.sub(r"^[^a-zA-Z]+|[^a-zA-Z]+$", "", text)
+    return text