[Feature] Add circular eval (#610)

* refactor default, add circular summarizer * add circular * update impl * update doc * minor update * no more to be added
2025-05-30 16:03:24 +08:00 · 2023-11-23 16:45:47 +08:00 · 2023-11-23 16:45:47 +08:00 · d949e3c003
commit d949e3c003
parent 5202456b4c
12 changed files with 915 additions and 145 deletions
--- a/configs/eval_circular.py
+++ b/configs/eval_circular.py
@ -0,0 +1,91 @@
+from mmengine.config import read_base
+from opencompass.datasets.circular import (CircularCEvalDataset, CircularMMLUDataset, CircularCMMLUDataset, CircularCSQADataset,
+                                           CircularARCDataset, CircularHSWAGDataset, CircularOBQADataset, CircularRaceDataset, CircularEvaluator)
+from opencompass.summarizers import CircularSummarizer
+
+with read_base():
+    from .datasets.ceval.ceval_gen_5f30c7 import ceval_datasets
+    from .datasets.mmlu.mmlu_gen_a484b3 import mmlu_datasets
+    from .datasets.cmmlu.cmmlu_gen_c13365 import cmmlu_datasets
+    from .datasets.hellaswag.hellaswag_gen_6faab5 import hellaswag_datasets
+    from .datasets.ARC_e.ARC_e_gen_1e0de5 import ARC_e_datasets
+    from .datasets.ARC_c.ARC_c_gen_1e0de5 import ARC_c_datasets
+    from .datasets.commonsenseqa.commonsenseqa_gen_1da2d0 import commonsenseqa_datasets
+    from .datasets.obqa.obqa_gen_9069e4 import obqa_datasets
+    from .datasets.race.race_gen_69ee4f import race_datasets
+
+    from .models.hf_internlm.hf_internlm_chat_7b import models as hf_internlm_chat_7b_model
+    from .models.hf_internlm.hf_internlm_chat_20b import models as hf_internlm_chat_20b_model
+    from .models.qwen.hf_qwen_7b_chat import models as hf_qwen_7b_chat_model
+    from .models.qwen.hf_qwen_14b_chat import models as hf_qwen_14b_chat_model
+
+    from .summarizers.groups.mmlu import mmlu_summary_groups
+    from .summarizers.groups.cmmlu import cmmlu_summary_groups
+    from .summarizers.groups.ceval import ceval_summary_groups
+
+for ds, t in [
+    (ceval_datasets, CircularCEvalDataset),
+    (mmlu_datasets, CircularMMLUDataset),
+    (cmmlu_datasets, CircularCMMLUDataset),
+    (hellaswag_datasets, CircularHSWAGDataset),
+    (ARC_e_datasets, CircularARCDataset),
+    (ARC_c_datasets, CircularARCDataset),
+    (commonsenseqa_datasets, CircularCSQADataset),
+    (obqa_datasets, CircularOBQADataset),
+    (race_datasets, CircularRaceDataset),
+]:
+    for d in ds:
+        d['type'] = t
+        d['abbr'] = d['abbr'] + '-circular-4'
+        d['eval_cfg']['evaluator'] = {'type': CircularEvaluator, 'circular_pattern': 'circular'}
+        d['circular_patterns'] = 'circular'
+
+
+datasets = sum([v for k, v in locals().items() if k.endswith("_datasets") or k == 'datasets'], [])
+models = sum([v for k, v in locals().items() if k.endswith("_model")], [])
+
+# config summarizer
+other_summary_groups = [
+    {'name': 'average',
+     'subsets': ['ceval', 'mmlu', 'cmmlu', 'hellaswag', 'ARC-e', 'ARC-c', 'commonsense_qa', 'openbookqa_fact', 'race-middle', 'race-high']},
+]
+origin_summary_groups = sum([v for k, v in locals().items() if k.endswith("_summary_groups")], [])
+new_summary_groups = []
+for item in origin_summary_groups:
+    new_summary_groups.append(
+        {
+            'name': item['name'] + '-circular-4',
+            'subsets': [i + '-circular-4' for i in item['subsets']],
+        }
+    )
+summarizer = dict(
+    type=CircularSummarizer,
+    metric_types=['acc_origin', 'perf_circular'],
+    dataset_abbrs = [
+        'average-circular-4',
+        'ceval-circular-4',
+        'mmlu-circular-4',
+        'cmmlu-circular-4',
+        'hellaswag-circular-4',
+        'ARC-e-circular-4',
+        'ARC-c-circular-4',
+        'commonsense_qa-circular-4',
+        'openbookqa_fact-circular-4',
+        'race-middle-circular-4',
+        'race-high-circular-4',
+        'ceval-humanities-circular-4',
+        'ceval-stem-circular-4',
+        'ceval-social-science-circular-4',
+        'ceval-other-circular-4',
+        'mmlu-humanities-circular-4',
+        'mmlu-stem-circular-4',
+        'mmlu-social-science-circular-4',
+        'mmlu-other-circular-4',
+        'cmmlu-humanities-circular-4',
+        'cmmlu-stem-circular-4',
+        'cmmlu-social-science-circular-4',
+        'cmmlu-other-circular-4',
+        'cmmlu-china-specific-circular-4',
+    ],
+    summary_groups=new_summary_groups,
+)
--- a/docs/en/advanced_guides/circular_eval.md
+++ b/docs/en/advanced_guides/circular_eval.md
@ -0,0 +1,113 @@
+# CircularEval
+
+## Background
+
+For multiple-choice questions, when a Language Model (LLM) provides the correct option, it does not necessarily imply a true understanding and reasoning of the question. It could be a guess. To differentiate these scenarios and reduce LLM bias towards options, CircularEval (CircularEval) can be utilized. A multiple-choice question is augmented by shuffling its options, and if the LLM correctly answers all variations of the augmented question, it is considered correct under CircularEval.
+
+## Adding Your Own CircularEval Dataset
+
+Generally, to evaluate a dataset using CircularEval, both its loading and evaluation methods need to be rewritten. Modifications are required in both the OpenCompass main library and configuration files. We will use C-Eval as an example for explanation.
+
+OpenCompass main library:
+
+```python
+from opencompass.datasets.ceval import CEvalDataset
+from opencompass.datasets.circular import CircularDatasetMeta
+
+class CircularCEvalDataset(CEvalDataset, metaclass=CircularDatasetMeta):
+    # The overloaded dataset class
+    dataset_class = CEvalDataset
+
+    # Splits of the DatasetDict that need CircularEval. For CEvalDataset, which loads [dev, val, test], we only need 'val' and 'test' for CircularEval, not 'dev'
+    default_circular_splits = ['val', 'test']
+
+    # List of keys to be shuffled
+    default_option_keys = ['A', 'B', 'C', 'D']
+
+    # If the content of 'answer_key' is one of ['A', 'B', 'C', 'D'], representing the correct answer. This field indicates how to update the correct answer after shuffling options. Choose either this or default_answer_key_switch_method
+    default_answer_key = 'answer'
+
+    # If 'answer_key' content is not one of ['A', 'B', 'C', 'D'], a function can be used to specify the correct answer after shuffling options. Choose either this or default_answer_key
+    # def default_answer_key_switch_method(item, circular_pattern):
+    #     # 'item' is the original data item
+    #     # 'circular_pattern' is a tuple indicating the order after shuffling options, e.g., ('D', 'A', 'B', 'C') means the original option A is now D, and so on
+    #     item['answer'] = circular_pattern['ABCD'.index(item['answer'])]
+    #     return item
+```
+
+`CircularCEvalDataset` accepts the `circular_pattern` parameter with two values:
+
+- `circular`: Indicates a single cycle. It is the default value. ABCD is expanded to ABCD, BCDA, CDAB, DABC, a total of 4 variations.
+- `all_possible`: Indicates all permutations. ABCD is expanded to ABCD, ABDC, ACBD, ACDB, ADBC, ADCB, BACD, ..., a total of 24 variations.
+
+Additionally, we provide a `CircularEvaluator` to replace `AccEvaluator`. This Evaluator also accepts `circular_pattern`, and it should be consistent with the above. It produces the following metrics:
+
+- `acc_{origin|circular|all_possible}`: Treating each question with shuffled options as separate, calculating accuracy.
+- `perf_{origin|circular|all_possible}`: Following Circular logic, a question is considered correct only if all its variations with shuffled options are answered correctly, calculating accuracy.
+- `more_{num}_{origin|circular|all_possible}`: According to Circular logic, a question is deemed correct if the number of its variations answered correctly is greater than or equal to num, calculating accuracy.
+
+OpenCompass configuration file:
+
+```python
+from mmengine.config import read_base
+from opencompass.datasets.circular import CircularCEvalDataset
+
+with read_base():
+    from .datasets.ceval.ceval_gen_5f30c7 import ceval_datasets
+
+for d in ceval_datasets:
+    # Overloading the load method
+    d['type'] = CircularCEvalDataset
+    # Renaming for differentiation from non-circular evaluation versions
+    d['abbr'] = d['abbr'] + '-circular-4'
+    # Overloading the evaluation method
+    d['eval_cfg']['evaluator'] = {'type': CircularEvaluator}
+
+# The dataset after the above operations looks like this:
+# dict(
+#     type=CircularCEvalDataset,
+#     path='./data/ceval/formal_ceval',  # Unchanged
+#     name='computer_network',  # Unchanged
+#     abbr='ceval-computer_network-circular-4',
+#     reader_cfg=dict(...),  # Unchanged
+#     infer_cfg=dict(...),  # Unchanged
+#     eval_cfg=dict(evaluator=dict(type=CircularEvaluator), ...),
+# )
+```
+
+Additionally, for better presentation of results in CircularEval, consider using the following summarizer:
+
+```python
+
+
+from mmengine.config import read_base
+from opencompass.summarizers import CircularSummarizer
+
+with read_base():
+    from ...summarizers.groups.ceval.ceval_summary_groups
+
+new_summary_groups = []
+for item in ceval_summary_groups:
+    new_summary_groups.append(
+        {
+            'name': item['name'] + '-circular-4',
+            'subsets': [i + '-circular-4' for i in item['subsets']],
+        }
+    )
+
+summarizer = dict(
+    type=CircularSummarizer,
+    # Select specific metrics to view
+    metric_types=['acc_origin', 'perf_circular'],
+    dataset_abbrs = [
+        'ceval-circular-4',
+        'ceval-humanities-circular-4',
+        'ceval-stem-circular-4',
+        'ceval-social-science-circular-4',
+        'ceval-other-circular-4',
+    ],
+    summary_groups=new_summary_groups,
+)
+```
+
+For more complex evaluation examples, refer to this sample code: https://github.com/open-compass/opencompass/tree/main/configs/eval_circular.py
--- a/docs/en/index.rst
+++ b/docs/en/index.rst
@ -67,6 +67,7 @@ We always welcome *PRs* and *Issues* for the betterment of OpenCompass.
   advanced_guides/prompt_attack.md
   advanced_guides/longeval.md
   advanced_guides/subjective_evaluation.md
+   advanced_guides/circular_eval.md

 .. _Tools:
 .. toctree::
--- a/docs/zh_cn/advanced_guides/circular_eval.md
+++ b/docs/zh_cn/advanced_guides/circular_eval.md
@ -0,0 +1,111 @@
+# 循环评测
+
+## 背景
+
+对于选择题而言，当 LLM 给出正确的选项，并不一定代表着它能真正地理解题意并经过推理得出答案，它也有可能是蒙对的。为了将这两种情形区分开，同时也为了降低 LLM 对选项的偏见，我们可以尝试使用循环评测 (CircularEval)。我们会将一道选择题按照打乱选项的方式进行增广，若 LLM 可以在增广后的每道题上均得到正确的答案，那么我们认为在循环评测的意义下，这道题被做对了。
+
+## 新增自己的循环评测数据集
+
+一般来说，为了将一个数据集使用循环评测的方式进行评测，它的加载方式和评测方式是需要被重写的，OpenCompass 主库和配置文件均需要进行修改。后续我们以 C-Eval 为例进行讲解。
+
+OpenCompass 主库：
+
+```python
+from opencompass.datasets.ceval import CEvalDataset
+from opencompass.datasets.circular import CircularDatasetMeta
+
+class CircularCEvalDataset(CEvalDataset, metaclass=CircularDatasetMeta):
+    # 被重载的数据集类
+    dataset_class = CEvalDataset
+
+    # 若原 load 方法得到一 DatasetDict，其哪些 split 需要被循环评测。CEvalDataset load 得到 [dev, val, test]，我们只需要对 val 和 test 进行循环评测，dev 不需要
+    default_circular_splits = ['val', 'test']
+
+    # 需要被打乱的 key 列表
+    default_option_keys = ['A', 'B', 'C', 'D']
+
+    # 若 answer_key 的内容属于是 ['A', 'B', 'C', 'D'] 之一，并表示正确答案。该字段表示打乱选项后，需要如何更新正确答案。与 default_answer_key_switch_method 二选一
+    default_answer_key = 'answer'
+
+    # 如果 answer_key 的内容不属于 ['A', 'B', 'C', 'D'] 之一，那么可以使用函数的方式来指定打乱选项后的正确答案。与 default_answer_key 二选一
+    # def default_answer_key_switch_method(item, circular_pattern):
+    #     # item 是原本的数据项
+    #     # circular_pattern 是一个 tuple，表示打乱选项后的顺序，例如 ('D', 'A', 'B', 'C') 表示原来的 A 选项变成了 D，原来的 B 选项变成了 A，以此类推
+    #     item['answer'] = circular_pattern['ABCD'.index(item['answer'])]
+    #     return item
+```
+
+`CircularCEvalDataset` 会接受 `circular_pattern` 参数，它有两个取值:
+
+- `circular`: 表示单项循环。默认为该值。ABCD 会被扩充为 ABCD, BCDA, CDAB, DABC, 共 4 种
+- `all_possible`: 表示全排列。ABCD 会被扩充为 ABCD, ABDC, ACBD, ACDB, ADBC, ADCB, BACD, ..., 共 24 种
+
+另外我们提供了一个 `CircularEvaluator` 用于替换 `AccEvaluator`，该 Evaluator 同样接受 `circular_pattern`，该参数应与上述保持一致。它会产出以下指标：
+
+- `acc_{origin|circular|all_possible}`: 将打乱后选项顺序后的题目视作多道单独的题目，计算准确率
+- `perf_{origin|circular|all_possible}`: 按照 circular 的逻辑，若选项打乱后的题目都回答正确，才会视为这道题正确，计算准确率
+- `more_{num}_{origin|circular|all_possible}`: 按照 circular 的逻辑，若选项打乱后的题目回答正确的数量大于等于 num，就会视为这道题正确，计算准确率
+
+OpenCompass 配置文件：
+
+```python
+from mmengine.config import read_base
+from opencompass.datasets.circular import CircularCEvalDataset
+
+with read_base():
+    from .datasets.ceval.ceval_gen_5f30c7 import ceval_datasets
+
+for d in ceval_datasets:
+    # 重载 load 方法
+    d['type'] = CircularCEvalDataset
+    # 为了与非循环评测版本做区分而进行改名
+    d['abbr'] = d['abbr'] + '-circular-4'
+    # 重载评测方法
+    d['eval_cfg']['evaluator'] = {'type': CircularEvaluator}
+
+# 上述操作后的 dataset 形如下：
+# dict(
+#     type=CircularCEvalDataset,
+#     path='./data/ceval/formal_ceval',  # 未改变
+#     name='computer_network',  # 未改变
+#     abbr='ceval-computer_network-circular-4',
+#     reader_cfg=dict(...),  # 未改变
+#     infer_cfg=dict(...),  # 未改变
+#     eval_cfg=dict(evaluator=dict(type=CircularEvaluator), ...),
+# )
+```
+
+另外评测时为了针对循环评测有更良好的结果呈现，建议考虑使用以下 summarizer
+
+```python
+from mmengine.config import read_base
+from opencompass.summarizers import CircularSummarizer
+
+with read_base():
+    from ...summarizers.groups.ceval import ceval_summary_groups
+
+new_summary_groups = []
+for item in ceval_summary_groups:
+    new_summary_groups.append(
+        {
+            'name': item['name'] + '-circular-4',
+            'subsets': [i + '-circular-4' for i in item['subsets']],
+        }
+    )
+
+summarizer = dict(
+    type=CircularSummarizer,
+    # 选择具体看哪些指标
+    metric_types=['acc_origin', 'perf_circular'],
+    dataset_abbrs = [
+        'ceval-circular-4',
+        'ceval-humanities-circular-4',
+        'ceval-stem-circular-4',
+        'ceval-social-science-circular-4',
+        'ceval-other-circular-4',
+    ],
+    summary_groups=new_summary_groups,
+)
+```
+
+更多复杂的评测案例可以参考这个样例代码: https://github.com/open-compass/opencompass/tree/main/configs/eval_circular.py
--- a/docs/zh_cn/index.rst
+++ b/docs/zh_cn/index.rst
@ -67,6 +67,7 @@ OpenCompass 上手路线
   advanced_guides/prompt_attack.md
   advanced_guides/longeval.md
   advanced_guides/subjective_evaluation.md
+   advanced_guides/circular_eval.md

 .. _工具:
 .. toctree::
--- a/opencompass/datasets/init.py
+++ b/opencompass/datasets/init.py
@ -13,6 +13,7 @@ from .cb import *  # noqa: F401, F403
 from .ceval import *  # noqa: F401, F403
 from .chid import *  # noqa: F401, F403
 from .cibench import *  # noqa: F401, F403
+from .circular import *  # noqa: F401, F403
 from .civilcomments import *  # noqa: F401, F403
 from .clozeTest_maxmin import *  # noqa: F401, F403
 from .cluewsc import *  # noqa: F401, F403
--- a/opencompass/datasets/arc.py
+++ b/opencompass/datasets/arc.py
@ -14,32 +14,19 @@ class ARCDataset(BaseDataset):
    def load(path: str):
        with open(path, 'r', errors='ignore') as in_f:
            rows = []
-            for i, line in enumerate(in_f):
-                sample = json.loads(line.strip())
-                answerKey = sample['answerKey']
-                sample = sample['question']
-                question = sample['stem']
-                choices = sample['choices']
-                if len(choices) != 4:
+            for line in in_f:
+                item = json.loads(line.strip())
+                question = item['question']
+                if len(question['choices']) != 4:
                    continue
-                textA = choices[0]['text']
-                textB = choices[1]['text']
-                textC = choices[2]['text']
-                textD = choices[3]['text']
+                labels = [c['label'] for c in question['choices']]
+                answerKey = 'ABCD'[labels.index(item['answerKey'])]
                rows.append({
-                    'question': question,
+                    'question': question['stem'],
                    'answerKey': answerKey,
-                    'textA': textA,
-                    'textB': textB,
-                    'textC': textC,
-                    'textD': textD
+                    'textA': question['choices'][0]['text'],
+                    'textB': question['choices'][1]['text'],
+                    'textC': question['choices'][2]['text'],
+                    'textD': question['choices'][3]['text'],
                })
-            dataset = Dataset.from_dict({
-                'question': [row['question'] for row in rows],
-                'answerKey': [row['answerKey'] for row in rows],
-                'textA': [row['textA'] for row in rows],
-                'textB': [row['textB'] for row in rows],
-                'textC': [row['textC'] for row in rows],
-                'textD': [row['textD'] for row in rows]
-            })
-            return dataset
+            return Dataset.from_list(rows)
--- a/opencompass/datasets/circular.py
+++ b/opencompass/datasets/circular.py
@ -0,0 +1,348 @@
+import copy
+import itertools
+from typing import Callable, List, Optional, Union
+
+from datasets import Dataset, DatasetDict
+
+from opencompass.openicl.icl_evaluator import BaseEvaluator
+
+from .arc import ARCDataset
+from .ceval import CEvalDataset
+from .cmmlu import CMMLUDataset
+from .commonsenseqa import commonsenseqaDataset
+from .hellaswag import hellaswagDataset_V2
+from .mmlu import MMLUDataset
+from .obqa import OBQADataset
+from .race import RaceDataset
+from .xiezhi import XiezhiDataset
+
+
+def get_origin_patterns(option_keys):
+    return [tuple(option_keys)]
+
+
+def get_circular_patterns(option_keys):
+    double_option_keys = option_keys + option_keys
+    circular_patterns = [
+        tuple(double_option_keys[i:i + len(option_keys)])
+        for i in range(len(option_keys))
+    ]
+    return circular_patterns
+
+
+def get_all_possible_patterns(option_keys):
+    circular_patterns = list(itertools.permutations(option_keys))
+    return circular_patterns
+
+
+class CircularDatasetMeta(type):
+    """This Meta Class is designed to transform a class that reads datasets
+    into one that supports reading datasets required for CircularEval. It
+    overloads an existing load method for the original class.
+
+    The Meta Class should possess the following attributes:
+
+    - `dataset_class` (class): The class for reading datasets, such as
+        `CEvalDataset`.
+    - `default_circular_splits` (list, optional): The default splits of the
+        dataset that need to undergo CircularEval, like ['val', 'test']. If a
+        `Dataset` is loaded originally, this field will be ignored.
+    - `default_option_keys` (list): The keys for options in the dataset, such
+        as ['A', 'B', 'C', 'D'].
+    - `default_answer_key` (str, optional): The key for answers in the dataset,
+        like 'answer'. This is an alternative to
+        `default_answer_key_switch_method`.
+    - `default_answer_key_switch_method` (function, optional): The method to
+        transform the key for answers in the dataset. This is an alternative to
+        `default_answer_key`.
+    """
+
+    @staticmethod
+    def make_circular_items(
+        origin_item,
+        circular_patterns,
+        option_keys,
+        answer_key,
+        answer_key_switch_method,
+        qid,
+    ):
+        items = []
+        for circular_pattern in circular_patterns:
+            item = copy.deepcopy(origin_item)
+            for i in range(len(option_keys)):
+                item[circular_pattern[i]] = origin_item[option_keys[i]]
+            if answer_key_switch_method is None:
+                if origin_item[answer_key] in option_keys:
+                    item[answer_key] = circular_pattern[option_keys.index(
+                        origin_item[answer_key])]
+                else:
+                    pass
+            else:
+                item = answer_key_switch_method(item, circular_pattern)
+            item['qid'] = qid
+            item['circular_pattern'] = tuple(circular_pattern)
+            items.append(item)
+        return items
+
+    @staticmethod
+    def make_circular_dataset(dataset, circular_patterns, option_keys,
+                              answer_key, answer_key_switch_method):
+        circulated_items = []
+        for i, item in enumerate(dataset):
+            item = CircularDatasetMeta.make_circular_items(
+                item,
+                circular_patterns,
+                option_keys,
+                answer_key,
+                answer_key_switch_method,
+                i,
+            )
+            circulated_items.extend(item)
+        return Dataset.from_list(circulated_items)
+
+    def make_circular(
+        dataset: Union[Dataset, DatasetDict],
+        circular_splits: Optional[List[str]] = ['test'],
+        circular_patterns: str = 'circular',
+        option_keys: List[str] = ['A', 'B', 'C', 'D'],
+        answer_key: Optional[str] = 'answer',
+        answer_key_switch_method: Optional[Callable] = None,
+    ):
+        """Transform the dataset into one that is compatible with CircularEval.
+        In CircularEval, the original multiple-choice questions with options
+        ABCD are augmented by shuffling the order of options, such as BCDA,
+        CDAB, DABC, etc. A model is considered correct only if it answers all
+        augmented questions correctly. This method effectively prevents models
+        from memorizing answers.
+
+        Args:
+        datasets: The dataset to be augmented.
+        circular_splits: List of splits to make circular. This is only
+            effective when the dataset is a DatasetDict.
+        circular_patterns: Method for circular processing, can be 'circular'
+            for single cycle or 'all_possible' for all permutations, default
+            is 'circular'.
+        option_keys: List of keys for options, default to ['A', 'B', 'C', 'D'].
+        answer_key: Key for the answer, default to 'answer'. When specified,
+            ensure that the content of answer_key is among the option_keys.
+            It is an alternative to specifying answer_key_switch_method.
+        answer_key_switch_method: Function to modify the answer_key. It is an
+            alternative to specifying answer_key.
+        """
+
+        if isinstance(circular_patterns, str):
+            if circular_patterns == 'circular':
+                circular_patterns = get_circular_patterns(option_keys)
+            elif circular_patterns == 'all_possible':
+                circular_patterns = get_all_possible_patterns(option_keys)
+            else:
+                raise ValueError(
+                    f'Unknown circular_patterns: {circular_patterns}')
+        else:
+            assert isinstance(circular_patterns, list)
+            assert all([isinstance(i, list) for i in circular_patterns])
+            # TODO: other necessary sanity checks
+            raise NotImplementedError(
+                'circular_patterns int list of list has not been tested yet')
+
+        if answer_key is None and answer_key_switch_method is None:
+            raise ValueError(
+                'answer_key and answer_key_switch_method cannot be both None')
+        if answer_key is not None and answer_key_switch_method is not None:
+            raise ValueError(
+                'either answer_key or answer_key_switch_method should be None')
+
+        if isinstance(dataset, Dataset):
+            dataset = CircularDatasetMeta.make_circular_dataset(
+                dataset,
+                circular_patterns,
+                option_keys,
+                answer_key,
+                answer_key_switch_method,
+            )
+        else:
+            assert isinstance(dataset, DatasetDict)
+            dataset_dict = {}
+            for split in dataset:
+                if circular_splits is not None and split in circular_splits:
+                    dataset_dict[
+                        split] = CircularDatasetMeta.make_circular_dataset(
+                            dataset[split],
+                            circular_patterns,
+                            option_keys,
+                            answer_key,
+                            answer_key_switch_method,
+                        )
+                else:
+                    dataset_dict[split] = dataset[split]
+            dataset = DatasetDict(dataset_dict)
+        return dataset
+
+    def __new__(cls, name, bases, dct):
+        new_cls = super().__new__(cls, name, bases, dct)
+
+        def load(cls, circular_patterns='circular', *args, **kwargs):
+            circular_splits = getattr(cls, 'default_circular_splits', None)
+            option_keys = cls.default_option_keys
+            answer_key = getattr(cls, 'default_answer_key', None)
+            answer_key_switch_method = getattr(
+                cls, 'default_answer_key_switch_method', None)
+            dataset = cls.dataset_class.load(*args, **kwargs)
+            return CircularDatasetMeta.make_circular(
+                dataset,
+                circular_splits,
+                circular_patterns,
+                option_keys,
+                answer_key,
+                answer_key_switch_method,
+            )
+
+        setattr(new_cls, 'load', classmethod(load))
+        return new_cls
+
+
+class CircularCEvalDataset(CEvalDataset, metaclass=CircularDatasetMeta):
+    dataset_class = CEvalDataset
+    default_circular_splits = ['val', 'test']
+    default_option_keys = ['A', 'B', 'C', 'D']
+    default_answer_key = 'answer'
+
+
+class CircularMMLUDataset(MMLUDataset, metaclass=CircularDatasetMeta):
+    dataset_class = MMLUDataset
+    default_circular_splits = ['test']
+    default_option_keys = ['A', 'B', 'C', 'D']
+    default_answer_key = 'target'
+
+
+class CircularCMMLUDataset(CMMLUDataset, metaclass=CircularDatasetMeta):
+    dataset_class = CMMLUDataset
+    default_circular_splits = ['test']
+    default_option_keys = ['A', 'B', 'C', 'D']
+    default_answer_key = 'answer'
+
+
+class CircularCSQADataset(commonsenseqaDataset, metaclass=CircularDatasetMeta):
+    dataset_class = commonsenseqaDataset
+    default_circular_splits = ['validation']
+    default_option_keys = ['A', 'B', 'C', 'D', 'E']
+    default_answer_key = 'answerKey'
+
+
+class CircularARCDataset(ARCDataset, metaclass=CircularDatasetMeta):
+    dataset_class = ARCDataset
+    default_circular_splits = None
+    default_option_keys = ['textA', 'textB', 'textC', 'textD']
+
+    def default_answer_key_switch_method(item, circular_pattern):
+        circular_pattern = tuple(i[-1] for i in circular_pattern)
+        item['answerKey'] = circular_pattern['ABCD'.index(item['answerKey'])]
+        return item
+
+
+class CircularHSWAGDataset(hellaswagDataset_V2, metaclass=CircularDatasetMeta):
+    dataset_class = hellaswagDataset_V2
+    default_circular_splits = None
+    default_option_keys = ['A', 'B', 'C', 'D']
+    default_answer_key = 'label'
+
+
+class CircularOBQADataset(OBQADataset, metaclass=CircularDatasetMeta):
+    dataset_class = OBQADataset
+    default_circular_splits = None
+    default_option_keys = ['A', 'B', 'C', 'D']
+    default_answer_key = 'answerKey'
+
+
+class CircularRaceDataset(RaceDataset, metaclass=CircularDatasetMeta):
+    dataset_class = RaceDataset
+    default_circular_splits = ['test']
+    default_option_keys = ['A', 'B', 'C', 'D']
+    default_answer_key = 'answer'
+
+
+class CircularXiezhiDataset(XiezhiDataset, metaclass=CircularDatasetMeta):
+    dataset_class = XiezhiDataset
+    default_circular_splits = None
+    default_option_keys = ['A', 'B', 'C', 'D']
+    default_answer_key = 'answer'
+
+
+class CircularEvaluator(BaseEvaluator):
+    """This Evaluator assesses datasets post-Circular processing, generating
+    the following evaluation metrics:
+
+    - `acc_{origin|circular|all_possible}`: Treats each question with shuffled
+        answer options as separate, calculating accuracy.
+    - `perf_{origin|circular|all_possible}`: According Circular logic, a
+        question is considered correct only if all its variations with shuffled
+        options are answered correctly, calculating accuracy. perf is short for
+        perfect.
+    - `more_{num}_{origin|circular|all_possible}`: According to Circular logic,
+        a question is considered correct only if the number of its variations
+        answered correctly is greater than or equal to `num`, calculating
+        accuracy.
+
+    Note that when the `all_possible` method is used to shuffle option order,
+        it naturally includes the Circular method, and its metrics will also be
+        output.
+
+    Args:
+        circular_pattern: The method of shuffling options, either 'circular' or
+            'all_possible', defaulting to 'circular'.
+    """
+
+    def __init__(self, circular_pattern='circular'):
+        super().__init__()
+        self.circular_pattern = circular_pattern
+
+    def score(self, predictions, references, test_set):
+        circular_patterns = {}
+        circular_patterns['origin'] = get_origin_patterns(
+            test_set[0]['circular_pattern'])
+        circular_patterns['circular'] = get_circular_patterns(
+            test_set[0]['circular_pattern'])
+        if self.circular_pattern == 'all_possible':
+            circular_patterns['all_possible'] = get_all_possible_patterns(
+                test_set[0]['circular_pattern'])
+
+        metrics = {}
+        tmp_metrics = {}
+        tmp_metrics.update({f'correct_{k}': 0 for k in circular_patterns})
+        tmp_metrics.update({f'count_{k}': 0 for k in circular_patterns})
+        # calculate the original accuracy
+        for pred, ref, origin_item in zip(predictions, references, test_set):
+            circular_pattern = origin_item['circular_pattern']
+            for k in circular_patterns:
+                if tuple(circular_pattern) in circular_patterns[k]:
+                    tmp_metrics[f'correct_{k}'] += 1 if pred == ref else 0
+                    tmp_metrics[f'count_{k}'] += 1
+
+        for k in circular_patterns:
+            metrics[f'acc_{k}'] = (tmp_metrics[f'correct_{k}'] /
+                                   tmp_metrics[f'count_{k}'] * 100)
+
+        # calculate the circular accuracy
+        _details = {k: {} for k in circular_patterns}
+        for pred, ref, origin_item in zip(predictions, references, test_set):
+            index = origin_item['qid']
+            circular_pattern = origin_item['circular_pattern']
+            for k in circular_patterns:
+                if tuple(circular_pattern) in circular_patterns[k]:
+                    _details[k].setdefault(
+                        index, []).append(True if pred == ref else False)
+        for k in _details:
+            _details[k] = {
+                index: sum(_details[k][index])
+                for index in _details[k]
+            }
+        for k in _details:
+            for j in range(1, len(circular_patterns[k]) + 1):
+                count = sum([_details[k][index] >= j for index in _details[k]])
+                total = len(_details[k])
+                if j != len(circular_patterns[k]):
+                    metrics[f'more_{j}_{k}'] = count / total * 100
+                else:
+                    metrics[f'perf_{k}'] = count / total * 100
+
+        return metrics
--- a/opencompass/summarizers/init.py
+++ b/opencompass/summarizers/init.py
@ -1,4 +1,5 @@
+from .circular import CircularSummarizer
 from .default import DefaultSummarizer
 from .subjective import SubjectiveSummarizer

-__all__ = ['DefaultSummarizer', 'SubjectiveSummarizer']
+__all__ = ['DefaultSummarizer', 'SubjectiveSummarizer', 'CircularSummarizer']
--- a/opencompass/summarizers/circular.py
+++ b/opencompass/summarizers/circular.py
@ -0,0 +1,57 @@
+from typing import List, Optional
+
+from mmengine import ConfigDict
+
+from opencompass.utils import dataset_abbr_from_cfg
+from opencompass.utils.prompt import get_prompt_hash
+
+from .default import DefaultSummarizer
+
+
+class CircularSummarizer(DefaultSummarizer):
+
+    def __init__(self,
+                 config: ConfigDict,
+                 dataset_abbrs: Optional[List[str]] = None,
+                 summary_groups: List = [],
+                 prompt_db=None,
+                 metric_types=None) -> None:
+        super().__init__(config, dataset_abbrs, summary_groups, prompt_db)
+        self.metric_types = metric_types
+
+    def _format_table(self, parsed_results, dataset_metrics,
+                      dataset_eval_mode):
+        prompt_version = {
+            dataset_abbr_from_cfg(d): get_prompt_hash(d)[:6]
+            for d in self.dataset_cfgs
+        }
+
+        table = []
+        header1 = ['dataset', 'version', 'mode'] + sum(
+            [[model_abbr] + ['-' for _ in range(len(self.metric_types) - 1)]
+             for model_abbr in self.model_abbrs], [])
+        table.append(header1)
+        header2 = ['-', '-', '-'] + sum(
+            [self.metric_types for _ in self.model_abbrs], [])
+        table.append(header2)
+        for dataset_abbr in self.dataset_abbrs:
+            if dataset_abbr not in dataset_metrics:
+                table.append([dataset_abbr, '-', '-'] + ['-'] *
+                             len(self.model_abbrs) * len(self.metric_types))
+                continue
+            row = [
+                dataset_abbr,
+                prompt_version.get(dataset_abbr, '-'),
+                dataset_eval_mode.get(dataset_abbr, '-')
+            ]
+            for model_abbr in self.model_abbrs:
+                for metric in self.metric_types:
+                    if dataset_abbr in parsed_results[
+                            model_abbr] and metric in parsed_results[
+                                model_abbr][dataset_abbr]:
+                        row.append('{:.02f}'.format(
+                            parsed_results[model_abbr][dataset_abbr][metric]))
+                    else:
+                        row.append('-')
+            table.append(row)
+        return table
--- a/opencompass/summarizers/default.py
+++ b/opencompass/summarizers/default.py
@ -1,10 +1,11 @@
 # flake8: noqa
 # yapf: disable
+import functools
 import getpass
 import math
 import os.path as osp
 from datetime import datetime
-from typing import List, Optional
+from typing import Any, Dict, List, Optional

 import mmengine
 import tabulate
@ -22,12 +23,9 @@ class DefaultSummarizer:
    """Default summarizer in OpenCompass.

    Args:
-        config (ConfigDict): The configuration object of the evaluation task.
-            It's expected to be filled out at runtime.
-        dataset_abbrs (list[str], optional): Dataset abbreviations to be
-            listed in the summary.
-        summary_groups (list): The dataset groups whose results need to be
-            averaged out. For example, mmlu. Each item it a dict with
+        config (ConfigDict): The configuration object of the evaluation task. It's expected to be filled out at runtime.
+        dataset_abbrs (list[str], optional): Dataset abbreviations to be listed in the summary.
+        summary_groups (list): The dataset groups whose results need to be averaged out. For example, mmlu. Each item it a dict with
            'name' (str) and 'subsets' (list of dataset abbrs), and optionally
            'weights' if weighted average is needed.
        prompt_db: A deprecated field.
@ -48,28 +46,37 @@ class DefaultSummarizer:
        if self.cfg.get('lark_bot_url', None):
            self.lark_reporter = LarkReporter(self.cfg['lark_bot_url'])

-    def summarize(
-        self,
-        output_path: str = None,
-        time_str: str = datetime.now().strftime('%Y%m%d_%H%M%S')):  # noqa
+        self.model_cfgs = self.cfg['models']
+        self.dataset_cfgs = self.cfg['datasets']
+        self.work_dir = self.cfg['work_dir']
+        self.model_abbrs = [model_abbr_from_cfg(model) for model in self.model_cfgs]

-        model_cfgs = self.cfg['models']
-        dataset_cfgs = self.cfg['datasets']
-        work_dir = self.cfg['work_dir']
+    def _pick_up_results(self):
+        """The function reads the numerical results of evaluations from the
+        output folder based on the configuration file, and ultimately returns
+        four dictionaries, each containing processed information in different
+        formats. The contents of the four dictionaries are as follows:

-        # pick up results
-        raw_results = {}
-        parsed_results = {}
-        dataset_metrics = {}
+        - raw_results: contains the raw results of each model on each dataset (excluding details).
+        - parsed_results: contains the results of each model on each dataset for each metric, with metrics in METRIC_BLACKLIST being ignored.
+        - dataset_metrics: contains the list of metrics for each dataset, consistent with the metrics in parsed_results. The list is ordered according to the METRIC_WHITELIST,
+            with metrics appearing earlier considered more important.
+        - dataset_eval_mode: contains the evaluation mode for each dataset.
+        """
+        # raw_results: {model_abbr: {dataset_abbr: result}}
+        raw_results : Dict[str, Dict[str, Any]] = {}
+        # parsed_results: {model_abbr: {dataset_abbr: {metric: score}}}
+        parsed_results : Dict[str, Dict[str, Dict[str, float]]] = {}
+        # dataset_metrics: {dataset_abbr: [metric]}
+        dataset_metrics : Dict[str, List[str]] = {}

-        model_abbrs = [model_abbr_from_cfg(model) for model in model_cfgs]
-        for model in model_cfgs:
+        for model in self.model_cfgs:
            model_abbr = model_abbr_from_cfg(model)
            parsed_results[model_abbr] = {}
            raw_results[model_abbr] = {}
-            for dataset in dataset_cfgs:
+            for dataset in self.dataset_cfgs:
                dataset_abbr = dataset_abbr_from_cfg(dataset)
-                filepath = get_infer_output_path(model, dataset, osp.join(work_dir, 'results'))
+                filepath = get_infer_output_path(model, dataset, osp.join(self.work_dir, 'results'))
                if not osp.exists(filepath):
                    continue
                result = mmengine.load(filepath)
@ -78,34 +85,28 @@ class DefaultSummarizer:
                if 'error' in result:
                    self.logger.debug(f'error in {model_abbr} {dataset_abbr} {result["error"]}')
                    continue
-                else:
-                    parsed_results[model_abbr][dataset_abbr] = []
-                    dataset_metrics[dataset_abbr] = []
-                    for metric, score in result.items():
-                        if metric not in METRIC_BLACKLIST and isinstance(score, (int, float)):
-                            parsed_results[model_abbr][dataset_abbr].append(score)
-                            dataset_metrics[dataset_abbr].append(metric)
-                        else:
-                            continue
-                    if len(parsed_results[model_abbr][dataset_abbr]) == 0:
-                        self.logger.warning(f'unknown result format: {result}, continue')
-                        del parsed_results[model_abbr][dataset_abbr]
-                        del dataset_metrics[dataset_abbr]
+                _rst, _dm = {}, []
+                for metric, score in result.items():
+                    if metric not in METRIC_BLACKLIST and isinstance(score, (int, float)):
+                        _rst[metric] = score
+                        _dm.append(metric)
+                    else:
                        continue
-                    indice = sorted(
-                        list(range(len(dataset_metrics[dataset_abbr]))),
-                        key=lambda i: (
-                            METRIC_WHITELIST.index(dataset_metrics[dataset_abbr][i])
-                            if dataset_metrics[dataset_abbr][i] in METRIC_WHITELIST
-                            else len(METRIC_WHITELIST)
-                        )
-                    )
-                    parsed_results[model_abbr][dataset_abbr] = [parsed_results[model_abbr][dataset_abbr][i] for i in indice]
-                    dataset_metrics[dataset_abbr] = [dataset_metrics[dataset_abbr][i] for i in indice]
+                if len(_rst) == 0:
+                    self.logger.warning(f'unknown result format: {result}, continue')
+                    continue
+                _dm = sorted(_dm, key=lambda i: METRIC_WHITELIST.index(i) if i in METRIC_WHITELIST else len(METRIC_WHITELIST))

-        # parse eval mode
-        dataset_eval_mode = {}
-        for dataset in dataset_cfgs:
+                if dataset_abbr in dataset_metrics:
+                    assert tuple(dataset_metrics[dataset_abbr]) == tuple(_dm), \
+                    f'{dataset_abbr} has different metrics: {dataset_metrics[dataset_abbr]} vs {_dm}'
+                else:
+                    dataset_metrics[dataset_abbr] = _dm
+                parsed_results[model_abbr][dataset_abbr] = _rst
+
+        # dataset_eval_mode: {dataset_abbr: eval_mode}
+        dataset_eval_mode : Dict[str, str] = {}
+        for dataset in self.dataset_cfgs:
            inferencer = dataset.get('infer_cfg', {}).get('inferencer', {}).get('type', '')
            inferencer = inferencer if isinstance(inferencer, str) else inferencer.__name__
            dataset_abbr = dataset_abbr_from_cfg(dataset)
@ -116,64 +117,97 @@ class DefaultSummarizer:
            else:
                dataset_eval_mode[dataset_abbr] = 'unknown'
                self.logger.warning(f'unknown inferencer: {inferencer} - {dataset_abbr}')
+        return raw_results, parsed_results, dataset_metrics, dataset_eval_mode

-        # calculate group metrics
+    def _calculate_group_metrics(self, raw_results, parsed_results, dataset_metrics, dataset_eval_mode):
+        """The function calculates the numerical results for each group based
+        on the configuration in summary_groups, and updates the contents of
+        each dictionary accordingly."""
        summary_groups = self.summary_groups
        for sg in summary_groups:
-            for model_abbr in model_abbrs:
-                results = {}
-                eval_modes = []
-                for dataset_abbr in sg['subsets']:
-                    if dataset_abbr in parsed_results[model_abbr]:
-                        results[dataset_abbr] = parsed_results[model_abbr][dataset_abbr][0]
-                        eval_modes.append(dataset_eval_mode.get(dataset_abbr, 'unknown'))
-                if len(results) == len(sg['subsets']):
-                    if 'std' in sg and sg['std'] == True:
-                        avg = sum(results[k] for k in results) / len(results)
-                        variance = sum((results[k] - avg)**2 for k in results) / len(results)
-                        metric = 'standard_deviation'
-                        results[metric] = math.sqrt(variance)
-                    else:
-                        if 'weights' in sg:
-                            numerator = sum(results[k] * sg['weights'][k] for k in sg['weights'])
-                            denominator = sum(sg['weights'].values())
-                            metric = 'weighted_average'
-                        else:
-                            numerator = sum(results[k] for k in results)
-                            denominator = len(results)
-                            metric = 'naive_average'
-                        results[metric] = numerator / denominator
+            for model_abbr in self.model_abbrs:
+                available_count = sum(dataset_abbr in parsed_results[model_abbr] for dataset_abbr in sg['subsets'])
+                if available_count == 0:
+                    continue
+                if available_count != len(sg['subsets']):
+                    raw_results[model_abbr][sg['name']] = {'error': 'missing datasets: {}'.format(set(sg['subsets']) - set(parsed_results[model_abbr].keys()))}
+                    continue

+                if sg.get('std', False):
+                    default_metric = 'standard_deviation'
+                elif sg.get('weights', []):
+                    default_metric = 'weighted_average'
+                else:
+                    default_metric = 'naive_average'
+                scores, eval_modes, group_metrics = {}, [], None
+                if any(isinstance(dataset_abbr, (list, tuple)) for dataset_abbr in sg['subsets']) and \
+                    any(isinstance(dataset_abbr, str) for dataset_abbr in sg['subsets']):
+                    raise NotImplementedError('mixed dataset_abbr type is not supported')
+
+                if all(isinstance(dataset_abbr, (list, tuple)) for dataset_abbr in sg['subsets']):
+                    group_metrics = [default_metric]
+                    for dataset_abbr, metric in sg['subsets']:
+                        scores.setdefault(default_metric, []).append(parsed_results[model_abbr][dataset_abbr][metric])
+                        eval_modes.append(dataset_eval_mode.get(dataset_abbr, 'unknown'))
+                else:
+                    group_metrics = list(functools.reduce(lambda a, b: a & b, [set(dataset_metrics[dataset_abbr]) for dataset_abbr in sg['subsets']]))
+                    if len(group_metrics) > 1:
+                        for metric in group_metrics:
+                            for dataset_abbr in sg['subsets']:
+                                scores.setdefault(metric, []).append(parsed_results[model_abbr][dataset_abbr][metric])
+                                eval_modes.append(dataset_eval_mode.get(sg['subsets'][0], 'unknown'))
+                    else:
+                        group_metrics = [default_metric]
+                        for dataset_abbr in sg['subsets']:
+                            metric = dataset_metrics[dataset_abbr][0]
+                            scores.setdefault(default_metric, []).append(parsed_results[model_abbr][dataset_abbr][metric])
+                            eval_modes.append(dataset_eval_mode.get(dataset_abbr, 'unknown'))
+
+                result = {}
+                for metric in scores:
+                    if default_metric == 'standard_deviation':
+                        avg = sum(scores[metric]) / len(scores[metric])
+                        variance = sum((k - avg) ** 2 for k in scores[metric]) / len(scores[metric])
+                        scores[metric] = result[metric] = math.sqrt(variance)
+                    else:
+                        if default_metric == 'weighted_average':
+                            numerator = sum(scores[metric][k] * sg['weights'][k] for k in sg['weights'])
+                            denominator = sum(sg['weights'].values())
+                        else:
+                            numerator = sum(scores[metric])
+                            denominator = len(scores[metric])
+                        scores[metric] = result[metric] = numerator / denominator
                    eval_modes = list(set(eval_modes))
                    eval_mode = eval_modes[0] if len(eval_modes) == 1 else 'mixed'
-                    # add to global results
-                    raw_results[model_abbr][sg['name']] = results
-                    parsed_results[model_abbr][sg['name']] = [results[metric]]

-                    dataset_metrics[sg['name']] = [metric]
-                    dataset_eval_mode[sg['name']] = eval_mode
-                elif len(results) == 0:
-                    continue
-                else:
-                    raw_results[model_abbr][sg['name']] = {'error': 'missing datasets: {}'.format(set(sg['subsets']) - set(results.keys()))}
+                # add to global results
+                raw_results[model_abbr][sg['name']] = scores
+                parsed_results[model_abbr][sg['name']]= result
+                dataset_metrics[sg['name']] = group_metrics
+                dataset_eval_mode[sg['name']] = eval_mode

-        prompt_version = {dataset_abbr_from_cfg(d): get_prompt_hash(d)[:6] for d in dataset_cfgs}
+        return raw_results, parsed_results, dataset_metrics, dataset_eval_mode
+
+    def _format_table(self, parsed_results, dataset_metrics, dataset_eval_mode):
+        dataset_abbrs = [dataset_abbr_from_cfg(dataset) for dataset in self.dataset_cfgs]
+        prompt_version = {dataset_abbr_from_cfg(d): get_prompt_hash(d)[:6] for d in self.dataset_cfgs}

-        # format table
        summarizer_dataset_abbrs = []
        if self.dataset_abbrs is None:
-            for dataset in dataset_cfgs:
-                dataset_abbr = dataset_abbr_from_cfg(dataset)
+            # display all dataset metrics included in the config
+            for dataset_abbr in dataset_abbrs:
                if dataset_abbr in dataset_metrics:
                    for metric in dataset_metrics[dataset_abbr]:
                        summarizer_dataset_abbrs.append((dataset_abbr, metric))
                else:
                    summarizer_dataset_abbrs.append((dataset_abbr, None))
+            # along with all possible group metrics
            for dataset_abbr in dataset_metrics:
                for metric in dataset_metrics[dataset_abbr]:
                    if (dataset_abbr, metric) not in summarizer_dataset_abbrs:
                        summarizer_dataset_abbrs.append((dataset_abbr, metric))
        else:
+            # follow the required order
            for item in self.dataset_abbrs:
                if isinstance(item, str):
                    summarizer_dataset_abbrs.append((item, None))
@ -181,79 +215,103 @@ class DefaultSummarizer:
                    summarizer_dataset_abbrs.append((item[0], item[1]))

        table = []
-        header = ['dataset', 'version', 'metric', 'mode'] + model_abbrs
+        header = ['dataset', 'version', 'metric', 'mode'] + self.model_abbrs
        table.append(header)
        for dataset_abbr, metric in summarizer_dataset_abbrs:
            if dataset_abbr not in dataset_metrics:
-                table.append([dataset_abbr, '-', '-', '-'] + ['-'] * len(model_abbrs))
+                table.append([dataset_abbr, '-', '-', '-'] + ['-'] * len(self.model_abbrs))
                continue
            if metric is None:
-                index = 0
                metric = dataset_metrics[dataset_abbr][0]
            elif metric in dataset_metrics[dataset_abbr]:
-                index = dataset_metrics[dataset_abbr].index(metric)
+                pass
            else:
-                table.append([dataset_abbr, '-', '-', '-'] + ['-'] * len(model_abbrs))
+                table.append([dataset_abbr, '-', '-', '-'] + ['-'] * len(self.model_abbrs))
                continue

            row = [dataset_abbr, prompt_version.get(dataset_abbr, '-'), metric, dataset_eval_mode.get(dataset_abbr, '-')]
-            for model_abbr in model_abbrs:
+            for model_abbr in self.model_abbrs:
                if dataset_abbr in parsed_results[model_abbr]:
-                    row.append('{:.02f}'.format(parsed_results[model_abbr][dataset_abbr][index]))
+                    row.append('{:.02f}'.format(parsed_results[model_abbr][dataset_abbr][metric]))
                else:
                    row.append('-')
            table.append(row)
+        return table

-        # format raw txt
+    def _format_raw_txt(self, raw_results):
        raw_dataset_abbrs = []
-        for model_abbr in model_abbrs:
+        for model_abbr in self.model_abbrs:
            for dataset_abbr in raw_results[model_abbr]:
                if dataset_abbr not in raw_dataset_abbrs:
                    raw_dataset_abbrs.append(dataset_abbr)
        raw_txts = []
-        for model_abbr in model_abbrs:
+        for model_abbr in self.model_abbrs:
            raw_txts.append('-------------------------------')
            raw_txts.append(f'Model: {model_abbr}')
            for dataset_abbr in raw_dataset_abbrs:
                result = raw_results[model_abbr].get(dataset_abbr, '{}')
                raw_txts.append(f'{dataset_abbr}: {result}')
        raw_txts = '\n'.join(raw_txts)
+        return raw_txts

-        # output to screean
-        print(tabulate.tabulate(table, headers='firstrow'))
-
+    def _output_to_file(self, output_path, time_str, table, raw_txts):
        # output to file
        if output_path is None:
-            output_path = osp.join(work_dir, 'summary', f'summary_{time_str}.txt')
-            output_csv_path = osp.join(work_dir, 'summary', f'summary_{time_str}.csv')
+            output_path = osp.join(self.work_dir, 'summary', f'summary_{time_str}.txt')
+            output_csv_path = osp.join(self.work_dir, 'summary', f'summary_{time_str}.csv')
        else:
            output_csv_path = output_path.replace('.txt', '.csv')

        output_dir = osp.split(output_path)[0]
        mmengine.mkdir_or_exist(output_dir)
        with open(output_path, 'w', encoding='utf-8') as f:
-            f.write(time_str + '\n')
-            f.write('tabulate format\n')
-            f.write('^' * 128 + '\n')
-            f.write(tabulate.tabulate(table, headers='firstrow') + '\n')
-            f.write('$' * 128 + '\n')
-            f.write('\n' + '-' * 128 + ' THIS IS A DIVIDER ' + '-' * 128 + '\n\n')
-            f.write('csv format\n')
-            f.write('^' * 128 + '\n')
-            f.write('\n'.join([','.join(row) for row in table]) + '\n')
-            f.write('$' * 128 + '\n')
-            f.write('\n' + '-' * 128 + ' THIS IS A DIVIDER ' + '-' * 128 + '\n\n')
-            f.write('raw format\n')
-            f.write('^' * 128 + '\n')
-            f.write(raw_txts + '\n')
-            f.write('$' * 128 + '\n')
+            text = f'{time_str}\n' + \
+                    'tabulate format\n' + \
+                    '^' * 128 + '\n' + \
+                    tabulate.tabulate(table, headers='firstrow') + '\n' + \
+                    '$' * 128 + '\n\n' + \
+                    '-' * 128 + ' THIS IS A DIVIDER ' + '-' * 128 + '\n\n' + \
+                    'csv format\n' + \
+                    '^' * 128 + '\n' + \
+                    '\n'.join([','.join(row) for row in table]) + '\n' + \
+                    '$' * 128 + '\n\n' + \
+                    '-' * 128 + ' THIS IS A DIVIDER ' + '-' * 128 + '\n\n' + \
+                    'raw format\n' + \
+                    '^' * 128 + '\n' + \
+                    raw_txts + '\n' + \
+                    '$' * 128 + '\n'
+            f.write(text)
        self.logger.info(f'write summary to {osp.abspath(output_path)}')

+        with open(output_csv_path, 'w', encoding='utf-8') as f:
+            f.write('\n'.join([','.join(row) for row in table]) + '\n')
+        self.logger.info(f'write csv to {osp.abspath(output_csv_path)}')
+
+    def summarize(
+        self,
+        output_path: str = None,
+        time_str: str = datetime.now().strftime('%Y%m%d_%H%M%S')):  # noqa
+
+        # pick up results
+        raw_results, parsed_results, dataset_metrics, dataset_eval_mode = self._pick_up_results()
+
+        # calculate group metrics
+        raw_results, parsed_results, dataset_metrics, dataset_eval_mode = \
+            self._calculate_group_metrics(raw_results, parsed_results, dataset_metrics, dataset_eval_mode)
+
+        # format table
+        table = self._format_table(parsed_results, dataset_metrics, dataset_eval_mode)
+
+        # format raw txt
+        raw_txts = self._format_raw_txt(raw_results)
+
+        # output to screen
+        print(tabulate.tabulate(table, headers='firstrow'))
+
+        # output to .text / .csv files
+        self._output_to_file(output_path, time_str, table, raw_txts)
+
        if self.lark_reporter:
            content = f'{getpass.getuser()} 的'
            content += f'详细评测汇总已输出至 {osp.abspath(output_path)}'
            self.lark_reporter.post(content)
-
-        with open(output_csv_path, 'w', encoding='utf-8') as f:
-            f.write('\n'.join([','.join(row) for row in table]) + '\n')
-        self.logger.info(f'write csv to {osp.abspath(output_csv_path)}')
--- a/opencompass/tasks/openicl_eval.py
+++ b/opencompass/tasks/openicl_eval.py
@ -172,6 +172,7 @@ class OpenICLEvalTask(BaseTask):
            preds['predictions'] = pred_strs
            preds['references'] = (test_set[self.output_column]
                                   if self.output_column else None)
+            preds['test_set'] = test_set
            preds = {
                k: preds[k]
                for k in signature(icl_evaluator.score).parameters