diff --git a/configs/eval_circular.py b/configs/eval_circular.py new file mode 100644 index 00000000..e1aab0f8 --- /dev/null +++ b/configs/eval_circular.py @@ -0,0 +1,91 @@ +from mmengine.config import read_base +from opencompass.datasets.circular import (CircularCEvalDataset, CircularMMLUDataset, CircularCMMLUDataset, CircularCSQADataset, + CircularARCDataset, CircularHSWAGDataset, CircularOBQADataset, CircularRaceDataset, CircularEvaluator) +from opencompass.summarizers import CircularSummarizer + +with read_base(): + from .datasets.ceval.ceval_gen_5f30c7 import ceval_datasets + from .datasets.mmlu.mmlu_gen_a484b3 import mmlu_datasets + from .datasets.cmmlu.cmmlu_gen_c13365 import cmmlu_datasets + from .datasets.hellaswag.hellaswag_gen_6faab5 import hellaswag_datasets + from .datasets.ARC_e.ARC_e_gen_1e0de5 import ARC_e_datasets + from .datasets.ARC_c.ARC_c_gen_1e0de5 import ARC_c_datasets + from .datasets.commonsenseqa.commonsenseqa_gen_1da2d0 import commonsenseqa_datasets + from .datasets.obqa.obqa_gen_9069e4 import obqa_datasets + from .datasets.race.race_gen_69ee4f import race_datasets + + from .models.hf_internlm.hf_internlm_chat_7b import models as hf_internlm_chat_7b_model + from .models.hf_internlm.hf_internlm_chat_20b import models as hf_internlm_chat_20b_model + from .models.qwen.hf_qwen_7b_chat import models as hf_qwen_7b_chat_model + from .models.qwen.hf_qwen_14b_chat import models as hf_qwen_14b_chat_model + + from .summarizers.groups.mmlu import mmlu_summary_groups + from .summarizers.groups.cmmlu import cmmlu_summary_groups + from .summarizers.groups.ceval import ceval_summary_groups + +for ds, t in [ + (ceval_datasets, CircularCEvalDataset), + (mmlu_datasets, CircularMMLUDataset), + (cmmlu_datasets, CircularCMMLUDataset), + (hellaswag_datasets, CircularHSWAGDataset), + (ARC_e_datasets, CircularARCDataset), + (ARC_c_datasets, CircularARCDataset), + (commonsenseqa_datasets, CircularCSQADataset), + (obqa_datasets, CircularOBQADataset), + (race_datasets, CircularRaceDataset), +]: + for d in ds: + d['type'] = t + d['abbr'] = d['abbr'] + '-circular-4' + d['eval_cfg']['evaluator'] = {'type': CircularEvaluator, 'circular_pattern': 'circular'} + d['circular_patterns'] = 'circular' + + +datasets = sum([v for k, v in locals().items() if k.endswith("_datasets") or k == 'datasets'], []) +models = sum([v for k, v in locals().items() if k.endswith("_model")], []) + +# config summarizer +other_summary_groups = [ + {'name': 'average', + 'subsets': ['ceval', 'mmlu', 'cmmlu', 'hellaswag', 'ARC-e', 'ARC-c', 'commonsense_qa', 'openbookqa_fact', 'race-middle', 'race-high']}, +] +origin_summary_groups = sum([v for k, v in locals().items() if k.endswith("_summary_groups")], []) +new_summary_groups = [] +for item in origin_summary_groups: + new_summary_groups.append( + { + 'name': item['name'] + '-circular-4', + 'subsets': [i + '-circular-4' for i in item['subsets']], + } + ) +summarizer = dict( + type=CircularSummarizer, + metric_types=['acc_origin', 'perf_circular'], + dataset_abbrs = [ + 'average-circular-4', + 'ceval-circular-4', + 'mmlu-circular-4', + 'cmmlu-circular-4', + 'hellaswag-circular-4', + 'ARC-e-circular-4', + 'ARC-c-circular-4', + 'commonsense_qa-circular-4', + 'openbookqa_fact-circular-4', + 'race-middle-circular-4', + 'race-high-circular-4', + 'ceval-humanities-circular-4', + 'ceval-stem-circular-4', + 'ceval-social-science-circular-4', + 'ceval-other-circular-4', + 'mmlu-humanities-circular-4', + 'mmlu-stem-circular-4', + 'mmlu-social-science-circular-4', + 'mmlu-other-circular-4', + 'cmmlu-humanities-circular-4', + 'cmmlu-stem-circular-4', + 'cmmlu-social-science-circular-4', + 'cmmlu-other-circular-4', + 'cmmlu-china-specific-circular-4', + ], + summary_groups=new_summary_groups, +) diff --git a/docs/en/advanced_guides/circular_eval.md b/docs/en/advanced_guides/circular_eval.md new file mode 100644 index 00000000..ef7e3b5c --- /dev/null +++ b/docs/en/advanced_guides/circular_eval.md @@ -0,0 +1,113 @@ +# CircularEval + +## Background + +For multiple-choice questions, when a Language Model (LLM) provides the correct option, it does not necessarily imply a true understanding and reasoning of the question. It could be a guess. To differentiate these scenarios and reduce LLM bias towards options, CircularEval (CircularEval) can be utilized. A multiple-choice question is augmented by shuffling its options, and if the LLM correctly answers all variations of the augmented question, it is considered correct under CircularEval. + +## Adding Your Own CircularEval Dataset + +Generally, to evaluate a dataset using CircularEval, both its loading and evaluation methods need to be rewritten. Modifications are required in both the OpenCompass main library and configuration files. We will use C-Eval as an example for explanation. + +OpenCompass main library: + +```python +from opencompass.datasets.ceval import CEvalDataset +from opencompass.datasets.circular import CircularDatasetMeta + +class CircularCEvalDataset(CEvalDataset, metaclass=CircularDatasetMeta): + # The overloaded dataset class + dataset_class = CEvalDataset + + # Splits of the DatasetDict that need CircularEval. For CEvalDataset, which loads [dev, val, test], we only need 'val' and 'test' for CircularEval, not 'dev' + default_circular_splits = ['val', 'test'] + + # List of keys to be shuffled + default_option_keys = ['A', 'B', 'C', 'D'] + + # If the content of 'answer_key' is one of ['A', 'B', 'C', 'D'], representing the correct answer. This field indicates how to update the correct answer after shuffling options. Choose either this or default_answer_key_switch_method + default_answer_key = 'answer' + + # If 'answer_key' content is not one of ['A', 'B', 'C', 'D'], a function can be used to specify the correct answer after shuffling options. Choose either this or default_answer_key + # def default_answer_key_switch_method(item, circular_pattern): + # # 'item' is the original data item + # # 'circular_pattern' is a tuple indicating the order after shuffling options, e.g., ('D', 'A', 'B', 'C') means the original option A is now D, and so on + # item['answer'] = circular_pattern['ABCD'.index(item['answer'])] + # return item +``` + +`CircularCEvalDataset` accepts the `circular_pattern` parameter with two values: + +- `circular`: Indicates a single cycle. It is the default value. ABCD is expanded to ABCD, BCDA, CDAB, DABC, a total of 4 variations. +- `all_possible`: Indicates all permutations. ABCD is expanded to ABCD, ABDC, ACBD, ACDB, ADBC, ADCB, BACD, ..., a total of 24 variations. + +Additionally, we provide a `CircularEvaluator` to replace `AccEvaluator`. This Evaluator also accepts `circular_pattern`, and it should be consistent with the above. It produces the following metrics: + +- `acc_{origin|circular|all_possible}`: Treating each question with shuffled options as separate, calculating accuracy. +- `perf_{origin|circular|all_possible}`: Following Circular logic, a question is considered correct only if all its variations with shuffled options are answered correctly, calculating accuracy. +- `more_{num}_{origin|circular|all_possible}`: According to Circular logic, a question is deemed correct if the number of its variations answered correctly is greater than or equal to num, calculating accuracy. + +OpenCompass configuration file: + +```python +from mmengine.config import read_base +from opencompass.datasets.circular import CircularCEvalDataset + +with read_base(): + from .datasets.ceval.ceval_gen_5f30c7 import ceval_datasets + +for d in ceval_datasets: + # Overloading the load method + d['type'] = CircularCEvalDataset + # Renaming for differentiation from non-circular evaluation versions + d['abbr'] = d['abbr'] + '-circular-4' + # Overloading the evaluation method + d['eval_cfg']['evaluator'] = {'type': CircularEvaluator} + +# The dataset after the above operations looks like this: +# dict( +# type=CircularCEvalDataset, +# path='./data/ceval/formal_ceval', # Unchanged +# name='computer_network', # Unchanged +# abbr='ceval-computer_network-circular-4', +# reader_cfg=dict(...), # Unchanged +# infer_cfg=dict(...), # Unchanged +# eval_cfg=dict(evaluator=dict(type=CircularEvaluator), ...), +# ) +``` + +Additionally, for better presentation of results in CircularEval, consider using the following summarizer: + +```python + + +from mmengine.config import read_base +from opencompass.summarizers import CircularSummarizer + +with read_base(): + from ...summarizers.groups.ceval.ceval_summary_groups + +new_summary_groups = [] +for item in ceval_summary_groups: + new_summary_groups.append( + { + 'name': item['name'] + '-circular-4', + 'subsets': [i + '-circular-4' for i in item['subsets']], + } + ) + +summarizer = dict( + type=CircularSummarizer, + # Select specific metrics to view + metric_types=['acc_origin', 'perf_circular'], + dataset_abbrs = [ + 'ceval-circular-4', + 'ceval-humanities-circular-4', + 'ceval-stem-circular-4', + 'ceval-social-science-circular-4', + 'ceval-other-circular-4', + ], + summary_groups=new_summary_groups, +) +``` + +For more complex evaluation examples, refer to this sample code: https://github.com/open-compass/opencompass/tree/main/configs/eval_circular.py diff --git a/docs/en/index.rst b/docs/en/index.rst index c45120d7..c6c61ab4 100644 --- a/docs/en/index.rst +++ b/docs/en/index.rst @@ -67,6 +67,7 @@ We always welcome *PRs* and *Issues* for the betterment of OpenCompass. advanced_guides/prompt_attack.md advanced_guides/longeval.md advanced_guides/subjective_evaluation.md + advanced_guides/circular_eval.md .. _Tools: .. toctree:: diff --git a/docs/zh_cn/advanced_guides/circular_eval.md b/docs/zh_cn/advanced_guides/circular_eval.md new file mode 100644 index 00000000..402d3f5e --- /dev/null +++ b/docs/zh_cn/advanced_guides/circular_eval.md @@ -0,0 +1,111 @@ +# 循环评测 + +## 背景 + +对于选择题而言,当 LLM 给出正确的选项,并不一定代表着它能真正地理解题意并经过推理得出答案,它也有可能是蒙对的。为了将这两种情形区分开,同时也为了降低 LLM 对选项的偏见,我们可以尝试使用循环评测 (CircularEval)。我们会将一道选择题按照打乱选项的方式进行增广,若 LLM 可以在增广后的每道题上均得到正确的答案,那么我们认为在循环评测的意义下,这道题被做对了。 + +## 新增自己的循环评测数据集 + +一般来说,为了将一个数据集使用循环评测的方式进行评测,它的加载方式和评测方式是需要被重写的,OpenCompass 主库和配置文件均需要进行修改。后续我们以 C-Eval 为例进行讲解。 + +OpenCompass 主库: + +```python +from opencompass.datasets.ceval import CEvalDataset +from opencompass.datasets.circular import CircularDatasetMeta + +class CircularCEvalDataset(CEvalDataset, metaclass=CircularDatasetMeta): + # 被重载的数据集类 + dataset_class = CEvalDataset + + # 若原 load 方法得到一 DatasetDict,其哪些 split 需要被循环评测。CEvalDataset load 得到 [dev, val, test],我们只需要对 val 和 test 进行循环评测,dev 不需要 + default_circular_splits = ['val', 'test'] + + # 需要被打乱的 key 列表 + default_option_keys = ['A', 'B', 'C', 'D'] + + # 若 answer_key 的内容属于是 ['A', 'B', 'C', 'D'] 之一,并表示正确答案。该字段表示打乱选项后,需要如何更新正确答案。与 default_answer_key_switch_method 二选一 + default_answer_key = 'answer' + + # 如果 answer_key 的内容不属于 ['A', 'B', 'C', 'D'] 之一,那么可以使用函数的方式来指定打乱选项后的正确答案。与 default_answer_key 二选一 + # def default_answer_key_switch_method(item, circular_pattern): + # # item 是原本的数据项 + # # circular_pattern 是一个 tuple,表示打乱选项后的顺序,例如 ('D', 'A', 'B', 'C') 表示原来的 A 选项变成了 D,原来的 B 选项变成了 A,以此类推 + # item['answer'] = circular_pattern['ABCD'.index(item['answer'])] + # return item +``` + +`CircularCEvalDataset` 会接受 `circular_pattern` 参数,它有两个取值: + +- `circular`: 表示单项循环。默认为该值。ABCD 会被扩充为 ABCD, BCDA, CDAB, DABC, 共 4 种 +- `all_possible`: 表示全排列。ABCD 会被扩充为 ABCD, ABDC, ACBD, ACDB, ADBC, ADCB, BACD, ..., 共 24 种 + +另外我们提供了一个 `CircularEvaluator` 用于替换 `AccEvaluator`,该 Evaluator 同样接受 `circular_pattern`,该参数应与上述保持一致。它会产出以下指标: + +- `acc_{origin|circular|all_possible}`: 将打乱后选项顺序后的题目视作多道单独的题目,计算准确率 +- `perf_{origin|circular|all_possible}`: 按照 circular 的逻辑,若选项打乱后的题目都回答正确,才会视为这道题正确,计算准确率 +- `more_{num}_{origin|circular|all_possible}`: 按照 circular 的逻辑,若选项打乱后的题目回答正确的数量大于等于 num,就会视为这道题正确,计算准确率 + +OpenCompass 配置文件: + +```python +from mmengine.config import read_base +from opencompass.datasets.circular import CircularCEvalDataset + +with read_base(): + from .datasets.ceval.ceval_gen_5f30c7 import ceval_datasets + +for d in ceval_datasets: + # 重载 load 方法 + d['type'] = CircularCEvalDataset + # 为了与非循环评测版本做区分而进行改名 + d['abbr'] = d['abbr'] + '-circular-4' + # 重载评测方法 + d['eval_cfg']['evaluator'] = {'type': CircularEvaluator} + +# 上述操作后的 dataset 形如下: +# dict( +# type=CircularCEvalDataset, +# path='./data/ceval/formal_ceval', # 未改变 +# name='computer_network', # 未改变 +# abbr='ceval-computer_network-circular-4', +# reader_cfg=dict(...), # 未改变 +# infer_cfg=dict(...), # 未改变 +# eval_cfg=dict(evaluator=dict(type=CircularEvaluator), ...), +# ) +``` + +另外评测时为了针对循环评测有更良好的结果呈现,建议考虑使用以下 summarizer + +```python +from mmengine.config import read_base +from opencompass.summarizers import CircularSummarizer + +with read_base(): + from ...summarizers.groups.ceval import ceval_summary_groups + +new_summary_groups = [] +for item in ceval_summary_groups: + new_summary_groups.append( + { + 'name': item['name'] + '-circular-4', + 'subsets': [i + '-circular-4' for i in item['subsets']], + } + ) + +summarizer = dict( + type=CircularSummarizer, + # 选择具体看哪些指标 + metric_types=['acc_origin', 'perf_circular'], + dataset_abbrs = [ + 'ceval-circular-4', + 'ceval-humanities-circular-4', + 'ceval-stem-circular-4', + 'ceval-social-science-circular-4', + 'ceval-other-circular-4', + ], + summary_groups=new_summary_groups, +) +``` + +更多复杂的评测案例可以参考这个样例代码: https://github.com/open-compass/opencompass/tree/main/configs/eval_circular.py diff --git a/docs/zh_cn/index.rst b/docs/zh_cn/index.rst index 74343daa..7cab73c8 100644 --- a/docs/zh_cn/index.rst +++ b/docs/zh_cn/index.rst @@ -67,6 +67,7 @@ OpenCompass 上手路线 advanced_guides/prompt_attack.md advanced_guides/longeval.md advanced_guides/subjective_evaluation.md + advanced_guides/circular_eval.md .. _工具: .. toctree:: diff --git a/opencompass/datasets/__init__.py b/opencompass/datasets/__init__.py index f63eac86..306cee4a 100644 --- a/opencompass/datasets/__init__.py +++ b/opencompass/datasets/__init__.py @@ -13,6 +13,7 @@ from .cb import * # noqa: F401, F403 from .ceval import * # noqa: F401, F403 from .chid import * # noqa: F401, F403 from .cibench import * # noqa: F401, F403 +from .circular import * # noqa: F401, F403 from .civilcomments import * # noqa: F401, F403 from .clozeTest_maxmin import * # noqa: F401, F403 from .cluewsc import * # noqa: F401, F403 diff --git a/opencompass/datasets/arc.py b/opencompass/datasets/arc.py index 62e8cbea..1cf50f11 100644 --- a/opencompass/datasets/arc.py +++ b/opencompass/datasets/arc.py @@ -14,32 +14,19 @@ class ARCDataset(BaseDataset): def load(path: str): with open(path, 'r', errors='ignore') as in_f: rows = [] - for i, line in enumerate(in_f): - sample = json.loads(line.strip()) - answerKey = sample['answerKey'] - sample = sample['question'] - question = sample['stem'] - choices = sample['choices'] - if len(choices) != 4: + for line in in_f: + item = json.loads(line.strip()) + question = item['question'] + if len(question['choices']) != 4: continue - textA = choices[0]['text'] - textB = choices[1]['text'] - textC = choices[2]['text'] - textD = choices[3]['text'] + labels = [c['label'] for c in question['choices']] + answerKey = 'ABCD'[labels.index(item['answerKey'])] rows.append({ - 'question': question, + 'question': question['stem'], 'answerKey': answerKey, - 'textA': textA, - 'textB': textB, - 'textC': textC, - 'textD': textD + 'textA': question['choices'][0]['text'], + 'textB': question['choices'][1]['text'], + 'textC': question['choices'][2]['text'], + 'textD': question['choices'][3]['text'], }) - dataset = Dataset.from_dict({ - 'question': [row['question'] for row in rows], - 'answerKey': [row['answerKey'] for row in rows], - 'textA': [row['textA'] for row in rows], - 'textB': [row['textB'] for row in rows], - 'textC': [row['textC'] for row in rows], - 'textD': [row['textD'] for row in rows] - }) - return dataset + return Dataset.from_list(rows) diff --git a/opencompass/datasets/circular.py b/opencompass/datasets/circular.py new file mode 100644 index 00000000..552d26e4 --- /dev/null +++ b/opencompass/datasets/circular.py @@ -0,0 +1,348 @@ +import copy +import itertools +from typing import Callable, List, Optional, Union + +from datasets import Dataset, DatasetDict + +from opencompass.openicl.icl_evaluator import BaseEvaluator + +from .arc import ARCDataset +from .ceval import CEvalDataset +from .cmmlu import CMMLUDataset +from .commonsenseqa import commonsenseqaDataset +from .hellaswag import hellaswagDataset_V2 +from .mmlu import MMLUDataset +from .obqa import OBQADataset +from .race import RaceDataset +from .xiezhi import XiezhiDataset + + +def get_origin_patterns(option_keys): + return [tuple(option_keys)] + + +def get_circular_patterns(option_keys): + double_option_keys = option_keys + option_keys + circular_patterns = [ + tuple(double_option_keys[i:i + len(option_keys)]) + for i in range(len(option_keys)) + ] + return circular_patterns + + +def get_all_possible_patterns(option_keys): + circular_patterns = list(itertools.permutations(option_keys)) + return circular_patterns + + +class CircularDatasetMeta(type): + """This Meta Class is designed to transform a class that reads datasets + into one that supports reading datasets required for CircularEval. It + overloads an existing load method for the original class. + + The Meta Class should possess the following attributes: + + - `dataset_class` (class): The class for reading datasets, such as + `CEvalDataset`. + - `default_circular_splits` (list, optional): The default splits of the + dataset that need to undergo CircularEval, like ['val', 'test']. If a + `Dataset` is loaded originally, this field will be ignored. + - `default_option_keys` (list): The keys for options in the dataset, such + as ['A', 'B', 'C', 'D']. + - `default_answer_key` (str, optional): The key for answers in the dataset, + like 'answer'. This is an alternative to + `default_answer_key_switch_method`. + - `default_answer_key_switch_method` (function, optional): The method to + transform the key for answers in the dataset. This is an alternative to + `default_answer_key`. + """ + + @staticmethod + def make_circular_items( + origin_item, + circular_patterns, + option_keys, + answer_key, + answer_key_switch_method, + qid, + ): + items = [] + for circular_pattern in circular_patterns: + item = copy.deepcopy(origin_item) + for i in range(len(option_keys)): + item[circular_pattern[i]] = origin_item[option_keys[i]] + if answer_key_switch_method is None: + if origin_item[answer_key] in option_keys: + item[answer_key] = circular_pattern[option_keys.index( + origin_item[answer_key])] + else: + pass + else: + item = answer_key_switch_method(item, circular_pattern) + item['qid'] = qid + item['circular_pattern'] = tuple(circular_pattern) + items.append(item) + return items + + @staticmethod + def make_circular_dataset(dataset, circular_patterns, option_keys, + answer_key, answer_key_switch_method): + circulated_items = [] + for i, item in enumerate(dataset): + item = CircularDatasetMeta.make_circular_items( + item, + circular_patterns, + option_keys, + answer_key, + answer_key_switch_method, + i, + ) + circulated_items.extend(item) + return Dataset.from_list(circulated_items) + + def make_circular( + dataset: Union[Dataset, DatasetDict], + circular_splits: Optional[List[str]] = ['test'], + circular_patterns: str = 'circular', + option_keys: List[str] = ['A', 'B', 'C', 'D'], + answer_key: Optional[str] = 'answer', + answer_key_switch_method: Optional[Callable] = None, + ): + """Transform the dataset into one that is compatible with CircularEval. + In CircularEval, the original multiple-choice questions with options + ABCD are augmented by shuffling the order of options, such as BCDA, + CDAB, DABC, etc. A model is considered correct only if it answers all + augmented questions correctly. This method effectively prevents models + from memorizing answers. + + Args: + datasets: The dataset to be augmented. + circular_splits: List of splits to make circular. This is only + effective when the dataset is a DatasetDict. + circular_patterns: Method for circular processing, can be 'circular' + for single cycle or 'all_possible' for all permutations, default + is 'circular'. + option_keys: List of keys for options, default to ['A', 'B', 'C', 'D']. + answer_key: Key for the answer, default to 'answer'. When specified, + ensure that the content of answer_key is among the option_keys. + It is an alternative to specifying answer_key_switch_method. + answer_key_switch_method: Function to modify the answer_key. It is an + alternative to specifying answer_key. + """ + + if isinstance(circular_patterns, str): + if circular_patterns == 'circular': + circular_patterns = get_circular_patterns(option_keys) + elif circular_patterns == 'all_possible': + circular_patterns = get_all_possible_patterns(option_keys) + else: + raise ValueError( + f'Unknown circular_patterns: {circular_patterns}') + else: + assert isinstance(circular_patterns, list) + assert all([isinstance(i, list) for i in circular_patterns]) + # TODO: other necessary sanity checks + raise NotImplementedError( + 'circular_patterns int list of list has not been tested yet') + + if answer_key is None and answer_key_switch_method is None: + raise ValueError( + 'answer_key and answer_key_switch_method cannot be both None') + if answer_key is not None and answer_key_switch_method is not None: + raise ValueError( + 'either answer_key or answer_key_switch_method should be None') + + if isinstance(dataset, Dataset): + dataset = CircularDatasetMeta.make_circular_dataset( + dataset, + circular_patterns, + option_keys, + answer_key, + answer_key_switch_method, + ) + else: + assert isinstance(dataset, DatasetDict) + dataset_dict = {} + for split in dataset: + if circular_splits is not None and split in circular_splits: + dataset_dict[ + split] = CircularDatasetMeta.make_circular_dataset( + dataset[split], + circular_patterns, + option_keys, + answer_key, + answer_key_switch_method, + ) + else: + dataset_dict[split] = dataset[split] + dataset = DatasetDict(dataset_dict) + return dataset + + def __new__(cls, name, bases, dct): + new_cls = super().__new__(cls, name, bases, dct) + + def load(cls, circular_patterns='circular', *args, **kwargs): + circular_splits = getattr(cls, 'default_circular_splits', None) + option_keys = cls.default_option_keys + answer_key = getattr(cls, 'default_answer_key', None) + answer_key_switch_method = getattr( + cls, 'default_answer_key_switch_method', None) + dataset = cls.dataset_class.load(*args, **kwargs) + return CircularDatasetMeta.make_circular( + dataset, + circular_splits, + circular_patterns, + option_keys, + answer_key, + answer_key_switch_method, + ) + + setattr(new_cls, 'load', classmethod(load)) + return new_cls + + +class CircularCEvalDataset(CEvalDataset, metaclass=CircularDatasetMeta): + dataset_class = CEvalDataset + default_circular_splits = ['val', 'test'] + default_option_keys = ['A', 'B', 'C', 'D'] + default_answer_key = 'answer' + + +class CircularMMLUDataset(MMLUDataset, metaclass=CircularDatasetMeta): + dataset_class = MMLUDataset + default_circular_splits = ['test'] + default_option_keys = ['A', 'B', 'C', 'D'] + default_answer_key = 'target' + + +class CircularCMMLUDataset(CMMLUDataset, metaclass=CircularDatasetMeta): + dataset_class = CMMLUDataset + default_circular_splits = ['test'] + default_option_keys = ['A', 'B', 'C', 'D'] + default_answer_key = 'answer' + + +class CircularCSQADataset(commonsenseqaDataset, metaclass=CircularDatasetMeta): + dataset_class = commonsenseqaDataset + default_circular_splits = ['validation'] + default_option_keys = ['A', 'B', 'C', 'D', 'E'] + default_answer_key = 'answerKey' + + +class CircularARCDataset(ARCDataset, metaclass=CircularDatasetMeta): + dataset_class = ARCDataset + default_circular_splits = None + default_option_keys = ['textA', 'textB', 'textC', 'textD'] + + def default_answer_key_switch_method(item, circular_pattern): + circular_pattern = tuple(i[-1] for i in circular_pattern) + item['answerKey'] = circular_pattern['ABCD'.index(item['answerKey'])] + return item + + +class CircularHSWAGDataset(hellaswagDataset_V2, metaclass=CircularDatasetMeta): + dataset_class = hellaswagDataset_V2 + default_circular_splits = None + default_option_keys = ['A', 'B', 'C', 'D'] + default_answer_key = 'label' + + +class CircularOBQADataset(OBQADataset, metaclass=CircularDatasetMeta): + dataset_class = OBQADataset + default_circular_splits = None + default_option_keys = ['A', 'B', 'C', 'D'] + default_answer_key = 'answerKey' + + +class CircularRaceDataset(RaceDataset, metaclass=CircularDatasetMeta): + dataset_class = RaceDataset + default_circular_splits = ['test'] + default_option_keys = ['A', 'B', 'C', 'D'] + default_answer_key = 'answer' + + +class CircularXiezhiDataset(XiezhiDataset, metaclass=CircularDatasetMeta): + dataset_class = XiezhiDataset + default_circular_splits = None + default_option_keys = ['A', 'B', 'C', 'D'] + default_answer_key = 'answer' + + +class CircularEvaluator(BaseEvaluator): + """This Evaluator assesses datasets post-Circular processing, generating + the following evaluation metrics: + + - `acc_{origin|circular|all_possible}`: Treats each question with shuffled + answer options as separate, calculating accuracy. + - `perf_{origin|circular|all_possible}`: According Circular logic, a + question is considered correct only if all its variations with shuffled + options are answered correctly, calculating accuracy. perf is short for + perfect. + - `more_{num}_{origin|circular|all_possible}`: According to Circular logic, + a question is considered correct only if the number of its variations + answered correctly is greater than or equal to `num`, calculating + accuracy. + + Note that when the `all_possible` method is used to shuffle option order, + it naturally includes the Circular method, and its metrics will also be + output. + + Args: + circular_pattern: The method of shuffling options, either 'circular' or + 'all_possible', defaulting to 'circular'. + """ + + def __init__(self, circular_pattern='circular'): + super().__init__() + self.circular_pattern = circular_pattern + + def score(self, predictions, references, test_set): + circular_patterns = {} + circular_patterns['origin'] = get_origin_patterns( + test_set[0]['circular_pattern']) + circular_patterns['circular'] = get_circular_patterns( + test_set[0]['circular_pattern']) + if self.circular_pattern == 'all_possible': + circular_patterns['all_possible'] = get_all_possible_patterns( + test_set[0]['circular_pattern']) + + metrics = {} + tmp_metrics = {} + tmp_metrics.update({f'correct_{k}': 0 for k in circular_patterns}) + tmp_metrics.update({f'count_{k}': 0 for k in circular_patterns}) + # calculate the original accuracy + for pred, ref, origin_item in zip(predictions, references, test_set): + circular_pattern = origin_item['circular_pattern'] + for k in circular_patterns: + if tuple(circular_pattern) in circular_patterns[k]: + tmp_metrics[f'correct_{k}'] += 1 if pred == ref else 0 + tmp_metrics[f'count_{k}'] += 1 + + for k in circular_patterns: + metrics[f'acc_{k}'] = (tmp_metrics[f'correct_{k}'] / + tmp_metrics[f'count_{k}'] * 100) + + # calculate the circular accuracy + _details = {k: {} for k in circular_patterns} + for pred, ref, origin_item in zip(predictions, references, test_set): + index = origin_item['qid'] + circular_pattern = origin_item['circular_pattern'] + for k in circular_patterns: + if tuple(circular_pattern) in circular_patterns[k]: + _details[k].setdefault( + index, []).append(True if pred == ref else False) + for k in _details: + _details[k] = { + index: sum(_details[k][index]) + for index in _details[k] + } + for k in _details: + for j in range(1, len(circular_patterns[k]) + 1): + count = sum([_details[k][index] >= j for index in _details[k]]) + total = len(_details[k]) + if j != len(circular_patterns[k]): + metrics[f'more_{j}_{k}'] = count / total * 100 + else: + metrics[f'perf_{k}'] = count / total * 100 + + return metrics diff --git a/opencompass/summarizers/__init__.py b/opencompass/summarizers/__init__.py index 2c06aeb8..612f02eb 100644 --- a/opencompass/summarizers/__init__.py +++ b/opencompass/summarizers/__init__.py @@ -1,4 +1,5 @@ +from .circular import CircularSummarizer from .default import DefaultSummarizer from .subjective import SubjectiveSummarizer -__all__ = ['DefaultSummarizer', 'SubjectiveSummarizer'] +__all__ = ['DefaultSummarizer', 'SubjectiveSummarizer', 'CircularSummarizer'] diff --git a/opencompass/summarizers/circular.py b/opencompass/summarizers/circular.py new file mode 100644 index 00000000..d3b62cae --- /dev/null +++ b/opencompass/summarizers/circular.py @@ -0,0 +1,57 @@ +from typing import List, Optional + +from mmengine import ConfigDict + +from opencompass.utils import dataset_abbr_from_cfg +from opencompass.utils.prompt import get_prompt_hash + +from .default import DefaultSummarizer + + +class CircularSummarizer(DefaultSummarizer): + + def __init__(self, + config: ConfigDict, + dataset_abbrs: Optional[List[str]] = None, + summary_groups: List = [], + prompt_db=None, + metric_types=None) -> None: + super().__init__(config, dataset_abbrs, summary_groups, prompt_db) + self.metric_types = metric_types + + def _format_table(self, parsed_results, dataset_metrics, + dataset_eval_mode): + prompt_version = { + dataset_abbr_from_cfg(d): get_prompt_hash(d)[:6] + for d in self.dataset_cfgs + } + + table = [] + header1 = ['dataset', 'version', 'mode'] + sum( + [[model_abbr] + ['-' for _ in range(len(self.metric_types) - 1)] + for model_abbr in self.model_abbrs], []) + table.append(header1) + header2 = ['-', '-', '-'] + sum( + [self.metric_types for _ in self.model_abbrs], []) + table.append(header2) + for dataset_abbr in self.dataset_abbrs: + if dataset_abbr not in dataset_metrics: + table.append([dataset_abbr, '-', '-'] + ['-'] * + len(self.model_abbrs) * len(self.metric_types)) + continue + row = [ + dataset_abbr, + prompt_version.get(dataset_abbr, '-'), + dataset_eval_mode.get(dataset_abbr, '-') + ] + for model_abbr in self.model_abbrs: + for metric in self.metric_types: + if dataset_abbr in parsed_results[ + model_abbr] and metric in parsed_results[ + model_abbr][dataset_abbr]: + row.append('{:.02f}'.format( + parsed_results[model_abbr][dataset_abbr][metric])) + else: + row.append('-') + table.append(row) + return table diff --git a/opencompass/summarizers/default.py b/opencompass/summarizers/default.py index aa1acffe..8ae4e896 100644 --- a/opencompass/summarizers/default.py +++ b/opencompass/summarizers/default.py @@ -1,10 +1,11 @@ # flake8: noqa # yapf: disable +import functools import getpass import math import os.path as osp from datetime import datetime -from typing import List, Optional +from typing import Any, Dict, List, Optional import mmengine import tabulate @@ -22,12 +23,9 @@ class DefaultSummarizer: """Default summarizer in OpenCompass. Args: - config (ConfigDict): The configuration object of the evaluation task. - It's expected to be filled out at runtime. - dataset_abbrs (list[str], optional): Dataset abbreviations to be - listed in the summary. - summary_groups (list): The dataset groups whose results need to be - averaged out. For example, mmlu. Each item it a dict with + config (ConfigDict): The configuration object of the evaluation task. It's expected to be filled out at runtime. + dataset_abbrs (list[str], optional): Dataset abbreviations to be listed in the summary. + summary_groups (list): The dataset groups whose results need to be averaged out. For example, mmlu. Each item it a dict with 'name' (str) and 'subsets' (list of dataset abbrs), and optionally 'weights' if weighted average is needed. prompt_db: A deprecated field. @@ -48,28 +46,37 @@ class DefaultSummarizer: if self.cfg.get('lark_bot_url', None): self.lark_reporter = LarkReporter(self.cfg['lark_bot_url']) - def summarize( - self, - output_path: str = None, - time_str: str = datetime.now().strftime('%Y%m%d_%H%M%S')): # noqa + self.model_cfgs = self.cfg['models'] + self.dataset_cfgs = self.cfg['datasets'] + self.work_dir = self.cfg['work_dir'] + self.model_abbrs = [model_abbr_from_cfg(model) for model in self.model_cfgs] - model_cfgs = self.cfg['models'] - dataset_cfgs = self.cfg['datasets'] - work_dir = self.cfg['work_dir'] + def _pick_up_results(self): + """The function reads the numerical results of evaluations from the + output folder based on the configuration file, and ultimately returns + four dictionaries, each containing processed information in different + formats. The contents of the four dictionaries are as follows: - # pick up results - raw_results = {} - parsed_results = {} - dataset_metrics = {} + - raw_results: contains the raw results of each model on each dataset (excluding details). + - parsed_results: contains the results of each model on each dataset for each metric, with metrics in METRIC_BLACKLIST being ignored. + - dataset_metrics: contains the list of metrics for each dataset, consistent with the metrics in parsed_results. The list is ordered according to the METRIC_WHITELIST, + with metrics appearing earlier considered more important. + - dataset_eval_mode: contains the evaluation mode for each dataset. + """ + # raw_results: {model_abbr: {dataset_abbr: result}} + raw_results : Dict[str, Dict[str, Any]] = {} + # parsed_results: {model_abbr: {dataset_abbr: {metric: score}}} + parsed_results : Dict[str, Dict[str, Dict[str, float]]] = {} + # dataset_metrics: {dataset_abbr: [metric]} + dataset_metrics : Dict[str, List[str]] = {} - model_abbrs = [model_abbr_from_cfg(model) for model in model_cfgs] - for model in model_cfgs: + for model in self.model_cfgs: model_abbr = model_abbr_from_cfg(model) parsed_results[model_abbr] = {} raw_results[model_abbr] = {} - for dataset in dataset_cfgs: + for dataset in self.dataset_cfgs: dataset_abbr = dataset_abbr_from_cfg(dataset) - filepath = get_infer_output_path(model, dataset, osp.join(work_dir, 'results')) + filepath = get_infer_output_path(model, dataset, osp.join(self.work_dir, 'results')) if not osp.exists(filepath): continue result = mmengine.load(filepath) @@ -78,34 +85,28 @@ class DefaultSummarizer: if 'error' in result: self.logger.debug(f'error in {model_abbr} {dataset_abbr} {result["error"]}') continue - else: - parsed_results[model_abbr][dataset_abbr] = [] - dataset_metrics[dataset_abbr] = [] - for metric, score in result.items(): - if metric not in METRIC_BLACKLIST and isinstance(score, (int, float)): - parsed_results[model_abbr][dataset_abbr].append(score) - dataset_metrics[dataset_abbr].append(metric) - else: - continue - if len(parsed_results[model_abbr][dataset_abbr]) == 0: - self.logger.warning(f'unknown result format: {result}, continue') - del parsed_results[model_abbr][dataset_abbr] - del dataset_metrics[dataset_abbr] + _rst, _dm = {}, [] + for metric, score in result.items(): + if metric not in METRIC_BLACKLIST and isinstance(score, (int, float)): + _rst[metric] = score + _dm.append(metric) + else: continue - indice = sorted( - list(range(len(dataset_metrics[dataset_abbr]))), - key=lambda i: ( - METRIC_WHITELIST.index(dataset_metrics[dataset_abbr][i]) - if dataset_metrics[dataset_abbr][i] in METRIC_WHITELIST - else len(METRIC_WHITELIST) - ) - ) - parsed_results[model_abbr][dataset_abbr] = [parsed_results[model_abbr][dataset_abbr][i] for i in indice] - dataset_metrics[dataset_abbr] = [dataset_metrics[dataset_abbr][i] for i in indice] + if len(_rst) == 0: + self.logger.warning(f'unknown result format: {result}, continue') + continue + _dm = sorted(_dm, key=lambda i: METRIC_WHITELIST.index(i) if i in METRIC_WHITELIST else len(METRIC_WHITELIST)) - # parse eval mode - dataset_eval_mode = {} - for dataset in dataset_cfgs: + if dataset_abbr in dataset_metrics: + assert tuple(dataset_metrics[dataset_abbr]) == tuple(_dm), \ + f'{dataset_abbr} has different metrics: {dataset_metrics[dataset_abbr]} vs {_dm}' + else: + dataset_metrics[dataset_abbr] = _dm + parsed_results[model_abbr][dataset_abbr] = _rst + + # dataset_eval_mode: {dataset_abbr: eval_mode} + dataset_eval_mode : Dict[str, str] = {} + for dataset in self.dataset_cfgs: inferencer = dataset.get('infer_cfg', {}).get('inferencer', {}).get('type', '') inferencer = inferencer if isinstance(inferencer, str) else inferencer.__name__ dataset_abbr = dataset_abbr_from_cfg(dataset) @@ -116,64 +117,97 @@ class DefaultSummarizer: else: dataset_eval_mode[dataset_abbr] = 'unknown' self.logger.warning(f'unknown inferencer: {inferencer} - {dataset_abbr}') + return raw_results, parsed_results, dataset_metrics, dataset_eval_mode - # calculate group metrics + def _calculate_group_metrics(self, raw_results, parsed_results, dataset_metrics, dataset_eval_mode): + """The function calculates the numerical results for each group based + on the configuration in summary_groups, and updates the contents of + each dictionary accordingly.""" summary_groups = self.summary_groups for sg in summary_groups: - for model_abbr in model_abbrs: - results = {} - eval_modes = [] - for dataset_abbr in sg['subsets']: - if dataset_abbr in parsed_results[model_abbr]: - results[dataset_abbr] = parsed_results[model_abbr][dataset_abbr][0] - eval_modes.append(dataset_eval_mode.get(dataset_abbr, 'unknown')) - if len(results) == len(sg['subsets']): - if 'std' in sg and sg['std'] == True: - avg = sum(results[k] for k in results) / len(results) - variance = sum((results[k] - avg)**2 for k in results) / len(results) - metric = 'standard_deviation' - results[metric] = math.sqrt(variance) - else: - if 'weights' in sg: - numerator = sum(results[k] * sg['weights'][k] for k in sg['weights']) - denominator = sum(sg['weights'].values()) - metric = 'weighted_average' - else: - numerator = sum(results[k] for k in results) - denominator = len(results) - metric = 'naive_average' - results[metric] = numerator / denominator + for model_abbr in self.model_abbrs: + available_count = sum(dataset_abbr in parsed_results[model_abbr] for dataset_abbr in sg['subsets']) + if available_count == 0: + continue + if available_count != len(sg['subsets']): + raw_results[model_abbr][sg['name']] = {'error': 'missing datasets: {}'.format(set(sg['subsets']) - set(parsed_results[model_abbr].keys()))} + continue + if sg.get('std', False): + default_metric = 'standard_deviation' + elif sg.get('weights', []): + default_metric = 'weighted_average' + else: + default_metric = 'naive_average' + scores, eval_modes, group_metrics = {}, [], None + if any(isinstance(dataset_abbr, (list, tuple)) for dataset_abbr in sg['subsets']) and \ + any(isinstance(dataset_abbr, str) for dataset_abbr in sg['subsets']): + raise NotImplementedError('mixed dataset_abbr type is not supported') + + if all(isinstance(dataset_abbr, (list, tuple)) for dataset_abbr in sg['subsets']): + group_metrics = [default_metric] + for dataset_abbr, metric in sg['subsets']: + scores.setdefault(default_metric, []).append(parsed_results[model_abbr][dataset_abbr][metric]) + eval_modes.append(dataset_eval_mode.get(dataset_abbr, 'unknown')) + else: + group_metrics = list(functools.reduce(lambda a, b: a & b, [set(dataset_metrics[dataset_abbr]) for dataset_abbr in sg['subsets']])) + if len(group_metrics) > 1: + for metric in group_metrics: + for dataset_abbr in sg['subsets']: + scores.setdefault(metric, []).append(parsed_results[model_abbr][dataset_abbr][metric]) + eval_modes.append(dataset_eval_mode.get(sg['subsets'][0], 'unknown')) + else: + group_metrics = [default_metric] + for dataset_abbr in sg['subsets']: + metric = dataset_metrics[dataset_abbr][0] + scores.setdefault(default_metric, []).append(parsed_results[model_abbr][dataset_abbr][metric]) + eval_modes.append(dataset_eval_mode.get(dataset_abbr, 'unknown')) + + result = {} + for metric in scores: + if default_metric == 'standard_deviation': + avg = sum(scores[metric]) / len(scores[metric]) + variance = sum((k - avg) ** 2 for k in scores[metric]) / len(scores[metric]) + scores[metric] = result[metric] = math.sqrt(variance) + else: + if default_metric == 'weighted_average': + numerator = sum(scores[metric][k] * sg['weights'][k] for k in sg['weights']) + denominator = sum(sg['weights'].values()) + else: + numerator = sum(scores[metric]) + denominator = len(scores[metric]) + scores[metric] = result[metric] = numerator / denominator eval_modes = list(set(eval_modes)) eval_mode = eval_modes[0] if len(eval_modes) == 1 else 'mixed' - # add to global results - raw_results[model_abbr][sg['name']] = results - parsed_results[model_abbr][sg['name']] = [results[metric]] - dataset_metrics[sg['name']] = [metric] - dataset_eval_mode[sg['name']] = eval_mode - elif len(results) == 0: - continue - else: - raw_results[model_abbr][sg['name']] = {'error': 'missing datasets: {}'.format(set(sg['subsets']) - set(results.keys()))} + # add to global results + raw_results[model_abbr][sg['name']] = scores + parsed_results[model_abbr][sg['name']]= result + dataset_metrics[sg['name']] = group_metrics + dataset_eval_mode[sg['name']] = eval_mode - prompt_version = {dataset_abbr_from_cfg(d): get_prompt_hash(d)[:6] for d in dataset_cfgs} + return raw_results, parsed_results, dataset_metrics, dataset_eval_mode + + def _format_table(self, parsed_results, dataset_metrics, dataset_eval_mode): + dataset_abbrs = [dataset_abbr_from_cfg(dataset) for dataset in self.dataset_cfgs] + prompt_version = {dataset_abbr_from_cfg(d): get_prompt_hash(d)[:6] for d in self.dataset_cfgs} - # format table summarizer_dataset_abbrs = [] if self.dataset_abbrs is None: - for dataset in dataset_cfgs: - dataset_abbr = dataset_abbr_from_cfg(dataset) + # display all dataset metrics included in the config + for dataset_abbr in dataset_abbrs: if dataset_abbr in dataset_metrics: for metric in dataset_metrics[dataset_abbr]: summarizer_dataset_abbrs.append((dataset_abbr, metric)) else: summarizer_dataset_abbrs.append((dataset_abbr, None)) + # along with all possible group metrics for dataset_abbr in dataset_metrics: for metric in dataset_metrics[dataset_abbr]: if (dataset_abbr, metric) not in summarizer_dataset_abbrs: summarizer_dataset_abbrs.append((dataset_abbr, metric)) else: + # follow the required order for item in self.dataset_abbrs: if isinstance(item, str): summarizer_dataset_abbrs.append((item, None)) @@ -181,79 +215,103 @@ class DefaultSummarizer: summarizer_dataset_abbrs.append((item[0], item[1])) table = [] - header = ['dataset', 'version', 'metric', 'mode'] + model_abbrs + header = ['dataset', 'version', 'metric', 'mode'] + self.model_abbrs table.append(header) for dataset_abbr, metric in summarizer_dataset_abbrs: if dataset_abbr not in dataset_metrics: - table.append([dataset_abbr, '-', '-', '-'] + ['-'] * len(model_abbrs)) + table.append([dataset_abbr, '-', '-', '-'] + ['-'] * len(self.model_abbrs)) continue if metric is None: - index = 0 metric = dataset_metrics[dataset_abbr][0] elif metric in dataset_metrics[dataset_abbr]: - index = dataset_metrics[dataset_abbr].index(metric) + pass else: - table.append([dataset_abbr, '-', '-', '-'] + ['-'] * len(model_abbrs)) + table.append([dataset_abbr, '-', '-', '-'] + ['-'] * len(self.model_abbrs)) continue row = [dataset_abbr, prompt_version.get(dataset_abbr, '-'), metric, dataset_eval_mode.get(dataset_abbr, '-')] - for model_abbr in model_abbrs: + for model_abbr in self.model_abbrs: if dataset_abbr in parsed_results[model_abbr]: - row.append('{:.02f}'.format(parsed_results[model_abbr][dataset_abbr][index])) + row.append('{:.02f}'.format(parsed_results[model_abbr][dataset_abbr][metric])) else: row.append('-') table.append(row) + return table - # format raw txt + def _format_raw_txt(self, raw_results): raw_dataset_abbrs = [] - for model_abbr in model_abbrs: + for model_abbr in self.model_abbrs: for dataset_abbr in raw_results[model_abbr]: if dataset_abbr not in raw_dataset_abbrs: raw_dataset_abbrs.append(dataset_abbr) raw_txts = [] - for model_abbr in model_abbrs: + for model_abbr in self.model_abbrs: raw_txts.append('-------------------------------') raw_txts.append(f'Model: {model_abbr}') for dataset_abbr in raw_dataset_abbrs: result = raw_results[model_abbr].get(dataset_abbr, '{}') raw_txts.append(f'{dataset_abbr}: {result}') raw_txts = '\n'.join(raw_txts) + return raw_txts - # output to screean - print(tabulate.tabulate(table, headers='firstrow')) - + def _output_to_file(self, output_path, time_str, table, raw_txts): # output to file if output_path is None: - output_path = osp.join(work_dir, 'summary', f'summary_{time_str}.txt') - output_csv_path = osp.join(work_dir, 'summary', f'summary_{time_str}.csv') + output_path = osp.join(self.work_dir, 'summary', f'summary_{time_str}.txt') + output_csv_path = osp.join(self.work_dir, 'summary', f'summary_{time_str}.csv') else: output_csv_path = output_path.replace('.txt', '.csv') output_dir = osp.split(output_path)[0] mmengine.mkdir_or_exist(output_dir) with open(output_path, 'w', encoding='utf-8') as f: - f.write(time_str + '\n') - f.write('tabulate format\n') - f.write('^' * 128 + '\n') - f.write(tabulate.tabulate(table, headers='firstrow') + '\n') - f.write('$' * 128 + '\n') - f.write('\n' + '-' * 128 + ' THIS IS A DIVIDER ' + '-' * 128 + '\n\n') - f.write('csv format\n') - f.write('^' * 128 + '\n') - f.write('\n'.join([','.join(row) for row in table]) + '\n') - f.write('$' * 128 + '\n') - f.write('\n' + '-' * 128 + ' THIS IS A DIVIDER ' + '-' * 128 + '\n\n') - f.write('raw format\n') - f.write('^' * 128 + '\n') - f.write(raw_txts + '\n') - f.write('$' * 128 + '\n') + text = f'{time_str}\n' + \ + 'tabulate format\n' + \ + '^' * 128 + '\n' + \ + tabulate.tabulate(table, headers='firstrow') + '\n' + \ + '$' * 128 + '\n\n' + \ + '-' * 128 + ' THIS IS A DIVIDER ' + '-' * 128 + '\n\n' + \ + 'csv format\n' + \ + '^' * 128 + '\n' + \ + '\n'.join([','.join(row) for row in table]) + '\n' + \ + '$' * 128 + '\n\n' + \ + '-' * 128 + ' THIS IS A DIVIDER ' + '-' * 128 + '\n\n' + \ + 'raw format\n' + \ + '^' * 128 + '\n' + \ + raw_txts + '\n' + \ + '$' * 128 + '\n' + f.write(text) self.logger.info(f'write summary to {osp.abspath(output_path)}') + with open(output_csv_path, 'w', encoding='utf-8') as f: + f.write('\n'.join([','.join(row) for row in table]) + '\n') + self.logger.info(f'write csv to {osp.abspath(output_csv_path)}') + + def summarize( + self, + output_path: str = None, + time_str: str = datetime.now().strftime('%Y%m%d_%H%M%S')): # noqa + + # pick up results + raw_results, parsed_results, dataset_metrics, dataset_eval_mode = self._pick_up_results() + + # calculate group metrics + raw_results, parsed_results, dataset_metrics, dataset_eval_mode = \ + self._calculate_group_metrics(raw_results, parsed_results, dataset_metrics, dataset_eval_mode) + + # format table + table = self._format_table(parsed_results, dataset_metrics, dataset_eval_mode) + + # format raw txt + raw_txts = self._format_raw_txt(raw_results) + + # output to screen + print(tabulate.tabulate(table, headers='firstrow')) + + # output to .text / .csv files + self._output_to_file(output_path, time_str, table, raw_txts) + if self.lark_reporter: content = f'{getpass.getuser()} 的' content += f'详细评测汇总已输出至 {osp.abspath(output_path)}' self.lark_reporter.post(content) - - with open(output_csv_path, 'w', encoding='utf-8') as f: - f.write('\n'.join([','.join(row) for row in table]) + '\n') - self.logger.info(f'write csv to {osp.abspath(output_csv_path)}') diff --git a/opencompass/tasks/openicl_eval.py b/opencompass/tasks/openicl_eval.py index 52f25504..f0e4b2d6 100644 --- a/opencompass/tasks/openicl_eval.py +++ b/opencompass/tasks/openicl_eval.py @@ -172,6 +172,7 @@ class OpenICLEvalTask(BaseTask): preds['predictions'] = pred_strs preds['references'] = (test_set[self.output_column] if self.output_column else None) + preds['test_set'] = test_set preds = { k: preds[k] for k in signature(icl_evaluator.score).parameters