[Feature] Add circular eval (#610)

* refactor default, add circular summarizer

* add circular

* update impl

* update doc

* minor update

* no more to be added
This commit is contained in:
Fengzhe Zhou 2023-11-23 16:45:47 +08:00 committed by GitHub
parent 5202456b4c
commit d949e3c003
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
12 changed files with 915 additions and 145 deletions

91
configs/eval_circular.py Normal file
View File

@ -0,0 +1,91 @@
from mmengine.config import read_base
from opencompass.datasets.circular import (CircularCEvalDataset, CircularMMLUDataset, CircularCMMLUDataset, CircularCSQADataset,
CircularARCDataset, CircularHSWAGDataset, CircularOBQADataset, CircularRaceDataset, CircularEvaluator)
from opencompass.summarizers import CircularSummarizer
with read_base():
from .datasets.ceval.ceval_gen_5f30c7 import ceval_datasets
from .datasets.mmlu.mmlu_gen_a484b3 import mmlu_datasets
from .datasets.cmmlu.cmmlu_gen_c13365 import cmmlu_datasets
from .datasets.hellaswag.hellaswag_gen_6faab5 import hellaswag_datasets
from .datasets.ARC_e.ARC_e_gen_1e0de5 import ARC_e_datasets
from .datasets.ARC_c.ARC_c_gen_1e0de5 import ARC_c_datasets
from .datasets.commonsenseqa.commonsenseqa_gen_1da2d0 import commonsenseqa_datasets
from .datasets.obqa.obqa_gen_9069e4 import obqa_datasets
from .datasets.race.race_gen_69ee4f import race_datasets
from .models.hf_internlm.hf_internlm_chat_7b import models as hf_internlm_chat_7b_model
from .models.hf_internlm.hf_internlm_chat_20b import models as hf_internlm_chat_20b_model
from .models.qwen.hf_qwen_7b_chat import models as hf_qwen_7b_chat_model
from .models.qwen.hf_qwen_14b_chat import models as hf_qwen_14b_chat_model
from .summarizers.groups.mmlu import mmlu_summary_groups
from .summarizers.groups.cmmlu import cmmlu_summary_groups
from .summarizers.groups.ceval import ceval_summary_groups
for ds, t in [
(ceval_datasets, CircularCEvalDataset),
(mmlu_datasets, CircularMMLUDataset),
(cmmlu_datasets, CircularCMMLUDataset),
(hellaswag_datasets, CircularHSWAGDataset),
(ARC_e_datasets, CircularARCDataset),
(ARC_c_datasets, CircularARCDataset),
(commonsenseqa_datasets, CircularCSQADataset),
(obqa_datasets, CircularOBQADataset),
(race_datasets, CircularRaceDataset),
]:
for d in ds:
d['type'] = t
d['abbr'] = d['abbr'] + '-circular-4'
d['eval_cfg']['evaluator'] = {'type': CircularEvaluator, 'circular_pattern': 'circular'}
d['circular_patterns'] = 'circular'
datasets = sum([v for k, v in locals().items() if k.endswith("_datasets") or k == 'datasets'], [])
models = sum([v for k, v in locals().items() if k.endswith("_model")], [])
# config summarizer
other_summary_groups = [
{'name': 'average',
'subsets': ['ceval', 'mmlu', 'cmmlu', 'hellaswag', 'ARC-e', 'ARC-c', 'commonsense_qa', 'openbookqa_fact', 'race-middle', 'race-high']},
]
origin_summary_groups = sum([v for k, v in locals().items() if k.endswith("_summary_groups")], [])
new_summary_groups = []
for item in origin_summary_groups:
new_summary_groups.append(
{
'name': item['name'] + '-circular-4',
'subsets': [i + '-circular-4' for i in item['subsets']],
}
)
summarizer = dict(
type=CircularSummarizer,
metric_types=['acc_origin', 'perf_circular'],
dataset_abbrs = [
'average-circular-4',
'ceval-circular-4',
'mmlu-circular-4',
'cmmlu-circular-4',
'hellaswag-circular-4',
'ARC-e-circular-4',
'ARC-c-circular-4',
'commonsense_qa-circular-4',
'openbookqa_fact-circular-4',
'race-middle-circular-4',
'race-high-circular-4',
'ceval-humanities-circular-4',
'ceval-stem-circular-4',
'ceval-social-science-circular-4',
'ceval-other-circular-4',
'mmlu-humanities-circular-4',
'mmlu-stem-circular-4',
'mmlu-social-science-circular-4',
'mmlu-other-circular-4',
'cmmlu-humanities-circular-4',
'cmmlu-stem-circular-4',
'cmmlu-social-science-circular-4',
'cmmlu-other-circular-4',
'cmmlu-china-specific-circular-4',
],
summary_groups=new_summary_groups,
)

View File

@ -0,0 +1,113 @@
# CircularEval
## Background
For multiple-choice questions, when a Language Model (LLM) provides the correct option, it does not necessarily imply a true understanding and reasoning of the question. It could be a guess. To differentiate these scenarios and reduce LLM bias towards options, CircularEval (CircularEval) can be utilized. A multiple-choice question is augmented by shuffling its options, and if the LLM correctly answers all variations of the augmented question, it is considered correct under CircularEval.
## Adding Your Own CircularEval Dataset
Generally, to evaluate a dataset using CircularEval, both its loading and evaluation methods need to be rewritten. Modifications are required in both the OpenCompass main library and configuration files. We will use C-Eval as an example for explanation.
OpenCompass main library:
```python
from opencompass.datasets.ceval import CEvalDataset
from opencompass.datasets.circular import CircularDatasetMeta
class CircularCEvalDataset(CEvalDataset, metaclass=CircularDatasetMeta):
# The overloaded dataset class
dataset_class = CEvalDataset
# Splits of the DatasetDict that need CircularEval. For CEvalDataset, which loads [dev, val, test], we only need 'val' and 'test' for CircularEval, not 'dev'
default_circular_splits = ['val', 'test']
# List of keys to be shuffled
default_option_keys = ['A', 'B', 'C', 'D']
# If the content of 'answer_key' is one of ['A', 'B', 'C', 'D'], representing the correct answer. This field indicates how to update the correct answer after shuffling options. Choose either this or default_answer_key_switch_method
default_answer_key = 'answer'
# If 'answer_key' content is not one of ['A', 'B', 'C', 'D'], a function can be used to specify the correct answer after shuffling options. Choose either this or default_answer_key
# def default_answer_key_switch_method(item, circular_pattern):
# # 'item' is the original data item
# # 'circular_pattern' is a tuple indicating the order after shuffling options, e.g., ('D', 'A', 'B', 'C') means the original option A is now D, and so on
# item['answer'] = circular_pattern['ABCD'.index(item['answer'])]
# return item
```
`CircularCEvalDataset` accepts the `circular_pattern` parameter with two values:
- `circular`: Indicates a single cycle. It is the default value. ABCD is expanded to ABCD, BCDA, CDAB, DABC, a total of 4 variations.
- `all_possible`: Indicates all permutations. ABCD is expanded to ABCD, ABDC, ACBD, ACDB, ADBC, ADCB, BACD, ..., a total of 24 variations.
Additionally, we provide a `CircularEvaluator` to replace `AccEvaluator`. This Evaluator also accepts `circular_pattern`, and it should be consistent with the above. It produces the following metrics:
- `acc_{origin|circular|all_possible}`: Treating each question with shuffled options as separate, calculating accuracy.
- `perf_{origin|circular|all_possible}`: Following Circular logic, a question is considered correct only if all its variations with shuffled options are answered correctly, calculating accuracy.
- `more_{num}_{origin|circular|all_possible}`: According to Circular logic, a question is deemed correct if the number of its variations answered correctly is greater than or equal to num, calculating accuracy.
OpenCompass configuration file:
```python
from mmengine.config import read_base
from opencompass.datasets.circular import CircularCEvalDataset
with read_base():
from .datasets.ceval.ceval_gen_5f30c7 import ceval_datasets
for d in ceval_datasets:
# Overloading the load method
d['type'] = CircularCEvalDataset
# Renaming for differentiation from non-circular evaluation versions
d['abbr'] = d['abbr'] + '-circular-4'
# Overloading the evaluation method
d['eval_cfg']['evaluator'] = {'type': CircularEvaluator}
# The dataset after the above operations looks like this:
# dict(
# type=CircularCEvalDataset,
# path='./data/ceval/formal_ceval', # Unchanged
# name='computer_network', # Unchanged
# abbr='ceval-computer_network-circular-4',
# reader_cfg=dict(...), # Unchanged
# infer_cfg=dict(...), # Unchanged
# eval_cfg=dict(evaluator=dict(type=CircularEvaluator), ...),
# )
```
Additionally, for better presentation of results in CircularEval, consider using the following summarizer:
```python
from mmengine.config import read_base
from opencompass.summarizers import CircularSummarizer
with read_base():
from ...summarizers.groups.ceval.ceval_summary_groups
new_summary_groups = []
for item in ceval_summary_groups:
new_summary_groups.append(
{
'name': item['name'] + '-circular-4',
'subsets': [i + '-circular-4' for i in item['subsets']],
}
)
summarizer = dict(
type=CircularSummarizer,
# Select specific metrics to view
metric_types=['acc_origin', 'perf_circular'],
dataset_abbrs = [
'ceval-circular-4',
'ceval-humanities-circular-4',
'ceval-stem-circular-4',
'ceval-social-science-circular-4',
'ceval-other-circular-4',
],
summary_groups=new_summary_groups,
)
```
For more complex evaluation examples, refer to this sample code: https://github.com/open-compass/opencompass/tree/main/configs/eval_circular.py

View File

@ -67,6 +67,7 @@ We always welcome *PRs* and *Issues* for the betterment of OpenCompass.
advanced_guides/prompt_attack.md
advanced_guides/longeval.md
advanced_guides/subjective_evaluation.md
advanced_guides/circular_eval.md
.. _Tools:
.. toctree::

View File

@ -0,0 +1,111 @@
# 循环评测
## 背景
对于选择题而言,当 LLM 给出正确的选项,并不一定代表着它能真正地理解题意并经过推理得出答案,它也有可能是蒙对的。为了将这两种情形区分开,同时也为了降低 LLM 对选项的偏见,我们可以尝试使用循环评测 (CircularEval)。我们会将一道选择题按照打乱选项的方式进行增广,若 LLM 可以在增广后的每道题上均得到正确的答案,那么我们认为在循环评测的意义下,这道题被做对了。
## 新增自己的循环评测数据集
一般来说为了将一个数据集使用循环评测的方式进行评测它的加载方式和评测方式是需要被重写的OpenCompass 主库和配置文件均需要进行修改。后续我们以 C-Eval 为例进行讲解。
OpenCompass 主库:
```python
from opencompass.datasets.ceval import CEvalDataset
from opencompass.datasets.circular import CircularDatasetMeta
class CircularCEvalDataset(CEvalDataset, metaclass=CircularDatasetMeta):
# 被重载的数据集类
dataset_class = CEvalDataset
# 若原 load 方法得到一 DatasetDict其哪些 split 需要被循环评测。CEvalDataset load 得到 [dev, val, test],我们只需要对 val 和 test 进行循环评测dev 不需要
default_circular_splits = ['val', 'test']
# 需要被打乱的 key 列表
default_option_keys = ['A', 'B', 'C', 'D']
# 若 answer_key 的内容属于是 ['A', 'B', 'C', 'D'] 之一,并表示正确答案。该字段表示打乱选项后,需要如何更新正确答案。与 default_answer_key_switch_method 二选一
default_answer_key = 'answer'
# 如果 answer_key 的内容不属于 ['A', 'B', 'C', 'D'] 之一,那么可以使用函数的方式来指定打乱选项后的正确答案。与 default_answer_key 二选一
# def default_answer_key_switch_method(item, circular_pattern):
# # item 是原本的数据项
# # circular_pattern 是一个 tuple表示打乱选项后的顺序例如 ('D', 'A', 'B', 'C') 表示原来的 A 选项变成了 D原来的 B 选项变成了 A以此类推
# item['answer'] = circular_pattern['ABCD'.index(item['answer'])]
# return item
```
`CircularCEvalDataset` 会接受 `circular_pattern` 参数,它有两个取值:
- `circular`: 表示单项循环。默认为该值。ABCD 会被扩充为 ABCD, BCDA, CDAB, DABC, 共 4 种
- `all_possible`: 表示全排列。ABCD 会被扩充为 ABCD, ABDC, ACBD, ACDB, ADBC, ADCB, BACD, ..., 共 24 种
另外我们提供了一个 `CircularEvaluator` 用于替换 `AccEvaluator`,该 Evaluator 同样接受 `circular_pattern`,该参数应与上述保持一致。它会产出以下指标:
- `acc_{origin|circular|all_possible}`: 将打乱后选项顺序后的题目视作多道单独的题目,计算准确率
- `perf_{origin|circular|all_possible}`: 按照 circular 的逻辑,若选项打乱后的题目都回答正确,才会视为这道题正确,计算准确率
- `more_{num}_{origin|circular|all_possible}`: 按照 circular 的逻辑,若选项打乱后的题目回答正确的数量大于等于 num就会视为这道题正确计算准确率
OpenCompass 配置文件:
```python
from mmengine.config import read_base
from opencompass.datasets.circular import CircularCEvalDataset
with read_base():
from .datasets.ceval.ceval_gen_5f30c7 import ceval_datasets
for d in ceval_datasets:
# 重载 load 方法
d['type'] = CircularCEvalDataset
# 为了与非循环评测版本做区分而进行改名
d['abbr'] = d['abbr'] + '-circular-4'
# 重载评测方法
d['eval_cfg']['evaluator'] = {'type': CircularEvaluator}
# 上述操作后的 dataset 形如下:
# dict(
# type=CircularCEvalDataset,
# path='./data/ceval/formal_ceval', # 未改变
# name='computer_network', # 未改变
# abbr='ceval-computer_network-circular-4',
# reader_cfg=dict(...), # 未改变
# infer_cfg=dict(...), # 未改变
# eval_cfg=dict(evaluator=dict(type=CircularEvaluator), ...),
# )
```
另外评测时为了针对循环评测有更良好的结果呈现,建议考虑使用以下 summarizer
```python
from mmengine.config import read_base
from opencompass.summarizers import CircularSummarizer
with read_base():
from ...summarizers.groups.ceval import ceval_summary_groups
new_summary_groups = []
for item in ceval_summary_groups:
new_summary_groups.append(
{
'name': item['name'] + '-circular-4',
'subsets': [i + '-circular-4' for i in item['subsets']],
}
)
summarizer = dict(
type=CircularSummarizer,
# 选择具体看哪些指标
metric_types=['acc_origin', 'perf_circular'],
dataset_abbrs = [
'ceval-circular-4',
'ceval-humanities-circular-4',
'ceval-stem-circular-4',
'ceval-social-science-circular-4',
'ceval-other-circular-4',
],
summary_groups=new_summary_groups,
)
```
更多复杂的评测案例可以参考这个样例代码: https://github.com/open-compass/opencompass/tree/main/configs/eval_circular.py

View File

@ -67,6 +67,7 @@ OpenCompass 上手路线
advanced_guides/prompt_attack.md
advanced_guides/longeval.md
advanced_guides/subjective_evaluation.md
advanced_guides/circular_eval.md
.. _工具:
.. toctree::

View File

@ -13,6 +13,7 @@ from .cb import * # noqa: F401, F403
from .ceval import * # noqa: F401, F403
from .chid import * # noqa: F401, F403
from .cibench import * # noqa: F401, F403
from .circular import * # noqa: F401, F403
from .civilcomments import * # noqa: F401, F403
from .clozeTest_maxmin import * # noqa: F401, F403
from .cluewsc import * # noqa: F401, F403

View File

@ -14,32 +14,19 @@ class ARCDataset(BaseDataset):
def load(path: str):
with open(path, 'r', errors='ignore') as in_f:
rows = []
for i, line in enumerate(in_f):
sample = json.loads(line.strip())
answerKey = sample['answerKey']
sample = sample['question']
question = sample['stem']
choices = sample['choices']
if len(choices) != 4:
for line in in_f:
item = json.loads(line.strip())
question = item['question']
if len(question['choices']) != 4:
continue
textA = choices[0]['text']
textB = choices[1]['text']
textC = choices[2]['text']
textD = choices[3]['text']
labels = [c['label'] for c in question['choices']]
answerKey = 'ABCD'[labels.index(item['answerKey'])]
rows.append({
'question': question,
'question': question['stem'],
'answerKey': answerKey,
'textA': textA,
'textB': textB,
'textC': textC,
'textD': textD
'textA': question['choices'][0]['text'],
'textB': question['choices'][1]['text'],
'textC': question['choices'][2]['text'],
'textD': question['choices'][3]['text'],
})
dataset = Dataset.from_dict({
'question': [row['question'] for row in rows],
'answerKey': [row['answerKey'] for row in rows],
'textA': [row['textA'] for row in rows],
'textB': [row['textB'] for row in rows],
'textC': [row['textC'] for row in rows],
'textD': [row['textD'] for row in rows]
})
return dataset
return Dataset.from_list(rows)

View File

@ -0,0 +1,348 @@
import copy
import itertools
from typing import Callable, List, Optional, Union
from datasets import Dataset, DatasetDict
from opencompass.openicl.icl_evaluator import BaseEvaluator
from .arc import ARCDataset
from .ceval import CEvalDataset
from .cmmlu import CMMLUDataset
from .commonsenseqa import commonsenseqaDataset
from .hellaswag import hellaswagDataset_V2
from .mmlu import MMLUDataset
from .obqa import OBQADataset
from .race import RaceDataset
from .xiezhi import XiezhiDataset
def get_origin_patterns(option_keys):
return [tuple(option_keys)]
def get_circular_patterns(option_keys):
double_option_keys = option_keys + option_keys
circular_patterns = [
tuple(double_option_keys[i:i + len(option_keys)])
for i in range(len(option_keys))
]
return circular_patterns
def get_all_possible_patterns(option_keys):
circular_patterns = list(itertools.permutations(option_keys))
return circular_patterns
class CircularDatasetMeta(type):
"""This Meta Class is designed to transform a class that reads datasets
into one that supports reading datasets required for CircularEval. It
overloads an existing load method for the original class.
The Meta Class should possess the following attributes:
- `dataset_class` (class): The class for reading datasets, such as
`CEvalDataset`.
- `default_circular_splits` (list, optional): The default splits of the
dataset that need to undergo CircularEval, like ['val', 'test']. If a
`Dataset` is loaded originally, this field will be ignored.
- `default_option_keys` (list): The keys for options in the dataset, such
as ['A', 'B', 'C', 'D'].
- `default_answer_key` (str, optional): The key for answers in the dataset,
like 'answer'. This is an alternative to
`default_answer_key_switch_method`.
- `default_answer_key_switch_method` (function, optional): The method to
transform the key for answers in the dataset. This is an alternative to
`default_answer_key`.
"""
@staticmethod
def make_circular_items(
origin_item,
circular_patterns,
option_keys,
answer_key,
answer_key_switch_method,
qid,
):
items = []
for circular_pattern in circular_patterns:
item = copy.deepcopy(origin_item)
for i in range(len(option_keys)):
item[circular_pattern[i]] = origin_item[option_keys[i]]
if answer_key_switch_method is None:
if origin_item[answer_key] in option_keys:
item[answer_key] = circular_pattern[option_keys.index(
origin_item[answer_key])]
else:
pass
else:
item = answer_key_switch_method(item, circular_pattern)
item['qid'] = qid
item['circular_pattern'] = tuple(circular_pattern)
items.append(item)
return items
@staticmethod
def make_circular_dataset(dataset, circular_patterns, option_keys,
answer_key, answer_key_switch_method):
circulated_items = []
for i, item in enumerate(dataset):
item = CircularDatasetMeta.make_circular_items(
item,
circular_patterns,
option_keys,
answer_key,
answer_key_switch_method,
i,
)
circulated_items.extend(item)
return Dataset.from_list(circulated_items)
def make_circular(
dataset: Union[Dataset, DatasetDict],
circular_splits: Optional[List[str]] = ['test'],
circular_patterns: str = 'circular',
option_keys: List[str] = ['A', 'B', 'C', 'D'],
answer_key: Optional[str] = 'answer',
answer_key_switch_method: Optional[Callable] = None,
):
"""Transform the dataset into one that is compatible with CircularEval.
In CircularEval, the original multiple-choice questions with options
ABCD are augmented by shuffling the order of options, such as BCDA,
CDAB, DABC, etc. A model is considered correct only if it answers all
augmented questions correctly. This method effectively prevents models
from memorizing answers.
Args:
datasets: The dataset to be augmented.
circular_splits: List of splits to make circular. This is only
effective when the dataset is a DatasetDict.
circular_patterns: Method for circular processing, can be 'circular'
for single cycle or 'all_possible' for all permutations, default
is 'circular'.
option_keys: List of keys for options, default to ['A', 'B', 'C', 'D'].
answer_key: Key for the answer, default to 'answer'. When specified,
ensure that the content of answer_key is among the option_keys.
It is an alternative to specifying answer_key_switch_method.
answer_key_switch_method: Function to modify the answer_key. It is an
alternative to specifying answer_key.
"""
if isinstance(circular_patterns, str):
if circular_patterns == 'circular':
circular_patterns = get_circular_patterns(option_keys)
elif circular_patterns == 'all_possible':
circular_patterns = get_all_possible_patterns(option_keys)
else:
raise ValueError(
f'Unknown circular_patterns: {circular_patterns}')
else:
assert isinstance(circular_patterns, list)
assert all([isinstance(i, list) for i in circular_patterns])
# TODO: other necessary sanity checks
raise NotImplementedError(
'circular_patterns int list of list has not been tested yet')
if answer_key is None and answer_key_switch_method is None:
raise ValueError(
'answer_key and answer_key_switch_method cannot be both None')
if answer_key is not None and answer_key_switch_method is not None:
raise ValueError(
'either answer_key or answer_key_switch_method should be None')
if isinstance(dataset, Dataset):
dataset = CircularDatasetMeta.make_circular_dataset(
dataset,
circular_patterns,
option_keys,
answer_key,
answer_key_switch_method,
)
else:
assert isinstance(dataset, DatasetDict)
dataset_dict = {}
for split in dataset:
if circular_splits is not None and split in circular_splits:
dataset_dict[
split] = CircularDatasetMeta.make_circular_dataset(
dataset[split],
circular_patterns,
option_keys,
answer_key,
answer_key_switch_method,
)
else:
dataset_dict[split] = dataset[split]
dataset = DatasetDict(dataset_dict)
return dataset
def __new__(cls, name, bases, dct):
new_cls = super().__new__(cls, name, bases, dct)
def load(cls, circular_patterns='circular', *args, **kwargs):
circular_splits = getattr(cls, 'default_circular_splits', None)
option_keys = cls.default_option_keys
answer_key = getattr(cls, 'default_answer_key', None)
answer_key_switch_method = getattr(
cls, 'default_answer_key_switch_method', None)
dataset = cls.dataset_class.load(*args, **kwargs)
return CircularDatasetMeta.make_circular(
dataset,
circular_splits,
circular_patterns,
option_keys,
answer_key,
answer_key_switch_method,
)
setattr(new_cls, 'load', classmethod(load))
return new_cls
class CircularCEvalDataset(CEvalDataset, metaclass=CircularDatasetMeta):
dataset_class = CEvalDataset
default_circular_splits = ['val', 'test']
default_option_keys = ['A', 'B', 'C', 'D']
default_answer_key = 'answer'
class CircularMMLUDataset(MMLUDataset, metaclass=CircularDatasetMeta):
dataset_class = MMLUDataset
default_circular_splits = ['test']
default_option_keys = ['A', 'B', 'C', 'D']
default_answer_key = 'target'
class CircularCMMLUDataset(CMMLUDataset, metaclass=CircularDatasetMeta):
dataset_class = CMMLUDataset
default_circular_splits = ['test']
default_option_keys = ['A', 'B', 'C', 'D']
default_answer_key = 'answer'
class CircularCSQADataset(commonsenseqaDataset, metaclass=CircularDatasetMeta):
dataset_class = commonsenseqaDataset
default_circular_splits = ['validation']
default_option_keys = ['A', 'B', 'C', 'D', 'E']
default_answer_key = 'answerKey'
class CircularARCDataset(ARCDataset, metaclass=CircularDatasetMeta):
dataset_class = ARCDataset
default_circular_splits = None
default_option_keys = ['textA', 'textB', 'textC', 'textD']
def default_answer_key_switch_method(item, circular_pattern):
circular_pattern = tuple(i[-1] for i in circular_pattern)
item['answerKey'] = circular_pattern['ABCD'.index(item['answerKey'])]
return item
class CircularHSWAGDataset(hellaswagDataset_V2, metaclass=CircularDatasetMeta):
dataset_class = hellaswagDataset_V2
default_circular_splits = None
default_option_keys = ['A', 'B', 'C', 'D']
default_answer_key = 'label'
class CircularOBQADataset(OBQADataset, metaclass=CircularDatasetMeta):
dataset_class = OBQADataset
default_circular_splits = None
default_option_keys = ['A', 'B', 'C', 'D']
default_answer_key = 'answerKey'
class CircularRaceDataset(RaceDataset, metaclass=CircularDatasetMeta):
dataset_class = RaceDataset
default_circular_splits = ['test']
default_option_keys = ['A', 'B', 'C', 'D']
default_answer_key = 'answer'
class CircularXiezhiDataset(XiezhiDataset, metaclass=CircularDatasetMeta):
dataset_class = XiezhiDataset
default_circular_splits = None
default_option_keys = ['A', 'B', 'C', 'D']
default_answer_key = 'answer'
class CircularEvaluator(BaseEvaluator):
"""This Evaluator assesses datasets post-Circular processing, generating
the following evaluation metrics:
- `acc_{origin|circular|all_possible}`: Treats each question with shuffled
answer options as separate, calculating accuracy.
- `perf_{origin|circular|all_possible}`: According Circular logic, a
question is considered correct only if all its variations with shuffled
options are answered correctly, calculating accuracy. perf is short for
perfect.
- `more_{num}_{origin|circular|all_possible}`: According to Circular logic,
a question is considered correct only if the number of its variations
answered correctly is greater than or equal to `num`, calculating
accuracy.
Note that when the `all_possible` method is used to shuffle option order,
it naturally includes the Circular method, and its metrics will also be
output.
Args:
circular_pattern: The method of shuffling options, either 'circular' or
'all_possible', defaulting to 'circular'.
"""
def __init__(self, circular_pattern='circular'):
super().__init__()
self.circular_pattern = circular_pattern
def score(self, predictions, references, test_set):
circular_patterns = {}
circular_patterns['origin'] = get_origin_patterns(
test_set[0]['circular_pattern'])
circular_patterns['circular'] = get_circular_patterns(
test_set[0]['circular_pattern'])
if self.circular_pattern == 'all_possible':
circular_patterns['all_possible'] = get_all_possible_patterns(
test_set[0]['circular_pattern'])
metrics = {}
tmp_metrics = {}
tmp_metrics.update({f'correct_{k}': 0 for k in circular_patterns})
tmp_metrics.update({f'count_{k}': 0 for k in circular_patterns})
# calculate the original accuracy
for pred, ref, origin_item in zip(predictions, references, test_set):
circular_pattern = origin_item['circular_pattern']
for k in circular_patterns:
if tuple(circular_pattern) in circular_patterns[k]:
tmp_metrics[f'correct_{k}'] += 1 if pred == ref else 0
tmp_metrics[f'count_{k}'] += 1
for k in circular_patterns:
metrics[f'acc_{k}'] = (tmp_metrics[f'correct_{k}'] /
tmp_metrics[f'count_{k}'] * 100)
# calculate the circular accuracy
_details = {k: {} for k in circular_patterns}
for pred, ref, origin_item in zip(predictions, references, test_set):
index = origin_item['qid']
circular_pattern = origin_item['circular_pattern']
for k in circular_patterns:
if tuple(circular_pattern) in circular_patterns[k]:
_details[k].setdefault(
index, []).append(True if pred == ref else False)
for k in _details:
_details[k] = {
index: sum(_details[k][index])
for index in _details[k]
}
for k in _details:
for j in range(1, len(circular_patterns[k]) + 1):
count = sum([_details[k][index] >= j for index in _details[k]])
total = len(_details[k])
if j != len(circular_patterns[k]):
metrics[f'more_{j}_{k}'] = count / total * 100
else:
metrics[f'perf_{k}'] = count / total * 100
return metrics

View File

@ -1,4 +1,5 @@
from .circular import CircularSummarizer
from .default import DefaultSummarizer
from .subjective import SubjectiveSummarizer
__all__ = ['DefaultSummarizer', 'SubjectiveSummarizer']
__all__ = ['DefaultSummarizer', 'SubjectiveSummarizer', 'CircularSummarizer']

View File

@ -0,0 +1,57 @@
from typing import List, Optional
from mmengine import ConfigDict
from opencompass.utils import dataset_abbr_from_cfg
from opencompass.utils.prompt import get_prompt_hash
from .default import DefaultSummarizer
class CircularSummarizer(DefaultSummarizer):
def __init__(self,
config: ConfigDict,
dataset_abbrs: Optional[List[str]] = None,
summary_groups: List = [],
prompt_db=None,
metric_types=None) -> None:
super().__init__(config, dataset_abbrs, summary_groups, prompt_db)
self.metric_types = metric_types
def _format_table(self, parsed_results, dataset_metrics,
dataset_eval_mode):
prompt_version = {
dataset_abbr_from_cfg(d): get_prompt_hash(d)[:6]
for d in self.dataset_cfgs
}
table = []
header1 = ['dataset', 'version', 'mode'] + sum(
[[model_abbr] + ['-' for _ in range(len(self.metric_types) - 1)]
for model_abbr in self.model_abbrs], [])
table.append(header1)
header2 = ['-', '-', '-'] + sum(
[self.metric_types for _ in self.model_abbrs], [])
table.append(header2)
for dataset_abbr in self.dataset_abbrs:
if dataset_abbr not in dataset_metrics:
table.append([dataset_abbr, '-', '-'] + ['-'] *
len(self.model_abbrs) * len(self.metric_types))
continue
row = [
dataset_abbr,
prompt_version.get(dataset_abbr, '-'),
dataset_eval_mode.get(dataset_abbr, '-')
]
for model_abbr in self.model_abbrs:
for metric in self.metric_types:
if dataset_abbr in parsed_results[
model_abbr] and metric in parsed_results[
model_abbr][dataset_abbr]:
row.append('{:.02f}'.format(
parsed_results[model_abbr][dataset_abbr][metric]))
else:
row.append('-')
table.append(row)
return table

View File

@ -1,10 +1,11 @@
# flake8: noqa
# yapf: disable
import functools
import getpass
import math
import os.path as osp
from datetime import datetime
from typing import List, Optional
from typing import Any, Dict, List, Optional
import mmengine
import tabulate
@ -22,12 +23,9 @@ class DefaultSummarizer:
"""Default summarizer in OpenCompass.
Args:
config (ConfigDict): The configuration object of the evaluation task.
It's expected to be filled out at runtime.
dataset_abbrs (list[str], optional): Dataset abbreviations to be
listed in the summary.
summary_groups (list): The dataset groups whose results need to be
averaged out. For example, mmlu. Each item it a dict with
config (ConfigDict): The configuration object of the evaluation task. It's expected to be filled out at runtime.
dataset_abbrs (list[str], optional): Dataset abbreviations to be listed in the summary.
summary_groups (list): The dataset groups whose results need to be averaged out. For example, mmlu. Each item it a dict with
'name' (str) and 'subsets' (list of dataset abbrs), and optionally
'weights' if weighted average is needed.
prompt_db: A deprecated field.
@ -48,28 +46,37 @@ class DefaultSummarizer:
if self.cfg.get('lark_bot_url', None):
self.lark_reporter = LarkReporter(self.cfg['lark_bot_url'])
def summarize(
self,
output_path: str = None,
time_str: str = datetime.now().strftime('%Y%m%d_%H%M%S')): # noqa
self.model_cfgs = self.cfg['models']
self.dataset_cfgs = self.cfg['datasets']
self.work_dir = self.cfg['work_dir']
self.model_abbrs = [model_abbr_from_cfg(model) for model in self.model_cfgs]
model_cfgs = self.cfg['models']
dataset_cfgs = self.cfg['datasets']
work_dir = self.cfg['work_dir']
def _pick_up_results(self):
"""The function reads the numerical results of evaluations from the
output folder based on the configuration file, and ultimately returns
four dictionaries, each containing processed information in different
formats. The contents of the four dictionaries are as follows:
# pick up results
raw_results = {}
parsed_results = {}
dataset_metrics = {}
- raw_results: contains the raw results of each model on each dataset (excluding details).
- parsed_results: contains the results of each model on each dataset for each metric, with metrics in METRIC_BLACKLIST being ignored.
- dataset_metrics: contains the list of metrics for each dataset, consistent with the metrics in parsed_results. The list is ordered according to the METRIC_WHITELIST,
with metrics appearing earlier considered more important.
- dataset_eval_mode: contains the evaluation mode for each dataset.
"""
# raw_results: {model_abbr: {dataset_abbr: result}}
raw_results : Dict[str, Dict[str, Any]] = {}
# parsed_results: {model_abbr: {dataset_abbr: {metric: score}}}
parsed_results : Dict[str, Dict[str, Dict[str, float]]] = {}
# dataset_metrics: {dataset_abbr: [metric]}
dataset_metrics : Dict[str, List[str]] = {}
model_abbrs = [model_abbr_from_cfg(model) for model in model_cfgs]
for model in model_cfgs:
for model in self.model_cfgs:
model_abbr = model_abbr_from_cfg(model)
parsed_results[model_abbr] = {}
raw_results[model_abbr] = {}
for dataset in dataset_cfgs:
for dataset in self.dataset_cfgs:
dataset_abbr = dataset_abbr_from_cfg(dataset)
filepath = get_infer_output_path(model, dataset, osp.join(work_dir, 'results'))
filepath = get_infer_output_path(model, dataset, osp.join(self.work_dir, 'results'))
if not osp.exists(filepath):
continue
result = mmengine.load(filepath)
@ -78,34 +85,28 @@ class DefaultSummarizer:
if 'error' in result:
self.logger.debug(f'error in {model_abbr} {dataset_abbr} {result["error"]}')
continue
else:
parsed_results[model_abbr][dataset_abbr] = []
dataset_metrics[dataset_abbr] = []
for metric, score in result.items():
if metric not in METRIC_BLACKLIST and isinstance(score, (int, float)):
parsed_results[model_abbr][dataset_abbr].append(score)
dataset_metrics[dataset_abbr].append(metric)
else:
continue
if len(parsed_results[model_abbr][dataset_abbr]) == 0:
self.logger.warning(f'unknown result format: {result}, continue')
del parsed_results[model_abbr][dataset_abbr]
del dataset_metrics[dataset_abbr]
_rst, _dm = {}, []
for metric, score in result.items():
if metric not in METRIC_BLACKLIST and isinstance(score, (int, float)):
_rst[metric] = score
_dm.append(metric)
else:
continue
indice = sorted(
list(range(len(dataset_metrics[dataset_abbr]))),
key=lambda i: (
METRIC_WHITELIST.index(dataset_metrics[dataset_abbr][i])
if dataset_metrics[dataset_abbr][i] in METRIC_WHITELIST
else len(METRIC_WHITELIST)
)
)
parsed_results[model_abbr][dataset_abbr] = [parsed_results[model_abbr][dataset_abbr][i] for i in indice]
dataset_metrics[dataset_abbr] = [dataset_metrics[dataset_abbr][i] for i in indice]
if len(_rst) == 0:
self.logger.warning(f'unknown result format: {result}, continue')
continue
_dm = sorted(_dm, key=lambda i: METRIC_WHITELIST.index(i) if i in METRIC_WHITELIST else len(METRIC_WHITELIST))
# parse eval mode
dataset_eval_mode = {}
for dataset in dataset_cfgs:
if dataset_abbr in dataset_metrics:
assert tuple(dataset_metrics[dataset_abbr]) == tuple(_dm), \
f'{dataset_abbr} has different metrics: {dataset_metrics[dataset_abbr]} vs {_dm}'
else:
dataset_metrics[dataset_abbr] = _dm
parsed_results[model_abbr][dataset_abbr] = _rst
# dataset_eval_mode: {dataset_abbr: eval_mode}
dataset_eval_mode : Dict[str, str] = {}
for dataset in self.dataset_cfgs:
inferencer = dataset.get('infer_cfg', {}).get('inferencer', {}).get('type', '')
inferencer = inferencer if isinstance(inferencer, str) else inferencer.__name__
dataset_abbr = dataset_abbr_from_cfg(dataset)
@ -116,64 +117,97 @@ class DefaultSummarizer:
else:
dataset_eval_mode[dataset_abbr] = 'unknown'
self.logger.warning(f'unknown inferencer: {inferencer} - {dataset_abbr}')
return raw_results, parsed_results, dataset_metrics, dataset_eval_mode
# calculate group metrics
def _calculate_group_metrics(self, raw_results, parsed_results, dataset_metrics, dataset_eval_mode):
"""The function calculates the numerical results for each group based
on the configuration in summary_groups, and updates the contents of
each dictionary accordingly."""
summary_groups = self.summary_groups
for sg in summary_groups:
for model_abbr in model_abbrs:
results = {}
eval_modes = []
for dataset_abbr in sg['subsets']:
if dataset_abbr in parsed_results[model_abbr]:
results[dataset_abbr] = parsed_results[model_abbr][dataset_abbr][0]
eval_modes.append(dataset_eval_mode.get(dataset_abbr, 'unknown'))
if len(results) == len(sg['subsets']):
if 'std' in sg and sg['std'] == True:
avg = sum(results[k] for k in results) / len(results)
variance = sum((results[k] - avg)**2 for k in results) / len(results)
metric = 'standard_deviation'
results[metric] = math.sqrt(variance)
else:
if 'weights' in sg:
numerator = sum(results[k] * sg['weights'][k] for k in sg['weights'])
denominator = sum(sg['weights'].values())
metric = 'weighted_average'
else:
numerator = sum(results[k] for k in results)
denominator = len(results)
metric = 'naive_average'
results[metric] = numerator / denominator
for model_abbr in self.model_abbrs:
available_count = sum(dataset_abbr in parsed_results[model_abbr] for dataset_abbr in sg['subsets'])
if available_count == 0:
continue
if available_count != len(sg['subsets']):
raw_results[model_abbr][sg['name']] = {'error': 'missing datasets: {}'.format(set(sg['subsets']) - set(parsed_results[model_abbr].keys()))}
continue
if sg.get('std', False):
default_metric = 'standard_deviation'
elif sg.get('weights', []):
default_metric = 'weighted_average'
else:
default_metric = 'naive_average'
scores, eval_modes, group_metrics = {}, [], None
if any(isinstance(dataset_abbr, (list, tuple)) for dataset_abbr in sg['subsets']) and \
any(isinstance(dataset_abbr, str) for dataset_abbr in sg['subsets']):
raise NotImplementedError('mixed dataset_abbr type is not supported')
if all(isinstance(dataset_abbr, (list, tuple)) for dataset_abbr in sg['subsets']):
group_metrics = [default_metric]
for dataset_abbr, metric in sg['subsets']:
scores.setdefault(default_metric, []).append(parsed_results[model_abbr][dataset_abbr][metric])
eval_modes.append(dataset_eval_mode.get(dataset_abbr, 'unknown'))
else:
group_metrics = list(functools.reduce(lambda a, b: a & b, [set(dataset_metrics[dataset_abbr]) for dataset_abbr in sg['subsets']]))
if len(group_metrics) > 1:
for metric in group_metrics:
for dataset_abbr in sg['subsets']:
scores.setdefault(metric, []).append(parsed_results[model_abbr][dataset_abbr][metric])
eval_modes.append(dataset_eval_mode.get(sg['subsets'][0], 'unknown'))
else:
group_metrics = [default_metric]
for dataset_abbr in sg['subsets']:
metric = dataset_metrics[dataset_abbr][0]
scores.setdefault(default_metric, []).append(parsed_results[model_abbr][dataset_abbr][metric])
eval_modes.append(dataset_eval_mode.get(dataset_abbr, 'unknown'))
result = {}
for metric in scores:
if default_metric == 'standard_deviation':
avg = sum(scores[metric]) / len(scores[metric])
variance = sum((k - avg) ** 2 for k in scores[metric]) / len(scores[metric])
scores[metric] = result[metric] = math.sqrt(variance)
else:
if default_metric == 'weighted_average':
numerator = sum(scores[metric][k] * sg['weights'][k] for k in sg['weights'])
denominator = sum(sg['weights'].values())
else:
numerator = sum(scores[metric])
denominator = len(scores[metric])
scores[metric] = result[metric] = numerator / denominator
eval_modes = list(set(eval_modes))
eval_mode = eval_modes[0] if len(eval_modes) == 1 else 'mixed'
# add to global results
raw_results[model_abbr][sg['name']] = results
parsed_results[model_abbr][sg['name']] = [results[metric]]
dataset_metrics[sg['name']] = [metric]
dataset_eval_mode[sg['name']] = eval_mode
elif len(results) == 0:
continue
else:
raw_results[model_abbr][sg['name']] = {'error': 'missing datasets: {}'.format(set(sg['subsets']) - set(results.keys()))}
# add to global results
raw_results[model_abbr][sg['name']] = scores
parsed_results[model_abbr][sg['name']]= result
dataset_metrics[sg['name']] = group_metrics
dataset_eval_mode[sg['name']] = eval_mode
prompt_version = {dataset_abbr_from_cfg(d): get_prompt_hash(d)[:6] for d in dataset_cfgs}
return raw_results, parsed_results, dataset_metrics, dataset_eval_mode
def _format_table(self, parsed_results, dataset_metrics, dataset_eval_mode):
dataset_abbrs = [dataset_abbr_from_cfg(dataset) for dataset in self.dataset_cfgs]
prompt_version = {dataset_abbr_from_cfg(d): get_prompt_hash(d)[:6] for d in self.dataset_cfgs}
# format table
summarizer_dataset_abbrs = []
if self.dataset_abbrs is None:
for dataset in dataset_cfgs:
dataset_abbr = dataset_abbr_from_cfg(dataset)
# display all dataset metrics included in the config
for dataset_abbr in dataset_abbrs:
if dataset_abbr in dataset_metrics:
for metric in dataset_metrics[dataset_abbr]:
summarizer_dataset_abbrs.append((dataset_abbr, metric))
else:
summarizer_dataset_abbrs.append((dataset_abbr, None))
# along with all possible group metrics
for dataset_abbr in dataset_metrics:
for metric in dataset_metrics[dataset_abbr]:
if (dataset_abbr, metric) not in summarizer_dataset_abbrs:
summarizer_dataset_abbrs.append((dataset_abbr, metric))
else:
# follow the required order
for item in self.dataset_abbrs:
if isinstance(item, str):
summarizer_dataset_abbrs.append((item, None))
@ -181,79 +215,103 @@ class DefaultSummarizer:
summarizer_dataset_abbrs.append((item[0], item[1]))
table = []
header = ['dataset', 'version', 'metric', 'mode'] + model_abbrs
header = ['dataset', 'version', 'metric', 'mode'] + self.model_abbrs
table.append(header)
for dataset_abbr, metric in summarizer_dataset_abbrs:
if dataset_abbr not in dataset_metrics:
table.append([dataset_abbr, '-', '-', '-'] + ['-'] * len(model_abbrs))
table.append([dataset_abbr, '-', '-', '-'] + ['-'] * len(self.model_abbrs))
continue
if metric is None:
index = 0
metric = dataset_metrics[dataset_abbr][0]
elif metric in dataset_metrics[dataset_abbr]:
index = dataset_metrics[dataset_abbr].index(metric)
pass
else:
table.append([dataset_abbr, '-', '-', '-'] + ['-'] * len(model_abbrs))
table.append([dataset_abbr, '-', '-', '-'] + ['-'] * len(self.model_abbrs))
continue
row = [dataset_abbr, prompt_version.get(dataset_abbr, '-'), metric, dataset_eval_mode.get(dataset_abbr, '-')]
for model_abbr in model_abbrs:
for model_abbr in self.model_abbrs:
if dataset_abbr in parsed_results[model_abbr]:
row.append('{:.02f}'.format(parsed_results[model_abbr][dataset_abbr][index]))
row.append('{:.02f}'.format(parsed_results[model_abbr][dataset_abbr][metric]))
else:
row.append('-')
table.append(row)
return table
# format raw txt
def _format_raw_txt(self, raw_results):
raw_dataset_abbrs = []
for model_abbr in model_abbrs:
for model_abbr in self.model_abbrs:
for dataset_abbr in raw_results[model_abbr]:
if dataset_abbr not in raw_dataset_abbrs:
raw_dataset_abbrs.append(dataset_abbr)
raw_txts = []
for model_abbr in model_abbrs:
for model_abbr in self.model_abbrs:
raw_txts.append('-------------------------------')
raw_txts.append(f'Model: {model_abbr}')
for dataset_abbr in raw_dataset_abbrs:
result = raw_results[model_abbr].get(dataset_abbr, '{}')
raw_txts.append(f'{dataset_abbr}: {result}')
raw_txts = '\n'.join(raw_txts)
return raw_txts
# output to screean
print(tabulate.tabulate(table, headers='firstrow'))
def _output_to_file(self, output_path, time_str, table, raw_txts):
# output to file
if output_path is None:
output_path = osp.join(work_dir, 'summary', f'summary_{time_str}.txt')
output_csv_path = osp.join(work_dir, 'summary', f'summary_{time_str}.csv')
output_path = osp.join(self.work_dir, 'summary', f'summary_{time_str}.txt')
output_csv_path = osp.join(self.work_dir, 'summary', f'summary_{time_str}.csv')
else:
output_csv_path = output_path.replace('.txt', '.csv')
output_dir = osp.split(output_path)[0]
mmengine.mkdir_or_exist(output_dir)
with open(output_path, 'w', encoding='utf-8') as f:
f.write(time_str + '\n')
f.write('tabulate format\n')
f.write('^' * 128 + '\n')
f.write(tabulate.tabulate(table, headers='firstrow') + '\n')
f.write('$' * 128 + '\n')
f.write('\n' + '-' * 128 + ' THIS IS A DIVIDER ' + '-' * 128 + '\n\n')
f.write('csv format\n')
f.write('^' * 128 + '\n')
f.write('\n'.join([','.join(row) for row in table]) + '\n')
f.write('$' * 128 + '\n')
f.write('\n' + '-' * 128 + ' THIS IS A DIVIDER ' + '-' * 128 + '\n\n')
f.write('raw format\n')
f.write('^' * 128 + '\n')
f.write(raw_txts + '\n')
f.write('$' * 128 + '\n')
text = f'{time_str}\n' + \
'tabulate format\n' + \
'^' * 128 + '\n' + \
tabulate.tabulate(table, headers='firstrow') + '\n' + \
'$' * 128 + '\n\n' + \
'-' * 128 + ' THIS IS A DIVIDER ' + '-' * 128 + '\n\n' + \
'csv format\n' + \
'^' * 128 + '\n' + \
'\n'.join([','.join(row) for row in table]) + '\n' + \
'$' * 128 + '\n\n' + \
'-' * 128 + ' THIS IS A DIVIDER ' + '-' * 128 + '\n\n' + \
'raw format\n' + \
'^' * 128 + '\n' + \
raw_txts + '\n' + \
'$' * 128 + '\n'
f.write(text)
self.logger.info(f'write summary to {osp.abspath(output_path)}')
with open(output_csv_path, 'w', encoding='utf-8') as f:
f.write('\n'.join([','.join(row) for row in table]) + '\n')
self.logger.info(f'write csv to {osp.abspath(output_csv_path)}')
def summarize(
self,
output_path: str = None,
time_str: str = datetime.now().strftime('%Y%m%d_%H%M%S')): # noqa
# pick up results
raw_results, parsed_results, dataset_metrics, dataset_eval_mode = self._pick_up_results()
# calculate group metrics
raw_results, parsed_results, dataset_metrics, dataset_eval_mode = \
self._calculate_group_metrics(raw_results, parsed_results, dataset_metrics, dataset_eval_mode)
# format table
table = self._format_table(parsed_results, dataset_metrics, dataset_eval_mode)
# format raw txt
raw_txts = self._format_raw_txt(raw_results)
# output to screen
print(tabulate.tabulate(table, headers='firstrow'))
# output to .text / .csv files
self._output_to_file(output_path, time_str, table, raw_txts)
if self.lark_reporter:
content = f'{getpass.getuser()}'
content += f'详细评测汇总已输出至 {osp.abspath(output_path)}'
self.lark_reporter.post(content)
with open(output_csv_path, 'w', encoding='utf-8') as f:
f.write('\n'.join([','.join(row) for row in table]) + '\n')
self.logger.info(f'write csv to {osp.abspath(output_csv_path)}')

View File

@ -172,6 +172,7 @@ class OpenICLEvalTask(BaseTask):
preds['predictions'] = pred_strs
preds['references'] = (test_set[self.output_column]
if self.output_column else None)
preds['test_set'] = test_set
preds = {
k: preds[k]
for k in signature(icl_evaluator.score).parameters