mirror of
https://github.com/open-compass/opencompass.git
synced 2025-05-30 16:03:24 +08:00
[Feature] Add circular eval (#610)
* refactor default, add circular summarizer * add circular * update impl * update doc * minor update * no more to be added
This commit is contained in:
parent
5202456b4c
commit
d949e3c003
91
configs/eval_circular.py
Normal file
91
configs/eval_circular.py
Normal file
@ -0,0 +1,91 @@
|
||||
from mmengine.config import read_base
|
||||
from opencompass.datasets.circular import (CircularCEvalDataset, CircularMMLUDataset, CircularCMMLUDataset, CircularCSQADataset,
|
||||
CircularARCDataset, CircularHSWAGDataset, CircularOBQADataset, CircularRaceDataset, CircularEvaluator)
|
||||
from opencompass.summarizers import CircularSummarizer
|
||||
|
||||
with read_base():
|
||||
from .datasets.ceval.ceval_gen_5f30c7 import ceval_datasets
|
||||
from .datasets.mmlu.mmlu_gen_a484b3 import mmlu_datasets
|
||||
from .datasets.cmmlu.cmmlu_gen_c13365 import cmmlu_datasets
|
||||
from .datasets.hellaswag.hellaswag_gen_6faab5 import hellaswag_datasets
|
||||
from .datasets.ARC_e.ARC_e_gen_1e0de5 import ARC_e_datasets
|
||||
from .datasets.ARC_c.ARC_c_gen_1e0de5 import ARC_c_datasets
|
||||
from .datasets.commonsenseqa.commonsenseqa_gen_1da2d0 import commonsenseqa_datasets
|
||||
from .datasets.obqa.obqa_gen_9069e4 import obqa_datasets
|
||||
from .datasets.race.race_gen_69ee4f import race_datasets
|
||||
|
||||
from .models.hf_internlm.hf_internlm_chat_7b import models as hf_internlm_chat_7b_model
|
||||
from .models.hf_internlm.hf_internlm_chat_20b import models as hf_internlm_chat_20b_model
|
||||
from .models.qwen.hf_qwen_7b_chat import models as hf_qwen_7b_chat_model
|
||||
from .models.qwen.hf_qwen_14b_chat import models as hf_qwen_14b_chat_model
|
||||
|
||||
from .summarizers.groups.mmlu import mmlu_summary_groups
|
||||
from .summarizers.groups.cmmlu import cmmlu_summary_groups
|
||||
from .summarizers.groups.ceval import ceval_summary_groups
|
||||
|
||||
for ds, t in [
|
||||
(ceval_datasets, CircularCEvalDataset),
|
||||
(mmlu_datasets, CircularMMLUDataset),
|
||||
(cmmlu_datasets, CircularCMMLUDataset),
|
||||
(hellaswag_datasets, CircularHSWAGDataset),
|
||||
(ARC_e_datasets, CircularARCDataset),
|
||||
(ARC_c_datasets, CircularARCDataset),
|
||||
(commonsenseqa_datasets, CircularCSQADataset),
|
||||
(obqa_datasets, CircularOBQADataset),
|
||||
(race_datasets, CircularRaceDataset),
|
||||
]:
|
||||
for d in ds:
|
||||
d['type'] = t
|
||||
d['abbr'] = d['abbr'] + '-circular-4'
|
||||
d['eval_cfg']['evaluator'] = {'type': CircularEvaluator, 'circular_pattern': 'circular'}
|
||||
d['circular_patterns'] = 'circular'
|
||||
|
||||
|
||||
datasets = sum([v for k, v in locals().items() if k.endswith("_datasets") or k == 'datasets'], [])
|
||||
models = sum([v for k, v in locals().items() if k.endswith("_model")], [])
|
||||
|
||||
# config summarizer
|
||||
other_summary_groups = [
|
||||
{'name': 'average',
|
||||
'subsets': ['ceval', 'mmlu', 'cmmlu', 'hellaswag', 'ARC-e', 'ARC-c', 'commonsense_qa', 'openbookqa_fact', 'race-middle', 'race-high']},
|
||||
]
|
||||
origin_summary_groups = sum([v for k, v in locals().items() if k.endswith("_summary_groups")], [])
|
||||
new_summary_groups = []
|
||||
for item in origin_summary_groups:
|
||||
new_summary_groups.append(
|
||||
{
|
||||
'name': item['name'] + '-circular-4',
|
||||
'subsets': [i + '-circular-4' for i in item['subsets']],
|
||||
}
|
||||
)
|
||||
summarizer = dict(
|
||||
type=CircularSummarizer,
|
||||
metric_types=['acc_origin', 'perf_circular'],
|
||||
dataset_abbrs = [
|
||||
'average-circular-4',
|
||||
'ceval-circular-4',
|
||||
'mmlu-circular-4',
|
||||
'cmmlu-circular-4',
|
||||
'hellaswag-circular-4',
|
||||
'ARC-e-circular-4',
|
||||
'ARC-c-circular-4',
|
||||
'commonsense_qa-circular-4',
|
||||
'openbookqa_fact-circular-4',
|
||||
'race-middle-circular-4',
|
||||
'race-high-circular-4',
|
||||
'ceval-humanities-circular-4',
|
||||
'ceval-stem-circular-4',
|
||||
'ceval-social-science-circular-4',
|
||||
'ceval-other-circular-4',
|
||||
'mmlu-humanities-circular-4',
|
||||
'mmlu-stem-circular-4',
|
||||
'mmlu-social-science-circular-4',
|
||||
'mmlu-other-circular-4',
|
||||
'cmmlu-humanities-circular-4',
|
||||
'cmmlu-stem-circular-4',
|
||||
'cmmlu-social-science-circular-4',
|
||||
'cmmlu-other-circular-4',
|
||||
'cmmlu-china-specific-circular-4',
|
||||
],
|
||||
summary_groups=new_summary_groups,
|
||||
)
|
113
docs/en/advanced_guides/circular_eval.md
Normal file
113
docs/en/advanced_guides/circular_eval.md
Normal file
@ -0,0 +1,113 @@
|
||||
# CircularEval
|
||||
|
||||
## Background
|
||||
|
||||
For multiple-choice questions, when a Language Model (LLM) provides the correct option, it does not necessarily imply a true understanding and reasoning of the question. It could be a guess. To differentiate these scenarios and reduce LLM bias towards options, CircularEval (CircularEval) can be utilized. A multiple-choice question is augmented by shuffling its options, and if the LLM correctly answers all variations of the augmented question, it is considered correct under CircularEval.
|
||||
|
||||
## Adding Your Own CircularEval Dataset
|
||||
|
||||
Generally, to evaluate a dataset using CircularEval, both its loading and evaluation methods need to be rewritten. Modifications are required in both the OpenCompass main library and configuration files. We will use C-Eval as an example for explanation.
|
||||
|
||||
OpenCompass main library:
|
||||
|
||||
```python
|
||||
from opencompass.datasets.ceval import CEvalDataset
|
||||
from opencompass.datasets.circular import CircularDatasetMeta
|
||||
|
||||
class CircularCEvalDataset(CEvalDataset, metaclass=CircularDatasetMeta):
|
||||
# The overloaded dataset class
|
||||
dataset_class = CEvalDataset
|
||||
|
||||
# Splits of the DatasetDict that need CircularEval. For CEvalDataset, which loads [dev, val, test], we only need 'val' and 'test' for CircularEval, not 'dev'
|
||||
default_circular_splits = ['val', 'test']
|
||||
|
||||
# List of keys to be shuffled
|
||||
default_option_keys = ['A', 'B', 'C', 'D']
|
||||
|
||||
# If the content of 'answer_key' is one of ['A', 'B', 'C', 'D'], representing the correct answer. This field indicates how to update the correct answer after shuffling options. Choose either this or default_answer_key_switch_method
|
||||
default_answer_key = 'answer'
|
||||
|
||||
# If 'answer_key' content is not one of ['A', 'B', 'C', 'D'], a function can be used to specify the correct answer after shuffling options. Choose either this or default_answer_key
|
||||
# def default_answer_key_switch_method(item, circular_pattern):
|
||||
# # 'item' is the original data item
|
||||
# # 'circular_pattern' is a tuple indicating the order after shuffling options, e.g., ('D', 'A', 'B', 'C') means the original option A is now D, and so on
|
||||
# item['answer'] = circular_pattern['ABCD'.index(item['answer'])]
|
||||
# return item
|
||||
```
|
||||
|
||||
`CircularCEvalDataset` accepts the `circular_pattern` parameter with two values:
|
||||
|
||||
- `circular`: Indicates a single cycle. It is the default value. ABCD is expanded to ABCD, BCDA, CDAB, DABC, a total of 4 variations.
|
||||
- `all_possible`: Indicates all permutations. ABCD is expanded to ABCD, ABDC, ACBD, ACDB, ADBC, ADCB, BACD, ..., a total of 24 variations.
|
||||
|
||||
Additionally, we provide a `CircularEvaluator` to replace `AccEvaluator`. This Evaluator also accepts `circular_pattern`, and it should be consistent with the above. It produces the following metrics:
|
||||
|
||||
- `acc_{origin|circular|all_possible}`: Treating each question with shuffled options as separate, calculating accuracy.
|
||||
- `perf_{origin|circular|all_possible}`: Following Circular logic, a question is considered correct only if all its variations with shuffled options are answered correctly, calculating accuracy.
|
||||
- `more_{num}_{origin|circular|all_possible}`: According to Circular logic, a question is deemed correct if the number of its variations answered correctly is greater than or equal to num, calculating accuracy.
|
||||
|
||||
OpenCompass configuration file:
|
||||
|
||||
```python
|
||||
from mmengine.config import read_base
|
||||
from opencompass.datasets.circular import CircularCEvalDataset
|
||||
|
||||
with read_base():
|
||||
from .datasets.ceval.ceval_gen_5f30c7 import ceval_datasets
|
||||
|
||||
for d in ceval_datasets:
|
||||
# Overloading the load method
|
||||
d['type'] = CircularCEvalDataset
|
||||
# Renaming for differentiation from non-circular evaluation versions
|
||||
d['abbr'] = d['abbr'] + '-circular-4'
|
||||
# Overloading the evaluation method
|
||||
d['eval_cfg']['evaluator'] = {'type': CircularEvaluator}
|
||||
|
||||
# The dataset after the above operations looks like this:
|
||||
# dict(
|
||||
# type=CircularCEvalDataset,
|
||||
# path='./data/ceval/formal_ceval', # Unchanged
|
||||
# name='computer_network', # Unchanged
|
||||
# abbr='ceval-computer_network-circular-4',
|
||||
# reader_cfg=dict(...), # Unchanged
|
||||
# infer_cfg=dict(...), # Unchanged
|
||||
# eval_cfg=dict(evaluator=dict(type=CircularEvaluator), ...),
|
||||
# )
|
||||
```
|
||||
|
||||
Additionally, for better presentation of results in CircularEval, consider using the following summarizer:
|
||||
|
||||
```python
|
||||
|
||||
|
||||
from mmengine.config import read_base
|
||||
from opencompass.summarizers import CircularSummarizer
|
||||
|
||||
with read_base():
|
||||
from ...summarizers.groups.ceval.ceval_summary_groups
|
||||
|
||||
new_summary_groups = []
|
||||
for item in ceval_summary_groups:
|
||||
new_summary_groups.append(
|
||||
{
|
||||
'name': item['name'] + '-circular-4',
|
||||
'subsets': [i + '-circular-4' for i in item['subsets']],
|
||||
}
|
||||
)
|
||||
|
||||
summarizer = dict(
|
||||
type=CircularSummarizer,
|
||||
# Select specific metrics to view
|
||||
metric_types=['acc_origin', 'perf_circular'],
|
||||
dataset_abbrs = [
|
||||
'ceval-circular-4',
|
||||
'ceval-humanities-circular-4',
|
||||
'ceval-stem-circular-4',
|
||||
'ceval-social-science-circular-4',
|
||||
'ceval-other-circular-4',
|
||||
],
|
||||
summary_groups=new_summary_groups,
|
||||
)
|
||||
```
|
||||
|
||||
For more complex evaluation examples, refer to this sample code: https://github.com/open-compass/opencompass/tree/main/configs/eval_circular.py
|
@ -67,6 +67,7 @@ We always welcome *PRs* and *Issues* for the betterment of OpenCompass.
|
||||
advanced_guides/prompt_attack.md
|
||||
advanced_guides/longeval.md
|
||||
advanced_guides/subjective_evaluation.md
|
||||
advanced_guides/circular_eval.md
|
||||
|
||||
.. _Tools:
|
||||
.. toctree::
|
||||
|
111
docs/zh_cn/advanced_guides/circular_eval.md
Normal file
111
docs/zh_cn/advanced_guides/circular_eval.md
Normal file
@ -0,0 +1,111 @@
|
||||
# 循环评测
|
||||
|
||||
## 背景
|
||||
|
||||
对于选择题而言,当 LLM 给出正确的选项,并不一定代表着它能真正地理解题意并经过推理得出答案,它也有可能是蒙对的。为了将这两种情形区分开,同时也为了降低 LLM 对选项的偏见,我们可以尝试使用循环评测 (CircularEval)。我们会将一道选择题按照打乱选项的方式进行增广,若 LLM 可以在增广后的每道题上均得到正确的答案,那么我们认为在循环评测的意义下,这道题被做对了。
|
||||
|
||||
## 新增自己的循环评测数据集
|
||||
|
||||
一般来说,为了将一个数据集使用循环评测的方式进行评测,它的加载方式和评测方式是需要被重写的,OpenCompass 主库和配置文件均需要进行修改。后续我们以 C-Eval 为例进行讲解。
|
||||
|
||||
OpenCompass 主库:
|
||||
|
||||
```python
|
||||
from opencompass.datasets.ceval import CEvalDataset
|
||||
from opencompass.datasets.circular import CircularDatasetMeta
|
||||
|
||||
class CircularCEvalDataset(CEvalDataset, metaclass=CircularDatasetMeta):
|
||||
# 被重载的数据集类
|
||||
dataset_class = CEvalDataset
|
||||
|
||||
# 若原 load 方法得到一 DatasetDict,其哪些 split 需要被循环评测。CEvalDataset load 得到 [dev, val, test],我们只需要对 val 和 test 进行循环评测,dev 不需要
|
||||
default_circular_splits = ['val', 'test']
|
||||
|
||||
# 需要被打乱的 key 列表
|
||||
default_option_keys = ['A', 'B', 'C', 'D']
|
||||
|
||||
# 若 answer_key 的内容属于是 ['A', 'B', 'C', 'D'] 之一,并表示正确答案。该字段表示打乱选项后,需要如何更新正确答案。与 default_answer_key_switch_method 二选一
|
||||
default_answer_key = 'answer'
|
||||
|
||||
# 如果 answer_key 的内容不属于 ['A', 'B', 'C', 'D'] 之一,那么可以使用函数的方式来指定打乱选项后的正确答案。与 default_answer_key 二选一
|
||||
# def default_answer_key_switch_method(item, circular_pattern):
|
||||
# # item 是原本的数据项
|
||||
# # circular_pattern 是一个 tuple,表示打乱选项后的顺序,例如 ('D', 'A', 'B', 'C') 表示原来的 A 选项变成了 D,原来的 B 选项变成了 A,以此类推
|
||||
# item['answer'] = circular_pattern['ABCD'.index(item['answer'])]
|
||||
# return item
|
||||
```
|
||||
|
||||
`CircularCEvalDataset` 会接受 `circular_pattern` 参数,它有两个取值:
|
||||
|
||||
- `circular`: 表示单项循环。默认为该值。ABCD 会被扩充为 ABCD, BCDA, CDAB, DABC, 共 4 种
|
||||
- `all_possible`: 表示全排列。ABCD 会被扩充为 ABCD, ABDC, ACBD, ACDB, ADBC, ADCB, BACD, ..., 共 24 种
|
||||
|
||||
另外我们提供了一个 `CircularEvaluator` 用于替换 `AccEvaluator`,该 Evaluator 同样接受 `circular_pattern`,该参数应与上述保持一致。它会产出以下指标:
|
||||
|
||||
- `acc_{origin|circular|all_possible}`: 将打乱后选项顺序后的题目视作多道单独的题目,计算准确率
|
||||
- `perf_{origin|circular|all_possible}`: 按照 circular 的逻辑,若选项打乱后的题目都回答正确,才会视为这道题正确,计算准确率
|
||||
- `more_{num}_{origin|circular|all_possible}`: 按照 circular 的逻辑,若选项打乱后的题目回答正确的数量大于等于 num,就会视为这道题正确,计算准确率
|
||||
|
||||
OpenCompass 配置文件:
|
||||
|
||||
```python
|
||||
from mmengine.config import read_base
|
||||
from opencompass.datasets.circular import CircularCEvalDataset
|
||||
|
||||
with read_base():
|
||||
from .datasets.ceval.ceval_gen_5f30c7 import ceval_datasets
|
||||
|
||||
for d in ceval_datasets:
|
||||
# 重载 load 方法
|
||||
d['type'] = CircularCEvalDataset
|
||||
# 为了与非循环评测版本做区分而进行改名
|
||||
d['abbr'] = d['abbr'] + '-circular-4'
|
||||
# 重载评测方法
|
||||
d['eval_cfg']['evaluator'] = {'type': CircularEvaluator}
|
||||
|
||||
# 上述操作后的 dataset 形如下:
|
||||
# dict(
|
||||
# type=CircularCEvalDataset,
|
||||
# path='./data/ceval/formal_ceval', # 未改变
|
||||
# name='computer_network', # 未改变
|
||||
# abbr='ceval-computer_network-circular-4',
|
||||
# reader_cfg=dict(...), # 未改变
|
||||
# infer_cfg=dict(...), # 未改变
|
||||
# eval_cfg=dict(evaluator=dict(type=CircularEvaluator), ...),
|
||||
# )
|
||||
```
|
||||
|
||||
另外评测时为了针对循环评测有更良好的结果呈现,建议考虑使用以下 summarizer
|
||||
|
||||
```python
|
||||
from mmengine.config import read_base
|
||||
from opencompass.summarizers import CircularSummarizer
|
||||
|
||||
with read_base():
|
||||
from ...summarizers.groups.ceval import ceval_summary_groups
|
||||
|
||||
new_summary_groups = []
|
||||
for item in ceval_summary_groups:
|
||||
new_summary_groups.append(
|
||||
{
|
||||
'name': item['name'] + '-circular-4',
|
||||
'subsets': [i + '-circular-4' for i in item['subsets']],
|
||||
}
|
||||
)
|
||||
|
||||
summarizer = dict(
|
||||
type=CircularSummarizer,
|
||||
# 选择具体看哪些指标
|
||||
metric_types=['acc_origin', 'perf_circular'],
|
||||
dataset_abbrs = [
|
||||
'ceval-circular-4',
|
||||
'ceval-humanities-circular-4',
|
||||
'ceval-stem-circular-4',
|
||||
'ceval-social-science-circular-4',
|
||||
'ceval-other-circular-4',
|
||||
],
|
||||
summary_groups=new_summary_groups,
|
||||
)
|
||||
```
|
||||
|
||||
更多复杂的评测案例可以参考这个样例代码: https://github.com/open-compass/opencompass/tree/main/configs/eval_circular.py
|
@ -67,6 +67,7 @@ OpenCompass 上手路线
|
||||
advanced_guides/prompt_attack.md
|
||||
advanced_guides/longeval.md
|
||||
advanced_guides/subjective_evaluation.md
|
||||
advanced_guides/circular_eval.md
|
||||
|
||||
.. _工具:
|
||||
.. toctree::
|
||||
|
@ -13,6 +13,7 @@ from .cb import * # noqa: F401, F403
|
||||
from .ceval import * # noqa: F401, F403
|
||||
from .chid import * # noqa: F401, F403
|
||||
from .cibench import * # noqa: F401, F403
|
||||
from .circular import * # noqa: F401, F403
|
||||
from .civilcomments import * # noqa: F401, F403
|
||||
from .clozeTest_maxmin import * # noqa: F401, F403
|
||||
from .cluewsc import * # noqa: F401, F403
|
||||
|
@ -14,32 +14,19 @@ class ARCDataset(BaseDataset):
|
||||
def load(path: str):
|
||||
with open(path, 'r', errors='ignore') as in_f:
|
||||
rows = []
|
||||
for i, line in enumerate(in_f):
|
||||
sample = json.loads(line.strip())
|
||||
answerKey = sample['answerKey']
|
||||
sample = sample['question']
|
||||
question = sample['stem']
|
||||
choices = sample['choices']
|
||||
if len(choices) != 4:
|
||||
for line in in_f:
|
||||
item = json.loads(line.strip())
|
||||
question = item['question']
|
||||
if len(question['choices']) != 4:
|
||||
continue
|
||||
textA = choices[0]['text']
|
||||
textB = choices[1]['text']
|
||||
textC = choices[2]['text']
|
||||
textD = choices[3]['text']
|
||||
labels = [c['label'] for c in question['choices']]
|
||||
answerKey = 'ABCD'[labels.index(item['answerKey'])]
|
||||
rows.append({
|
||||
'question': question,
|
||||
'question': question['stem'],
|
||||
'answerKey': answerKey,
|
||||
'textA': textA,
|
||||
'textB': textB,
|
||||
'textC': textC,
|
||||
'textD': textD
|
||||
'textA': question['choices'][0]['text'],
|
||||
'textB': question['choices'][1]['text'],
|
||||
'textC': question['choices'][2]['text'],
|
||||
'textD': question['choices'][3]['text'],
|
||||
})
|
||||
dataset = Dataset.from_dict({
|
||||
'question': [row['question'] for row in rows],
|
||||
'answerKey': [row['answerKey'] for row in rows],
|
||||
'textA': [row['textA'] for row in rows],
|
||||
'textB': [row['textB'] for row in rows],
|
||||
'textC': [row['textC'] for row in rows],
|
||||
'textD': [row['textD'] for row in rows]
|
||||
})
|
||||
return dataset
|
||||
return Dataset.from_list(rows)
|
||||
|
348
opencompass/datasets/circular.py
Normal file
348
opencompass/datasets/circular.py
Normal file
@ -0,0 +1,348 @@
|
||||
import copy
|
||||
import itertools
|
||||
from typing import Callable, List, Optional, Union
|
||||
|
||||
from datasets import Dataset, DatasetDict
|
||||
|
||||
from opencompass.openicl.icl_evaluator import BaseEvaluator
|
||||
|
||||
from .arc import ARCDataset
|
||||
from .ceval import CEvalDataset
|
||||
from .cmmlu import CMMLUDataset
|
||||
from .commonsenseqa import commonsenseqaDataset
|
||||
from .hellaswag import hellaswagDataset_V2
|
||||
from .mmlu import MMLUDataset
|
||||
from .obqa import OBQADataset
|
||||
from .race import RaceDataset
|
||||
from .xiezhi import XiezhiDataset
|
||||
|
||||
|
||||
def get_origin_patterns(option_keys):
|
||||
return [tuple(option_keys)]
|
||||
|
||||
|
||||
def get_circular_patterns(option_keys):
|
||||
double_option_keys = option_keys + option_keys
|
||||
circular_patterns = [
|
||||
tuple(double_option_keys[i:i + len(option_keys)])
|
||||
for i in range(len(option_keys))
|
||||
]
|
||||
return circular_patterns
|
||||
|
||||
|
||||
def get_all_possible_patterns(option_keys):
|
||||
circular_patterns = list(itertools.permutations(option_keys))
|
||||
return circular_patterns
|
||||
|
||||
|
||||
class CircularDatasetMeta(type):
|
||||
"""This Meta Class is designed to transform a class that reads datasets
|
||||
into one that supports reading datasets required for CircularEval. It
|
||||
overloads an existing load method for the original class.
|
||||
|
||||
The Meta Class should possess the following attributes:
|
||||
|
||||
- `dataset_class` (class): The class for reading datasets, such as
|
||||
`CEvalDataset`.
|
||||
- `default_circular_splits` (list, optional): The default splits of the
|
||||
dataset that need to undergo CircularEval, like ['val', 'test']. If a
|
||||
`Dataset` is loaded originally, this field will be ignored.
|
||||
- `default_option_keys` (list): The keys for options in the dataset, such
|
||||
as ['A', 'B', 'C', 'D'].
|
||||
- `default_answer_key` (str, optional): The key for answers in the dataset,
|
||||
like 'answer'. This is an alternative to
|
||||
`default_answer_key_switch_method`.
|
||||
- `default_answer_key_switch_method` (function, optional): The method to
|
||||
transform the key for answers in the dataset. This is an alternative to
|
||||
`default_answer_key`.
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def make_circular_items(
|
||||
origin_item,
|
||||
circular_patterns,
|
||||
option_keys,
|
||||
answer_key,
|
||||
answer_key_switch_method,
|
||||
qid,
|
||||
):
|
||||
items = []
|
||||
for circular_pattern in circular_patterns:
|
||||
item = copy.deepcopy(origin_item)
|
||||
for i in range(len(option_keys)):
|
||||
item[circular_pattern[i]] = origin_item[option_keys[i]]
|
||||
if answer_key_switch_method is None:
|
||||
if origin_item[answer_key] in option_keys:
|
||||
item[answer_key] = circular_pattern[option_keys.index(
|
||||
origin_item[answer_key])]
|
||||
else:
|
||||
pass
|
||||
else:
|
||||
item = answer_key_switch_method(item, circular_pattern)
|
||||
item['qid'] = qid
|
||||
item['circular_pattern'] = tuple(circular_pattern)
|
||||
items.append(item)
|
||||
return items
|
||||
|
||||
@staticmethod
|
||||
def make_circular_dataset(dataset, circular_patterns, option_keys,
|
||||
answer_key, answer_key_switch_method):
|
||||
circulated_items = []
|
||||
for i, item in enumerate(dataset):
|
||||
item = CircularDatasetMeta.make_circular_items(
|
||||
item,
|
||||
circular_patterns,
|
||||
option_keys,
|
||||
answer_key,
|
||||
answer_key_switch_method,
|
||||
i,
|
||||
)
|
||||
circulated_items.extend(item)
|
||||
return Dataset.from_list(circulated_items)
|
||||
|
||||
def make_circular(
|
||||
dataset: Union[Dataset, DatasetDict],
|
||||
circular_splits: Optional[List[str]] = ['test'],
|
||||
circular_patterns: str = 'circular',
|
||||
option_keys: List[str] = ['A', 'B', 'C', 'D'],
|
||||
answer_key: Optional[str] = 'answer',
|
||||
answer_key_switch_method: Optional[Callable] = None,
|
||||
):
|
||||
"""Transform the dataset into one that is compatible with CircularEval.
|
||||
In CircularEval, the original multiple-choice questions with options
|
||||
ABCD are augmented by shuffling the order of options, such as BCDA,
|
||||
CDAB, DABC, etc. A model is considered correct only if it answers all
|
||||
augmented questions correctly. This method effectively prevents models
|
||||
from memorizing answers.
|
||||
|
||||
Args:
|
||||
datasets: The dataset to be augmented.
|
||||
circular_splits: List of splits to make circular. This is only
|
||||
effective when the dataset is a DatasetDict.
|
||||
circular_patterns: Method for circular processing, can be 'circular'
|
||||
for single cycle or 'all_possible' for all permutations, default
|
||||
is 'circular'.
|
||||
option_keys: List of keys for options, default to ['A', 'B', 'C', 'D'].
|
||||
answer_key: Key for the answer, default to 'answer'. When specified,
|
||||
ensure that the content of answer_key is among the option_keys.
|
||||
It is an alternative to specifying answer_key_switch_method.
|
||||
answer_key_switch_method: Function to modify the answer_key. It is an
|
||||
alternative to specifying answer_key.
|
||||
"""
|
||||
|
||||
if isinstance(circular_patterns, str):
|
||||
if circular_patterns == 'circular':
|
||||
circular_patterns = get_circular_patterns(option_keys)
|
||||
elif circular_patterns == 'all_possible':
|
||||
circular_patterns = get_all_possible_patterns(option_keys)
|
||||
else:
|
||||
raise ValueError(
|
||||
f'Unknown circular_patterns: {circular_patterns}')
|
||||
else:
|
||||
assert isinstance(circular_patterns, list)
|
||||
assert all([isinstance(i, list) for i in circular_patterns])
|
||||
# TODO: other necessary sanity checks
|
||||
raise NotImplementedError(
|
||||
'circular_patterns int list of list has not been tested yet')
|
||||
|
||||
if answer_key is None and answer_key_switch_method is None:
|
||||
raise ValueError(
|
||||
'answer_key and answer_key_switch_method cannot be both None')
|
||||
if answer_key is not None and answer_key_switch_method is not None:
|
||||
raise ValueError(
|
||||
'either answer_key or answer_key_switch_method should be None')
|
||||
|
||||
if isinstance(dataset, Dataset):
|
||||
dataset = CircularDatasetMeta.make_circular_dataset(
|
||||
dataset,
|
||||
circular_patterns,
|
||||
option_keys,
|
||||
answer_key,
|
||||
answer_key_switch_method,
|
||||
)
|
||||
else:
|
||||
assert isinstance(dataset, DatasetDict)
|
||||
dataset_dict = {}
|
||||
for split in dataset:
|
||||
if circular_splits is not None and split in circular_splits:
|
||||
dataset_dict[
|
||||
split] = CircularDatasetMeta.make_circular_dataset(
|
||||
dataset[split],
|
||||
circular_patterns,
|
||||
option_keys,
|
||||
answer_key,
|
||||
answer_key_switch_method,
|
||||
)
|
||||
else:
|
||||
dataset_dict[split] = dataset[split]
|
||||
dataset = DatasetDict(dataset_dict)
|
||||
return dataset
|
||||
|
||||
def __new__(cls, name, bases, dct):
|
||||
new_cls = super().__new__(cls, name, bases, dct)
|
||||
|
||||
def load(cls, circular_patterns='circular', *args, **kwargs):
|
||||
circular_splits = getattr(cls, 'default_circular_splits', None)
|
||||
option_keys = cls.default_option_keys
|
||||
answer_key = getattr(cls, 'default_answer_key', None)
|
||||
answer_key_switch_method = getattr(
|
||||
cls, 'default_answer_key_switch_method', None)
|
||||
dataset = cls.dataset_class.load(*args, **kwargs)
|
||||
return CircularDatasetMeta.make_circular(
|
||||
dataset,
|
||||
circular_splits,
|
||||
circular_patterns,
|
||||
option_keys,
|
||||
answer_key,
|
||||
answer_key_switch_method,
|
||||
)
|
||||
|
||||
setattr(new_cls, 'load', classmethod(load))
|
||||
return new_cls
|
||||
|
||||
|
||||
class CircularCEvalDataset(CEvalDataset, metaclass=CircularDatasetMeta):
|
||||
dataset_class = CEvalDataset
|
||||
default_circular_splits = ['val', 'test']
|
||||
default_option_keys = ['A', 'B', 'C', 'D']
|
||||
default_answer_key = 'answer'
|
||||
|
||||
|
||||
class CircularMMLUDataset(MMLUDataset, metaclass=CircularDatasetMeta):
|
||||
dataset_class = MMLUDataset
|
||||
default_circular_splits = ['test']
|
||||
default_option_keys = ['A', 'B', 'C', 'D']
|
||||
default_answer_key = 'target'
|
||||
|
||||
|
||||
class CircularCMMLUDataset(CMMLUDataset, metaclass=CircularDatasetMeta):
|
||||
dataset_class = CMMLUDataset
|
||||
default_circular_splits = ['test']
|
||||
default_option_keys = ['A', 'B', 'C', 'D']
|
||||
default_answer_key = 'answer'
|
||||
|
||||
|
||||
class CircularCSQADataset(commonsenseqaDataset, metaclass=CircularDatasetMeta):
|
||||
dataset_class = commonsenseqaDataset
|
||||
default_circular_splits = ['validation']
|
||||
default_option_keys = ['A', 'B', 'C', 'D', 'E']
|
||||
default_answer_key = 'answerKey'
|
||||
|
||||
|
||||
class CircularARCDataset(ARCDataset, metaclass=CircularDatasetMeta):
|
||||
dataset_class = ARCDataset
|
||||
default_circular_splits = None
|
||||
default_option_keys = ['textA', 'textB', 'textC', 'textD']
|
||||
|
||||
def default_answer_key_switch_method(item, circular_pattern):
|
||||
circular_pattern = tuple(i[-1] for i in circular_pattern)
|
||||
item['answerKey'] = circular_pattern['ABCD'.index(item['answerKey'])]
|
||||
return item
|
||||
|
||||
|
||||
class CircularHSWAGDataset(hellaswagDataset_V2, metaclass=CircularDatasetMeta):
|
||||
dataset_class = hellaswagDataset_V2
|
||||
default_circular_splits = None
|
||||
default_option_keys = ['A', 'B', 'C', 'D']
|
||||
default_answer_key = 'label'
|
||||
|
||||
|
||||
class CircularOBQADataset(OBQADataset, metaclass=CircularDatasetMeta):
|
||||
dataset_class = OBQADataset
|
||||
default_circular_splits = None
|
||||
default_option_keys = ['A', 'B', 'C', 'D']
|
||||
default_answer_key = 'answerKey'
|
||||
|
||||
|
||||
class CircularRaceDataset(RaceDataset, metaclass=CircularDatasetMeta):
|
||||
dataset_class = RaceDataset
|
||||
default_circular_splits = ['test']
|
||||
default_option_keys = ['A', 'B', 'C', 'D']
|
||||
default_answer_key = 'answer'
|
||||
|
||||
|
||||
class CircularXiezhiDataset(XiezhiDataset, metaclass=CircularDatasetMeta):
|
||||
dataset_class = XiezhiDataset
|
||||
default_circular_splits = None
|
||||
default_option_keys = ['A', 'B', 'C', 'D']
|
||||
default_answer_key = 'answer'
|
||||
|
||||
|
||||
class CircularEvaluator(BaseEvaluator):
|
||||
"""This Evaluator assesses datasets post-Circular processing, generating
|
||||
the following evaluation metrics:
|
||||
|
||||
- `acc_{origin|circular|all_possible}`: Treats each question with shuffled
|
||||
answer options as separate, calculating accuracy.
|
||||
- `perf_{origin|circular|all_possible}`: According Circular logic, a
|
||||
question is considered correct only if all its variations with shuffled
|
||||
options are answered correctly, calculating accuracy. perf is short for
|
||||
perfect.
|
||||
- `more_{num}_{origin|circular|all_possible}`: According to Circular logic,
|
||||
a question is considered correct only if the number of its variations
|
||||
answered correctly is greater than or equal to `num`, calculating
|
||||
accuracy.
|
||||
|
||||
Note that when the `all_possible` method is used to shuffle option order,
|
||||
it naturally includes the Circular method, and its metrics will also be
|
||||
output.
|
||||
|
||||
Args:
|
||||
circular_pattern: The method of shuffling options, either 'circular' or
|
||||
'all_possible', defaulting to 'circular'.
|
||||
"""
|
||||
|
||||
def __init__(self, circular_pattern='circular'):
|
||||
super().__init__()
|
||||
self.circular_pattern = circular_pattern
|
||||
|
||||
def score(self, predictions, references, test_set):
|
||||
circular_patterns = {}
|
||||
circular_patterns['origin'] = get_origin_patterns(
|
||||
test_set[0]['circular_pattern'])
|
||||
circular_patterns['circular'] = get_circular_patterns(
|
||||
test_set[0]['circular_pattern'])
|
||||
if self.circular_pattern == 'all_possible':
|
||||
circular_patterns['all_possible'] = get_all_possible_patterns(
|
||||
test_set[0]['circular_pattern'])
|
||||
|
||||
metrics = {}
|
||||
tmp_metrics = {}
|
||||
tmp_metrics.update({f'correct_{k}': 0 for k in circular_patterns})
|
||||
tmp_metrics.update({f'count_{k}': 0 for k in circular_patterns})
|
||||
# calculate the original accuracy
|
||||
for pred, ref, origin_item in zip(predictions, references, test_set):
|
||||
circular_pattern = origin_item['circular_pattern']
|
||||
for k in circular_patterns:
|
||||
if tuple(circular_pattern) in circular_patterns[k]:
|
||||
tmp_metrics[f'correct_{k}'] += 1 if pred == ref else 0
|
||||
tmp_metrics[f'count_{k}'] += 1
|
||||
|
||||
for k in circular_patterns:
|
||||
metrics[f'acc_{k}'] = (tmp_metrics[f'correct_{k}'] /
|
||||
tmp_metrics[f'count_{k}'] * 100)
|
||||
|
||||
# calculate the circular accuracy
|
||||
_details = {k: {} for k in circular_patterns}
|
||||
for pred, ref, origin_item in zip(predictions, references, test_set):
|
||||
index = origin_item['qid']
|
||||
circular_pattern = origin_item['circular_pattern']
|
||||
for k in circular_patterns:
|
||||
if tuple(circular_pattern) in circular_patterns[k]:
|
||||
_details[k].setdefault(
|
||||
index, []).append(True if pred == ref else False)
|
||||
for k in _details:
|
||||
_details[k] = {
|
||||
index: sum(_details[k][index])
|
||||
for index in _details[k]
|
||||
}
|
||||
for k in _details:
|
||||
for j in range(1, len(circular_patterns[k]) + 1):
|
||||
count = sum([_details[k][index] >= j for index in _details[k]])
|
||||
total = len(_details[k])
|
||||
if j != len(circular_patterns[k]):
|
||||
metrics[f'more_{j}_{k}'] = count / total * 100
|
||||
else:
|
||||
metrics[f'perf_{k}'] = count / total * 100
|
||||
|
||||
return metrics
|
@ -1,4 +1,5 @@
|
||||
from .circular import CircularSummarizer
|
||||
from .default import DefaultSummarizer
|
||||
from .subjective import SubjectiveSummarizer
|
||||
|
||||
__all__ = ['DefaultSummarizer', 'SubjectiveSummarizer']
|
||||
__all__ = ['DefaultSummarizer', 'SubjectiveSummarizer', 'CircularSummarizer']
|
||||
|
57
opencompass/summarizers/circular.py
Normal file
57
opencompass/summarizers/circular.py
Normal file
@ -0,0 +1,57 @@
|
||||
from typing import List, Optional
|
||||
|
||||
from mmengine import ConfigDict
|
||||
|
||||
from opencompass.utils import dataset_abbr_from_cfg
|
||||
from opencompass.utils.prompt import get_prompt_hash
|
||||
|
||||
from .default import DefaultSummarizer
|
||||
|
||||
|
||||
class CircularSummarizer(DefaultSummarizer):
|
||||
|
||||
def __init__(self,
|
||||
config: ConfigDict,
|
||||
dataset_abbrs: Optional[List[str]] = None,
|
||||
summary_groups: List = [],
|
||||
prompt_db=None,
|
||||
metric_types=None) -> None:
|
||||
super().__init__(config, dataset_abbrs, summary_groups, prompt_db)
|
||||
self.metric_types = metric_types
|
||||
|
||||
def _format_table(self, parsed_results, dataset_metrics,
|
||||
dataset_eval_mode):
|
||||
prompt_version = {
|
||||
dataset_abbr_from_cfg(d): get_prompt_hash(d)[:6]
|
||||
for d in self.dataset_cfgs
|
||||
}
|
||||
|
||||
table = []
|
||||
header1 = ['dataset', 'version', 'mode'] + sum(
|
||||
[[model_abbr] + ['-' for _ in range(len(self.metric_types) - 1)]
|
||||
for model_abbr in self.model_abbrs], [])
|
||||
table.append(header1)
|
||||
header2 = ['-', '-', '-'] + sum(
|
||||
[self.metric_types for _ in self.model_abbrs], [])
|
||||
table.append(header2)
|
||||
for dataset_abbr in self.dataset_abbrs:
|
||||
if dataset_abbr not in dataset_metrics:
|
||||
table.append([dataset_abbr, '-', '-'] + ['-'] *
|
||||
len(self.model_abbrs) * len(self.metric_types))
|
||||
continue
|
||||
row = [
|
||||
dataset_abbr,
|
||||
prompt_version.get(dataset_abbr, '-'),
|
||||
dataset_eval_mode.get(dataset_abbr, '-')
|
||||
]
|
||||
for model_abbr in self.model_abbrs:
|
||||
for metric in self.metric_types:
|
||||
if dataset_abbr in parsed_results[
|
||||
model_abbr] and metric in parsed_results[
|
||||
model_abbr][dataset_abbr]:
|
||||
row.append('{:.02f}'.format(
|
||||
parsed_results[model_abbr][dataset_abbr][metric]))
|
||||
else:
|
||||
row.append('-')
|
||||
table.append(row)
|
||||
return table
|
@ -1,10 +1,11 @@
|
||||
# flake8: noqa
|
||||
# yapf: disable
|
||||
import functools
|
||||
import getpass
|
||||
import math
|
||||
import os.path as osp
|
||||
from datetime import datetime
|
||||
from typing import List, Optional
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
import mmengine
|
||||
import tabulate
|
||||
@ -22,12 +23,9 @@ class DefaultSummarizer:
|
||||
"""Default summarizer in OpenCompass.
|
||||
|
||||
Args:
|
||||
config (ConfigDict): The configuration object of the evaluation task.
|
||||
It's expected to be filled out at runtime.
|
||||
dataset_abbrs (list[str], optional): Dataset abbreviations to be
|
||||
listed in the summary.
|
||||
summary_groups (list): The dataset groups whose results need to be
|
||||
averaged out. For example, mmlu. Each item it a dict with
|
||||
config (ConfigDict): The configuration object of the evaluation task. It's expected to be filled out at runtime.
|
||||
dataset_abbrs (list[str], optional): Dataset abbreviations to be listed in the summary.
|
||||
summary_groups (list): The dataset groups whose results need to be averaged out. For example, mmlu. Each item it a dict with
|
||||
'name' (str) and 'subsets' (list of dataset abbrs), and optionally
|
||||
'weights' if weighted average is needed.
|
||||
prompt_db: A deprecated field.
|
||||
@ -48,28 +46,37 @@ class DefaultSummarizer:
|
||||
if self.cfg.get('lark_bot_url', None):
|
||||
self.lark_reporter = LarkReporter(self.cfg['lark_bot_url'])
|
||||
|
||||
def summarize(
|
||||
self,
|
||||
output_path: str = None,
|
||||
time_str: str = datetime.now().strftime('%Y%m%d_%H%M%S')): # noqa
|
||||
self.model_cfgs = self.cfg['models']
|
||||
self.dataset_cfgs = self.cfg['datasets']
|
||||
self.work_dir = self.cfg['work_dir']
|
||||
self.model_abbrs = [model_abbr_from_cfg(model) for model in self.model_cfgs]
|
||||
|
||||
model_cfgs = self.cfg['models']
|
||||
dataset_cfgs = self.cfg['datasets']
|
||||
work_dir = self.cfg['work_dir']
|
||||
def _pick_up_results(self):
|
||||
"""The function reads the numerical results of evaluations from the
|
||||
output folder based on the configuration file, and ultimately returns
|
||||
four dictionaries, each containing processed information in different
|
||||
formats. The contents of the four dictionaries are as follows:
|
||||
|
||||
# pick up results
|
||||
raw_results = {}
|
||||
parsed_results = {}
|
||||
dataset_metrics = {}
|
||||
- raw_results: contains the raw results of each model on each dataset (excluding details).
|
||||
- parsed_results: contains the results of each model on each dataset for each metric, with metrics in METRIC_BLACKLIST being ignored.
|
||||
- dataset_metrics: contains the list of metrics for each dataset, consistent with the metrics in parsed_results. The list is ordered according to the METRIC_WHITELIST,
|
||||
with metrics appearing earlier considered more important.
|
||||
- dataset_eval_mode: contains the evaluation mode for each dataset.
|
||||
"""
|
||||
# raw_results: {model_abbr: {dataset_abbr: result}}
|
||||
raw_results : Dict[str, Dict[str, Any]] = {}
|
||||
# parsed_results: {model_abbr: {dataset_abbr: {metric: score}}}
|
||||
parsed_results : Dict[str, Dict[str, Dict[str, float]]] = {}
|
||||
# dataset_metrics: {dataset_abbr: [metric]}
|
||||
dataset_metrics : Dict[str, List[str]] = {}
|
||||
|
||||
model_abbrs = [model_abbr_from_cfg(model) for model in model_cfgs]
|
||||
for model in model_cfgs:
|
||||
for model in self.model_cfgs:
|
||||
model_abbr = model_abbr_from_cfg(model)
|
||||
parsed_results[model_abbr] = {}
|
||||
raw_results[model_abbr] = {}
|
||||
for dataset in dataset_cfgs:
|
||||
for dataset in self.dataset_cfgs:
|
||||
dataset_abbr = dataset_abbr_from_cfg(dataset)
|
||||
filepath = get_infer_output_path(model, dataset, osp.join(work_dir, 'results'))
|
||||
filepath = get_infer_output_path(model, dataset, osp.join(self.work_dir, 'results'))
|
||||
if not osp.exists(filepath):
|
||||
continue
|
||||
result = mmengine.load(filepath)
|
||||
@ -78,34 +85,28 @@ class DefaultSummarizer:
|
||||
if 'error' in result:
|
||||
self.logger.debug(f'error in {model_abbr} {dataset_abbr} {result["error"]}')
|
||||
continue
|
||||
else:
|
||||
parsed_results[model_abbr][dataset_abbr] = []
|
||||
dataset_metrics[dataset_abbr] = []
|
||||
for metric, score in result.items():
|
||||
if metric not in METRIC_BLACKLIST and isinstance(score, (int, float)):
|
||||
parsed_results[model_abbr][dataset_abbr].append(score)
|
||||
dataset_metrics[dataset_abbr].append(metric)
|
||||
else:
|
||||
continue
|
||||
if len(parsed_results[model_abbr][dataset_abbr]) == 0:
|
||||
self.logger.warning(f'unknown result format: {result}, continue')
|
||||
del parsed_results[model_abbr][dataset_abbr]
|
||||
del dataset_metrics[dataset_abbr]
|
||||
_rst, _dm = {}, []
|
||||
for metric, score in result.items():
|
||||
if metric not in METRIC_BLACKLIST and isinstance(score, (int, float)):
|
||||
_rst[metric] = score
|
||||
_dm.append(metric)
|
||||
else:
|
||||
continue
|
||||
indice = sorted(
|
||||
list(range(len(dataset_metrics[dataset_abbr]))),
|
||||
key=lambda i: (
|
||||
METRIC_WHITELIST.index(dataset_metrics[dataset_abbr][i])
|
||||
if dataset_metrics[dataset_abbr][i] in METRIC_WHITELIST
|
||||
else len(METRIC_WHITELIST)
|
||||
)
|
||||
)
|
||||
parsed_results[model_abbr][dataset_abbr] = [parsed_results[model_abbr][dataset_abbr][i] for i in indice]
|
||||
dataset_metrics[dataset_abbr] = [dataset_metrics[dataset_abbr][i] for i in indice]
|
||||
if len(_rst) == 0:
|
||||
self.logger.warning(f'unknown result format: {result}, continue')
|
||||
continue
|
||||
_dm = sorted(_dm, key=lambda i: METRIC_WHITELIST.index(i) if i in METRIC_WHITELIST else len(METRIC_WHITELIST))
|
||||
|
||||
# parse eval mode
|
||||
dataset_eval_mode = {}
|
||||
for dataset in dataset_cfgs:
|
||||
if dataset_abbr in dataset_metrics:
|
||||
assert tuple(dataset_metrics[dataset_abbr]) == tuple(_dm), \
|
||||
f'{dataset_abbr} has different metrics: {dataset_metrics[dataset_abbr]} vs {_dm}'
|
||||
else:
|
||||
dataset_metrics[dataset_abbr] = _dm
|
||||
parsed_results[model_abbr][dataset_abbr] = _rst
|
||||
|
||||
# dataset_eval_mode: {dataset_abbr: eval_mode}
|
||||
dataset_eval_mode : Dict[str, str] = {}
|
||||
for dataset in self.dataset_cfgs:
|
||||
inferencer = dataset.get('infer_cfg', {}).get('inferencer', {}).get('type', '')
|
||||
inferencer = inferencer if isinstance(inferencer, str) else inferencer.__name__
|
||||
dataset_abbr = dataset_abbr_from_cfg(dataset)
|
||||
@ -116,64 +117,97 @@ class DefaultSummarizer:
|
||||
else:
|
||||
dataset_eval_mode[dataset_abbr] = 'unknown'
|
||||
self.logger.warning(f'unknown inferencer: {inferencer} - {dataset_abbr}')
|
||||
return raw_results, parsed_results, dataset_metrics, dataset_eval_mode
|
||||
|
||||
# calculate group metrics
|
||||
def _calculate_group_metrics(self, raw_results, parsed_results, dataset_metrics, dataset_eval_mode):
|
||||
"""The function calculates the numerical results for each group based
|
||||
on the configuration in summary_groups, and updates the contents of
|
||||
each dictionary accordingly."""
|
||||
summary_groups = self.summary_groups
|
||||
for sg in summary_groups:
|
||||
for model_abbr in model_abbrs:
|
||||
results = {}
|
||||
eval_modes = []
|
||||
for dataset_abbr in sg['subsets']:
|
||||
if dataset_abbr in parsed_results[model_abbr]:
|
||||
results[dataset_abbr] = parsed_results[model_abbr][dataset_abbr][0]
|
||||
eval_modes.append(dataset_eval_mode.get(dataset_abbr, 'unknown'))
|
||||
if len(results) == len(sg['subsets']):
|
||||
if 'std' in sg and sg['std'] == True:
|
||||
avg = sum(results[k] for k in results) / len(results)
|
||||
variance = sum((results[k] - avg)**2 for k in results) / len(results)
|
||||
metric = 'standard_deviation'
|
||||
results[metric] = math.sqrt(variance)
|
||||
else:
|
||||
if 'weights' in sg:
|
||||
numerator = sum(results[k] * sg['weights'][k] for k in sg['weights'])
|
||||
denominator = sum(sg['weights'].values())
|
||||
metric = 'weighted_average'
|
||||
else:
|
||||
numerator = sum(results[k] for k in results)
|
||||
denominator = len(results)
|
||||
metric = 'naive_average'
|
||||
results[metric] = numerator / denominator
|
||||
for model_abbr in self.model_abbrs:
|
||||
available_count = sum(dataset_abbr in parsed_results[model_abbr] for dataset_abbr in sg['subsets'])
|
||||
if available_count == 0:
|
||||
continue
|
||||
if available_count != len(sg['subsets']):
|
||||
raw_results[model_abbr][sg['name']] = {'error': 'missing datasets: {}'.format(set(sg['subsets']) - set(parsed_results[model_abbr].keys()))}
|
||||
continue
|
||||
|
||||
if sg.get('std', False):
|
||||
default_metric = 'standard_deviation'
|
||||
elif sg.get('weights', []):
|
||||
default_metric = 'weighted_average'
|
||||
else:
|
||||
default_metric = 'naive_average'
|
||||
scores, eval_modes, group_metrics = {}, [], None
|
||||
if any(isinstance(dataset_abbr, (list, tuple)) for dataset_abbr in sg['subsets']) and \
|
||||
any(isinstance(dataset_abbr, str) for dataset_abbr in sg['subsets']):
|
||||
raise NotImplementedError('mixed dataset_abbr type is not supported')
|
||||
|
||||
if all(isinstance(dataset_abbr, (list, tuple)) for dataset_abbr in sg['subsets']):
|
||||
group_metrics = [default_metric]
|
||||
for dataset_abbr, metric in sg['subsets']:
|
||||
scores.setdefault(default_metric, []).append(parsed_results[model_abbr][dataset_abbr][metric])
|
||||
eval_modes.append(dataset_eval_mode.get(dataset_abbr, 'unknown'))
|
||||
else:
|
||||
group_metrics = list(functools.reduce(lambda a, b: a & b, [set(dataset_metrics[dataset_abbr]) for dataset_abbr in sg['subsets']]))
|
||||
if len(group_metrics) > 1:
|
||||
for metric in group_metrics:
|
||||
for dataset_abbr in sg['subsets']:
|
||||
scores.setdefault(metric, []).append(parsed_results[model_abbr][dataset_abbr][metric])
|
||||
eval_modes.append(dataset_eval_mode.get(sg['subsets'][0], 'unknown'))
|
||||
else:
|
||||
group_metrics = [default_metric]
|
||||
for dataset_abbr in sg['subsets']:
|
||||
metric = dataset_metrics[dataset_abbr][0]
|
||||
scores.setdefault(default_metric, []).append(parsed_results[model_abbr][dataset_abbr][metric])
|
||||
eval_modes.append(dataset_eval_mode.get(dataset_abbr, 'unknown'))
|
||||
|
||||
result = {}
|
||||
for metric in scores:
|
||||
if default_metric == 'standard_deviation':
|
||||
avg = sum(scores[metric]) / len(scores[metric])
|
||||
variance = sum((k - avg) ** 2 for k in scores[metric]) / len(scores[metric])
|
||||
scores[metric] = result[metric] = math.sqrt(variance)
|
||||
else:
|
||||
if default_metric == 'weighted_average':
|
||||
numerator = sum(scores[metric][k] * sg['weights'][k] for k in sg['weights'])
|
||||
denominator = sum(sg['weights'].values())
|
||||
else:
|
||||
numerator = sum(scores[metric])
|
||||
denominator = len(scores[metric])
|
||||
scores[metric] = result[metric] = numerator / denominator
|
||||
eval_modes = list(set(eval_modes))
|
||||
eval_mode = eval_modes[0] if len(eval_modes) == 1 else 'mixed'
|
||||
# add to global results
|
||||
raw_results[model_abbr][sg['name']] = results
|
||||
parsed_results[model_abbr][sg['name']] = [results[metric]]
|
||||
|
||||
dataset_metrics[sg['name']] = [metric]
|
||||
dataset_eval_mode[sg['name']] = eval_mode
|
||||
elif len(results) == 0:
|
||||
continue
|
||||
else:
|
||||
raw_results[model_abbr][sg['name']] = {'error': 'missing datasets: {}'.format(set(sg['subsets']) - set(results.keys()))}
|
||||
# add to global results
|
||||
raw_results[model_abbr][sg['name']] = scores
|
||||
parsed_results[model_abbr][sg['name']]= result
|
||||
dataset_metrics[sg['name']] = group_metrics
|
||||
dataset_eval_mode[sg['name']] = eval_mode
|
||||
|
||||
prompt_version = {dataset_abbr_from_cfg(d): get_prompt_hash(d)[:6] for d in dataset_cfgs}
|
||||
return raw_results, parsed_results, dataset_metrics, dataset_eval_mode
|
||||
|
||||
def _format_table(self, parsed_results, dataset_metrics, dataset_eval_mode):
|
||||
dataset_abbrs = [dataset_abbr_from_cfg(dataset) for dataset in self.dataset_cfgs]
|
||||
prompt_version = {dataset_abbr_from_cfg(d): get_prompt_hash(d)[:6] for d in self.dataset_cfgs}
|
||||
|
||||
# format table
|
||||
summarizer_dataset_abbrs = []
|
||||
if self.dataset_abbrs is None:
|
||||
for dataset in dataset_cfgs:
|
||||
dataset_abbr = dataset_abbr_from_cfg(dataset)
|
||||
# display all dataset metrics included in the config
|
||||
for dataset_abbr in dataset_abbrs:
|
||||
if dataset_abbr in dataset_metrics:
|
||||
for metric in dataset_metrics[dataset_abbr]:
|
||||
summarizer_dataset_abbrs.append((dataset_abbr, metric))
|
||||
else:
|
||||
summarizer_dataset_abbrs.append((dataset_abbr, None))
|
||||
# along with all possible group metrics
|
||||
for dataset_abbr in dataset_metrics:
|
||||
for metric in dataset_metrics[dataset_abbr]:
|
||||
if (dataset_abbr, metric) not in summarizer_dataset_abbrs:
|
||||
summarizer_dataset_abbrs.append((dataset_abbr, metric))
|
||||
else:
|
||||
# follow the required order
|
||||
for item in self.dataset_abbrs:
|
||||
if isinstance(item, str):
|
||||
summarizer_dataset_abbrs.append((item, None))
|
||||
@ -181,79 +215,103 @@ class DefaultSummarizer:
|
||||
summarizer_dataset_abbrs.append((item[0], item[1]))
|
||||
|
||||
table = []
|
||||
header = ['dataset', 'version', 'metric', 'mode'] + model_abbrs
|
||||
header = ['dataset', 'version', 'metric', 'mode'] + self.model_abbrs
|
||||
table.append(header)
|
||||
for dataset_abbr, metric in summarizer_dataset_abbrs:
|
||||
if dataset_abbr not in dataset_metrics:
|
||||
table.append([dataset_abbr, '-', '-', '-'] + ['-'] * len(model_abbrs))
|
||||
table.append([dataset_abbr, '-', '-', '-'] + ['-'] * len(self.model_abbrs))
|
||||
continue
|
||||
if metric is None:
|
||||
index = 0
|
||||
metric = dataset_metrics[dataset_abbr][0]
|
||||
elif metric in dataset_metrics[dataset_abbr]:
|
||||
index = dataset_metrics[dataset_abbr].index(metric)
|
||||
pass
|
||||
else:
|
||||
table.append([dataset_abbr, '-', '-', '-'] + ['-'] * len(model_abbrs))
|
||||
table.append([dataset_abbr, '-', '-', '-'] + ['-'] * len(self.model_abbrs))
|
||||
continue
|
||||
|
||||
row = [dataset_abbr, prompt_version.get(dataset_abbr, '-'), metric, dataset_eval_mode.get(dataset_abbr, '-')]
|
||||
for model_abbr in model_abbrs:
|
||||
for model_abbr in self.model_abbrs:
|
||||
if dataset_abbr in parsed_results[model_abbr]:
|
||||
row.append('{:.02f}'.format(parsed_results[model_abbr][dataset_abbr][index]))
|
||||
row.append('{:.02f}'.format(parsed_results[model_abbr][dataset_abbr][metric]))
|
||||
else:
|
||||
row.append('-')
|
||||
table.append(row)
|
||||
return table
|
||||
|
||||
# format raw txt
|
||||
def _format_raw_txt(self, raw_results):
|
||||
raw_dataset_abbrs = []
|
||||
for model_abbr in model_abbrs:
|
||||
for model_abbr in self.model_abbrs:
|
||||
for dataset_abbr in raw_results[model_abbr]:
|
||||
if dataset_abbr not in raw_dataset_abbrs:
|
||||
raw_dataset_abbrs.append(dataset_abbr)
|
||||
raw_txts = []
|
||||
for model_abbr in model_abbrs:
|
||||
for model_abbr in self.model_abbrs:
|
||||
raw_txts.append('-------------------------------')
|
||||
raw_txts.append(f'Model: {model_abbr}')
|
||||
for dataset_abbr in raw_dataset_abbrs:
|
||||
result = raw_results[model_abbr].get(dataset_abbr, '{}')
|
||||
raw_txts.append(f'{dataset_abbr}: {result}')
|
||||
raw_txts = '\n'.join(raw_txts)
|
||||
return raw_txts
|
||||
|
||||
# output to screean
|
||||
print(tabulate.tabulate(table, headers='firstrow'))
|
||||
|
||||
def _output_to_file(self, output_path, time_str, table, raw_txts):
|
||||
# output to file
|
||||
if output_path is None:
|
||||
output_path = osp.join(work_dir, 'summary', f'summary_{time_str}.txt')
|
||||
output_csv_path = osp.join(work_dir, 'summary', f'summary_{time_str}.csv')
|
||||
output_path = osp.join(self.work_dir, 'summary', f'summary_{time_str}.txt')
|
||||
output_csv_path = osp.join(self.work_dir, 'summary', f'summary_{time_str}.csv')
|
||||
else:
|
||||
output_csv_path = output_path.replace('.txt', '.csv')
|
||||
|
||||
output_dir = osp.split(output_path)[0]
|
||||
mmengine.mkdir_or_exist(output_dir)
|
||||
with open(output_path, 'w', encoding='utf-8') as f:
|
||||
f.write(time_str + '\n')
|
||||
f.write('tabulate format\n')
|
||||
f.write('^' * 128 + '\n')
|
||||
f.write(tabulate.tabulate(table, headers='firstrow') + '\n')
|
||||
f.write('$' * 128 + '\n')
|
||||
f.write('\n' + '-' * 128 + ' THIS IS A DIVIDER ' + '-' * 128 + '\n\n')
|
||||
f.write('csv format\n')
|
||||
f.write('^' * 128 + '\n')
|
||||
f.write('\n'.join([','.join(row) for row in table]) + '\n')
|
||||
f.write('$' * 128 + '\n')
|
||||
f.write('\n' + '-' * 128 + ' THIS IS A DIVIDER ' + '-' * 128 + '\n\n')
|
||||
f.write('raw format\n')
|
||||
f.write('^' * 128 + '\n')
|
||||
f.write(raw_txts + '\n')
|
||||
f.write('$' * 128 + '\n')
|
||||
text = f'{time_str}\n' + \
|
||||
'tabulate format\n' + \
|
||||
'^' * 128 + '\n' + \
|
||||
tabulate.tabulate(table, headers='firstrow') + '\n' + \
|
||||
'$' * 128 + '\n\n' + \
|
||||
'-' * 128 + ' THIS IS A DIVIDER ' + '-' * 128 + '\n\n' + \
|
||||
'csv format\n' + \
|
||||
'^' * 128 + '\n' + \
|
||||
'\n'.join([','.join(row) for row in table]) + '\n' + \
|
||||
'$' * 128 + '\n\n' + \
|
||||
'-' * 128 + ' THIS IS A DIVIDER ' + '-' * 128 + '\n\n' + \
|
||||
'raw format\n' + \
|
||||
'^' * 128 + '\n' + \
|
||||
raw_txts + '\n' + \
|
||||
'$' * 128 + '\n'
|
||||
f.write(text)
|
||||
self.logger.info(f'write summary to {osp.abspath(output_path)}')
|
||||
|
||||
with open(output_csv_path, 'w', encoding='utf-8') as f:
|
||||
f.write('\n'.join([','.join(row) for row in table]) + '\n')
|
||||
self.logger.info(f'write csv to {osp.abspath(output_csv_path)}')
|
||||
|
||||
def summarize(
|
||||
self,
|
||||
output_path: str = None,
|
||||
time_str: str = datetime.now().strftime('%Y%m%d_%H%M%S')): # noqa
|
||||
|
||||
# pick up results
|
||||
raw_results, parsed_results, dataset_metrics, dataset_eval_mode = self._pick_up_results()
|
||||
|
||||
# calculate group metrics
|
||||
raw_results, parsed_results, dataset_metrics, dataset_eval_mode = \
|
||||
self._calculate_group_metrics(raw_results, parsed_results, dataset_metrics, dataset_eval_mode)
|
||||
|
||||
# format table
|
||||
table = self._format_table(parsed_results, dataset_metrics, dataset_eval_mode)
|
||||
|
||||
# format raw txt
|
||||
raw_txts = self._format_raw_txt(raw_results)
|
||||
|
||||
# output to screen
|
||||
print(tabulate.tabulate(table, headers='firstrow'))
|
||||
|
||||
# output to .text / .csv files
|
||||
self._output_to_file(output_path, time_str, table, raw_txts)
|
||||
|
||||
if self.lark_reporter:
|
||||
content = f'{getpass.getuser()} 的'
|
||||
content += f'详细评测汇总已输出至 {osp.abspath(output_path)}'
|
||||
self.lark_reporter.post(content)
|
||||
|
||||
with open(output_csv_path, 'w', encoding='utf-8') as f:
|
||||
f.write('\n'.join([','.join(row) for row in table]) + '\n')
|
||||
self.logger.info(f'write csv to {osp.abspath(output_csv_path)}')
|
||||
|
@ -172,6 +172,7 @@ class OpenICLEvalTask(BaseTask):
|
||||
preds['predictions'] = pred_strs
|
||||
preds['references'] = (test_set[self.output_column]
|
||||
if self.output_column else None)
|
||||
preds['test_set'] = test_set
|
||||
preds = {
|
||||
k: preds[k]
|
||||
for k in signature(icl_evaluator.score).parameters
|
||||
|
Loading…
Reference in New Issue
Block a user