[Feature] Add double order of subjective evaluation and removing duplicated response among two models (#692)

* add features

* add doc string

* add doc string
This commit is contained in:
bittersweet1999 2023-12-12 20:58:17 +08:00 committed by GitHub
parent 82a533a690
commit 6130394165
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 76 additions and 11 deletions

View File

@ -36,7 +36,7 @@ for _name in subjective_all_sets:
subjective_eval_cfg = dict(
evaluator=dict(
type=LMEvaluator,
random_order=True,
infer_order='random',
prompt_template=dict(
type=PromptTemplate,
template=dict(round=[

View File

@ -34,7 +34,6 @@ for _name in subjective_all_sets:
subjective_eval_cfg = dict(
evaluator=dict(
type=LMEvaluator,
random_order=True,
prompt_template=dict(
type=PromptTemplate,
template=dict(round=[

View File

@ -83,6 +83,10 @@ summarizer = dict(
)
```
In addition, you can also change the response order of the two models, please refer to `config/subjective_compare.py`,
when `infer_order` is setting to `random`, the response will be random ordered,
when `infer_order` is setting to `double`, the response of two models will be doubled in two ways.
### Single Model Scoring Configuration
For `config/subjective_score.py`, it is mainly same with `config/subjective_compare.py`, and you just need to modify the eval mode to `singlescore`.

View File

@ -83,6 +83,10 @@ summarizer = dict(
)
```
此外在数据集的配置config中还可以选择两回答比较时的回答顺序请参考`config/subjective_compare.py`,
当`infer_order`设置为`random`时,将对两模型的回复顺序进行随机打乱,
当`infer_order`设置为`double`时,将把两模型的回复按两种先后顺序进行判断。
### 单回答打分配置
对于单回答打分更详细的config setting请参考 `config/subjective_score.py`该config的大部分都与两回答比较的config相同只需要修改评测模式即可将评测模式设置为`singlescore`。

View File

@ -1,8 +1,10 @@
# flake8: noqa: E501
import os.path as osp
import random
from typing import Dict, List, Optional
import mmengine
from datasets import Dataset
from mmengine.config import ConfigDict
from opencompass.openicl.icl_inferencer import GenInferencer
@ -14,20 +16,40 @@ from opencompass.utils.text_postprocessors import first_number_postprocess
from opencompass.utils.types import get_type_from_cfg
def randomize_preds_and_record_references(predictions,
references,
random_order,
seed=2680):
def order_preds_and_record_references(predictions,
references,
infer_order,
seed=2680):
"""Order predictions based on args and recording regrading references.
Args:
predictions (List): List of multi model predictions.
references (List): List of reference based on each problem.
infer_order (str, optional): The mode of inference order.
seed (int, optional): Random seed.
"""
random.seed(seed)
list_of_preds = [[] for _ in range(len(predictions))]
for i in range(len(predictions[0]['model_preds'])):
preds = [[pred['model_preds'][i], pred['model_name']]
for pred in predictions]
if random_order:
if infer_order == 'random':
random.shuffle(preds)
for j in range(len(preds)):
list_of_preds[j].append(preds[j][0])
references[i][f'answer{j+1}'] = preds[j][1]
if infer_order == 'double':
assert len(predictions) == 2
list_of_preds = [
a + b for a, b in zip(list_of_preds, reversed(list_of_preds))
]
reversed_references = []
for item in references:
reversed_item = item.copy()
reversed_item['answer1'], reversed_item['answer2'] = reversed_item[
'answer2'], reversed_item['answer1']
reversed_references.append(reversed_item)
references += reversed_references
return list_of_preds, references
@ -52,10 +74,11 @@ class LMEvaluator:
prompt_template: ConfigDict,
judge_cfg: ConfigDict,
output_path: str,
random_order: Optional[bool] = False,
infer_order: Optional[str] = 'random',
dataset_cfg: Optional[ConfigDict] = None,
postprocessor: ConfigDict = dict(type=first_number_postprocess)
) -> None:
assert infer_order in ['random', 'double']
self.output_path = output_path
out_dir, out_name = osp.split(output_path)
if not out_dir:
@ -74,20 +97,36 @@ class LMEvaluator:
self.postprocessor = get_type_from_cfg(postprocessor)
self.logger = get_logger()
self.dataset_cfg = dataset_cfg
self.random_order = random_order
self.infer_order = infer_order
def score(self, predictions, references: Optional[List] = None) -> Dict:
if type(predictions) == list:
"""Apply to multi-model comparison."""
references = [{} for _ in range(len(predictions[0]['model_preds']))
] if references is None else references
predictions, references = randomize_preds_and_record_references(
predictions, references, self.random_order)
predictions, references = order_preds_and_record_references(
predictions, references, self.infer_order)
elif type(predictions) == dict:
"""Apply to single-model scoring."""
references = [{} for _ in range(len(predictions[0]['model_preds']))
] if references is None else references
predictions = [predictions['model_preds']]
# calculate dupicated predictions numbers
total_predictions_num = len(predictions[0])
dup_indices = []
for i in range(len(predictions[0])):
check = [sub[i] for sub in predictions]
if len(set(check)) == 1:
dup_indices.append(i)
if len(dup_indices) != 0:
# remove dupicated predictions
for index in sorted(dup_indices, reverse=True):
for sublist in predictions:
del sublist[index]
del references[index]
pred_dict = {}
for i in range(len(predictions)):
key = 'prediction' if i == 0 else f'prediction{i + 1}'
@ -95,6 +134,25 @@ class LMEvaluator:
if self.dataset_cfg:
dataset = build_dataset_from_cfg(self.dataset_cfg)
if self.infer_order == 'double':
new_ds = {
k: dataset.test[k] * 2
for k in dataset.test.column_names
}
dataset.reader.dataset['test'] = Dataset.from_dict(new_ds)
if len(dup_indices) != 0:
remaining_indices = [
idx for idx in range(len(dataset.test))
if idx not in dup_indices
]
dataset.reader.dataset['test'] = dataset.test.select(
remaining_indices)
print(
f'Among total {total_predictions_num} predictions, there are {len(dup_indices)} predictions totally same, which are removed!'
)
for k, v in pred_dict.items():
dataset.reader.dataset['test'] = dataset.test.add_column(k, v)
dataset.reader.input_columns.append(k)