[Feature] Add double order of subjective evaluation and removing duplicated response among two models (#692)

* add features

* add doc string

* add doc string
This commit is contained in:
bittersweet1999 2023-12-12 20:58:17 +08:00 committed by GitHub
parent 82a533a690
commit 6130394165
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 76 additions and 11 deletions

View File

@ -36,7 +36,7 @@ for _name in subjective_all_sets:
subjective_eval_cfg = dict( subjective_eval_cfg = dict(
evaluator=dict( evaluator=dict(
type=LMEvaluator, type=LMEvaluator,
random_order=True, infer_order='random',
prompt_template=dict( prompt_template=dict(
type=PromptTemplate, type=PromptTemplate,
template=dict(round=[ template=dict(round=[

View File

@ -34,7 +34,6 @@ for _name in subjective_all_sets:
subjective_eval_cfg = dict( subjective_eval_cfg = dict(
evaluator=dict( evaluator=dict(
type=LMEvaluator, type=LMEvaluator,
random_order=True,
prompt_template=dict( prompt_template=dict(
type=PromptTemplate, type=PromptTemplate,
template=dict(round=[ template=dict(round=[

View File

@ -83,6 +83,10 @@ summarizer = dict(
) )
``` ```
In addition, you can also change the response order of the two models, please refer to `config/subjective_compare.py`,
when `infer_order` is setting to `random`, the response will be random ordered,
when `infer_order` is setting to `double`, the response of two models will be doubled in two ways.
### Single Model Scoring Configuration ### Single Model Scoring Configuration
For `config/subjective_score.py`, it is mainly same with `config/subjective_compare.py`, and you just need to modify the eval mode to `singlescore`. For `config/subjective_score.py`, it is mainly same with `config/subjective_compare.py`, and you just need to modify the eval mode to `singlescore`.

View File

@ -83,6 +83,10 @@ summarizer = dict(
) )
``` ```
此外在数据集的配置config中还可以选择两回答比较时的回答顺序请参考`config/subjective_compare.py`,
当`infer_order`设置为`random`时,将对两模型的回复顺序进行随机打乱,
当`infer_order`设置为`double`时,将把两模型的回复按两种先后顺序进行判断。
### 单回答打分配置 ### 单回答打分配置
对于单回答打分更详细的config setting请参考 `config/subjective_score.py`该config的大部分都与两回答比较的config相同只需要修改评测模式即可将评测模式设置为`singlescore`。 对于单回答打分更详细的config setting请参考 `config/subjective_score.py`该config的大部分都与两回答比较的config相同只需要修改评测模式即可将评测模式设置为`singlescore`。

View File

@ -1,8 +1,10 @@
# flake8: noqa: E501
import os.path as osp import os.path as osp
import random import random
from typing import Dict, List, Optional from typing import Dict, List, Optional
import mmengine import mmengine
from datasets import Dataset
from mmengine.config import ConfigDict from mmengine.config import ConfigDict
from opencompass.openicl.icl_inferencer import GenInferencer from opencompass.openicl.icl_inferencer import GenInferencer
@ -14,20 +16,40 @@ from opencompass.utils.text_postprocessors import first_number_postprocess
from opencompass.utils.types import get_type_from_cfg from opencompass.utils.types import get_type_from_cfg
def randomize_preds_and_record_references(predictions, def order_preds_and_record_references(predictions,
references, references,
random_order, infer_order,
seed=2680): seed=2680):
"""Order predictions based on args and recording regrading references.
Args:
predictions (List): List of multi model predictions.
references (List): List of reference based on each problem.
infer_order (str, optional): The mode of inference order.
seed (int, optional): Random seed.
"""
random.seed(seed) random.seed(seed)
list_of_preds = [[] for _ in range(len(predictions))] list_of_preds = [[] for _ in range(len(predictions))]
for i in range(len(predictions[0]['model_preds'])): for i in range(len(predictions[0]['model_preds'])):
preds = [[pred['model_preds'][i], pred['model_name']] preds = [[pred['model_preds'][i], pred['model_name']]
for pred in predictions] for pred in predictions]
if random_order: if infer_order == 'random':
random.shuffle(preds) random.shuffle(preds)
for j in range(len(preds)): for j in range(len(preds)):
list_of_preds[j].append(preds[j][0]) list_of_preds[j].append(preds[j][0])
references[i][f'answer{j+1}'] = preds[j][1] references[i][f'answer{j+1}'] = preds[j][1]
if infer_order == 'double':
assert len(predictions) == 2
list_of_preds = [
a + b for a, b in zip(list_of_preds, reversed(list_of_preds))
]
reversed_references = []
for item in references:
reversed_item = item.copy()
reversed_item['answer1'], reversed_item['answer2'] = reversed_item[
'answer2'], reversed_item['answer1']
reversed_references.append(reversed_item)
references += reversed_references
return list_of_preds, references return list_of_preds, references
@ -52,10 +74,11 @@ class LMEvaluator:
prompt_template: ConfigDict, prompt_template: ConfigDict,
judge_cfg: ConfigDict, judge_cfg: ConfigDict,
output_path: str, output_path: str,
random_order: Optional[bool] = False, infer_order: Optional[str] = 'random',
dataset_cfg: Optional[ConfigDict] = None, dataset_cfg: Optional[ConfigDict] = None,
postprocessor: ConfigDict = dict(type=first_number_postprocess) postprocessor: ConfigDict = dict(type=first_number_postprocess)
) -> None: ) -> None:
assert infer_order in ['random', 'double']
self.output_path = output_path self.output_path = output_path
out_dir, out_name = osp.split(output_path) out_dir, out_name = osp.split(output_path)
if not out_dir: if not out_dir:
@ -74,20 +97,36 @@ class LMEvaluator:
self.postprocessor = get_type_from_cfg(postprocessor) self.postprocessor = get_type_from_cfg(postprocessor)
self.logger = get_logger() self.logger = get_logger()
self.dataset_cfg = dataset_cfg self.dataset_cfg = dataset_cfg
self.random_order = random_order self.infer_order = infer_order
def score(self, predictions, references: Optional[List] = None) -> Dict: def score(self, predictions, references: Optional[List] = None) -> Dict:
if type(predictions) == list: if type(predictions) == list:
"""Apply to multi-model comparison.""" """Apply to multi-model comparison."""
references = [{} for _ in range(len(predictions[0]['model_preds'])) references = [{} for _ in range(len(predictions[0]['model_preds']))
] if references is None else references ] if references is None else references
predictions, references = randomize_preds_and_record_references( predictions, references = order_preds_and_record_references(
predictions, references, self.random_order) predictions, references, self.infer_order)
elif type(predictions) == dict: elif type(predictions) == dict:
"""Apply to single-model scoring.""" """Apply to single-model scoring."""
references = [{} for _ in range(len(predictions[0]['model_preds'])) references = [{} for _ in range(len(predictions[0]['model_preds']))
] if references is None else references ] if references is None else references
predictions = [predictions['model_preds']] predictions = [predictions['model_preds']]
# calculate dupicated predictions numbers
total_predictions_num = len(predictions[0])
dup_indices = []
for i in range(len(predictions[0])):
check = [sub[i] for sub in predictions]
if len(set(check)) == 1:
dup_indices.append(i)
if len(dup_indices) != 0:
# remove dupicated predictions
for index in sorted(dup_indices, reverse=True):
for sublist in predictions:
del sublist[index]
del references[index]
pred_dict = {} pred_dict = {}
for i in range(len(predictions)): for i in range(len(predictions)):
key = 'prediction' if i == 0 else f'prediction{i + 1}' key = 'prediction' if i == 0 else f'prediction{i + 1}'
@ -95,6 +134,25 @@ class LMEvaluator:
if self.dataset_cfg: if self.dataset_cfg:
dataset = build_dataset_from_cfg(self.dataset_cfg) dataset = build_dataset_from_cfg(self.dataset_cfg)
if self.infer_order == 'double':
new_ds = {
k: dataset.test[k] * 2
for k in dataset.test.column_names
}
dataset.reader.dataset['test'] = Dataset.from_dict(new_ds)
if len(dup_indices) != 0:
remaining_indices = [
idx for idx in range(len(dataset.test))
if idx not in dup_indices
]
dataset.reader.dataset['test'] = dataset.test.select(
remaining_indices)
print(
f'Among total {total_predictions_num} predictions, there are {len(dup_indices)} predictions totally same, which are removed!'
)
for k, v in pred_dict.items(): for k, v in pred_dict.items():
dataset.reader.dataset['test'] = dataset.test.add_column(k, v) dataset.reader.dataset['test'] = dataset.test.add_column(k, v)
dataset.reader.input_columns.append(k) dataset.reader.input_columns.append(k)