mirror of
https://github.com/open-compass/opencompass.git
synced 2025-05-30 16:03:24 +08:00
[Feature] Add double order of subjective evaluation and removing duplicated response among two models (#692)
* add features * add doc string * add doc string
This commit is contained in:
parent
82a533a690
commit
6130394165
@ -36,7 +36,7 @@ for _name in subjective_all_sets:
|
|||||||
subjective_eval_cfg = dict(
|
subjective_eval_cfg = dict(
|
||||||
evaluator=dict(
|
evaluator=dict(
|
||||||
type=LMEvaluator,
|
type=LMEvaluator,
|
||||||
random_order=True,
|
infer_order='random',
|
||||||
prompt_template=dict(
|
prompt_template=dict(
|
||||||
type=PromptTemplate,
|
type=PromptTemplate,
|
||||||
template=dict(round=[
|
template=dict(round=[
|
||||||
|
@ -34,7 +34,6 @@ for _name in subjective_all_sets:
|
|||||||
subjective_eval_cfg = dict(
|
subjective_eval_cfg = dict(
|
||||||
evaluator=dict(
|
evaluator=dict(
|
||||||
type=LMEvaluator,
|
type=LMEvaluator,
|
||||||
random_order=True,
|
|
||||||
prompt_template=dict(
|
prompt_template=dict(
|
||||||
type=PromptTemplate,
|
type=PromptTemplate,
|
||||||
template=dict(round=[
|
template=dict(round=[
|
||||||
|
@ -83,6 +83,10 @@ summarizer = dict(
|
|||||||
)
|
)
|
||||||
```
|
```
|
||||||
|
|
||||||
|
In addition, you can also change the response order of the two models, please refer to `config/subjective_compare.py`,
|
||||||
|
when `infer_order` is setting to `random`, the response will be random ordered,
|
||||||
|
when `infer_order` is setting to `double`, the response of two models will be doubled in two ways.
|
||||||
|
|
||||||
### Single Model Scoring Configuration
|
### Single Model Scoring Configuration
|
||||||
|
|
||||||
For `config/subjective_score.py`, it is mainly same with `config/subjective_compare.py`, and you just need to modify the eval mode to `singlescore`.
|
For `config/subjective_score.py`, it is mainly same with `config/subjective_compare.py`, and you just need to modify the eval mode to `singlescore`.
|
||||||
|
@ -83,6 +83,10 @@ summarizer = dict(
|
|||||||
)
|
)
|
||||||
```
|
```
|
||||||
|
|
||||||
|
此外,在数据集的配置config中,还可以选择两回答比较时的回答顺序,请参考`config/subjective_compare.py`,
|
||||||
|
当`infer_order`设置为`random`时,将对两模型的回复顺序进行随机打乱,
|
||||||
|
当`infer_order`设置为`double`时,将把两模型的回复按两种先后顺序进行判断。
|
||||||
|
|
||||||
### 单回答打分配置
|
### 单回答打分配置
|
||||||
|
|
||||||
对于单回答打分,更详细的config setting请参考 `config/subjective_score.py`,该config的大部分都与两回答比较的config相同,只需要修改评测模式即可,将评测模式设置为`singlescore`。
|
对于单回答打分,更详细的config setting请参考 `config/subjective_score.py`,该config的大部分都与两回答比较的config相同,只需要修改评测模式即可,将评测模式设置为`singlescore`。
|
||||||
|
@ -1,8 +1,10 @@
|
|||||||
|
# flake8: noqa: E501
|
||||||
import os.path as osp
|
import os.path as osp
|
||||||
import random
|
import random
|
||||||
from typing import Dict, List, Optional
|
from typing import Dict, List, Optional
|
||||||
|
|
||||||
import mmengine
|
import mmengine
|
||||||
|
from datasets import Dataset
|
||||||
from mmengine.config import ConfigDict
|
from mmengine.config import ConfigDict
|
||||||
|
|
||||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||||
@ -14,20 +16,40 @@ from opencompass.utils.text_postprocessors import first_number_postprocess
|
|||||||
from opencompass.utils.types import get_type_from_cfg
|
from opencompass.utils.types import get_type_from_cfg
|
||||||
|
|
||||||
|
|
||||||
def randomize_preds_and_record_references(predictions,
|
def order_preds_and_record_references(predictions,
|
||||||
references,
|
references,
|
||||||
random_order,
|
infer_order,
|
||||||
seed=2680):
|
seed=2680):
|
||||||
|
"""Order predictions based on args and recording regrading references.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
predictions (List): List of multi model predictions.
|
||||||
|
references (List): List of reference based on each problem.
|
||||||
|
infer_order (str, optional): The mode of inference order.
|
||||||
|
seed (int, optional): Random seed.
|
||||||
|
"""
|
||||||
random.seed(seed)
|
random.seed(seed)
|
||||||
list_of_preds = [[] for _ in range(len(predictions))]
|
list_of_preds = [[] for _ in range(len(predictions))]
|
||||||
for i in range(len(predictions[0]['model_preds'])):
|
for i in range(len(predictions[0]['model_preds'])):
|
||||||
preds = [[pred['model_preds'][i], pred['model_name']]
|
preds = [[pred['model_preds'][i], pred['model_name']]
|
||||||
for pred in predictions]
|
for pred in predictions]
|
||||||
if random_order:
|
if infer_order == 'random':
|
||||||
random.shuffle(preds)
|
random.shuffle(preds)
|
||||||
for j in range(len(preds)):
|
for j in range(len(preds)):
|
||||||
list_of_preds[j].append(preds[j][0])
|
list_of_preds[j].append(preds[j][0])
|
||||||
references[i][f'answer{j+1}'] = preds[j][1]
|
references[i][f'answer{j+1}'] = preds[j][1]
|
||||||
|
if infer_order == 'double':
|
||||||
|
assert len(predictions) == 2
|
||||||
|
list_of_preds = [
|
||||||
|
a + b for a, b in zip(list_of_preds, reversed(list_of_preds))
|
||||||
|
]
|
||||||
|
reversed_references = []
|
||||||
|
for item in references:
|
||||||
|
reversed_item = item.copy()
|
||||||
|
reversed_item['answer1'], reversed_item['answer2'] = reversed_item[
|
||||||
|
'answer2'], reversed_item['answer1']
|
||||||
|
reversed_references.append(reversed_item)
|
||||||
|
references += reversed_references
|
||||||
return list_of_preds, references
|
return list_of_preds, references
|
||||||
|
|
||||||
|
|
||||||
@ -52,10 +74,11 @@ class LMEvaluator:
|
|||||||
prompt_template: ConfigDict,
|
prompt_template: ConfigDict,
|
||||||
judge_cfg: ConfigDict,
|
judge_cfg: ConfigDict,
|
||||||
output_path: str,
|
output_path: str,
|
||||||
random_order: Optional[bool] = False,
|
infer_order: Optional[str] = 'random',
|
||||||
dataset_cfg: Optional[ConfigDict] = None,
|
dataset_cfg: Optional[ConfigDict] = None,
|
||||||
postprocessor: ConfigDict = dict(type=first_number_postprocess)
|
postprocessor: ConfigDict = dict(type=first_number_postprocess)
|
||||||
) -> None:
|
) -> None:
|
||||||
|
assert infer_order in ['random', 'double']
|
||||||
self.output_path = output_path
|
self.output_path = output_path
|
||||||
out_dir, out_name = osp.split(output_path)
|
out_dir, out_name = osp.split(output_path)
|
||||||
if not out_dir:
|
if not out_dir:
|
||||||
@ -74,20 +97,36 @@ class LMEvaluator:
|
|||||||
self.postprocessor = get_type_from_cfg(postprocessor)
|
self.postprocessor = get_type_from_cfg(postprocessor)
|
||||||
self.logger = get_logger()
|
self.logger = get_logger()
|
||||||
self.dataset_cfg = dataset_cfg
|
self.dataset_cfg = dataset_cfg
|
||||||
self.random_order = random_order
|
self.infer_order = infer_order
|
||||||
|
|
||||||
def score(self, predictions, references: Optional[List] = None) -> Dict:
|
def score(self, predictions, references: Optional[List] = None) -> Dict:
|
||||||
if type(predictions) == list:
|
if type(predictions) == list:
|
||||||
"""Apply to multi-model comparison."""
|
"""Apply to multi-model comparison."""
|
||||||
references = [{} for _ in range(len(predictions[0]['model_preds']))
|
references = [{} for _ in range(len(predictions[0]['model_preds']))
|
||||||
] if references is None else references
|
] if references is None else references
|
||||||
predictions, references = randomize_preds_and_record_references(
|
predictions, references = order_preds_and_record_references(
|
||||||
predictions, references, self.random_order)
|
predictions, references, self.infer_order)
|
||||||
elif type(predictions) == dict:
|
elif type(predictions) == dict:
|
||||||
"""Apply to single-model scoring."""
|
"""Apply to single-model scoring."""
|
||||||
references = [{} for _ in range(len(predictions[0]['model_preds']))
|
references = [{} for _ in range(len(predictions[0]['model_preds']))
|
||||||
] if references is None else references
|
] if references is None else references
|
||||||
predictions = [predictions['model_preds']]
|
predictions = [predictions['model_preds']]
|
||||||
|
|
||||||
|
# calculate dupicated predictions numbers
|
||||||
|
total_predictions_num = len(predictions[0])
|
||||||
|
dup_indices = []
|
||||||
|
for i in range(len(predictions[0])):
|
||||||
|
check = [sub[i] for sub in predictions]
|
||||||
|
if len(set(check)) == 1:
|
||||||
|
dup_indices.append(i)
|
||||||
|
|
||||||
|
if len(dup_indices) != 0:
|
||||||
|
# remove dupicated predictions
|
||||||
|
for index in sorted(dup_indices, reverse=True):
|
||||||
|
for sublist in predictions:
|
||||||
|
del sublist[index]
|
||||||
|
del references[index]
|
||||||
|
|
||||||
pred_dict = {}
|
pred_dict = {}
|
||||||
for i in range(len(predictions)):
|
for i in range(len(predictions)):
|
||||||
key = 'prediction' if i == 0 else f'prediction{i + 1}'
|
key = 'prediction' if i == 0 else f'prediction{i + 1}'
|
||||||
@ -95,6 +134,25 @@ class LMEvaluator:
|
|||||||
|
|
||||||
if self.dataset_cfg:
|
if self.dataset_cfg:
|
||||||
dataset = build_dataset_from_cfg(self.dataset_cfg)
|
dataset = build_dataset_from_cfg(self.dataset_cfg)
|
||||||
|
|
||||||
|
if self.infer_order == 'double':
|
||||||
|
new_ds = {
|
||||||
|
k: dataset.test[k] * 2
|
||||||
|
for k in dataset.test.column_names
|
||||||
|
}
|
||||||
|
dataset.reader.dataset['test'] = Dataset.from_dict(new_ds)
|
||||||
|
|
||||||
|
if len(dup_indices) != 0:
|
||||||
|
remaining_indices = [
|
||||||
|
idx for idx in range(len(dataset.test))
|
||||||
|
if idx not in dup_indices
|
||||||
|
]
|
||||||
|
dataset.reader.dataset['test'] = dataset.test.select(
|
||||||
|
remaining_indices)
|
||||||
|
print(
|
||||||
|
f'Among total {total_predictions_num} predictions, there are {len(dup_indices)} predictions totally same, which are removed!'
|
||||||
|
)
|
||||||
|
|
||||||
for k, v in pred_dict.items():
|
for k, v in pred_dict.items():
|
||||||
dataset.reader.dataset['test'] = dataset.test.add_column(k, v)
|
dataset.reader.dataset['test'] = dataset.test.add_column(k, v)
|
||||||
dataset.reader.input_columns.append(k)
|
dataset.reader.input_columns.append(k)
|
||||||
|
Loading…
Reference in New Issue
Block a user