diff --git a/configs/datasets/subjective_cmp/subjective_corev2.py b/configs/datasets/subjective_cmp/subjective_corev2.py index 464855bf..d6112c7d 100644 --- a/configs/datasets/subjective_cmp/subjective_corev2.py +++ b/configs/datasets/subjective_cmp/subjective_corev2.py @@ -36,7 +36,7 @@ for _name in subjective_all_sets: subjective_eval_cfg = dict( evaluator=dict( type=LMEvaluator, - random_order=True, + infer_order='random', prompt_template=dict( type=PromptTemplate, template=dict(round=[ diff --git a/configs/datasets/subjective_cmp/subjective_creation.py b/configs/datasets/subjective_cmp/subjective_creation.py index 164d2677..2d4ec228 100644 --- a/configs/datasets/subjective_cmp/subjective_creation.py +++ b/configs/datasets/subjective_cmp/subjective_creation.py @@ -34,7 +34,6 @@ for _name in subjective_all_sets: subjective_eval_cfg = dict( evaluator=dict( type=LMEvaluator, - random_order=True, prompt_template=dict( type=PromptTemplate, template=dict(round=[ diff --git a/docs/en/advanced_guides/subjective_evaluation.md b/docs/en/advanced_guides/subjective_evaluation.md index 221138ff..3df24986 100644 --- a/docs/en/advanced_guides/subjective_evaluation.md +++ b/docs/en/advanced_guides/subjective_evaluation.md @@ -83,6 +83,10 @@ summarizer = dict( ) ``` +In addition, you can also change the response order of the two models, please refer to `config/subjective_compare.py`, +when `infer_order` is setting to `random`, the response will be random ordered, +when `infer_order` is setting to `double`, the response of two models will be doubled in two ways. + ### Single Model Scoring Configuration For `config/subjective_score.py`, it is mainly same with `config/subjective_compare.py`, and you just need to modify the eval mode to `singlescore`. diff --git a/docs/zh_cn/advanced_guides/subjective_evaluation.md b/docs/zh_cn/advanced_guides/subjective_evaluation.md index 4c7c298f..4c63329a 100644 --- a/docs/zh_cn/advanced_guides/subjective_evaluation.md +++ b/docs/zh_cn/advanced_guides/subjective_evaluation.md @@ -83,6 +83,10 @@ summarizer = dict( ) ``` +此外,在数据集的配置config中,还可以选择两回答比较时的回答顺序,请参考`config/subjective_compare.py`, +当`infer_order`设置为`random`时,将对两模型的回复顺序进行随机打乱, +当`infer_order`设置为`double`时,将把两模型的回复按两种先后顺序进行判断。 + ### 单回答打分配置 对于单回答打分,更详细的config setting请参考 `config/subjective_score.py`,该config的大部分都与两回答比较的config相同,只需要修改评测模式即可,将评测模式设置为`singlescore`。 diff --git a/opencompass/openicl/icl_evaluator/lm_evaluator.py b/opencompass/openicl/icl_evaluator/lm_evaluator.py index 8edd5e73..0c0a6848 100644 --- a/opencompass/openicl/icl_evaluator/lm_evaluator.py +++ b/opencompass/openicl/icl_evaluator/lm_evaluator.py @@ -1,8 +1,10 @@ +# flake8: noqa: E501 import os.path as osp import random from typing import Dict, List, Optional import mmengine +from datasets import Dataset from mmengine.config import ConfigDict from opencompass.openicl.icl_inferencer import GenInferencer @@ -14,20 +16,40 @@ from opencompass.utils.text_postprocessors import first_number_postprocess from opencompass.utils.types import get_type_from_cfg -def randomize_preds_and_record_references(predictions, - references, - random_order, - seed=2680): +def order_preds_and_record_references(predictions, + references, + infer_order, + seed=2680): + """Order predictions based on args and recording regrading references. + + Args: + predictions (List): List of multi model predictions. + references (List): List of reference based on each problem. + infer_order (str, optional): The mode of inference order. + seed (int, optional): Random seed. + """ random.seed(seed) list_of_preds = [[] for _ in range(len(predictions))] for i in range(len(predictions[0]['model_preds'])): preds = [[pred['model_preds'][i], pred['model_name']] for pred in predictions] - if random_order: + if infer_order == 'random': random.shuffle(preds) for j in range(len(preds)): list_of_preds[j].append(preds[j][0]) references[i][f'answer{j+1}'] = preds[j][1] + if infer_order == 'double': + assert len(predictions) == 2 + list_of_preds = [ + a + b for a, b in zip(list_of_preds, reversed(list_of_preds)) + ] + reversed_references = [] + for item in references: + reversed_item = item.copy() + reversed_item['answer1'], reversed_item['answer2'] = reversed_item[ + 'answer2'], reversed_item['answer1'] + reversed_references.append(reversed_item) + references += reversed_references return list_of_preds, references @@ -52,10 +74,11 @@ class LMEvaluator: prompt_template: ConfigDict, judge_cfg: ConfigDict, output_path: str, - random_order: Optional[bool] = False, + infer_order: Optional[str] = 'random', dataset_cfg: Optional[ConfigDict] = None, postprocessor: ConfigDict = dict(type=first_number_postprocess) ) -> None: + assert infer_order in ['random', 'double'] self.output_path = output_path out_dir, out_name = osp.split(output_path) if not out_dir: @@ -74,20 +97,36 @@ class LMEvaluator: self.postprocessor = get_type_from_cfg(postprocessor) self.logger = get_logger() self.dataset_cfg = dataset_cfg - self.random_order = random_order + self.infer_order = infer_order def score(self, predictions, references: Optional[List] = None) -> Dict: if type(predictions) == list: """Apply to multi-model comparison.""" references = [{} for _ in range(len(predictions[0]['model_preds'])) ] if references is None else references - predictions, references = randomize_preds_and_record_references( - predictions, references, self.random_order) + predictions, references = order_preds_and_record_references( + predictions, references, self.infer_order) elif type(predictions) == dict: """Apply to single-model scoring.""" references = [{} for _ in range(len(predictions[0]['model_preds'])) ] if references is None else references predictions = [predictions['model_preds']] + + # calculate dupicated predictions numbers + total_predictions_num = len(predictions[0]) + dup_indices = [] + for i in range(len(predictions[0])): + check = [sub[i] for sub in predictions] + if len(set(check)) == 1: + dup_indices.append(i) + + if len(dup_indices) != 0: + # remove dupicated predictions + for index in sorted(dup_indices, reverse=True): + for sublist in predictions: + del sublist[index] + del references[index] + pred_dict = {} for i in range(len(predictions)): key = 'prediction' if i == 0 else f'prediction{i + 1}' @@ -95,6 +134,25 @@ class LMEvaluator: if self.dataset_cfg: dataset = build_dataset_from_cfg(self.dataset_cfg) + + if self.infer_order == 'double': + new_ds = { + k: dataset.test[k] * 2 + for k in dataset.test.column_names + } + dataset.reader.dataset['test'] = Dataset.from_dict(new_ds) + + if len(dup_indices) != 0: + remaining_indices = [ + idx for idx in range(len(dataset.test)) + if idx not in dup_indices + ] + dataset.reader.dataset['test'] = dataset.test.select( + remaining_indices) + print( + f'Among total {total_predictions_num} predictions, there are {len(dup_indices)} predictions totally same, which are removed!' + ) + for k, v in pred_dict.items(): dataset.reader.dataset['test'] = dataset.test.add_column(k, v) dataset.reader.input_columns.append(k)