From e975a96fa1995ab98ff2d6a4543876434af42e63 Mon Sep 17 00:00:00 2001 From: Guo Qipeng Date: Fri, 19 Jan 2024 11:29:27 +0800 Subject: [PATCH] Update cdme config and evaluator (#812) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * update cdme config and evaluator * fix cdme prompt * move CDME trim post-processor as a separate evaluator --------- Co-authored-by: 郭琦鹏 --- configs/datasets/cdme/cdme200k.py | 34 +++++++++++++++++++++++++++++-- opencompass/datasets/cdme/cdme.py | 31 ++++++++++++++++++++++++++++ 2 files changed, 63 insertions(+), 2 deletions(-) diff --git a/configs/datasets/cdme/cdme200k.py b/configs/datasets/cdme/cdme200k.py index 2c22935f..12166365 100644 --- a/configs/datasets/cdme/cdme200k.py +++ b/configs/datasets/cdme/cdme200k.py @@ -46,13 +46,21 @@ cdme_eval_cfg = dict( dataset_postprocessor=dict(type=cdme_dataset_postprocess), pred_role='BOT') -context_lengths = list(range(1000, 201000, 1000)) +cdme_trim_eval_cfg = dict( + evaluator=dict(type=CDMEEvaluator, use_trim=True), + pred_postprocessor=dict(type=cdme_postprocess), + dataset_postprocessor=dict(type=cdme_dataset_postprocess), + pred_role='BOT') + +#context_lengths = list(range(1000, 201000, 1000)) +context_lengths = [16000, 32000, 48000, 64000, 80000, 96000, 112000, 128000, 144000, 160000, 176000, 192000, 200000] document_depth_percent_intervals = 20 document_depth_percent_interval_type = "linear" base_path = './data/CDME' file_list = ['zh_finance.jsonl'] cdme_datasets = [] +cdme_trim_datasets = [] for original_context_length in context_lengths: for depth_percent in generate_depth_percents( @@ -73,9 +81,31 @@ for original_context_length in context_lengths: 'language': 'Chinese', 'needle': '\n小明最喜欢的实习的地点就是上海人工智能实验室。\n', 'retrieval_question': '小明最喜欢的实习地点是哪里?请按照' - '“小明最喜欢的实习地点就是________。”的格式回答。', + '“小明最喜欢的实习地点就是________。”的格式回答。\n', 'reader_cfg': cdme_reader_cfg, 'infer_cfg': cdme_infer_cfg, 'eval_cfg': cdme_eval_cfg } cdme_datasets.append(dataset_dict) + + trim_dataset_dict = { + 'abbr': f'CDME_Length{original_context_length}' + f'Depth{int(depth_percent)}', + 'type': CDMEDataset, + 'path': base_path, + 'length': original_context_length, + 'depth': int(depth_percent), + 'tokenizer_model': 'gpt-4', + 'file_list': file_list, + 'num_repeats_per_file': 10, + 'length_buffer': 200, + 'guide': True, + 'language': 'Chinese', + 'needle': '\n小明最喜欢的实习的地点就是上海人工智能实验室。\n', + 'retrieval_question': '小明最喜欢的实习地点是哪里?请按照' + '“小明最喜欢的实习地点就是________。”的格式回答。\n', + 'reader_cfg': cdme_reader_cfg, + 'infer_cfg': cdme_infer_cfg, + 'eval_cfg': cdme_trim_eval_cfg + } + cdme_trim_datasets.append(trim_dataset_dict) diff --git a/opencompass/datasets/cdme/cdme.py b/opencompass/datasets/cdme/cdme.py index 0bb3b1b5..d3e0f3f2 100644 --- a/opencompass/datasets/cdme/cdme.py +++ b/opencompass/datasets/cdme/cdme.py @@ -129,6 +129,32 @@ class CDMEDataset(BaseDataset): class CDMEEvaluator(BaseEvaluator): + def __init__(self, use_trim=False): + self.use_trim = use_trim + + @staticmethod + def _trim_prediction(prediction, reference): + """Trims the prediction string based on the length of the reference + string. + + Args: + prediction (str): The prediction string. + reference (str): The reference string. + + Returns: + str: The trimmed prediction string. + """ + l08 = int(0.8 * len(reference)) + l12 = int(1.2 * len(reference)) + trimmed_prediction = prediction[:l12] + + if len(trimmed_prediction) > l08 and \ + reference[-1] in trimmed_prediction[l08:]: + end_pos = l08 + trimmed_prediction[l08:].index(reference[-1]) + 1 + trimmed_prediction = trimmed_prediction[:end_pos] + + return trimmed_prediction + def levenshtein_distance(self, s1, s2): if len(s1) < len(s2): return self.levenshtein_distance(s2, s1) @@ -159,6 +185,11 @@ class CDMEEvaluator(BaseEvaluator): for prediction, reference in zip(predictions, references): prediction = re.sub(r'\s+', '', prediction) reference = re.sub(r'\s+', '', reference) + + if self.use_trim: + prediction = CDMEEvaluator._trim_prediction( + prediction, reference) + edit_distance = self.levenshtein_distance(prediction, reference) max_len = max(len(prediction), len(reference)) score = 100 * (1 -