diff --git a/configs/datasets/cdme/cdme200k.py b/configs/datasets/cdme/cdme200k.py deleted file mode 100644 index 12166365..00000000 --- a/configs/datasets/cdme/cdme200k.py +++ /dev/null @@ -1,111 +0,0 @@ -from opencompass.openicl.icl_prompt_template import PromptTemplate -from opencompass.openicl.icl_retriever import ZeroRetriever -from opencompass.openicl.icl_inferencer import GenInferencer -from opencompass.datasets.cdme.cdme import CDMEDataset -from opencompass.datasets.cdme.cdme import CDMEEvaluator -from opencompass.datasets.cdme.cdme import cdme_postprocess -from opencompass.datasets.cdme.cdme import cdme_dataset_postprocess -import math - - -def logistic(x, L=100, x0=50, k=0.1): - return round(L / (1 + math.exp(-k * (x - x0))), 3) - - -def generate_linear_space(start, end, num): - if num == 1: - return [start] - elif num < 1: - raise ValueError("num must be at least 1.") - step = (end - start) / (num - 1) - return [start + step * i for i in range(num)] - - -def generate_depth_percents(intervals, interval_type): - if interval_type == 'linear': - return generate_linear_space(0, 100, intervals) - elif interval_type == 'sigmoid': - linear_space = generate_linear_space(0, 100, intervals) - return [logistic(x) for x in linear_space] - else: - raise ValueError('Unsupported interval type') - - -cdme_reader_cfg = dict(input_columns=['prompt'], output_column='answer') - -cdme_infer_cfg = dict( - prompt_template=dict( - type=PromptTemplate, - template='''{prompt}'''), - retriever=dict(type=ZeroRetriever), - inferencer=dict(type=GenInferencer, max_out_len=512)) - -cdme_eval_cfg = dict( - evaluator=dict(type=CDMEEvaluator), - pred_postprocessor=dict(type=cdme_postprocess), - dataset_postprocessor=dict(type=cdme_dataset_postprocess), - pred_role='BOT') - -cdme_trim_eval_cfg = dict( - evaluator=dict(type=CDMEEvaluator, use_trim=True), - pred_postprocessor=dict(type=cdme_postprocess), - dataset_postprocessor=dict(type=cdme_dataset_postprocess), - pred_role='BOT') - -#context_lengths = list(range(1000, 201000, 1000)) -context_lengths = [16000, 32000, 48000, 64000, 80000, 96000, 112000, 128000, 144000, 160000, 176000, 192000, 200000] -document_depth_percent_intervals = 20 -document_depth_percent_interval_type = "linear" - -base_path = './data/CDME' -file_list = ['zh_finance.jsonl'] -cdme_datasets = [] -cdme_trim_datasets = [] - -for original_context_length in context_lengths: - for depth_percent in generate_depth_percents( - document_depth_percent_intervals, - document_depth_percent_interval_type): - dataset_dict = { - 'abbr': f'CDME_Length{original_context_length}' - f'Depth{int(depth_percent)}', - 'type': CDMEDataset, - 'path': base_path, - 'length': original_context_length, - 'depth': int(depth_percent), - 'tokenizer_model': 'gpt-4', - 'file_list': file_list, - 'num_repeats_per_file': 10, - 'length_buffer': 200, - 'guide': True, - 'language': 'Chinese', - 'needle': '\n小明最喜欢的实习的地点就是上海人工智能实验室。\n', - 'retrieval_question': '小明最喜欢的实习地点是哪里?请按照' - '“小明最喜欢的实习地点就是________。”的格式回答。\n', - 'reader_cfg': cdme_reader_cfg, - 'infer_cfg': cdme_infer_cfg, - 'eval_cfg': cdme_eval_cfg - } - cdme_datasets.append(dataset_dict) - - trim_dataset_dict = { - 'abbr': f'CDME_Length{original_context_length}' - f'Depth{int(depth_percent)}', - 'type': CDMEDataset, - 'path': base_path, - 'length': original_context_length, - 'depth': int(depth_percent), - 'tokenizer_model': 'gpt-4', - 'file_list': file_list, - 'num_repeats_per_file': 10, - 'length_buffer': 200, - 'guide': True, - 'language': 'Chinese', - 'needle': '\n小明最喜欢的实习的地点就是上海人工智能实验室。\n', - 'retrieval_question': '小明最喜欢的实习地点是哪里?请按照' - '“小明最喜欢的实习地点就是________。”的格式回答。\n', - 'reader_cfg': cdme_reader_cfg, - 'infer_cfg': cdme_infer_cfg, - 'eval_cfg': cdme_trim_eval_cfg - } - cdme_trim_datasets.append(trim_dataset_dict) diff --git a/configs/datasets/cdme/cdme32k.py b/configs/datasets/cdme/cdme32k.py deleted file mode 100644 index fcc2e049..00000000 --- a/configs/datasets/cdme/cdme32k.py +++ /dev/null @@ -1,81 +0,0 @@ -from opencompass.openicl.icl_prompt_template import PromptTemplate -from opencompass.openicl.icl_retriever import ZeroRetriever -from opencompass.openicl.icl_inferencer import GenInferencer -from opencompass.datasets.cdme.cdme import CDMEDataset -from opencompass.datasets.cdme.cdme import CDMEEvaluator -from opencompass.datasets.cdme.cdme import cdme_postprocess -from opencompass.datasets.cdme.cdme import cdme_dataset_postprocess -import math - - -def logistic(x, L=100, x0=50, k=0.1): - return round(L / (1 + math.exp(-k * (x - x0))), 3) - - -def generate_linear_space(start, end, num): - if num == 1: - return [start] - elif num < 1: - raise ValueError("num must be at least 1.") - step = (end - start) / (num - 1) - return [start + step * i for i in range(num)] - - -def generate_depth_percents(intervals, interval_type): - if interval_type == 'linear': - return generate_linear_space(0, 100, intervals) - elif interval_type == 'sigmoid': - linear_space = generate_linear_space(0, 100, intervals) - return [logistic(x) for x in linear_space] - else: - raise ValueError('Unsupported interval type') - - -cdme_reader_cfg = dict(input_columns=['prompt'], output_column='answer') - -cdme_infer_cfg = dict( - prompt_template=dict( - type=PromptTemplate, - template='''{prompt}'''), - retriever=dict(type=ZeroRetriever), - inferencer=dict(type=GenInferencer, max_out_len=512)) - -cdme_eval_cfg = dict( - evaluator=dict(type=CDMEEvaluator), - pred_postprocessor=dict(type=cdme_postprocess), - dataset_postprocessor=dict(type=cdme_dataset_postprocess), - pred_role='BOT') - -context_lengths = list(range(1000, 33000, 1000)) -document_depth_percent_intervals = 20 -document_depth_percent_interval_type = "linear" - -base_path = './data/CDME' -file_list = ['zh_finance.jsonl'] -cdme_datasets = [] - -for original_context_length in context_lengths: - for depth_percent in generate_depth_percents( - document_depth_percent_intervals, - document_depth_percent_interval_type): - dataset_dict = { - 'abbr': f'CDME_Length{original_context_length}' - f'Depth{int(depth_percent)}', - 'type': CDMEDataset, - 'path': base_path, - 'length': original_context_length, - 'depth': int(depth_percent), - 'tokenizer_model': 'gpt-4', - 'file_list': file_list, - 'num_repeats_per_file': 10, - 'length_buffer': 200, - 'guide': True, - 'language': 'Chinese', - 'needle': '\n小明最喜欢的实习的地点就是上海人工智能实验室。\n', - 'retrieval_question': '小明最喜欢的实习地点是哪里?请按照' - '“小明最喜欢的实习地点就是________。”的格式回答。', - 'reader_cfg': cdme_reader_cfg, - 'infer_cfg': cdme_infer_cfg, - 'eval_cfg': cdme_eval_cfg - } - cdme_datasets.append(dataset_dict) diff --git a/configs/datasets/cdme/cdme8k.py b/configs/datasets/cdme/cdme8k.py deleted file mode 100644 index d4af51b5..00000000 --- a/configs/datasets/cdme/cdme8k.py +++ /dev/null @@ -1,81 +0,0 @@ -from opencompass.openicl.icl_prompt_template import PromptTemplate -from opencompass.openicl.icl_retriever import ZeroRetriever -from opencompass.openicl.icl_inferencer import GenInferencer -from opencompass.datasets.cdme.cdme import CDMEDataset -from opencompass.datasets.cdme.cdme import CDMEEvaluator -from opencompass.datasets.cdme.cdme import cdme_postprocess -from opencompass.datasets.cdme.cdme import cdme_dataset_postprocess -import math - - -def logistic(x, L=100, x0=50, k=0.1): - return round(L / (1 + math.exp(-k * (x - x0))), 3) - - -def generate_linear_space(start, end, num): - if num == 1: - return [start] - elif num < 1: - raise ValueError("num must be at least 1.") - step = (end - start) / (num - 1) - return [start + step * i for i in range(num)] - - -def generate_depth_percents(intervals, interval_type): - if interval_type == 'linear': - return generate_linear_space(0, 100, intervals) - elif interval_type == 'sigmoid': - linear_space = generate_linear_space(0, 100, intervals) - return [logistic(x) for x in linear_space] - else: - raise ValueError('Unsupported interval type') - - -cdme_reader_cfg = dict(input_columns=['prompt'], output_column='answer') - -cdme_infer_cfg = dict( - prompt_template=dict( - type=PromptTemplate, - template='''{prompt}'''), - retriever=dict(type=ZeroRetriever), - inferencer=dict(type=GenInferencer, max_out_len=512)) - -cdme_eval_cfg = dict( - evaluator=dict(type=CDMEEvaluator), - pred_postprocessor=dict(type=cdme_postprocess), - dataset_postprocessor=dict(type=cdme_dataset_postprocess), - pred_role='BOT') - -context_lengths = list(range(1000, 9000, 1000)) -document_depth_percent_intervals = 20 -document_depth_percent_interval_type = "linear" - -base_path = './data/CDME' -file_list = ['zh_finance.jsonl'] -cdme_datasets = [] - -for original_context_length in context_lengths: - for depth_percent in generate_depth_percents( - document_depth_percent_intervals, - document_depth_percent_interval_type): - dataset_dict = { - 'abbr': f'CDME_Length{original_context_length}' - f'Depth{int(depth_percent)}', - 'type': CDMEDataset, - 'path': base_path, - 'length': original_context_length, - 'depth': int(depth_percent), - 'tokenizer_model': 'gpt-4', - 'file_list': file_list, - 'num_repeats_per_file': 10, - 'length_buffer': 200, - 'guide': True, - 'language': 'Chinese', - 'needle': '\n小明最喜欢的实习的地点就是上海人工智能实验室。\n', - 'retrieval_question': '小明最喜欢的实习地点是哪里?请按照' - '“小明最喜欢的实习地点就是________。”的格式回答。', - 'reader_cfg': cdme_reader_cfg, - 'infer_cfg': cdme_infer_cfg, - 'eval_cfg': cdme_eval_cfg - } - cdme_datasets.append(dataset_dict) diff --git a/configs/datasets/cdme/multi_needle/cdme8k_cot2_italy.py b/configs/datasets/cdme/multi_needle/cdme8k_cot2_italy.py deleted file mode 100644 index 3f5fab2b..00000000 --- a/configs/datasets/cdme/multi_needle/cdme8k_cot2_italy.py +++ /dev/null @@ -1,92 +0,0 @@ -from opencompass.openicl.icl_prompt_template import PromptTemplate -from opencompass.openicl.icl_retriever import ZeroRetriever -from opencompass.openicl.icl_inferencer import GenInferencer -from opencompass.datasets.cdme.cdme_multi import CDMEDataset -from opencompass.datasets.cdme.cdme_multi import CDMEEvaluator -from opencompass.datasets.cdme.cdme_multi import cdme_postprocess -from opencompass.datasets.cdme.cdme_multi import cdme_dataset_postprocess -import math - - -def logistic(x, L=100, x0=50, k=0.1): - return round(L / (1 + math.exp(-k * (x - x0))), 3) - - -def generate_linear_space(start, end, num): - if num == 1: - return [start] - elif num < 1: - raise ValueError("num must be at least 1.") - step = (end - start) / (num - 1) - return [start + step * i for i in range(num)] - - -def generate_depth_percents(intervals, interval_type): - if interval_type == 'linear': - return generate_linear_space(0, 100, intervals) - elif interval_type == 'sigmoid': - linear_space = generate_linear_space(0, 100, intervals) - return [logistic(x) for x in linear_space] - else: - raise ValueError('Unsupported interval type') - - -cdme_reader_cfg = dict(input_columns=['prompt'], output_column='answer') - -cdme_infer_cfg = dict( - prompt_template=dict( - type=PromptTemplate, - template='''{prompt}'''), - retriever=dict(type=ZeroRetriever), - inferencer=dict(type=GenInferencer, max_out_len=512)) - -cdme_eval_cfg = dict( - evaluator=dict(type=CDMEEvaluator), - pred_postprocessor=dict(type=cdme_postprocess), - dataset_postprocessor=dict(type=cdme_dataset_postprocess), - pred_role='BOT') - -context_lengths = list(range(1000, 9000, 1000)) -document_depth_percent_intervals = 20 -document_depth_percent_interval_type = "linear" - -base_path = './data/CDME' -file_list = ['zh_finance.jsonl'] -cdme_datasets = [] - -needles = ['\n意大利的佛罗伦萨有一家名为"La Giostra"的餐馆,是整个佛罗伦萨中排行第一的餐馆。\n', - '"La Giostra"餐馆的特色菜肴是松露奶酪通心粉。'] -retrieval_question = ("佛罗伦萨中排行第一的餐馆的特色菜肴是?" - "请按照'佛罗伦萨中排行第一的餐馆的特色菜肴是______。'的格式回答。") -answer = "佛罗伦萨中排行第一的餐馆的特色菜肴是松露奶酪通心粉。" - -keyword = "松露奶酪通心粉" -diff = 25 - -for original_context_length in context_lengths: - for depth_percent in generate_depth_percents( - document_depth_percent_intervals, - document_depth_percent_interval_type): - dataset_dict = { - 'abbr': f'CDME_Length{original_context_length}' - f'Depth{int(depth_percent)}', - 'type': CDMEDataset, - 'path': base_path, - 'length': original_context_length, - 'depth': int(depth_percent), - 'tokenizer_model': 'gpt-4', - 'file_list': file_list, - 'num_repeats_per_file': 10, - 'length_buffer': 200, - 'guide': True, - 'language': 'Chinese', - 'needles': needles, - 'diff': diff, - 'retrieval_question': retrieval_question, - 'answer': answer, - 'keyword': keyword, - 'reader_cfg': cdme_reader_cfg, - 'infer_cfg': cdme_infer_cfg, - 'eval_cfg': cdme_eval_cfg - } - cdme_datasets.append(dataset_dict) diff --git a/configs/datasets/cdme/multi_needle/cdme8k_cot3_italy.py b/configs/datasets/cdme/multi_needle/cdme8k_cot3_italy.py deleted file mode 100644 index acc82e85..00000000 --- a/configs/datasets/cdme/multi_needle/cdme8k_cot3_italy.py +++ /dev/null @@ -1,93 +0,0 @@ -from opencompass.openicl.icl_prompt_template import PromptTemplate -from opencompass.openicl.icl_retriever import ZeroRetriever -from opencompass.openicl.icl_inferencer import GenInferencer -from opencompass.datasets.cdme.cdme_multi import CDMEDataset -from opencompass.datasets.cdme.cdme_multi import CDMEEvaluator -from opencompass.datasets.cdme.cdme_multi import cdme_postprocess -from opencompass.datasets.cdme.cdme_multi import cdme_dataset_postprocess -import math - - -def logistic(x, L=100, x0=50, k=0.1): - return round(L / (1 + math.exp(-k * (x - x0))), 3) - - -def generate_linear_space(start, end, num): - if num == 1: - return [start] - elif num < 1: - raise ValueError("num must be at least 1.") - step = (end - start) / (num - 1) - return [start + step * i for i in range(num)] - - -def generate_depth_percents(intervals, interval_type): - if interval_type == 'linear': - return generate_linear_space(0, 100, intervals) - elif interval_type == 'sigmoid': - linear_space = generate_linear_space(0, 100, intervals) - return [logistic(x) for x in linear_space] - else: - raise ValueError('Unsupported interval type') - - -cdme_reader_cfg = dict(input_columns=['prompt'], output_column='answer') - -cdme_infer_cfg = dict( - prompt_template=dict( - type=PromptTemplate, - template='''{prompt}'''), - retriever=dict(type=ZeroRetriever), - inferencer=dict(type=GenInferencer, max_out_len=512)) - -cdme_eval_cfg = dict( - evaluator=dict(type=CDMEEvaluator), - pred_postprocessor=dict(type=cdme_postprocess), - dataset_postprocessor=dict(type=cdme_dataset_postprocess), - pred_role='BOT') - -context_lengths = list(range(1000, 9000, 1000)) -document_depth_percent_intervals = 20 -document_depth_percent_interval_type = "linear" - -base_path = './data/CDME' -file_list = ['zh_finance.jsonl'] -cdme_datasets = [] - -needles = ['\n意大利的佛罗伦萨有一家名为"La Giostra"的餐馆,是整个佛罗伦萨中排行第一的餐馆。\n', - '"La Giostra"餐馆的特色菜肴是松露奶酪通心粉。', - '松露奶酪通心粉是该家餐馆的有着意大利皇室烹饪血统的大厨Jack制作',] -retrieval_question = ("制作佛罗伦萨中排行第一的餐馆的特色菜肴的人叫什么?" - "请按照'制作佛罗伦萨中排行第一的餐馆的特色菜肴的人叫______。'的格式回答。") -answer = "制作佛罗伦萨中排行第一的餐馆的特色菜肴的人叫Jack" - -keyword = "Jack" -diff = 25 - -for original_context_length in context_lengths: - for depth_percent in generate_depth_percents( - document_depth_percent_intervals, - document_depth_percent_interval_type): - dataset_dict = { - 'abbr': f'CDME_Length{original_context_length}' - f'Depth{int(depth_percent)}', - 'type': CDMEDataset, - 'path': base_path, - 'length': original_context_length, - 'depth': int(depth_percent), - 'tokenizer_model': 'gpt-4', - 'file_list': file_list, - 'num_repeats_per_file': 10, - 'length_buffer': 200, - 'guide': True, - 'language': 'Chinese', - 'needles': needles, - 'diff': diff, - 'retrieval_question': retrieval_question, - 'answer': answer, - 'keyword': keyword, - 'reader_cfg': cdme_reader_cfg, - 'infer_cfg': cdme_infer_cfg, - 'eval_cfg': cdme_eval_cfg - } - cdme_datasets.append(dataset_dict) diff --git a/configs/datasets/needlebench/atc/atc.py b/configs/datasets/needlebench/atc/atc.py new file mode 100644 index 00000000..b2eeebea --- /dev/null +++ b/configs/datasets/needlebench/atc/atc.py @@ -0,0 +1,104 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets.needlebench.atc import NeedleBenchATCOrderedDataset +from opencompass.datasets.needlebench.atc import NeedleBenchATCDataset +from opencompass.datasets.needlebench.origin import NeedleBenchOriginEvaluator +from opencompass.datasets.needlebench.origin import needlebench_postprocess +from opencompass.datasets.needlebench.origin import needlebench_dataset_postprocess + +needlebench_reader_cfg = dict(input_columns=['prompt'], output_column='answer') + +needlebench_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict(role='HUMAN', prompt='{prompt}'), + dict(role='BOT', prompt='{answer}\n'), + ] + ) + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer)) + +needlebench_eval_cfg = dict( + evaluator=dict(type=NeedleBenchOriginEvaluator), + pred_postprocessor=dict(type=needlebench_postprocess), + dataset_postprocessor=dict(type=needlebench_dataset_postprocess), + pred_role='BOT') + +needle_num_list = list(range(2, 100, 3)) +document_depth_percent_intervals = 20 +repeats = 30 +names_path = './data/needlebench/names.json' + +needlebench_atc_datasets_zh = [] +needlebench_atc_datasets_en = [] +needlebench_atc_datasets_zh_ordered = [] +needlebench_atc_datasets_en_ordered = [] + +for num_needles in needle_num_list: + # ordered English version + dataset_dict = { + 'abbr': f'needlebench_atc_challenge' + f'needle_{num_needles}_en_ordered', + 'type': NeedleBenchATCOrderedDataset, + 'path': names_path, + 'num_needles': num_needles, + 'language': 'English', + 'repeats': repeats, + 'reader_cfg': needlebench_reader_cfg, + 'infer_cfg': needlebench_infer_cfg, + 'eval_cfg': needlebench_eval_cfg + } + needlebench_atc_datasets_en_ordered.append(dataset_dict) + + +for num_needles in needle_num_list: + # ordered Chinese version + dataset_dict = { + 'abbr': f'needlebench_atc_challenge' + f'needle_{num_needles}_zh_ordered', + 'type': NeedleBenchATCOrderedDataset, + 'path': names_path, + 'num_needles': num_needles, + 'language': 'Chinese', + 'repeats': repeats, + 'reader_cfg': needlebench_reader_cfg, + 'infer_cfg': needlebench_infer_cfg, + 'eval_cfg': needlebench_eval_cfg + } + needlebench_atc_datasets_zh_ordered.append(dataset_dict) + +for num_needles in needle_num_list: + # standard English version + dataset_dict = { + 'abbr': f'needlebench_atc_challenge' + f'needle_{num_needles}_en', + 'type': NeedleBenchATCDataset, + 'path': names_path, + 'num_needles': num_needles, + 'language': 'English', + 'repeats': repeats, + 'reader_cfg': needlebench_reader_cfg, + 'infer_cfg': needlebench_infer_cfg, + 'eval_cfg': needlebench_eval_cfg + } + needlebench_atc_datasets_en.append(dataset_dict) + +for num_needles in needle_num_list: + # standard Chinese version + dataset_dict = { + 'abbr': f'needlebench_atc_challenge' + f'needle_{num_needles}_zh', + 'type': NeedleBenchATCDataset, + 'path': names_path, + 'num_needles': num_needles, + 'language': 'Chinese', + 'repeats': repeats, + 'reader_cfg': needlebench_reader_cfg, + 'infer_cfg': needlebench_infer_cfg, + 'eval_cfg': needlebench_eval_cfg + } + needlebench_atc_datasets_zh.append(dataset_dict) diff --git a/configs/datasets/needlebench/needlebench.py b/configs/datasets/needlebench/needlebench.py new file mode 100644 index 00000000..291a3dee --- /dev/null +++ b/configs/datasets/needlebench/needlebench.py @@ -0,0 +1,10 @@ +from mmengine.config import read_base + +with read_base(): + from .needlebench_4k.needlebench import needlebench_datasets as needlebench_datasets_4k + from .needlebench_8k.needlebench import needlebench_datasets as needlebench_datasets_8k + from .needlebench_32k.needlebench import needlebench_datasets as needlebench_datasets_32k + from .needlebench_128k.needlebench import needlebench_datasets as needlebench_datasets_128k + from .needlebench_200k.needlebench import needlebench_datasets as needlebench_datasets_200k + +needlebench_datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), []) diff --git a/configs/datasets/needlebench/needlebench_128k/needlebench.py b/configs/datasets/needlebench/needlebench_128k/needlebench.py new file mode 100644 index 00000000..b73abb1f --- /dev/null +++ b/configs/datasets/needlebench/needlebench_128k/needlebench.py @@ -0,0 +1,18 @@ +from mmengine.config import read_base + +with read_base(): + from .needlebench_multi_reasoning import needlebench_datasets_2needle_en as needlebench_multi_2needle_en_datasets + from .needlebench_multi_reasoning import needlebench_datasets_3needle_en as needlebench_multi_3needle_en_datasets + from .needlebench_multi_reasoning import needlebench_datasets_4needle_en as needlebench_multi_4needle_en_datasets + from .needlebench_multi_reasoning import needlebench_datasets_5needle_en as needlebench_multi_5needle_en_datasets + from .needlebench_multi_reasoning import needlebench_datasets_2needle_zh as needlebench_multi_2needle_zh_datasets + from .needlebench_multi_reasoning import needlebench_datasets_3needle_zh as needlebench_multi_3needle_zh_datasets + from .needlebench_multi_reasoning import needlebench_datasets_4needle_zh as needlebench_multi_4needle_zh_datasets + from .needlebench_multi_reasoning import needlebench_datasets_5needle_zh as needlebench_multi_5needle_zh_datasets + + from .needlebench_single import needlebench_datasets_en as needlebench_origin_en_datasets + from .needlebench_single import needlebench_datasets_zh as needlebench_origin_zh_datasets + from .needlebench_multi_retrieval import needlebench_datasets_en as needlebench_parallel_en_datasets + from .needlebench_multi_retrieval import needlebench_datasets_zh as needlebench_parallel_zh_datasets + +needlebench_datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), []) diff --git a/configs/datasets/needlebench/needlebench_128k/needlebench_multi_reasoning.py b/configs/datasets/needlebench/needlebench_128k/needlebench_multi_reasoning.py new file mode 100644 index 00000000..e15fd552 --- /dev/null +++ b/configs/datasets/needlebench/needlebench_128k/needlebench_multi_reasoning.py @@ -0,0 +1,303 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets.needlebench.multi import NeedleBenchMultiDataset +from opencompass.datasets.needlebench.multi import NeedleBenchMultiEvaluator +from opencompass.datasets.needlebench.origin import needlebench_postprocess +from opencompass.datasets.needlebench.origin import needlebench_dataset_postprocess +import math + + +def logistic(x, L=100, x0=50, k=0.1): + return round(L / (1 + math.exp(-k * (x - x0))), 3) + + +def generate_linear_space(start, end, num): + if num == 1: + return [start] + elif num < 1: + raise ValueError("num must be at least 1.") + step = (end - start) / (num - 1) + return [start + step * i for i in range(num)] + + +def generate_depth_percents(intervals, interval_type): + if interval_type == 'linear': + return generate_linear_space(0, 100, intervals) + elif interval_type == 'sigmoid': + linear_space = generate_linear_space(0, 100, intervals) + return [logistic(x) for x in linear_space] + else: + raise ValueError('Unsupported interval type') + + +needlebench_reader_cfg = dict(input_columns=['prompt'], output_column='answer') + +needlebench_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict(role='HUMAN', prompt='{prompt}'), + dict(role='BOT', prompt='{answer}\n'), + ] + ) + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer)) + +needlebench_eval_cfg = dict( + evaluator=dict(type=NeedleBenchMultiEvaluator), + pred_postprocessor=dict(type=needlebench_postprocess), + dataset_postprocessor=dict(type=needlebench_dataset_postprocess), + pred_role='BOT') + +context_lengths = list([16000, 32000, 48000, 64000, 80000, 96000, 112000, 128000]) +document_depth_percent_intervals = 20 +document_depth_percent_interval_type = "linear" + +# ----------English Version---------- +base_path = './data/needlebench' +file_list = ['PaulGrahamEssays.jsonl'] + +needle_file_name = 'multi_needle_reasoning_en.json' +diff = 10 +num_needles = 2 +needlebench_datasets_2needle_en = [] +language = 'English' + +for original_context_length in context_lengths: + for depth_percent in generate_depth_percents( + document_depth_percent_intervals, + document_depth_percent_interval_type): + dataset_dict = { + 'abbr': f'Length{original_context_length}' + f'Depth{int(depth_percent)}_{num_needles}needle_en_128k', + 'type': NeedleBenchMultiDataset, + 'path': base_path, + 'length': original_context_length, + 'depth': int(depth_percent), + 'tokenizer_model': 'gpt-4', + 'file_list': file_list, + 'num_repeats_per_file': 10, + 'length_buffer': 600, + 'guide': True, + 'language': language, + 'needle_file_name': needle_file_name, + 'num_needles': num_needles, + 'diff': diff, + 'reader_cfg': needlebench_reader_cfg, + 'infer_cfg': needlebench_infer_cfg, + 'eval_cfg': needlebench_eval_cfg + } + needlebench_datasets_2needle_en.append(dataset_dict) + +num_needles = 3 +needlebench_datasets_3needle_en = [] + +for original_context_length in context_lengths: + for depth_percent in generate_depth_percents( + document_depth_percent_intervals, + document_depth_percent_interval_type): + dataset_dict = { + 'abbr': f'Length{original_context_length}' + f'Depth{int(depth_percent)}_{num_needles}needle_en_128k', + 'type': NeedleBenchMultiDataset, + 'path': base_path, + 'length': original_context_length, + 'depth': int(depth_percent), + 'tokenizer_model': 'gpt-4', + 'file_list': file_list, + 'num_repeats_per_file': 10, + 'length_buffer': 600, + 'guide': True, + 'language': language, + 'needle_file_name': needle_file_name, + 'num_needles': num_needles, + 'diff': diff, + 'reader_cfg': needlebench_reader_cfg, + 'infer_cfg': needlebench_infer_cfg, + 'eval_cfg': needlebench_eval_cfg + } + needlebench_datasets_3needle_en.append(dataset_dict) + +num_needles = 4 +needlebench_datasets_4needle_en = [] + +for original_context_length in context_lengths: + for depth_percent in generate_depth_percents( + document_depth_percent_intervals, + document_depth_percent_interval_type): + dataset_dict = { + 'abbr': f'Length{original_context_length}' + f'Depth{int(depth_percent)}_{num_needles}needle_en_128k', + 'type': NeedleBenchMultiDataset, + 'path': base_path, + 'length': original_context_length, + 'depth': int(depth_percent), + 'tokenizer_model': 'gpt-4', + 'file_list': file_list, + 'num_repeats_per_file': 10, + 'length_buffer': 600, + 'guide': True, + 'language': language, + 'needle_file_name': needle_file_name, + 'num_needles': num_needles, + 'diff': diff, + 'reader_cfg': needlebench_reader_cfg, + 'infer_cfg': needlebench_infer_cfg, + 'eval_cfg': needlebench_eval_cfg + } + needlebench_datasets_4needle_en.append(dataset_dict) + +num_needles = 5 +needlebench_datasets_5needle_en = [] + +for original_context_length in context_lengths: + for depth_percent in generate_depth_percents( + document_depth_percent_intervals, + document_depth_percent_interval_type): + dataset_dict = { + 'abbr': f'Length{original_context_length}' + f'Depth{int(depth_percent)}_{num_needles}needle_en_128k', + 'type': NeedleBenchMultiDataset, + 'path': base_path, + 'length': original_context_length, + 'depth': int(depth_percent), + 'tokenizer_model': 'gpt-4', + 'file_list': file_list, + 'num_repeats_per_file': 10, + 'length_buffer': 600, + 'guide': True, + 'language': language, + 'needle_file_name': needle_file_name, + 'num_needles': num_needles, + 'diff': diff, + 'reader_cfg': needlebench_reader_cfg, + 'infer_cfg': needlebench_infer_cfg, + 'eval_cfg': needlebench_eval_cfg + } + needlebench_datasets_5needle_en.append(dataset_dict) + +# ----------Chinese Version---------- +base_path = './data/needlebench' +file_list = ['zh_finance.jsonl'] + +needle_file_name = 'multi_needle_reasoning_zh.json' +diff = 10 +num_needles = 2 +needlebench_datasets_2needle_zh = [] +language = 'Chinese' + +for original_context_length in context_lengths: + for depth_percent in generate_depth_percents( + document_depth_percent_intervals, + document_depth_percent_interval_type): + dataset_dict = { + 'abbr': f'Length{original_context_length}' + f'Depth{int(depth_percent)}_{num_needles}needle_zh_128k', + 'type': NeedleBenchMultiDataset, + 'path': base_path, + 'length': original_context_length, + 'depth': int(depth_percent), + 'tokenizer_model': 'gpt-4', + 'file_list': file_list, + 'num_repeats_per_file': 10, + 'length_buffer': 200, + 'guide': True, + 'language': language, + 'needle_file_name': needle_file_name, + 'num_needles': num_needles, + 'diff': diff, + 'reader_cfg': needlebench_reader_cfg, + 'infer_cfg': needlebench_infer_cfg, + 'eval_cfg': needlebench_eval_cfg + } + needlebench_datasets_2needle_zh.append(dataset_dict) + +num_needles = 3 +needlebench_datasets_3needle_zh = [] + +for original_context_length in context_lengths: + for depth_percent in generate_depth_percents( + document_depth_percent_intervals, + document_depth_percent_interval_type): + dataset_dict = { + 'abbr': f'Length{original_context_length}' + f'Depth{int(depth_percent)}_{num_needles}needle_zh_128k', + 'type': NeedleBenchMultiDataset, + 'path': base_path, + 'length': original_context_length, + 'depth': int(depth_percent), + 'tokenizer_model': 'gpt-4', + 'file_list': file_list, + 'num_repeats_per_file': 10, + 'length_buffer': 200, + 'guide': True, + 'language': language, + 'needle_file_name': needle_file_name, + 'num_needles': num_needles, + 'diff': diff, + 'reader_cfg': needlebench_reader_cfg, + 'infer_cfg': needlebench_infer_cfg, + 'eval_cfg': needlebench_eval_cfg + } + needlebench_datasets_3needle_zh.append(dataset_dict) + +num_needles = 4 +needlebench_datasets_4needle_zh = [] + +for original_context_length in context_lengths: + for depth_percent in generate_depth_percents( + document_depth_percent_intervals, + document_depth_percent_interval_type): + dataset_dict = { + 'abbr': f'Length{original_context_length}' + f'Depth{int(depth_percent)}_{num_needles}needle_zh_128k', + 'type': NeedleBenchMultiDataset, + 'path': base_path, + 'length': original_context_length, + 'depth': int(depth_percent), + 'tokenizer_model': 'gpt-4', + 'file_list': file_list, + 'num_repeats_per_file': 10, + 'length_buffer': 200, + 'guide': True, + 'language': language, + 'needle_file_name': needle_file_name, + 'num_needles': num_needles, + 'diff': diff, + 'reader_cfg': needlebench_reader_cfg, + 'infer_cfg': needlebench_infer_cfg, + 'eval_cfg': needlebench_eval_cfg + } + needlebench_datasets_4needle_zh.append(dataset_dict) + +num_needles = 5 +needlebench_datasets_5needle_zh = [] + +for original_context_length in context_lengths: + for depth_percent in generate_depth_percents( + document_depth_percent_intervals, + document_depth_percent_interval_type): + dataset_dict = { + 'abbr': f'Length{original_context_length}' + f'Depth{int(depth_percent)}_{num_needles}needle_zh_128k', + 'type': NeedleBenchMultiDataset, + 'path': base_path, + 'length': original_context_length, + 'depth': int(depth_percent), + 'tokenizer_model': 'gpt-4', + 'file_list': file_list, + 'num_repeats_per_file': 10, + 'length_buffer': 200, + 'guide': True, + 'language': language, + 'needle_file_name': needle_file_name, + 'num_needles': num_needles, + 'diff': diff, + 'reader_cfg': needlebench_reader_cfg, + 'infer_cfg': needlebench_infer_cfg, + 'eval_cfg': needlebench_eval_cfg + } + needlebench_datasets_5needle_zh.append(dataset_dict) diff --git a/configs/datasets/needlebench/needlebench_128k/needlebench_multi_retrieval.py b/configs/datasets/needlebench/needlebench_128k/needlebench_multi_retrieval.py new file mode 100644 index 00000000..5d561f4d --- /dev/null +++ b/configs/datasets/needlebench/needlebench_128k/needlebench_multi_retrieval.py @@ -0,0 +1,111 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets.needlebench.parallel import NeedleBenchParallelDataset +from opencompass.datasets.needlebench.parallel import NeedleBenchParallelEvaluator +from opencompass.datasets.needlebench.origin import needlebench_postprocess +from opencompass.datasets.needlebench.origin import needlebench_dataset_postprocess +import math + + +def logistic(x, L=100, x0=50, k=0.1): + return round(L / (1 + math.exp(-k * (x - x0))), 3) + + +def generate_linear_space(start, end, num): + if num == 1: + return [start] + elif num < 1: + raise ValueError("num must be at least 1.") + step = (end - start) / (num - 1) + return [start + step * i for i in range(num)] + + +def generate_depth_percents(intervals, interval_type): + if interval_type == 'linear': + return generate_linear_space(0, 100, intervals) + elif interval_type == 'sigmoid': + linear_space = generate_linear_space(0, 100, intervals) + return [logistic(x) for x in linear_space] + else: + raise ValueError('Unsupported interval type') + + +needlebench_reader_cfg = dict(input_columns=['prompt'], output_column='answer') + +needlebench_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict(role='HUMAN', prompt='{prompt}'), + dict(role='BOT', prompt='{answer}\n'), + ] + ) + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer)) + +needlebench_eval_cfg = dict( + evaluator=dict(type=NeedleBenchParallelEvaluator), + pred_postprocessor=dict(type=needlebench_postprocess), + dataset_postprocessor=dict(type=needlebench_dataset_postprocess), + pred_role='BOT') + +context_lengths = list([16000, 32000, 48000, 64000, 80000, 96000, 112000, 128000]) +document_depth_percent_intervals = 20 +document_depth_percent_interval_type = "linear" + +base_path = './data/needlebench' +file_list = ['PaulGrahamEssays.jsonl'] +needlebench_datasets_en = [] +needle_file_name = 'needles.jsonl' +depths_float = generate_depth_percents( + document_depth_percent_intervals, + document_depth_percent_interval_type) +depths = [int(depth) for depth in depths_float] + +for original_context_length in context_lengths: + dataset_dict = { + 'abbr': f'Length{original_context_length}' + f'_parallel_en_128k', + 'type': NeedleBenchParallelDataset, + 'path': base_path, + 'needle_file_name': needle_file_name, + 'length': original_context_length, + 'depths': depths, + 'tokenizer_model': 'gpt-4', + 'file_list': file_list, + 'num_repeats_per_file': 25, + 'length_buffer': 3000, + 'guide': True, + 'language': 'English', + 'reader_cfg': needlebench_reader_cfg, + 'infer_cfg': needlebench_infer_cfg, + 'eval_cfg': needlebench_eval_cfg + } + needlebench_datasets_en.append(dataset_dict) + +file_list = ['zh_finance.jsonl'] +needlebench_datasets_zh = [] + +for original_context_length in context_lengths: + dataset_dict = { + 'abbr': f'Length{original_context_length}' + f'_parallel_zh_128k', + 'type': NeedleBenchParallelDataset, + 'path': base_path, + 'needle_file_name': needle_file_name, + 'length': original_context_length, + 'depths': depths, + 'tokenizer_model': 'gpt-4', + 'file_list': file_list, + 'num_repeats_per_file': 25, + 'length_buffer': 200, + 'guide': True, + 'language': 'Chinese', + 'reader_cfg': needlebench_reader_cfg, + 'infer_cfg': needlebench_infer_cfg, + 'eval_cfg': needlebench_eval_cfg + } + needlebench_datasets_zh.append(dataset_dict) diff --git a/configs/datasets/needlebench/needlebench_128k/needlebench_single.py b/configs/datasets/needlebench/needlebench_128k/needlebench_single.py new file mode 100644 index 00000000..62a5f38b --- /dev/null +++ b/configs/datasets/needlebench/needlebench_128k/needlebench_single.py @@ -0,0 +1,114 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets.needlebench.origin import NeedleBenchOriginDataset +from opencompass.datasets.needlebench.origin import NeedleBenchOriginEvaluator +from opencompass.datasets.needlebench.origin import needlebench_postprocess +from opencompass.datasets.needlebench.origin import needlebench_dataset_postprocess +import math + + +def logistic(x, L=100, x0=50, k=0.1): + return round(L / (1 + math.exp(-k * (x - x0))), 3) + + +def generate_linear_space(start, end, num): + if num == 1: + return [start] + elif num < 1: + raise ValueError("num must be at least 1.") + step = (end - start) / (num - 1) + return [start + step * i for i in range(num)] + + +def generate_depth_percents(intervals, interval_type): + if interval_type == 'linear': + return generate_linear_space(0, 100, intervals) + elif interval_type == 'sigmoid': + linear_space = generate_linear_space(0, 100, intervals) + return [logistic(x) for x in linear_space] + else: + raise ValueError('Unsupported interval type') + + +needlebench_reader_cfg = dict(input_columns=['prompt'], output_column='answer') + +needlebench_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict(role='HUMAN', prompt='{prompt}'), + dict(role='BOT', prompt='{answer}\n'), + ] + ) + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer)) + +needlebench_eval_cfg = dict( + evaluator=dict(type=NeedleBenchOriginEvaluator), + pred_postprocessor=dict(type=needlebench_postprocess), + dataset_postprocessor=dict(type=needlebench_dataset_postprocess), + pred_role='BOT') + +context_lengths = list([16000, 32000, 48000, 64000, 80000, 96000, 112000, 128000]) +document_depth_percent_intervals = 20 +document_depth_percent_interval_type = "linear" + +base_path = './data/needlebench' +file_list = ['PaulGrahamEssays.jsonl'] +needlebench_datasets_en = [] +needle_file_name = 'needles.jsonl' + +for original_context_length in context_lengths: + for depth_percent in generate_depth_percents( + document_depth_percent_intervals, + document_depth_percent_interval_type): + dataset_dict = { + 'abbr': f'Length{original_context_length}' + f'Depth{int(depth_percent)}_origin_en_128k', + 'type': NeedleBenchOriginDataset, + 'path': base_path, + 'length': original_context_length, + 'depth': int(depth_percent), + 'tokenizer_model': 'gpt-4', + 'file_list': file_list, + 'num_repeats_per_file': 10, + 'length_buffer': 600, + 'guide': True, + 'language': 'English', + 'needle_file_name': needle_file_name, + 'reader_cfg': needlebench_reader_cfg, + 'infer_cfg': needlebench_infer_cfg, + 'eval_cfg': needlebench_eval_cfg + } + needlebench_datasets_en.append(dataset_dict) + +file_list = ['zh_finance.jsonl'] +needlebench_datasets_zh = [] +needle_file_name = 'needles.jsonl' + +for original_context_length in context_lengths: + for depth_percent in generate_depth_percents( + document_depth_percent_intervals, + document_depth_percent_interval_type): + dataset_dict = { + 'abbr': f'Length{original_context_length}' + f'Depth{int(depth_percent)}_origin_zh_128k', + 'type': NeedleBenchOriginDataset, + 'path': base_path, + 'length': original_context_length, + 'depth': int(depth_percent), + 'tokenizer_model': 'gpt-4', + 'file_list': file_list, + 'num_repeats_per_file': 10, + 'length_buffer': 200, + 'guide': True, + 'language': 'Chinese', + 'needle_file_name': needle_file_name, + 'reader_cfg': needlebench_reader_cfg, + 'infer_cfg': needlebench_infer_cfg, + 'eval_cfg': needlebench_eval_cfg + } + needlebench_datasets_zh.append(dataset_dict) diff --git a/configs/datasets/needlebench/needlebench_200k/needlebench.py b/configs/datasets/needlebench/needlebench_200k/needlebench.py new file mode 100644 index 00000000..b73abb1f --- /dev/null +++ b/configs/datasets/needlebench/needlebench_200k/needlebench.py @@ -0,0 +1,18 @@ +from mmengine.config import read_base + +with read_base(): + from .needlebench_multi_reasoning import needlebench_datasets_2needle_en as needlebench_multi_2needle_en_datasets + from .needlebench_multi_reasoning import needlebench_datasets_3needle_en as needlebench_multi_3needle_en_datasets + from .needlebench_multi_reasoning import needlebench_datasets_4needle_en as needlebench_multi_4needle_en_datasets + from .needlebench_multi_reasoning import needlebench_datasets_5needle_en as needlebench_multi_5needle_en_datasets + from .needlebench_multi_reasoning import needlebench_datasets_2needle_zh as needlebench_multi_2needle_zh_datasets + from .needlebench_multi_reasoning import needlebench_datasets_3needle_zh as needlebench_multi_3needle_zh_datasets + from .needlebench_multi_reasoning import needlebench_datasets_4needle_zh as needlebench_multi_4needle_zh_datasets + from .needlebench_multi_reasoning import needlebench_datasets_5needle_zh as needlebench_multi_5needle_zh_datasets + + from .needlebench_single import needlebench_datasets_en as needlebench_origin_en_datasets + from .needlebench_single import needlebench_datasets_zh as needlebench_origin_zh_datasets + from .needlebench_multi_retrieval import needlebench_datasets_en as needlebench_parallel_en_datasets + from .needlebench_multi_retrieval import needlebench_datasets_zh as needlebench_parallel_zh_datasets + +needlebench_datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), []) diff --git a/configs/datasets/needlebench/needlebench_200k/needlebench_multi_reasoning.py b/configs/datasets/needlebench/needlebench_200k/needlebench_multi_reasoning.py new file mode 100644 index 00000000..1478f5b4 --- /dev/null +++ b/configs/datasets/needlebench/needlebench_200k/needlebench_multi_reasoning.py @@ -0,0 +1,303 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets.needlebench.multi import NeedleBenchMultiDataset +from opencompass.datasets.needlebench.multi import NeedleBenchMultiEvaluator +from opencompass.datasets.needlebench.origin import needlebench_postprocess +from opencompass.datasets.needlebench.origin import needlebench_dataset_postprocess +import math + + +def logistic(x, L=100, x0=50, k=0.1): + return round(L / (1 + math.exp(-k * (x - x0))), 3) + + +def generate_linear_space(start, end, num): + if num == 1: + return [start] + elif num < 1: + raise ValueError("num must be at least 1.") + step = (end - start) / (num - 1) + return [start + step * i for i in range(num)] + + +def generate_depth_percents(intervals, interval_type): + if interval_type == 'linear': + return generate_linear_space(0, 100, intervals) + elif interval_type == 'sigmoid': + linear_space = generate_linear_space(0, 100, intervals) + return [logistic(x) for x in linear_space] + else: + raise ValueError('Unsupported interval type') + + +needlebench_reader_cfg = dict(input_columns=['prompt'], output_column='answer') + +needlebench_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict(role='HUMAN', prompt='{prompt}'), + dict(role='BOT', prompt='{answer}\n'), + ] + ) + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer)) + +needlebench_eval_cfg = dict( + evaluator=dict(type=NeedleBenchMultiEvaluator), + pred_postprocessor=dict(type=needlebench_postprocess), + dataset_postprocessor=dict(type=needlebench_dataset_postprocess), + pred_role='BOT') + +context_lengths = list([16000, 32000, 48000, 64000, 80000, 96000, 112000, 128000, 144000, 160000, 176000, 192000, 200000]) +document_depth_percent_intervals = 20 +document_depth_percent_interval_type = "linear" + +# ----------English Version---------- +base_path = './data/needlebench' +file_list = ['PaulGrahamEssays.jsonl'] + +needle_file_name = 'multi_needle_reasoning_en.json' +diff = 10 +num_needles = 2 +needlebench_datasets_2needle_en = [] +language = 'English' + +for original_context_length in context_lengths: + for depth_percent in generate_depth_percents( + document_depth_percent_intervals, + document_depth_percent_interval_type): + dataset_dict = { + 'abbr': f'Length{original_context_length}' + f'Depth{int(depth_percent)}_{num_needles}needle_en_200k', + 'type': NeedleBenchMultiDataset, + 'path': base_path, + 'length': original_context_length, + 'depth': int(depth_percent), + 'tokenizer_model': 'gpt-4', + 'file_list': file_list, + 'num_repeats_per_file': 10, + 'length_buffer': 600, + 'guide': True, + 'language': language, + 'needle_file_name': needle_file_name, + 'num_needles': num_needles, + 'diff': diff, + 'reader_cfg': needlebench_reader_cfg, + 'infer_cfg': needlebench_infer_cfg, + 'eval_cfg': needlebench_eval_cfg + } + needlebench_datasets_2needle_en.append(dataset_dict) + +num_needles = 3 +needlebench_datasets_3needle_en = [] + +for original_context_length in context_lengths: + for depth_percent in generate_depth_percents( + document_depth_percent_intervals, + document_depth_percent_interval_type): + dataset_dict = { + 'abbr': f'Length{original_context_length}' + f'Depth{int(depth_percent)}_{num_needles}needle_en_200k', + 'type': NeedleBenchMultiDataset, + 'path': base_path, + 'length': original_context_length, + 'depth': int(depth_percent), + 'tokenizer_model': 'gpt-4', + 'file_list': file_list, + 'num_repeats_per_file': 10, + 'length_buffer': 600, + 'guide': True, + 'language': language, + 'needle_file_name': needle_file_name, + 'num_needles': num_needles, + 'diff': diff, + 'reader_cfg': needlebench_reader_cfg, + 'infer_cfg': needlebench_infer_cfg, + 'eval_cfg': needlebench_eval_cfg + } + needlebench_datasets_3needle_en.append(dataset_dict) + +num_needles = 4 +needlebench_datasets_4needle_en = [] + +for original_context_length in context_lengths: + for depth_percent in generate_depth_percents( + document_depth_percent_intervals, + document_depth_percent_interval_type): + dataset_dict = { + 'abbr': f'Length{original_context_length}' + f'Depth{int(depth_percent)}_{num_needles}needle_en_200k', + 'type': NeedleBenchMultiDataset, + 'path': base_path, + 'length': original_context_length, + 'depth': int(depth_percent), + 'tokenizer_model': 'gpt-4', + 'file_list': file_list, + 'num_repeats_per_file': 10, + 'length_buffer': 600, + 'guide': True, + 'language': language, + 'needle_file_name': needle_file_name, + 'num_needles': num_needles, + 'diff': diff, + 'reader_cfg': needlebench_reader_cfg, + 'infer_cfg': needlebench_infer_cfg, + 'eval_cfg': needlebench_eval_cfg + } + needlebench_datasets_4needle_en.append(dataset_dict) + +num_needles = 5 +needlebench_datasets_5needle_en = [] + +for original_context_length in context_lengths: + for depth_percent in generate_depth_percents( + document_depth_percent_intervals, + document_depth_percent_interval_type): + dataset_dict = { + 'abbr': f'Length{original_context_length}' + f'Depth{int(depth_percent)}_{num_needles}needle_en_200k', + 'type': NeedleBenchMultiDataset, + 'path': base_path, + 'length': original_context_length, + 'depth': int(depth_percent), + 'tokenizer_model': 'gpt-4', + 'file_list': file_list, + 'num_repeats_per_file': 10, + 'length_buffer': 600, + 'guide': True, + 'language': language, + 'needle_file_name': needle_file_name, + 'num_needles': num_needles, + 'diff': diff, + 'reader_cfg': needlebench_reader_cfg, + 'infer_cfg': needlebench_infer_cfg, + 'eval_cfg': needlebench_eval_cfg + } + needlebench_datasets_5needle_en.append(dataset_dict) + +# ----------Chinese Version---------- +base_path = './data/needlebench' +file_list = ['zh_finance.jsonl'] + +needle_file_name = 'multi_needle_reasoning_zh.json' +diff = 10 +num_needles = 2 +needlebench_datasets_2needle_zh = [] +language = 'Chinese' + +for original_context_length in context_lengths: + for depth_percent in generate_depth_percents( + document_depth_percent_intervals, + document_depth_percent_interval_type): + dataset_dict = { + 'abbr': f'Length{original_context_length}' + f'Depth{int(depth_percent)}_{num_needles}needle_zh_200k', + 'type': NeedleBenchMultiDataset, + 'path': base_path, + 'length': original_context_length, + 'depth': int(depth_percent), + 'tokenizer_model': 'gpt-4', + 'file_list': file_list, + 'num_repeats_per_file': 10, + 'length_buffer': 200, + 'guide': True, + 'language': language, + 'needle_file_name': needle_file_name, + 'num_needles': num_needles, + 'diff': diff, + 'reader_cfg': needlebench_reader_cfg, + 'infer_cfg': needlebench_infer_cfg, + 'eval_cfg': needlebench_eval_cfg + } + needlebench_datasets_2needle_zh.append(dataset_dict) + +num_needles = 3 +needlebench_datasets_3needle_zh = [] + +for original_context_length in context_lengths: + for depth_percent in generate_depth_percents( + document_depth_percent_intervals, + document_depth_percent_interval_type): + dataset_dict = { + 'abbr': f'Length{original_context_length}' + f'Depth{int(depth_percent)}_{num_needles}needle_zh_200k', + 'type': NeedleBenchMultiDataset, + 'path': base_path, + 'length': original_context_length, + 'depth': int(depth_percent), + 'tokenizer_model': 'gpt-4', + 'file_list': file_list, + 'num_repeats_per_file': 10, + 'length_buffer': 200, + 'guide': True, + 'language': language, + 'needle_file_name': needle_file_name, + 'num_needles': num_needles, + 'diff': diff, + 'reader_cfg': needlebench_reader_cfg, + 'infer_cfg': needlebench_infer_cfg, + 'eval_cfg': needlebench_eval_cfg + } + needlebench_datasets_3needle_zh.append(dataset_dict) + +num_needles = 4 +needlebench_datasets_4needle_zh = [] + +for original_context_length in context_lengths: + for depth_percent in generate_depth_percents( + document_depth_percent_intervals, + document_depth_percent_interval_type): + dataset_dict = { + 'abbr': f'Length{original_context_length}' + f'Depth{int(depth_percent)}_{num_needles}needle_zh_200k', + 'type': NeedleBenchMultiDataset, + 'path': base_path, + 'length': original_context_length, + 'depth': int(depth_percent), + 'tokenizer_model': 'gpt-4', + 'file_list': file_list, + 'num_repeats_per_file': 10, + 'length_buffer': 200, + 'guide': True, + 'language': language, + 'needle_file_name': needle_file_name, + 'num_needles': num_needles, + 'diff': diff, + 'reader_cfg': needlebench_reader_cfg, + 'infer_cfg': needlebench_infer_cfg, + 'eval_cfg': needlebench_eval_cfg + } + needlebench_datasets_4needle_zh.append(dataset_dict) + +num_needles = 5 +needlebench_datasets_5needle_zh = [] + +for original_context_length in context_lengths: + for depth_percent in generate_depth_percents( + document_depth_percent_intervals, + document_depth_percent_interval_type): + dataset_dict = { + 'abbr': f'Length{original_context_length}' + f'Depth{int(depth_percent)}_{num_needles}needle_zh_200k', + 'type': NeedleBenchMultiDataset, + 'path': base_path, + 'length': original_context_length, + 'depth': int(depth_percent), + 'tokenizer_model': 'gpt-4', + 'file_list': file_list, + 'num_repeats_per_file': 10, + 'length_buffer': 200, + 'guide': True, + 'language': language, + 'needle_file_name': needle_file_name, + 'num_needles': num_needles, + 'diff': diff, + 'reader_cfg': needlebench_reader_cfg, + 'infer_cfg': needlebench_infer_cfg, + 'eval_cfg': needlebench_eval_cfg + } + needlebench_datasets_5needle_zh.append(dataset_dict) diff --git a/configs/datasets/needlebench/needlebench_200k/needlebench_multi_retrieval.py b/configs/datasets/needlebench/needlebench_200k/needlebench_multi_retrieval.py new file mode 100644 index 00000000..d01145e8 --- /dev/null +++ b/configs/datasets/needlebench/needlebench_200k/needlebench_multi_retrieval.py @@ -0,0 +1,111 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets.needlebench.parallel import NeedleBenchParallelDataset +from opencompass.datasets.needlebench.parallel import NeedleBenchParallelEvaluator +from opencompass.datasets.needlebench.origin import needlebench_postprocess +from opencompass.datasets.needlebench.origin import needlebench_dataset_postprocess +import math + + +def logistic(x, L=100, x0=50, k=0.1): + return round(L / (1 + math.exp(-k * (x - x0))), 3) + + +def generate_linear_space(start, end, num): + if num == 1: + return [start] + elif num < 1: + raise ValueError("num must be at least 1.") + step = (end - start) / (num - 1) + return [start + step * i for i in range(num)] + + +def generate_depth_percents(intervals, interval_type): + if interval_type == 'linear': + return generate_linear_space(0, 100, intervals) + elif interval_type == 'sigmoid': + linear_space = generate_linear_space(0, 100, intervals) + return [logistic(x) for x in linear_space] + else: + raise ValueError('Unsupported interval type') + + +needlebench_reader_cfg = dict(input_columns=['prompt'], output_column='answer') + +needlebench_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict(role='HUMAN', prompt='{prompt}'), + dict(role='BOT', prompt='{answer}\n'), + ] + ) + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer)) + +needlebench_eval_cfg = dict( + evaluator=dict(type=NeedleBenchParallelEvaluator), + pred_postprocessor=dict(type=needlebench_postprocess), + dataset_postprocessor=dict(type=needlebench_dataset_postprocess), + pred_role='BOT') + +context_lengths = list([16000, 32000, 48000, 64000, 80000, 96000, 112000, 128000, 144000, 160000, 176000, 192000, 200000]) +document_depth_percent_intervals = 20 +document_depth_percent_interval_type = "linear" + +base_path = './data/needlebench' +file_list = ['PaulGrahamEssays.jsonl'] +needlebench_datasets_en = [] +needle_file_name = 'needles.jsonl' +depths_float = generate_depth_percents( + document_depth_percent_intervals, + document_depth_percent_interval_type) +depths = [int(depth) for depth in depths_float] + +for original_context_length in context_lengths: + dataset_dict = { + 'abbr': f'Length{original_context_length}' + f'_parallel_en_200k', + 'type': NeedleBenchParallelDataset, + 'path': base_path, + 'needle_file_name': needle_file_name, + 'length': original_context_length, + 'depths': depths, + 'tokenizer_model': 'gpt-4', + 'file_list': file_list, + 'num_repeats_per_file': 25, + 'length_buffer': 3000, + 'guide': True, + 'language': 'English', + 'reader_cfg': needlebench_reader_cfg, + 'infer_cfg': needlebench_infer_cfg, + 'eval_cfg': needlebench_eval_cfg + } + needlebench_datasets_en.append(dataset_dict) + +file_list = ['zh_finance.jsonl'] +needlebench_datasets_zh = [] + +for original_context_length in context_lengths: + dataset_dict = { + 'abbr': f'Length{original_context_length}' + f'_parallel_zh_200k', + 'type': NeedleBenchParallelDataset, + 'path': base_path, + 'needle_file_name': needle_file_name, + 'length': original_context_length, + 'depths': depths, + 'tokenizer_model': 'gpt-4', + 'file_list': file_list, + 'num_repeats_per_file': 25, + 'length_buffer': 200, + 'guide': True, + 'language': 'Chinese', + 'reader_cfg': needlebench_reader_cfg, + 'infer_cfg': needlebench_infer_cfg, + 'eval_cfg': needlebench_eval_cfg + } + needlebench_datasets_zh.append(dataset_dict) diff --git a/configs/datasets/needlebench/needlebench_200k/needlebench_single.py b/configs/datasets/needlebench/needlebench_200k/needlebench_single.py new file mode 100644 index 00000000..9e7fa129 --- /dev/null +++ b/configs/datasets/needlebench/needlebench_200k/needlebench_single.py @@ -0,0 +1,114 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets.needlebench.origin import NeedleBenchOriginDataset +from opencompass.datasets.needlebench.origin import NeedleBenchOriginEvaluator +from opencompass.datasets.needlebench.origin import needlebench_postprocess +from opencompass.datasets.needlebench.origin import needlebench_dataset_postprocess +import math + + +def logistic(x, L=100, x0=50, k=0.1): + return round(L / (1 + math.exp(-k * (x - x0))), 3) + + +def generate_linear_space(start, end, num): + if num == 1: + return [start] + elif num < 1: + raise ValueError("num must be at least 1.") + step = (end - start) / (num - 1) + return [start + step * i for i in range(num)] + + +def generate_depth_percents(intervals, interval_type): + if interval_type == 'linear': + return generate_linear_space(0, 100, intervals) + elif interval_type == 'sigmoid': + linear_space = generate_linear_space(0, 100, intervals) + return [logistic(x) for x in linear_space] + else: + raise ValueError('Unsupported interval type') + + +needlebench_reader_cfg = dict(input_columns=['prompt'], output_column='answer') + +needlebench_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict(role='HUMAN', prompt='{prompt}'), + dict(role='BOT', prompt='{answer}\n'), + ] + ) + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer)) + +needlebench_eval_cfg = dict( + evaluator=dict(type=NeedleBenchOriginEvaluator), + pred_postprocessor=dict(type=needlebench_postprocess), + dataset_postprocessor=dict(type=needlebench_dataset_postprocess), + pred_role='BOT') + +context_lengths = list([16000, 32000, 48000, 64000, 80000, 96000, 112000, 128000, 144000, 160000, 176000, 192000, 200000]) +document_depth_percent_intervals = 20 +document_depth_percent_interval_type = "linear" + +base_path = './data/needlebench' +file_list = ['PaulGrahamEssays.jsonl'] +needlebench_datasets_en = [] +needle_file_name = 'needles.jsonl' + +for original_context_length in context_lengths: + for depth_percent in generate_depth_percents( + document_depth_percent_intervals, + document_depth_percent_interval_type): + dataset_dict = { + 'abbr': f'Length{original_context_length}' + f'Depth{int(depth_percent)}_origin_en_200k', + 'type': NeedleBenchOriginDataset, + 'path': base_path, + 'length': original_context_length, + 'depth': int(depth_percent), + 'tokenizer_model': 'gpt-4', + 'file_list': file_list, + 'num_repeats_per_file': 10, + 'length_buffer': 600, + 'guide': True, + 'language': 'English', + 'needle_file_name': needle_file_name, + 'reader_cfg': needlebench_reader_cfg, + 'infer_cfg': needlebench_infer_cfg, + 'eval_cfg': needlebench_eval_cfg + } + needlebench_datasets_en.append(dataset_dict) + +file_list = ['zh_finance.jsonl'] +needlebench_datasets_zh = [] +needle_file_name = 'needles.jsonl' + +for original_context_length in context_lengths: + for depth_percent in generate_depth_percents( + document_depth_percent_intervals, + document_depth_percent_interval_type): + dataset_dict = { + 'abbr': f'Length{original_context_length}' + f'Depth{int(depth_percent)}_origin_zh_200k', + 'type': NeedleBenchOriginDataset, + 'path': base_path, + 'length': original_context_length, + 'depth': int(depth_percent), + 'tokenizer_model': 'gpt-4', + 'file_list': file_list, + 'num_repeats_per_file': 10, + 'length_buffer': 200, + 'guide': True, + 'language': 'Chinese', + 'needle_file_name': needle_file_name, + 'reader_cfg': needlebench_reader_cfg, + 'infer_cfg': needlebench_infer_cfg, + 'eval_cfg': needlebench_eval_cfg + } + needlebench_datasets_zh.append(dataset_dict) diff --git a/configs/datasets/needlebench/needlebench_32k/needlebench.py b/configs/datasets/needlebench/needlebench_32k/needlebench.py new file mode 100644 index 00000000..b73abb1f --- /dev/null +++ b/configs/datasets/needlebench/needlebench_32k/needlebench.py @@ -0,0 +1,18 @@ +from mmengine.config import read_base + +with read_base(): + from .needlebench_multi_reasoning import needlebench_datasets_2needle_en as needlebench_multi_2needle_en_datasets + from .needlebench_multi_reasoning import needlebench_datasets_3needle_en as needlebench_multi_3needle_en_datasets + from .needlebench_multi_reasoning import needlebench_datasets_4needle_en as needlebench_multi_4needle_en_datasets + from .needlebench_multi_reasoning import needlebench_datasets_5needle_en as needlebench_multi_5needle_en_datasets + from .needlebench_multi_reasoning import needlebench_datasets_2needle_zh as needlebench_multi_2needle_zh_datasets + from .needlebench_multi_reasoning import needlebench_datasets_3needle_zh as needlebench_multi_3needle_zh_datasets + from .needlebench_multi_reasoning import needlebench_datasets_4needle_zh as needlebench_multi_4needle_zh_datasets + from .needlebench_multi_reasoning import needlebench_datasets_5needle_zh as needlebench_multi_5needle_zh_datasets + + from .needlebench_single import needlebench_datasets_en as needlebench_origin_en_datasets + from .needlebench_single import needlebench_datasets_zh as needlebench_origin_zh_datasets + from .needlebench_multi_retrieval import needlebench_datasets_en as needlebench_parallel_en_datasets + from .needlebench_multi_retrieval import needlebench_datasets_zh as needlebench_parallel_zh_datasets + +needlebench_datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), []) diff --git a/configs/datasets/needlebench/needlebench_32k/needlebench_multi_reasoning.py b/configs/datasets/needlebench/needlebench_32k/needlebench_multi_reasoning.py new file mode 100644 index 00000000..efb43206 --- /dev/null +++ b/configs/datasets/needlebench/needlebench_32k/needlebench_multi_reasoning.py @@ -0,0 +1,303 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets.needlebench.multi import NeedleBenchMultiDataset +from opencompass.datasets.needlebench.multi import NeedleBenchMultiEvaluator +from opencompass.datasets.needlebench.origin import needlebench_postprocess +from opencompass.datasets.needlebench.origin import needlebench_dataset_postprocess +import math + + +def logistic(x, L=100, x0=50, k=0.1): + return round(L / (1 + math.exp(-k * (x - x0))), 3) + + +def generate_linear_space(start, end, num): + if num == 1: + return [start] + elif num < 1: + raise ValueError("num must be at least 1.") + step = (end - start) / (num - 1) + return [start + step * i for i in range(num)] + + +def generate_depth_percents(intervals, interval_type): + if interval_type == 'linear': + return generate_linear_space(0, 100, intervals) + elif interval_type == 'sigmoid': + linear_space = generate_linear_space(0, 100, intervals) + return [logistic(x) for x in linear_space] + else: + raise ValueError('Unsupported interval type') + + +needlebench_reader_cfg = dict(input_columns=['prompt'], output_column='answer') + +needlebench_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict(role='HUMAN', prompt='{prompt}'), + dict(role='BOT', prompt='{answer}\n'), + ] + ) + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer)) + +needlebench_eval_cfg = dict( + evaluator=dict(type=NeedleBenchMultiEvaluator), + pred_postprocessor=dict(type=needlebench_postprocess), + dataset_postprocessor=dict(type=needlebench_dataset_postprocess), + pred_role='BOT') + +context_lengths = list([9000, 13000, 17000, 21000, 25000, 29000, 31000, 32000]) +document_depth_percent_intervals = 20 +document_depth_percent_interval_type = "linear" + +# ----------English Version---------- +base_path = './data/needlebench' +file_list = ['PaulGrahamEssays.jsonl'] + +needle_file_name = 'multi_needle_reasoning_en.json' +diff = 10 +num_needles = 2 +needlebench_datasets_2needle_en = [] +language = 'English' + +for original_context_length in context_lengths: + for depth_percent in generate_depth_percents( + document_depth_percent_intervals, + document_depth_percent_interval_type): + dataset_dict = { + 'abbr': f'Length{original_context_length}' + f'Depth{int(depth_percent)}_{num_needles}needle_en_32k', + 'type': NeedleBenchMultiDataset, + 'path': base_path, + 'length': original_context_length, + 'depth': int(depth_percent), + 'tokenizer_model': 'gpt-4', + 'file_list': file_list, + 'num_repeats_per_file': 10, + 'length_buffer': 3000, + 'guide': True, + 'language': language, + 'needle_file_name': needle_file_name, + 'num_needles': num_needles, + 'diff': diff, + 'reader_cfg': needlebench_reader_cfg, + 'infer_cfg': needlebench_infer_cfg, + 'eval_cfg': needlebench_eval_cfg + } + needlebench_datasets_2needle_en.append(dataset_dict) + +num_needles = 3 +needlebench_datasets_3needle_en = [] + +for original_context_length in context_lengths: + for depth_percent in generate_depth_percents( + document_depth_percent_intervals, + document_depth_percent_interval_type): + dataset_dict = { + 'abbr': f'Length{original_context_length}' + f'Depth{int(depth_percent)}_{num_needles}needle_en_32k', + 'type': NeedleBenchMultiDataset, + 'path': base_path, + 'length': original_context_length, + 'depth': int(depth_percent), + 'tokenizer_model': 'gpt-4', + 'file_list': file_list, + 'num_repeats_per_file': 10, + 'length_buffer': 3000, + 'guide': True, + 'language': language, + 'needle_file_name': needle_file_name, + 'num_needles': num_needles, + 'diff': diff, + 'reader_cfg': needlebench_reader_cfg, + 'infer_cfg': needlebench_infer_cfg, + 'eval_cfg': needlebench_eval_cfg + } + needlebench_datasets_3needle_en.append(dataset_dict) + +num_needles = 4 +needlebench_datasets_4needle_en = [] + +for original_context_length in context_lengths: + for depth_percent in generate_depth_percents( + document_depth_percent_intervals, + document_depth_percent_interval_type): + dataset_dict = { + 'abbr': f'Length{original_context_length}' + f'Depth{int(depth_percent)}_{num_needles}needle_en_32k', + 'type': NeedleBenchMultiDataset, + 'path': base_path, + 'length': original_context_length, + 'depth': int(depth_percent), + 'tokenizer_model': 'gpt-4', + 'file_list': file_list, + 'num_repeats_per_file': 10, + 'length_buffer': 3000, + 'guide': True, + 'language': language, + 'needle_file_name': needle_file_name, + 'num_needles': num_needles, + 'diff': diff, + 'reader_cfg': needlebench_reader_cfg, + 'infer_cfg': needlebench_infer_cfg, + 'eval_cfg': needlebench_eval_cfg + } + needlebench_datasets_4needle_en.append(dataset_dict) + +num_needles = 5 +needlebench_datasets_5needle_en = [] + +for original_context_length in context_lengths: + for depth_percent in generate_depth_percents( + document_depth_percent_intervals, + document_depth_percent_interval_type): + dataset_dict = { + 'abbr': f'Length{original_context_length}' + f'Depth{int(depth_percent)}_{num_needles}needle_en_32k', + 'type': NeedleBenchMultiDataset, + 'path': base_path, + 'length': original_context_length, + 'depth': int(depth_percent), + 'tokenizer_model': 'gpt-4', + 'file_list': file_list, + 'num_repeats_per_file': 10, + 'length_buffer': 3000, + 'guide': True, + 'language': language, + 'needle_file_name': needle_file_name, + 'num_needles': num_needles, + 'diff': diff, + 'reader_cfg': needlebench_reader_cfg, + 'infer_cfg': needlebench_infer_cfg, + 'eval_cfg': needlebench_eval_cfg + } + needlebench_datasets_5needle_en.append(dataset_dict) + +# ----------Chinese Version---------- +base_path = './data/needlebench' +file_list = ['zh_finance.jsonl'] + +needle_file_name = 'multi_needle_reasoning_zh.json' +diff = 10 +num_needles = 2 +needlebench_datasets_2needle_zh = [] +language = 'Chinese' + +for original_context_length in context_lengths: + for depth_percent in generate_depth_percents( + document_depth_percent_intervals, + document_depth_percent_interval_type): + dataset_dict = { + 'abbr': f'Length{original_context_length}' + f'Depth{int(depth_percent)}_{num_needles}needle_zh_32k', + 'type': NeedleBenchMultiDataset, + 'path': base_path, + 'length': original_context_length, + 'depth': int(depth_percent), + 'tokenizer_model': 'gpt-4', + 'file_list': file_list, + 'num_repeats_per_file': 10, + 'length_buffer': 200, + 'guide': True, + 'language': language, + 'needle_file_name': needle_file_name, + 'num_needles': num_needles, + 'diff': diff, + 'reader_cfg': needlebench_reader_cfg, + 'infer_cfg': needlebench_infer_cfg, + 'eval_cfg': needlebench_eval_cfg + } + needlebench_datasets_2needle_zh.append(dataset_dict) + +num_needles = 3 +needlebench_datasets_3needle_zh = [] + +for original_context_length in context_lengths: + for depth_percent in generate_depth_percents( + document_depth_percent_intervals, + document_depth_percent_interval_type): + dataset_dict = { + 'abbr': f'Length{original_context_length}' + f'Depth{int(depth_percent)}_{num_needles}needle_zh_32k', + 'type': NeedleBenchMultiDataset, + 'path': base_path, + 'length': original_context_length, + 'depth': int(depth_percent), + 'tokenizer_model': 'gpt-4', + 'file_list': file_list, + 'num_repeats_per_file': 10, + 'length_buffer': 200, + 'guide': True, + 'language': language, + 'needle_file_name': needle_file_name, + 'num_needles': num_needles, + 'diff': diff, + 'reader_cfg': needlebench_reader_cfg, + 'infer_cfg': needlebench_infer_cfg, + 'eval_cfg': needlebench_eval_cfg + } + needlebench_datasets_3needle_zh.append(dataset_dict) + +num_needles = 4 +needlebench_datasets_4needle_zh = [] + +for original_context_length in context_lengths: + for depth_percent in generate_depth_percents( + document_depth_percent_intervals, + document_depth_percent_interval_type): + dataset_dict = { + 'abbr': f'Length{original_context_length}' + f'Depth{int(depth_percent)}_{num_needles}needle_zh_32k', + 'type': NeedleBenchMultiDataset, + 'path': base_path, + 'length': original_context_length, + 'depth': int(depth_percent), + 'tokenizer_model': 'gpt-4', + 'file_list': file_list, + 'num_repeats_per_file': 10, + 'length_buffer': 200, + 'guide': True, + 'language': language, + 'needle_file_name': needle_file_name, + 'num_needles': num_needles, + 'diff': diff, + 'reader_cfg': needlebench_reader_cfg, + 'infer_cfg': needlebench_infer_cfg, + 'eval_cfg': needlebench_eval_cfg + } + needlebench_datasets_4needle_zh.append(dataset_dict) + +num_needles = 5 +needlebench_datasets_5needle_zh = [] + +for original_context_length in context_lengths: + for depth_percent in generate_depth_percents( + document_depth_percent_intervals, + document_depth_percent_interval_type): + dataset_dict = { + 'abbr': f'Length{original_context_length}' + f'Depth{int(depth_percent)}_{num_needles}needle_zh_32k', + 'type': NeedleBenchMultiDataset, + 'path': base_path, + 'length': original_context_length, + 'depth': int(depth_percent), + 'tokenizer_model': 'gpt-4', + 'file_list': file_list, + 'num_repeats_per_file': 10, + 'length_buffer': 200, + 'guide': True, + 'language': language, + 'needle_file_name': needle_file_name, + 'num_needles': num_needles, + 'diff': diff, + 'reader_cfg': needlebench_reader_cfg, + 'infer_cfg': needlebench_infer_cfg, + 'eval_cfg': needlebench_eval_cfg + } + needlebench_datasets_5needle_zh.append(dataset_dict) diff --git a/configs/datasets/needlebench/needlebench_32k/needlebench_multi_retrieval.py b/configs/datasets/needlebench/needlebench_32k/needlebench_multi_retrieval.py new file mode 100644 index 00000000..b04cb952 --- /dev/null +++ b/configs/datasets/needlebench/needlebench_32k/needlebench_multi_retrieval.py @@ -0,0 +1,111 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets.needlebench.parallel import NeedleBenchParallelDataset +from opencompass.datasets.needlebench.parallel import NeedleBenchParallelEvaluator +from opencompass.datasets.needlebench.origin import needlebench_postprocess +from opencompass.datasets.needlebench.origin import needlebench_dataset_postprocess +import math + + +def logistic(x, L=100, x0=50, k=0.1): + return round(L / (1 + math.exp(-k * (x - x0))), 3) + + +def generate_linear_space(start, end, num): + if num == 1: + return [start] + elif num < 1: + raise ValueError("num must be at least 1.") + step = (end - start) / (num - 1) + return [start + step * i for i in range(num)] + + +def generate_depth_percents(intervals, interval_type): + if interval_type == 'linear': + return generate_linear_space(0, 100, intervals) + elif interval_type == 'sigmoid': + linear_space = generate_linear_space(0, 100, intervals) + return [logistic(x) for x in linear_space] + else: + raise ValueError('Unsupported interval type') + + +needlebench_reader_cfg = dict(input_columns=['prompt'], output_column='answer') + +needlebench_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict(role='HUMAN', prompt='{prompt}'), + dict(role='BOT', prompt='{answer}\n'), + ] + ) + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer)) + +needlebench_eval_cfg = dict( + evaluator=dict(type=NeedleBenchParallelEvaluator), + pred_postprocessor=dict(type=needlebench_postprocess), + dataset_postprocessor=dict(type=needlebench_dataset_postprocess), + pred_role='BOT') + +context_lengths = list([9000, 13000, 17000, 21000, 25000, 29000, 31000, 32000]) +document_depth_percent_intervals = 20 +document_depth_percent_interval_type = "linear" + +base_path = './data/needlebench' +file_list = ['PaulGrahamEssays.jsonl'] +needlebench_datasets_en = [] +needle_file_name = 'needles.jsonl' +depths_float = generate_depth_percents( + document_depth_percent_intervals, + document_depth_percent_interval_type) +depths = [int(depth) for depth in depths_float] + +for original_context_length in context_lengths: + dataset_dict = { + 'abbr': f'Length{original_context_length}' + f'_parallel_en_32k', + 'type': NeedleBenchParallelDataset, + 'path': base_path, + 'needle_file_name': needle_file_name, + 'length': original_context_length, + 'depths': depths, + 'tokenizer_model': 'gpt-4', + 'file_list': file_list, + 'num_repeats_per_file': 25, + 'length_buffer': 3000, + 'guide': True, + 'language': 'English', + 'reader_cfg': needlebench_reader_cfg, + 'infer_cfg': needlebench_infer_cfg, + 'eval_cfg': needlebench_eval_cfg + } + needlebench_datasets_en.append(dataset_dict) + +file_list = ['zh_finance.jsonl'] +needlebench_datasets_zh = [] + +for original_context_length in context_lengths: + dataset_dict = { + 'abbr': f'Length{original_context_length}' + f'_parallel_zh_32k', + 'type': NeedleBenchParallelDataset, + 'path': base_path, + 'needle_file_name': needle_file_name, + 'length': original_context_length, + 'depths': depths, + 'tokenizer_model': 'gpt-4', + 'file_list': file_list, + 'num_repeats_per_file': 25, + 'length_buffer': 200, + 'guide': True, + 'language': 'Chinese', + 'reader_cfg': needlebench_reader_cfg, + 'infer_cfg': needlebench_infer_cfg, + 'eval_cfg': needlebench_eval_cfg + } + needlebench_datasets_zh.append(dataset_dict) diff --git a/configs/datasets/needlebench/needlebench_32k/needlebench_single.py b/configs/datasets/needlebench/needlebench_32k/needlebench_single.py new file mode 100644 index 00000000..6b9eb3d4 --- /dev/null +++ b/configs/datasets/needlebench/needlebench_32k/needlebench_single.py @@ -0,0 +1,114 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets.needlebench.origin import NeedleBenchOriginDataset +from opencompass.datasets.needlebench.origin import NeedleBenchOriginEvaluator +from opencompass.datasets.needlebench.origin import needlebench_postprocess +from opencompass.datasets.needlebench.origin import needlebench_dataset_postprocess +import math + + +def logistic(x, L=100, x0=50, k=0.1): + return round(L / (1 + math.exp(-k * (x - x0))), 3) + + +def generate_linear_space(start, end, num): + if num == 1: + return [start] + elif num < 1: + raise ValueError("num must be at least 1.") + step = (end - start) / (num - 1) + return [start + step * i for i in range(num)] + + +def generate_depth_percents(intervals, interval_type): + if interval_type == 'linear': + return generate_linear_space(0, 100, intervals) + elif interval_type == 'sigmoid': + linear_space = generate_linear_space(0, 100, intervals) + return [logistic(x) for x in linear_space] + else: + raise ValueError('Unsupported interval type') + + +needlebench_reader_cfg = dict(input_columns=['prompt'], output_column='answer') + +needlebench_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict(role='HUMAN', prompt='{prompt}'), + dict(role='BOT', prompt='{answer}\n'), + ] + ) + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer)) + +needlebench_eval_cfg = dict( + evaluator=dict(type=NeedleBenchOriginEvaluator), + pred_postprocessor=dict(type=needlebench_postprocess), + dataset_postprocessor=dict(type=needlebench_dataset_postprocess), + pred_role='BOT') + +context_lengths = list([9000, 13000, 17000, 21000, 25000, 29000, 31000, 32000]) +document_depth_percent_intervals = 20 +document_depth_percent_interval_type = "linear" + +base_path = './data/needlebench' +file_list = ['PaulGrahamEssays.jsonl'] +needlebench_datasets_en = [] +needle_file_name = 'needles.jsonl' + +for original_context_length in context_lengths: + for depth_percent in generate_depth_percents( + document_depth_percent_intervals, + document_depth_percent_interval_type): + dataset_dict = { + 'abbr': f'Length{original_context_length}' + f'Depth{int(depth_percent)}_origin_en_32k', + 'type': NeedleBenchOriginDataset, + 'path': base_path, + 'length': original_context_length, + 'depth': int(depth_percent), + 'tokenizer_model': 'gpt-4', + 'file_list': file_list, + 'num_repeats_per_file': 10, + 'length_buffer': 3000, + 'guide': True, + 'language': 'English', + 'needle_file_name': needle_file_name, + 'reader_cfg': needlebench_reader_cfg, + 'infer_cfg': needlebench_infer_cfg, + 'eval_cfg': needlebench_eval_cfg + } + needlebench_datasets_en.append(dataset_dict) + +file_list = ['zh_finance.jsonl'] +needlebench_datasets_zh = [] +needle_file_name = 'needles.jsonl' + +for original_context_length in context_lengths: + for depth_percent in generate_depth_percents( + document_depth_percent_intervals, + document_depth_percent_interval_type): + dataset_dict = { + 'abbr': f'Length{original_context_length}' + f'Depth{int(depth_percent)}_origin_zh_32k', + 'type': NeedleBenchOriginDataset, + 'path': base_path, + 'length': original_context_length, + 'depth': int(depth_percent), + 'tokenizer_model': 'gpt-4', + 'file_list': file_list, + 'num_repeats_per_file': 10, + 'length_buffer': 200, + 'guide': True, + 'language': 'Chinese', + 'needle_file_name': needle_file_name, + 'reader_cfg': needlebench_reader_cfg, + 'infer_cfg': needlebench_infer_cfg, + 'eval_cfg': needlebench_eval_cfg + } + needlebench_datasets_zh.append(dataset_dict) diff --git a/configs/datasets/needlebench/needlebench_4k/needlebench.py b/configs/datasets/needlebench/needlebench_4k/needlebench.py new file mode 100644 index 00000000..b73abb1f --- /dev/null +++ b/configs/datasets/needlebench/needlebench_4k/needlebench.py @@ -0,0 +1,18 @@ +from mmengine.config import read_base + +with read_base(): + from .needlebench_multi_reasoning import needlebench_datasets_2needle_en as needlebench_multi_2needle_en_datasets + from .needlebench_multi_reasoning import needlebench_datasets_3needle_en as needlebench_multi_3needle_en_datasets + from .needlebench_multi_reasoning import needlebench_datasets_4needle_en as needlebench_multi_4needle_en_datasets + from .needlebench_multi_reasoning import needlebench_datasets_5needle_en as needlebench_multi_5needle_en_datasets + from .needlebench_multi_reasoning import needlebench_datasets_2needle_zh as needlebench_multi_2needle_zh_datasets + from .needlebench_multi_reasoning import needlebench_datasets_3needle_zh as needlebench_multi_3needle_zh_datasets + from .needlebench_multi_reasoning import needlebench_datasets_4needle_zh as needlebench_multi_4needle_zh_datasets + from .needlebench_multi_reasoning import needlebench_datasets_5needle_zh as needlebench_multi_5needle_zh_datasets + + from .needlebench_single import needlebench_datasets_en as needlebench_origin_en_datasets + from .needlebench_single import needlebench_datasets_zh as needlebench_origin_zh_datasets + from .needlebench_multi_retrieval import needlebench_datasets_en as needlebench_parallel_en_datasets + from .needlebench_multi_retrieval import needlebench_datasets_zh as needlebench_parallel_zh_datasets + +needlebench_datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), []) diff --git a/configs/datasets/needlebench/needlebench_4k/needlebench_multi_reasoning.py b/configs/datasets/needlebench/needlebench_4k/needlebench_multi_reasoning.py new file mode 100644 index 00000000..01771eb3 --- /dev/null +++ b/configs/datasets/needlebench/needlebench_4k/needlebench_multi_reasoning.py @@ -0,0 +1,303 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets.needlebench.multi import NeedleBenchMultiDataset +from opencompass.datasets.needlebench.multi import NeedleBenchMultiEvaluator +from opencompass.datasets.needlebench.origin import needlebench_postprocess +from opencompass.datasets.needlebench.origin import needlebench_dataset_postprocess +import math + + +def logistic(x, L=100, x0=50, k=0.1): + return round(L / (1 + math.exp(-k * (x - x0))), 3) + + +def generate_linear_space(start, end, num): + if num == 1: + return [start] + elif num < 1: + raise ValueError("num must be at least 1.") + step = (end - start) / (num - 1) + return [start + step * i for i in range(num)] + + +def generate_depth_percents(intervals, interval_type): + if interval_type == 'linear': + return generate_linear_space(0, 100, intervals) + elif interval_type == 'sigmoid': + linear_space = generate_linear_space(0, 100, intervals) + return [logistic(x) for x in linear_space] + else: + raise ValueError('Unsupported interval type') + + +needlebench_reader_cfg = dict(input_columns=['prompt'], output_column='answer') + +needlebench_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict(role='HUMAN', prompt='{prompt}'), + dict(role='BOT', prompt='{answer}\n'), + ] + ) + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer)) + +needlebench_eval_cfg = dict( + evaluator=dict(type=NeedleBenchMultiEvaluator), + pred_postprocessor=dict(type=needlebench_postprocess), + dataset_postprocessor=dict(type=needlebench_dataset_postprocess), + pred_role='BOT') + +context_lengths = list(range(1000, 5000, 1000)) +document_depth_percent_intervals = 20 +document_depth_percent_interval_type = "linear" + +# ----------English Version---------- +base_path = './data/needlebench' +file_list = ['PaulGrahamEssays.jsonl'] + +needle_file_name = 'multi_needle_reasoning_en.json' +diff = 10 +num_needles = 2 +needlebench_datasets_2needle_en = [] +language = 'English' + +for original_context_length in context_lengths: + for depth_percent in generate_depth_percents( + document_depth_percent_intervals, + document_depth_percent_interval_type): + dataset_dict = { + 'abbr': f'Length{original_context_length}' + f'Depth{int(depth_percent)}_{num_needles}needle_en_4k', + 'type': NeedleBenchMultiDataset, + 'path': base_path, + 'length': original_context_length, + 'depth': int(depth_percent), + 'tokenizer_model': 'gpt-4', + 'file_list': file_list, + 'num_repeats_per_file': 10, + 'length_buffer': 600, + 'guide': True, + 'language': language, + 'needle_file_name': needle_file_name, + 'num_needles': num_needles, + 'diff': diff, + 'reader_cfg': needlebench_reader_cfg, + 'infer_cfg': needlebench_infer_cfg, + 'eval_cfg': needlebench_eval_cfg + } + needlebench_datasets_2needle_en.append(dataset_dict) + +num_needles = 3 +needlebench_datasets_3needle_en = [] + +for original_context_length in context_lengths: + for depth_percent in generate_depth_percents( + document_depth_percent_intervals, + document_depth_percent_interval_type): + dataset_dict = { + 'abbr': f'Length{original_context_length}' + f'Depth{int(depth_percent)}_{num_needles}needle_en_4k', + 'type': NeedleBenchMultiDataset, + 'path': base_path, + 'length': original_context_length, + 'depth': int(depth_percent), + 'tokenizer_model': 'gpt-4', + 'file_list': file_list, + 'num_repeats_per_file': 10, + 'length_buffer': 600, + 'guide': True, + 'language': language, + 'needle_file_name': needle_file_name, + 'num_needles': num_needles, + 'diff': diff, + 'reader_cfg': needlebench_reader_cfg, + 'infer_cfg': needlebench_infer_cfg, + 'eval_cfg': needlebench_eval_cfg + } + needlebench_datasets_3needle_en.append(dataset_dict) + +num_needles = 4 +needlebench_datasets_4needle_en = [] + +for original_context_length in context_lengths: + for depth_percent in generate_depth_percents( + document_depth_percent_intervals, + document_depth_percent_interval_type): + dataset_dict = { + 'abbr': f'Length{original_context_length}' + f'Depth{int(depth_percent)}_{num_needles}needle_en_4k', + 'type': NeedleBenchMultiDataset, + 'path': base_path, + 'length': original_context_length, + 'depth': int(depth_percent), + 'tokenizer_model': 'gpt-4', + 'file_list': file_list, + 'num_repeats_per_file': 10, + 'length_buffer': 600, + 'guide': True, + 'language': language, + 'needle_file_name': needle_file_name, + 'num_needles': num_needles, + 'diff': diff, + 'reader_cfg': needlebench_reader_cfg, + 'infer_cfg': needlebench_infer_cfg, + 'eval_cfg': needlebench_eval_cfg + } + needlebench_datasets_4needle_en.append(dataset_dict) + +num_needles = 5 +needlebench_datasets_5needle_en = [] + +for original_context_length in context_lengths: + for depth_percent in generate_depth_percents( + document_depth_percent_intervals, + document_depth_percent_interval_type): + dataset_dict = { + 'abbr': f'Length{original_context_length}' + f'Depth{int(depth_percent)}_{num_needles}needle_en_4k', + 'type': NeedleBenchMultiDataset, + 'path': base_path, + 'length': original_context_length, + 'depth': int(depth_percent), + 'tokenizer_model': 'gpt-4', + 'file_list': file_list, + 'num_repeats_per_file': 10, + 'length_buffer': 600, + 'guide': True, + 'language': language, + 'needle_file_name': needle_file_name, + 'num_needles': num_needles, + 'diff': diff, + 'reader_cfg': needlebench_reader_cfg, + 'infer_cfg': needlebench_infer_cfg, + 'eval_cfg': needlebench_eval_cfg + } + needlebench_datasets_5needle_en.append(dataset_dict) + +# ----------Chinese Version---------- +base_path = './data/needlebench' +file_list = ['zh_finance.jsonl'] + +needle_file_name = 'multi_needle_reasoning_zh.json' +diff = 10 +num_needles = 2 +needlebench_datasets_2needle_zh = [] +language = 'Chinese' + +for original_context_length in context_lengths: + for depth_percent in generate_depth_percents( + document_depth_percent_intervals, + document_depth_percent_interval_type): + dataset_dict = { + 'abbr': f'Length{original_context_length}' + f'Depth{int(depth_percent)}_{num_needles}needle_zh_4k', + 'type': NeedleBenchMultiDataset, + 'path': base_path, + 'length': original_context_length, + 'depth': int(depth_percent), + 'tokenizer_model': 'gpt-4', + 'file_list': file_list, + 'num_repeats_per_file': 10, + 'length_buffer': 200, + 'guide': True, + 'language': language, + 'needle_file_name': needle_file_name, + 'num_needles': num_needles, + 'diff': diff, + 'reader_cfg': needlebench_reader_cfg, + 'infer_cfg': needlebench_infer_cfg, + 'eval_cfg': needlebench_eval_cfg + } + needlebench_datasets_2needle_zh.append(dataset_dict) + +num_needles = 3 +needlebench_datasets_3needle_zh = [] + +for original_context_length in context_lengths: + for depth_percent in generate_depth_percents( + document_depth_percent_intervals, + document_depth_percent_interval_type): + dataset_dict = { + 'abbr': f'Length{original_context_length}' + f'Depth{int(depth_percent)}_{num_needles}needle_zh_4k', + 'type': NeedleBenchMultiDataset, + 'path': base_path, + 'length': original_context_length, + 'depth': int(depth_percent), + 'tokenizer_model': 'gpt-4', + 'file_list': file_list, + 'num_repeats_per_file': 10, + 'length_buffer': 200, + 'guide': True, + 'language': language, + 'needle_file_name': needle_file_name, + 'num_needles': num_needles, + 'diff': diff, + 'reader_cfg': needlebench_reader_cfg, + 'infer_cfg': needlebench_infer_cfg, + 'eval_cfg': needlebench_eval_cfg + } + needlebench_datasets_3needle_zh.append(dataset_dict) + +num_needles = 4 +needlebench_datasets_4needle_zh = [] + +for original_context_length in context_lengths: + for depth_percent in generate_depth_percents( + document_depth_percent_intervals, + document_depth_percent_interval_type): + dataset_dict = { + 'abbr': f'Length{original_context_length}' + f'Depth{int(depth_percent)}_{num_needles}needle_zh_4k', + 'type': NeedleBenchMultiDataset, + 'path': base_path, + 'length': original_context_length, + 'depth': int(depth_percent), + 'tokenizer_model': 'gpt-4', + 'file_list': file_list, + 'num_repeats_per_file': 10, + 'length_buffer': 200, + 'guide': True, + 'language': language, + 'needle_file_name': needle_file_name, + 'num_needles': num_needles, + 'diff': diff, + 'reader_cfg': needlebench_reader_cfg, + 'infer_cfg': needlebench_infer_cfg, + 'eval_cfg': needlebench_eval_cfg + } + needlebench_datasets_4needle_zh.append(dataset_dict) + +num_needles = 5 +needlebench_datasets_5needle_zh = [] + +for original_context_length in context_lengths: + for depth_percent in generate_depth_percents( + document_depth_percent_intervals, + document_depth_percent_interval_type): + dataset_dict = { + 'abbr': f'Length{original_context_length}' + f'Depth{int(depth_percent)}_{num_needles}needle_zh_4k', + 'type': NeedleBenchMultiDataset, + 'path': base_path, + 'length': original_context_length, + 'depth': int(depth_percent), + 'tokenizer_model': 'gpt-4', + 'file_list': file_list, + 'num_repeats_per_file': 10, + 'length_buffer': 200, + 'guide': True, + 'language': language, + 'needle_file_name': needle_file_name, + 'num_needles': num_needles, + 'diff': diff, + 'reader_cfg': needlebench_reader_cfg, + 'infer_cfg': needlebench_infer_cfg, + 'eval_cfg': needlebench_eval_cfg + } + needlebench_datasets_5needle_zh.append(dataset_dict) diff --git a/configs/datasets/needlebench/needlebench_4k/needlebench_multi_retrieval.py b/configs/datasets/needlebench/needlebench_4k/needlebench_multi_retrieval.py new file mode 100644 index 00000000..e18736d3 --- /dev/null +++ b/configs/datasets/needlebench/needlebench_4k/needlebench_multi_retrieval.py @@ -0,0 +1,111 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets.needlebench.parallel import NeedleBenchParallelDataset +from opencompass.datasets.needlebench.parallel import NeedleBenchParallelEvaluator +from opencompass.datasets.needlebench.origin import needlebench_postprocess +from opencompass.datasets.needlebench.origin import needlebench_dataset_postprocess +import math + + +def logistic(x, L=100, x0=50, k=0.1): + return round(L / (1 + math.exp(-k * (x - x0))), 3) + + +def generate_linear_space(start, end, num): + if num == 1: + return [start] + elif num < 1: + raise ValueError("num must be at least 1.") + step = (end - start) / (num - 1) + return [start + step * i for i in range(num)] + + +def generate_depth_percents(intervals, interval_type): + if interval_type == 'linear': + return generate_linear_space(0, 100, intervals) + elif interval_type == 'sigmoid': + linear_space = generate_linear_space(0, 100, intervals) + return [logistic(x) for x in linear_space] + else: + raise ValueError('Unsupported interval type') + + +needlebench_reader_cfg = dict(input_columns=['prompt'], output_column='answer') + +needlebench_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict(role='HUMAN', prompt='{prompt}'), + dict(role='BOT', prompt='{answer}\n'), + ] + ) + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer)) + +needlebench_eval_cfg = dict( + evaluator=dict(type=NeedleBenchParallelEvaluator), + pred_postprocessor=dict(type=needlebench_postprocess), + dataset_postprocessor=dict(type=needlebench_dataset_postprocess), + pred_role='BOT') + +context_lengths = list(range(1000, 5000, 1000)) +document_depth_percent_intervals = 20 +document_depth_percent_interval_type = "linear" + +base_path = './data/needlebench' +file_list = ['PaulGrahamEssays.jsonl'] +needlebench_datasets_en = [] +needle_file_name = 'needles.jsonl' +depths_float = generate_depth_percents( + document_depth_percent_intervals, + document_depth_percent_interval_type) +depths = [int(depth) for depth in depths_float] + +for original_context_length in context_lengths: + dataset_dict = { + 'abbr': f'Length{original_context_length}' + f'_parallel_en_4k', + 'type': NeedleBenchParallelDataset, + 'path': base_path, + 'needle_file_name': needle_file_name, + 'length': original_context_length, + 'depths': depths, + 'tokenizer_model': 'gpt-4', + 'file_list': file_list, + 'num_repeats_per_file': 25, + 'length_buffer': 1000, + 'guide': True, + 'language': 'English', + 'reader_cfg': needlebench_reader_cfg, + 'infer_cfg': needlebench_infer_cfg, + 'eval_cfg': needlebench_eval_cfg + } + needlebench_datasets_en.append(dataset_dict) + +file_list = ['zh_finance.jsonl'] +needlebench_datasets_zh = [] + +for original_context_length in context_lengths: + dataset_dict = { + 'abbr': f'Length{original_context_length}' + f'_parallel_zh_4k', + 'type': NeedleBenchParallelDataset, + 'path': base_path, + 'needle_file_name': needle_file_name, + 'length': original_context_length, + 'depths': depths, + 'tokenizer_model': 'gpt-4', + 'file_list': file_list, + 'num_repeats_per_file': 25, + 'length_buffer': 200, + 'guide': True, + 'language': 'Chinese', + 'reader_cfg': needlebench_reader_cfg, + 'infer_cfg': needlebench_infer_cfg, + 'eval_cfg': needlebench_eval_cfg + } + needlebench_datasets_zh.append(dataset_dict) diff --git a/configs/datasets/needlebench/needlebench_4k/needlebench_single.py b/configs/datasets/needlebench/needlebench_4k/needlebench_single.py new file mode 100644 index 00000000..895b6556 --- /dev/null +++ b/configs/datasets/needlebench/needlebench_4k/needlebench_single.py @@ -0,0 +1,114 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets.needlebench.origin import NeedleBenchOriginDataset +from opencompass.datasets.needlebench.origin import NeedleBenchOriginEvaluator +from opencompass.datasets.needlebench.origin import needlebench_postprocess +from opencompass.datasets.needlebench.origin import needlebench_dataset_postprocess +import math + + +def logistic(x, L=100, x0=50, k=0.1): + return round(L / (1 + math.exp(-k * (x - x0))), 3) + + +def generate_linear_space(start, end, num): + if num == 1: + return [start] + elif num < 1: + raise ValueError("num must be at least 1.") + step = (end - start) / (num - 1) + return [start + step * i for i in range(num)] + + +def generate_depth_percents(intervals, interval_type): + if interval_type == 'linear': + return generate_linear_space(0, 100, intervals) + elif interval_type == 'sigmoid': + linear_space = generate_linear_space(0, 100, intervals) + return [logistic(x) for x in linear_space] + else: + raise ValueError('Unsupported interval type') + + +needlebench_reader_cfg = dict(input_columns=['prompt'], output_column='answer') + +needlebench_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict(role='HUMAN', prompt='{prompt}'), + dict(role='BOT', prompt='{answer}\n'), + ] + ) + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer)) + +needlebench_eval_cfg = dict( + evaluator=dict(type=NeedleBenchOriginEvaluator), + pred_postprocessor=dict(type=needlebench_postprocess), + dataset_postprocessor=dict(type=needlebench_dataset_postprocess), + pred_role='BOT') + +context_lengths = list(range(1000, 5000, 1000)) +document_depth_percent_intervals = 20 +document_depth_percent_interval_type = "linear" + +base_path = './data/needlebench' +file_list = ['PaulGrahamEssays.jsonl'] +needlebench_datasets_en = [] +needle_file_name = 'needles.jsonl' + +for original_context_length in context_lengths: + for depth_percent in generate_depth_percents( + document_depth_percent_intervals, + document_depth_percent_interval_type): + dataset_dict = { + 'abbr': f'Length{original_context_length}' + f'Depth{int(depth_percent)}_origin_en_4k', + 'type': NeedleBenchOriginDataset, + 'path': base_path, + 'length': original_context_length, + 'depth': int(depth_percent), + 'tokenizer_model': 'gpt-4', + 'file_list': file_list, + 'num_repeats_per_file': 10, + 'length_buffer': 600, + 'guide': True, + 'language': 'English', + 'needle_file_name': needle_file_name, + 'reader_cfg': needlebench_reader_cfg, + 'infer_cfg': needlebench_infer_cfg, + 'eval_cfg': needlebench_eval_cfg + } + needlebench_datasets_en.append(dataset_dict) + +file_list = ['zh_finance.jsonl'] +needlebench_datasets_zh = [] +needle_file_name = 'needles.jsonl' + +for original_context_length in context_lengths: + for depth_percent in generate_depth_percents( + document_depth_percent_intervals, + document_depth_percent_interval_type): + dataset_dict = { + 'abbr': f'Length{original_context_length}' + f'Depth{int(depth_percent)}_origin_zh_4k', + 'type': NeedleBenchOriginDataset, + 'path': base_path, + 'length': original_context_length, + 'depth': int(depth_percent), + 'tokenizer_model': 'gpt-4', + 'file_list': file_list, + 'num_repeats_per_file': 10, + 'length_buffer': 200, + 'guide': True, + 'language': 'Chinese', + 'needle_file_name': needle_file_name, + 'reader_cfg': needlebench_reader_cfg, + 'infer_cfg': needlebench_infer_cfg, + 'eval_cfg': needlebench_eval_cfg + } + needlebench_datasets_zh.append(dataset_dict) diff --git a/configs/datasets/needlebench/needlebench_8k/needlebench.py b/configs/datasets/needlebench/needlebench_8k/needlebench.py new file mode 100644 index 00000000..b73abb1f --- /dev/null +++ b/configs/datasets/needlebench/needlebench_8k/needlebench.py @@ -0,0 +1,18 @@ +from mmengine.config import read_base + +with read_base(): + from .needlebench_multi_reasoning import needlebench_datasets_2needle_en as needlebench_multi_2needle_en_datasets + from .needlebench_multi_reasoning import needlebench_datasets_3needle_en as needlebench_multi_3needle_en_datasets + from .needlebench_multi_reasoning import needlebench_datasets_4needle_en as needlebench_multi_4needle_en_datasets + from .needlebench_multi_reasoning import needlebench_datasets_5needle_en as needlebench_multi_5needle_en_datasets + from .needlebench_multi_reasoning import needlebench_datasets_2needle_zh as needlebench_multi_2needle_zh_datasets + from .needlebench_multi_reasoning import needlebench_datasets_3needle_zh as needlebench_multi_3needle_zh_datasets + from .needlebench_multi_reasoning import needlebench_datasets_4needle_zh as needlebench_multi_4needle_zh_datasets + from .needlebench_multi_reasoning import needlebench_datasets_5needle_zh as needlebench_multi_5needle_zh_datasets + + from .needlebench_single import needlebench_datasets_en as needlebench_origin_en_datasets + from .needlebench_single import needlebench_datasets_zh as needlebench_origin_zh_datasets + from .needlebench_multi_retrieval import needlebench_datasets_en as needlebench_parallel_en_datasets + from .needlebench_multi_retrieval import needlebench_datasets_zh as needlebench_parallel_zh_datasets + +needlebench_datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), []) diff --git a/configs/datasets/needlebench/needlebench_8k/needlebench_multi_reasoning.py b/configs/datasets/needlebench/needlebench_8k/needlebench_multi_reasoning.py new file mode 100644 index 00000000..98aa0556 --- /dev/null +++ b/configs/datasets/needlebench/needlebench_8k/needlebench_multi_reasoning.py @@ -0,0 +1,303 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets.needlebench.multi import NeedleBenchMultiDataset +from opencompass.datasets.needlebench.multi import NeedleBenchMultiEvaluator +from opencompass.datasets.needlebench.origin import needlebench_postprocess +from opencompass.datasets.needlebench.origin import needlebench_dataset_postprocess +import math + + +def logistic(x, L=100, x0=50, k=0.1): + return round(L / (1 + math.exp(-k * (x - x0))), 3) + + +def generate_linear_space(start, end, num): + if num == 1: + return [start] + elif num < 1: + raise ValueError("num must be at least 1.") + step = (end - start) / (num - 1) + return [start + step * i for i in range(num)] + + +def generate_depth_percents(intervals, interval_type): + if interval_type == 'linear': + return generate_linear_space(0, 100, intervals) + elif interval_type == 'sigmoid': + linear_space = generate_linear_space(0, 100, intervals) + return [logistic(x) for x in linear_space] + else: + raise ValueError('Unsupported interval type') + + +needlebench_reader_cfg = dict(input_columns=['prompt'], output_column='answer') + +needlebench_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict(role='HUMAN', prompt='{prompt}'), + dict(role='BOT', prompt='{answer}\n'), + ] + ) + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer)) + +needlebench_eval_cfg = dict( + evaluator=dict(type=NeedleBenchMultiEvaluator), + pred_postprocessor=dict(type=needlebench_postprocess), + dataset_postprocessor=dict(type=needlebench_dataset_postprocess), + pred_role='BOT') + +context_lengths = list(range(5000, 9000, 1000)) +document_depth_percent_intervals = 20 +document_depth_percent_interval_type = "linear" + +# ----------English Version---------- +base_path = './data/needlebench' +file_list = ['PaulGrahamEssays.jsonl'] + +needle_file_name = 'multi_needle_reasoning_en.json' +diff = 10 +num_needles = 2 +needlebench_datasets_2needle_en = [] +language = 'English' + +for original_context_length in context_lengths: + for depth_percent in generate_depth_percents( + document_depth_percent_intervals, + document_depth_percent_interval_type): + dataset_dict = { + 'abbr': f'Length{original_context_length}' + f'Depth{int(depth_percent)}_{num_needles}needle_en_8k', + 'type': NeedleBenchMultiDataset, + 'path': base_path, + 'length': original_context_length, + 'depth': int(depth_percent), + 'tokenizer_model': 'gpt-4', + 'file_list': file_list, + 'num_repeats_per_file': 10, + 'length_buffer': 1000, + 'guide': True, + 'language': language, + 'needle_file_name': needle_file_name, + 'num_needles': num_needles, + 'diff': diff, + 'reader_cfg': needlebench_reader_cfg, + 'infer_cfg': needlebench_infer_cfg, + 'eval_cfg': needlebench_eval_cfg + } + needlebench_datasets_2needle_en.append(dataset_dict) + +num_needles = 3 +needlebench_datasets_3needle_en = [] + +for original_context_length in context_lengths: + for depth_percent in generate_depth_percents( + document_depth_percent_intervals, + document_depth_percent_interval_type): + dataset_dict = { + 'abbr': f'Length{original_context_length}' + f'Depth{int(depth_percent)}_{num_needles}needle_en_8k', + 'type': NeedleBenchMultiDataset, + 'path': base_path, + 'length': original_context_length, + 'depth': int(depth_percent), + 'tokenizer_model': 'gpt-4', + 'file_list': file_list, + 'num_repeats_per_file': 10, + 'length_buffer': 1000, + 'guide': True, + 'language': language, + 'needle_file_name': needle_file_name, + 'num_needles': num_needles, + 'diff': diff, + 'reader_cfg': needlebench_reader_cfg, + 'infer_cfg': needlebench_infer_cfg, + 'eval_cfg': needlebench_eval_cfg + } + needlebench_datasets_3needle_en.append(dataset_dict) + +num_needles = 4 +needlebench_datasets_4needle_en = [] + +for original_context_length in context_lengths: + for depth_percent in generate_depth_percents( + document_depth_percent_intervals, + document_depth_percent_interval_type): + dataset_dict = { + 'abbr': f'Length{original_context_length}' + f'Depth{int(depth_percent)}_{num_needles}needle_en_8k', + 'type': NeedleBenchMultiDataset, + 'path': base_path, + 'length': original_context_length, + 'depth': int(depth_percent), + 'tokenizer_model': 'gpt-4', + 'file_list': file_list, + 'num_repeats_per_file': 10, + 'length_buffer': 1000, + 'guide': True, + 'language': language, + 'needle_file_name': needle_file_name, + 'num_needles': num_needles, + 'diff': diff, + 'reader_cfg': needlebench_reader_cfg, + 'infer_cfg': needlebench_infer_cfg, + 'eval_cfg': needlebench_eval_cfg + } + needlebench_datasets_4needle_en.append(dataset_dict) + +num_needles = 5 +needlebench_datasets_5needle_en = [] + +for original_context_length in context_lengths: + for depth_percent in generate_depth_percents( + document_depth_percent_intervals, + document_depth_percent_interval_type): + dataset_dict = { + 'abbr': f'Length{original_context_length}' + f'Depth{int(depth_percent)}_{num_needles}needle_en_8k', + 'type': NeedleBenchMultiDataset, + 'path': base_path, + 'length': original_context_length, + 'depth': int(depth_percent), + 'tokenizer_model': 'gpt-4', + 'file_list': file_list, + 'num_repeats_per_file': 10, + 'length_buffer': 1000, + 'guide': True, + 'language': language, + 'needle_file_name': needle_file_name, + 'num_needles': num_needles, + 'diff': diff, + 'reader_cfg': needlebench_reader_cfg, + 'infer_cfg': needlebench_infer_cfg, + 'eval_cfg': needlebench_eval_cfg + } + needlebench_datasets_5needle_en.append(dataset_dict) + +# ----------Chinese Version---------- +base_path = './data/needlebench' +file_list = ['zh_finance.jsonl'] + +needle_file_name = 'multi_needle_reasoning_zh.json' +diff = 10 +num_needles = 2 +needlebench_datasets_2needle_zh = [] +language = 'Chinese' + +for original_context_length in context_lengths: + for depth_percent in generate_depth_percents( + document_depth_percent_intervals, + document_depth_percent_interval_type): + dataset_dict = { + 'abbr': f'Length{original_context_length}' + f'Depth{int(depth_percent)}_{num_needles}needle_zh_8k', + 'type': NeedleBenchMultiDataset, + 'path': base_path, + 'length': original_context_length, + 'depth': int(depth_percent), + 'tokenizer_model': 'gpt-4', + 'file_list': file_list, + 'num_repeats_per_file': 10, + 'length_buffer': 200, + 'guide': True, + 'language': language, + 'needle_file_name': needle_file_name, + 'num_needles': num_needles, + 'diff': diff, + 'reader_cfg': needlebench_reader_cfg, + 'infer_cfg': needlebench_infer_cfg, + 'eval_cfg': needlebench_eval_cfg + } + needlebench_datasets_2needle_zh.append(dataset_dict) + +num_needles = 3 +needlebench_datasets_3needle_zh = [] + +for original_context_length in context_lengths: + for depth_percent in generate_depth_percents( + document_depth_percent_intervals, + document_depth_percent_interval_type): + dataset_dict = { + 'abbr': f'Length{original_context_length}' + f'Depth{int(depth_percent)}_{num_needles}needle_zh_8k', + 'type': NeedleBenchMultiDataset, + 'path': base_path, + 'length': original_context_length, + 'depth': int(depth_percent), + 'tokenizer_model': 'gpt-4', + 'file_list': file_list, + 'num_repeats_per_file': 10, + 'length_buffer': 200, + 'guide': True, + 'language': language, + 'needle_file_name': needle_file_name, + 'num_needles': num_needles, + 'diff': diff, + 'reader_cfg': needlebench_reader_cfg, + 'infer_cfg': needlebench_infer_cfg, + 'eval_cfg': needlebench_eval_cfg + } + needlebench_datasets_3needle_zh.append(dataset_dict) + +num_needles = 4 +needlebench_datasets_4needle_zh = [] + +for original_context_length in context_lengths: + for depth_percent in generate_depth_percents( + document_depth_percent_intervals, + document_depth_percent_interval_type): + dataset_dict = { + 'abbr': f'Length{original_context_length}' + f'Depth{int(depth_percent)}_{num_needles}needle_zh_8k', + 'type': NeedleBenchMultiDataset, + 'path': base_path, + 'length': original_context_length, + 'depth': int(depth_percent), + 'tokenizer_model': 'gpt-4', + 'file_list': file_list, + 'num_repeats_per_file': 10, + 'length_buffer': 200, + 'guide': True, + 'language': language, + 'needle_file_name': needle_file_name, + 'num_needles': num_needles, + 'diff': diff, + 'reader_cfg': needlebench_reader_cfg, + 'infer_cfg': needlebench_infer_cfg, + 'eval_cfg': needlebench_eval_cfg + } + needlebench_datasets_4needle_zh.append(dataset_dict) + +num_needles = 5 +needlebench_datasets_5needle_zh = [] + +for original_context_length in context_lengths: + for depth_percent in generate_depth_percents( + document_depth_percent_intervals, + document_depth_percent_interval_type): + dataset_dict = { + 'abbr': f'Length{original_context_length}' + f'Depth{int(depth_percent)}_{num_needles}needle_zh_8k', + 'type': NeedleBenchMultiDataset, + 'path': base_path, + 'length': original_context_length, + 'depth': int(depth_percent), + 'tokenizer_model': 'gpt-4', + 'file_list': file_list, + 'num_repeats_per_file': 10, + 'length_buffer': 200, + 'guide': True, + 'language': language, + 'needle_file_name': needle_file_name, + 'num_needles': num_needles, + 'diff': diff, + 'reader_cfg': needlebench_reader_cfg, + 'infer_cfg': needlebench_infer_cfg, + 'eval_cfg': needlebench_eval_cfg + } + needlebench_datasets_5needle_zh.append(dataset_dict) diff --git a/configs/datasets/needlebench/needlebench_8k/needlebench_multi_retrieval.py b/configs/datasets/needlebench/needlebench_8k/needlebench_multi_retrieval.py new file mode 100644 index 00000000..2890c6cc --- /dev/null +++ b/configs/datasets/needlebench/needlebench_8k/needlebench_multi_retrieval.py @@ -0,0 +1,111 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets.needlebench.parallel import NeedleBenchParallelDataset +from opencompass.datasets.needlebench.parallel import NeedleBenchParallelEvaluator +from opencompass.datasets.needlebench.origin import needlebench_postprocess +from opencompass.datasets.needlebench.origin import needlebench_dataset_postprocess +import math + + +def logistic(x, L=100, x0=50, k=0.1): + return round(L / (1 + math.exp(-k * (x - x0))), 3) + + +def generate_linear_space(start, end, num): + if num == 1: + return [start] + elif num < 1: + raise ValueError("num must be at least 1.") + step = (end - start) / (num - 1) + return [start + step * i for i in range(num)] + + +def generate_depth_percents(intervals, interval_type): + if interval_type == 'linear': + return generate_linear_space(0, 100, intervals) + elif interval_type == 'sigmoid': + linear_space = generate_linear_space(0, 100, intervals) + return [logistic(x) for x in linear_space] + else: + raise ValueError('Unsupported interval type') + + +needlebench_reader_cfg = dict(input_columns=['prompt'], output_column='answer') + +needlebench_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict(role='HUMAN', prompt='{prompt}'), + dict(role='BOT', prompt='{answer}\n'), + ] + ) + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer)) + +needlebench_eval_cfg = dict( + evaluator=dict(type=NeedleBenchParallelEvaluator), + pred_postprocessor=dict(type=needlebench_postprocess), + dataset_postprocessor=dict(type=needlebench_dataset_postprocess), + pred_role='BOT') + +context_lengths = list(range(5000, 9000, 1000)) +document_depth_percent_intervals = 20 +document_depth_percent_interval_type = "linear" + +base_path = './data/needlebench' +file_list = ['PaulGrahamEssays.jsonl'] +needlebench_datasets_en = [] +needle_file_name = 'needles.jsonl' +depths_float = generate_depth_percents( + document_depth_percent_intervals, + document_depth_percent_interval_type) +depths = [int(depth) for depth in depths_float] + +for original_context_length in context_lengths: + dataset_dict = { + 'abbr': f'Length{original_context_length}' + f'_parallel_en_8k', + 'type': NeedleBenchParallelDataset, + 'path': base_path, + 'needle_file_name': needle_file_name, + 'length': original_context_length, + 'depths': depths, + 'tokenizer_model': 'gpt-4', + 'file_list': file_list, + 'num_repeats_per_file': 25, + 'length_buffer': 1300, + 'guide': True, + 'language': 'English', + 'reader_cfg': needlebench_reader_cfg, + 'infer_cfg': needlebench_infer_cfg, + 'eval_cfg': needlebench_eval_cfg + } + needlebench_datasets_en.append(dataset_dict) + +file_list = ['zh_finance.jsonl'] +needlebench_datasets_zh = [] + +for original_context_length in context_lengths: + dataset_dict = { + 'abbr': f'Length{original_context_length}' + f'_parallel_zh_8k', + 'type': NeedleBenchParallelDataset, + 'path': base_path, + 'needle_file_name': needle_file_name, + 'length': original_context_length, + 'depths': depths, + 'tokenizer_model': 'gpt-4', + 'file_list': file_list, + 'num_repeats_per_file': 25, + 'length_buffer': 200, + 'guide': True, + 'language': 'Chinese', + 'reader_cfg': needlebench_reader_cfg, + 'infer_cfg': needlebench_infer_cfg, + 'eval_cfg': needlebench_eval_cfg + } + needlebench_datasets_zh.append(dataset_dict) diff --git a/configs/datasets/needlebench/needlebench_8k/needlebench_multi_retrieval_compare_batch.py b/configs/datasets/needlebench/needlebench_8k/needlebench_multi_retrieval_compare_batch.py new file mode 100644 index 00000000..e088a969 --- /dev/null +++ b/configs/datasets/needlebench/needlebench_8k/needlebench_multi_retrieval_compare_batch.py @@ -0,0 +1,120 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets.needlebench.parallel import NeedleBenchParallelDataset +from opencompass.datasets.needlebench.parallel import NeedleBenchParallelEvaluator +from opencompass.datasets.needlebench.origin import needlebench_postprocess +from opencompass.datasets.needlebench.origin import needlebench_dataset_postprocess +import math + + +def logistic(x, L=100, x0=50, k=0.1): + return round(L / (1 + math.exp(-k * (x - x0))), 3) + + +def generate_linear_space(start, end, num): + if num == 1: + return [start] + elif num < 1: + raise ValueError("num must be at least 1.") + step = (end - start) / (num - 1) + return [start + step * i for i in range(num)] + + +def generate_depth_percents(intervals, interval_type): + if interval_type == 'linear': + return generate_linear_space(0, 100, intervals) + elif interval_type == 'sigmoid': + linear_space = generate_linear_space(0, 100, intervals) + return [logistic(x) for x in linear_space] + else: + raise ValueError('Unsupported interval type') + + +needlebench_reader_cfg = dict(input_columns=['prompt'], output_column='answer') + +needlebench_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict(role='HUMAN', prompt='{prompt}'), + dict(role='BOT', prompt='{answer}\n'), + ] + ) + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer)) + +needlebench_eval_cfg = dict( + evaluator=dict(type=NeedleBenchParallelEvaluator), + pred_postprocessor=dict(type=needlebench_postprocess), + dataset_postprocessor=dict(type=needlebench_dataset_postprocess), + pred_role='BOT') + +context_lengths = list(range(5000, 9000, 1000)) +document_depth_percent_intervals_list = [1, 5, 10, 15, 20] +document_depth_percent_interval_type = "linear" + +base_path = './data/needlebench' +file_list = ['PaulGrahamEssays.jsonl'] +needlebench_datasets_en = [] +needle_file_name = 'needles.jsonl' + +for document_depth_percent_intervals in document_depth_percent_intervals_list: + depths_float = generate_depth_percents( + document_depth_percent_intervals, + document_depth_percent_interval_type) + depths = [int(depth) for depth in depths_float] + + for original_context_length in context_lengths: + dataset_dict = { + 'abbr': f'Length{original_context_length}' + f'_parallel_en_8k_batch{document_depth_percent_intervals}', + 'type': NeedleBenchParallelDataset, + 'path': base_path, + 'needle_file_name': needle_file_name, + 'length': original_context_length, + 'depths': depths, + 'tokenizer_model': 'gpt-4', + 'file_list': file_list, + 'num_repeats_per_file': 50, + 'length_buffer': 1300, + 'guide': True, + 'language': 'English', + 'reader_cfg': needlebench_reader_cfg, + 'infer_cfg': needlebench_infer_cfg, + 'eval_cfg': needlebench_eval_cfg + } + needlebench_datasets_en.append(dataset_dict) + +file_list = ['zh_finance.jsonl'] +needlebench_datasets_zh = [] +needle_file_name = 'needles.jsonl' + +for document_depth_percent_intervals in document_depth_percent_intervals_list: + depths_float = generate_depth_percents( + document_depth_percent_intervals, + document_depth_percent_interval_type) + depths = [int(depth) for depth in depths_float] + + for original_context_length in context_lengths: + dataset_dict = { + 'abbr': f'Length{original_context_length}' + f'_parallel_zh_8k_batch{document_depth_percent_intervals}', + 'type': NeedleBenchParallelDataset, + 'path': base_path, + 'needle_file_name': needle_file_name, + 'length': original_context_length, + 'depths': depths, + 'tokenizer_model': 'gpt-4', + 'file_list': file_list, + 'num_repeats_per_file': 50, + 'length_buffer': 200, + 'guide': True, + 'language': 'Chinese', + 'reader_cfg': needlebench_reader_cfg, + 'infer_cfg': needlebench_infer_cfg, + 'eval_cfg': needlebench_eval_cfg + } + needlebench_datasets_zh.append(dataset_dict) diff --git a/configs/datasets/needlebench/needlebench_8k/needlebench_single.py b/configs/datasets/needlebench/needlebench_8k/needlebench_single.py new file mode 100644 index 00000000..3ee55bb8 --- /dev/null +++ b/configs/datasets/needlebench/needlebench_8k/needlebench_single.py @@ -0,0 +1,114 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets.needlebench.origin import NeedleBenchOriginDataset +from opencompass.datasets.needlebench.origin import NeedleBenchOriginEvaluator +from opencompass.datasets.needlebench.origin import needlebench_postprocess +from opencompass.datasets.needlebench.origin import needlebench_dataset_postprocess +import math + + +def logistic(x, L=100, x0=50, k=0.1): + return round(L / (1 + math.exp(-k * (x - x0))), 3) + + +def generate_linear_space(start, end, num): + if num == 1: + return [start] + elif num < 1: + raise ValueError("num must be at least 1.") + step = (end - start) / (num - 1) + return [start + step * i for i in range(num)] + + +def generate_depth_percents(intervals, interval_type): + if interval_type == 'linear': + return generate_linear_space(0, 100, intervals) + elif interval_type == 'sigmoid': + linear_space = generate_linear_space(0, 100, intervals) + return [logistic(x) for x in linear_space] + else: + raise ValueError('Unsupported interval type') + + +needlebench_reader_cfg = dict(input_columns=['prompt'], output_column='answer') + +needlebench_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict(role='HUMAN', prompt='{prompt}'), + dict(role='BOT', prompt='{answer}\n'), + ] + ) + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer)) + +needlebench_eval_cfg = dict( + evaluator=dict(type=NeedleBenchOriginEvaluator), + pred_postprocessor=dict(type=needlebench_postprocess), + dataset_postprocessor=dict(type=needlebench_dataset_postprocess), + pred_role='BOT') + +context_lengths = list(range(5000, 9000, 1000)) +document_depth_percent_intervals = 20 +document_depth_percent_interval_type = "linear" + +base_path = './data/needlebench' +file_list = ['PaulGrahamEssays.jsonl'] +needlebench_datasets_en = [] +needle_file_name = 'needles.jsonl' + +for original_context_length in context_lengths: + for depth_percent in generate_depth_percents( + document_depth_percent_intervals, + document_depth_percent_interval_type): + dataset_dict = { + 'abbr': f'Length{original_context_length}' + f'Depth{int(depth_percent)}_origin_en_8k', + 'type': NeedleBenchOriginDataset, + 'path': base_path, + 'length': original_context_length, + 'depth': int(depth_percent), + 'tokenizer_model': 'gpt-4', + 'file_list': file_list, + 'num_repeats_per_file': 10, + 'length_buffer': 800, + 'guide': True, + 'language': 'English', + 'needle_file_name': needle_file_name, + 'reader_cfg': needlebench_reader_cfg, + 'infer_cfg': needlebench_infer_cfg, + 'eval_cfg': needlebench_eval_cfg + } + needlebench_datasets_en.append(dataset_dict) + +file_list = ['zh_finance.jsonl'] +needlebench_datasets_zh = [] +needle_file_name = 'needles.jsonl' + +for original_context_length in context_lengths: + for depth_percent in generate_depth_percents( + document_depth_percent_intervals, + document_depth_percent_interval_type): + dataset_dict = { + 'abbr': f'Length{original_context_length}' + f'Depth{int(depth_percent)}_origin_zh_8k', + 'type': NeedleBenchOriginDataset, + 'path': base_path, + 'length': original_context_length, + 'depth': int(depth_percent), + 'tokenizer_model': 'gpt-4', + 'file_list': file_list, + 'num_repeats_per_file': 10, + 'length_buffer': 200, + 'guide': True, + 'language': 'Chinese', + 'needle_file_name': needle_file_name, + 'reader_cfg': needlebench_reader_cfg, + 'infer_cfg': needlebench_infer_cfg, + 'eval_cfg': needlebench_eval_cfg + } + needlebench_datasets_zh.append(dataset_dict) diff --git a/configs/eval_needlebench.py b/configs/eval_needlebench.py new file mode 100644 index 00000000..dcf444bd --- /dev/null +++ b/configs/eval_needlebench.py @@ -0,0 +1,78 @@ +from opencompass.models import HuggingFaceCausalLM +from opencompass.models.turbomind import TurboMindModel +from opencompass.runners import SlurmSequentialRunner +from opencompass.partitioners import SizePartitioner, NaivePartitioner +from opencompass.tasks import OpenICLInferTask, OpenICLEvalTask + +from mmengine.config import read_base +with read_base(): + # eval needlebench_4k + from .datasets.needlebench.needlebench_4k.needlebench import needlebench_datasets + from .summarizers.needlebench import needlebench_4k_summarizer as summarizer + + # only eval original "needle in a haystack test" in needlebench_4k + # from .datasets.needlebench.needlebench_4k.needlebench_single import needlebench_datasets_zh, needlebench_datasets_en + # from .summarizers.needlebench import needlebench_4k_summarizer as summarizer + + # eval Ancestral Tracing Challenge(ATC) + # from .datasets.needlebench.atc.atc import needlebench_atc_datasets_zh, needlebench_atc_datasets_en + # from .summarizers.needlebench import needlebench_atc_summarizer as summarizer + +datasets = sum([v for k, v in locals().items() if ('datasets' in k)], []) + +hf_internlm2_chat_7b_model_meta_template = dict( + round=[ + dict(role='HUMAN', + begin='<|im_start|>user\n', end='<|im_end|>\n'), + dict(role='BOT', begin='<|im_start|>assistant\n', + end='<|im_end|>\n', generate=True), + ], +) +hf_internlm2_chat_7b = dict( + type=HuggingFaceCausalLM, + abbr='internlm2-chat-7b-hf', + path="internlm/internlm2-chat-7b", + tokenizer_path='internlm/internlm2-chat-7b', + model_kwargs=dict( + trust_remote_code=True, + device_map='auto', + ), + tokenizer_kwargs=dict( + padding_side='left', + truncation_side='left', + use_fast=False, + trust_remote_code=True, + ), + max_out_len=2000, + max_seq_len=32768, + batch_size=8, + meta_template=hf_internlm2_chat_7b_model_meta_template, + run_cfg=dict(num_gpus=1, num_procs=1), + end_str='<|im_end|>', + ) + +internlm2_chat_7b_200k = dict( + type=TurboMindModel, + abbr='internlm2-chat-7b-200k', + path="internlm/internlm2-chat-7b", + meta_template=hf_internlm2_chat_7b_model_meta_template, + engine_config=dict(session_len=210000, + max_batch_size=8, + rope_scaling_factor=2.0, + model_name="internlm2-chat-7b"), + gen_config=dict(top_k=1, top_p=0.8, + temperature=1.0, + max_new_tokens=2000), + max_out_len=2000, + max_seq_len=210000, + batch_size=8, + concurrency=8, + run_cfg=dict(num_gpus=1, num_procs=1), + ) + +models = [ + # hf_internlm2_chat_7b, + internlm2_chat_7b_200k, +] + +work_dir = './outputs/needlebench' diff --git a/configs/eval_needleinahaystack.py b/configs/eval_needleinahaystack.py deleted file mode 100644 index 533cc4db..00000000 --- a/configs/eval_needleinahaystack.py +++ /dev/null @@ -1,40 +0,0 @@ -from opencompass.models import HuggingFaceCausalLM - -from mmengine.config import read_base -with read_base(): - from .datasets.cdme.cdme8k import cdme_datasets - -datasets = [*cdme_datasets] - - -_meta_template = dict( - round=[ - dict(role='HUMAN', begin='<|User|>:', end='\n'), - dict(role='BOT', begin='<|Bot|>:', end='\n', generate=True), - ], -) - -models = [ - dict( - type=HuggingFaceCausalLM, - abbr='internlm-chat-20b-hf', - path="internlm/internlm-chat-20b", - tokenizer_path='internlm/internlm-chat-20b', - model_kwargs=dict( - trust_remote_code=True, - device_map='auto', - ), - tokenizer_kwargs=dict( - padding_side='left', - truncation_side='left', - use_fast=False, - trust_remote_code=True, - ), - max_out_len=100, - max_seq_len=8192, - batch_size=8, - meta_template=_meta_template, - run_cfg=dict(num_gpus=2, num_procs=1), - end_str='', - ) -] diff --git a/configs/eval_needleinahaystack_turbomind.py b/configs/eval_needleinahaystack_turbomind.py deleted file mode 100644 index 5f9a2f11..00000000 --- a/configs/eval_needleinahaystack_turbomind.py +++ /dev/null @@ -1,28 +0,0 @@ -from opencompass.models.turbomind import TurboMindModel - -from mmengine.config import read_base -with read_base(): - from .datasets.cdme.cdme200k import cdme_datasets - -datasets = [*cdme_datasets] - -internlm_meta_template = dict(round=[ - dict(role='HUMAN', begin='<|User|>:', end='\n'), - dict(role='BOT', begin='<|Bot|>:', end='\n', generate=True), -], - eos_token_id=103028) - -models = [ - # config for internlm-chat-20b - dict( - type=TurboMindModel, - abbr='internlm-chat-20b-turbomind', - path='./turbomind', - max_out_len=100, - max_seq_len=201000, - batch_size=8, - concurrency=8, - meta_template=internlm_meta_template, - run_cfg=dict(num_gpus=1, num_procs=1), - ) -] diff --git a/configs/summarizers/needlebench.py b/configs/summarizers/needlebench.py new file mode 100644 index 00000000..57f7b557 --- /dev/null +++ b/configs/summarizers/needlebench.py @@ -0,0 +1,657 @@ +from opencompass.summarizers.needlebench import NeedleBenchSummarizer +from opencompass.summarizers.needlebench import NeedleBenchATCSummarizer + +# ----------NeedleBench-4k-summarizer---------- +context_lengths_4k = list(range(1000, 5000, 1000)) +depths = [0, 5, 10, 15, 21, 26, 31, 36, 42, 47, 52, 57, 63, 68, 73, 78, 84, 89, 94, 100] + +# Initialize the lists +_needlebench_4k_2needle_en = [] +_needlebench_4k_3needle_en = [] +_needlebench_4k_4needle_en = [] +_needlebench_4k_5needle_en = [] +_needlebench_4k_2needle_zh = [] +_needlebench_4k_3needle_zh = [] +_needlebench_4k_4needle_zh = [] +_needlebench_4k_5needle_zh = [] +_needlebench_4k_origin_en = [] +_needlebench_4k_origin_zh = [] + +# Fill the lists using nested loops +for original_context_length in context_lengths_4k: + for depth_percent in depths: + _needlebench_4k_2needle_en.append(f'Length{original_context_length}Depth{int(depth_percent)}_2needle_en_4k') + _needlebench_4k_3needle_en.append(f'Length{original_context_length}Depth{int(depth_percent)}_3needle_en_4k') + _needlebench_4k_4needle_en.append(f'Length{original_context_length}Depth{int(depth_percent)}_4needle_en_4k') + _needlebench_4k_5needle_en.append(f'Length{original_context_length}Depth{int(depth_percent)}_5needle_en_4k') + _needlebench_4k_2needle_zh.append(f'Length{original_context_length}Depth{int(depth_percent)}_2needle_zh_4k') + _needlebench_4k_3needle_zh.append(f'Length{original_context_length}Depth{int(depth_percent)}_3needle_zh_4k') + _needlebench_4k_4needle_zh.append(f'Length{original_context_length}Depth{int(depth_percent)}_4needle_zh_4k') + _needlebench_4k_5needle_zh.append(f'Length{original_context_length}Depth{int(depth_percent)}_5needle_zh_4k') + + _needlebench_4k_origin_en.append(f'Length{original_context_length}Depth{int(depth_percent)}_origin_en_4k') + _needlebench_4k_origin_zh.append(f'Length{original_context_length}Depth{int(depth_percent)}_origin_zh_4k') + +# Concatenate the multi-needle and origin lists +_needlebench_4k_multi_needle_en = _needlebench_4k_2needle_en + _needlebench_4k_3needle_en + _needlebench_4k_4needle_en + _needlebench_4k_5needle_en +_needlebench_4k_multi_needle_zh = _needlebench_4k_2needle_zh + _needlebench_4k_3needle_zh + _needlebench_4k_4needle_zh + _needlebench_4k_5needle_zh +_needlebench_4k_origin = _needlebench_4k_origin_en + _needlebench_4k_origin_zh +_needlebench_4k_multi_needle = _needlebench_4k_multi_needle_en + _needlebench_4k_multi_needle_zh + +# Repeating the same process for parallel (assuming it's similar to origin_en) +_needlebench_4k_parallel_en = [] +_needlebench_4k_parallel_zh = [] +for original_context_length in context_lengths_4k: + _needlebench_4k_parallel_en.append(f'Length{original_context_length}_parallel_en_4k') +for original_context_length in context_lengths_4k: + _needlebench_4k_parallel_zh.append(f'Length{original_context_length}_parallel_zh_4k') +_needlebench_4k_parallel = _needlebench_4k_parallel_en + _needlebench_4k_parallel_zh + +needlebench_summary_groups = [ + {'name': 'original_version', 'subsets': _needlebench_4k_origin}, + {'name': 'original_version_zh', 'subsets': _needlebench_4k_origin_zh}, + {'name': 'original_version_en', 'subsets': _needlebench_4k_origin_en}, + + {'name': 'multi_needle_en', 'subsets': _needlebench_4k_multi_needle_en}, + {'name': 'multi_needle2_en', 'subsets': _needlebench_4k_2needle_en}, + {'name': 'multi_needle3_en', 'subsets': _needlebench_4k_3needle_en}, + {'name': 'multi_needle4_en', 'subsets': _needlebench_4k_4needle_en}, + {'name': 'multi_needle5_en', 'subsets': _needlebench_4k_5needle_en}, + + {'name': 'multi_needle_zh', 'subsets': _needlebench_4k_multi_needle_zh}, + {'name': 'multi_needle2_zh', 'subsets': _needlebench_4k_2needle_zh}, + {'name': 'multi_needle3_zh', 'subsets': _needlebench_4k_3needle_zh}, + {'name': 'multi_needle4_zh', 'subsets': _needlebench_4k_4needle_zh}, + {'name': 'multi_needle5_zh', 'subsets': _needlebench_4k_5needle_zh}, + + {'name': 'multi_needle', 'subsets': _needlebench_4k_multi_needle}, + + {'name': 'parallel_version', 'subsets': _needlebench_4k_parallel}, + {'name': 'parallel_version_zh', 'subsets': _needlebench_4k_parallel_zh}, + {'name': 'parallel_version_en', 'subsets': _needlebench_4k_parallel_en}, + + + {'name': 'overall', + 'subsets': [['original_version', 'naive_average'], + ['multi_needle', 'naive_average'], + ['parallel_version', 'average_score']], + 'weights': {'original_version': 0.4, + 'multi_needle': 0.3, + 'parallel_version': 0.3}}, +] +needlebench_4k_summarizer = dict( + type=NeedleBenchSummarizer, + dataset_abbrs=[ + 'overall', + '--------- NeedleBench-4k Single-Needle ---------', # category + 'original_version', + 'original_version_zh', + 'original_version_en', + '--------- NeedleBench-4k Parallel-Needles ---------', # category + 'parallel_version', + 'parallel_version_zh', + 'parallel_version_en', + '--------- NeedleBench-4k Multi-Needles ---------', # category + 'multi_needle', + 'multi_needle_en', + 'multi_needle_zh', + 'multi_needle2_en', + 'multi_needle3_en', + 'multi_needle4_en', + 'multi_needle5_en', + 'multi_needle2_zh', + 'multi_needle3_zh', + 'multi_needle4_zh', + 'multi_needle5_zh', + + # *_needlebench_4k_origin, *_needlebench_4k_multi_needle, *_needlebench_4k_parallel, + ], + summary_groups=needlebench_summary_groups, +) + +# ----------NeedleBench-8k-summarizer---------- + +context_lengths_8k = list(range(5000, 9000, 1000)) + +# Initialize the lists +_needlebench_8k_2needle_en = [] +_needlebench_8k_3needle_en = [] +_needlebench_8k_4needle_en = [] +_needlebench_8k_5needle_en = [] +_needlebench_8k_2needle_zh = [] +_needlebench_8k_3needle_zh = [] +_needlebench_8k_4needle_zh = [] +_needlebench_8k_5needle_zh = [] +_needlebench_8k_origin_en = [] +_needlebench_8k_origin_zh = [] + +# Fill the lists using nested loops +for original_context_length in context_lengths_8k: + for depth_percent in depths: + _needlebench_8k_2needle_en.append(f'Length{original_context_length}Depth{int(depth_percent)}_2needle_en_8k') + _needlebench_8k_3needle_en.append(f'Length{original_context_length}Depth{int(depth_percent)}_3needle_en_8k') + _needlebench_8k_4needle_en.append(f'Length{original_context_length}Depth{int(depth_percent)}_4needle_en_8k') + _needlebench_8k_5needle_en.append(f'Length{original_context_length}Depth{int(depth_percent)}_5needle_en_8k') + _needlebench_8k_2needle_zh.append(f'Length{original_context_length}Depth{int(depth_percent)}_2needle_zh_8k') + _needlebench_8k_3needle_zh.append(f'Length{original_context_length}Depth{int(depth_percent)}_3needle_zh_8k') + _needlebench_8k_4needle_zh.append(f'Length{original_context_length}Depth{int(depth_percent)}_4needle_zh_8k') + _needlebench_8k_5needle_zh.append(f'Length{original_context_length}Depth{int(depth_percent)}_5needle_zh_8k') + + _needlebench_8k_origin_en.append(f'Length{original_context_length}Depth{int(depth_percent)}_origin_en_8k') + _needlebench_8k_origin_zh.append(f'Length{original_context_length}Depth{int(depth_percent)}_origin_zh_8k') + +# Concatenate the multi-needle and origin lists +_needlebench_8k_multi_needle_en = _needlebench_8k_2needle_en + _needlebench_8k_3needle_en + _needlebench_8k_4needle_en + _needlebench_8k_5needle_en +_needlebench_8k_multi_needle_zh = _needlebench_8k_2needle_zh + _needlebench_8k_3needle_zh + _needlebench_8k_4needle_zh + _needlebench_8k_5needle_zh +_needlebench_8k_origin = _needlebench_8k_origin_en + _needlebench_8k_origin_zh +_needlebench_8k_multi_needle = _needlebench_8k_multi_needle_en + _needlebench_8k_multi_needle_zh + +# Repeating the same process for parallel (assuming it's similar to origin_en) +_needlebench_8k_parallel_en = [] +_needlebench_8k_parallel_zh = [] +for original_context_length in context_lengths_8k: + _needlebench_8k_parallel_en.append(f'Length{original_context_length}_parallel_en_8k') +for original_context_length in context_lengths_8k: + _needlebench_8k_parallel_zh.append(f'Length{original_context_length}_parallel_zh_8k') +_needlebench_8k_parallel = _needlebench_8k_parallel_en + _needlebench_8k_parallel_zh + +needlebench_summary_groups = [ + {'name': 'original_version', 'subsets': _needlebench_8k_origin}, + {'name': 'original_version_zh', 'subsets': _needlebench_8k_origin_zh}, + {'name': 'original_version_en', 'subsets': _needlebench_8k_origin_en}, + + {'name': 'multi_needle_en', 'subsets': _needlebench_8k_multi_needle_en}, + {'name': 'multi_needle2_en', 'subsets': _needlebench_8k_2needle_en}, + {'name': 'multi_needle3_en', 'subsets': _needlebench_8k_3needle_en}, + {'name': 'multi_needle4_en', 'subsets': _needlebench_8k_4needle_en}, + {'name': 'multi_needle5_en', 'subsets': _needlebench_8k_5needle_en}, + + {'name': 'multi_needle_zh', 'subsets': _needlebench_8k_multi_needle_zh}, + {'name': 'multi_needle2_zh', 'subsets': _needlebench_8k_2needle_zh}, + {'name': 'multi_needle3_zh', 'subsets': _needlebench_8k_3needle_zh}, + {'name': 'multi_needle4_zh', 'subsets': _needlebench_8k_4needle_zh}, + {'name': 'multi_needle5_zh', 'subsets': _needlebench_8k_5needle_zh}, + + {'name': 'multi_needle', 'subsets': _needlebench_8k_multi_needle}, + + {'name': 'parallel_version', 'subsets': _needlebench_8k_parallel}, + {'name': 'parallel_version_zh', 'subsets': _needlebench_8k_parallel_zh}, + {'name': 'parallel_version_en', 'subsets': _needlebench_8k_parallel_en}, + + + {'name': 'overall', + 'subsets': [['original_version', 'naive_average'], + ['multi_needle', 'naive_average'], + ['parallel_version', 'average_score']], + 'weights': {'original_version': 0.4, + 'multi_needle': 0.3, + 'parallel_version': 0.3}}, +] +needlebench_8k_summarizer = dict( + type=NeedleBenchSummarizer, + dataset_abbrs=[ + 'overall', + '--------- NeedleBench-8k Single-Needle ---------', # category + 'original_version', + 'original_version_zh', + 'original_version_en', + '--------- NeedleBench-8k Parallel-Needles ---------', # category + 'parallel_version', + 'parallel_version_zh', + 'parallel_version_en', + '--------- NeedleBench-8k Multi-Needles ---------', # category + 'multi_needle', + 'multi_needle_en', + 'multi_needle_zh', + 'multi_needle2_en', + 'multi_needle3_en', + 'multi_needle4_en', + 'multi_needle5_en', + 'multi_needle2_zh', + 'multi_needle3_zh', + 'multi_needle4_zh', + 'multi_needle5_zh', + + # *_needlebench_8k_origin, *_needlebench_8k_multi_needle, *_needlebench_8k_parallel, + ], + summary_groups=needlebench_summary_groups, +) + +# ----------NeedleBench-32k-summarizer---------- + +context_lengths_32k = [9000, 13000, 17000, 21000, 25000, 29000, 31000, 32000] + +# Initialize the lists +_needlebench_32k_2needle_en = [] +_needlebench_32k_3needle_en = [] +_needlebench_32k_4needle_en = [] +_needlebench_32k_5needle_en = [] +_needlebench_32k_2needle_zh = [] +_needlebench_32k_3needle_zh = [] +_needlebench_32k_4needle_zh = [] +_needlebench_32k_5needle_zh = [] +_needlebench_32k_origin_en = [] +_needlebench_32k_origin_zh = [] + +# Fill the lists using nested loops +for original_context_length in context_lengths_32k: + for depth_percent in depths: + _needlebench_32k_2needle_en.append(f'Length{original_context_length}Depth{int(depth_percent)}_2needle_en_32k') + _needlebench_32k_3needle_en.append(f'Length{original_context_length}Depth{int(depth_percent)}_3needle_en_32k') + _needlebench_32k_4needle_en.append(f'Length{original_context_length}Depth{int(depth_percent)}_4needle_en_32k') + _needlebench_32k_5needle_en.append(f'Length{original_context_length}Depth{int(depth_percent)}_5needle_en_32k') + _needlebench_32k_2needle_zh.append(f'Length{original_context_length}Depth{int(depth_percent)}_2needle_zh_32k') + _needlebench_32k_3needle_zh.append(f'Length{original_context_length}Depth{int(depth_percent)}_3needle_zh_32k') + _needlebench_32k_4needle_zh.append(f'Length{original_context_length}Depth{int(depth_percent)}_4needle_zh_32k') + _needlebench_32k_5needle_zh.append(f'Length{original_context_length}Depth{int(depth_percent)}_5needle_zh_32k') + + _needlebench_32k_origin_en.append(f'Length{original_context_length}Depth{int(depth_percent)}_origin_en_32k') + _needlebench_32k_origin_zh.append(f'Length{original_context_length}Depth{int(depth_percent)}_origin_zh_32k') + +# Concatenate the multi-needle and origin lists +_needlebench_32k_multi_needle_en = _needlebench_32k_2needle_en + _needlebench_32k_3needle_en + _needlebench_32k_4needle_en + _needlebench_32k_5needle_en +_needlebench_32k_multi_needle_zh = _needlebench_32k_2needle_zh + _needlebench_32k_3needle_zh + _needlebench_32k_4needle_zh + _needlebench_32k_5needle_zh +_needlebench_32k_origin = _needlebench_32k_origin_en + _needlebench_32k_origin_zh +_needlebench_32k_multi_needle = _needlebench_32k_multi_needle_en + _needlebench_32k_multi_needle_zh + +# Repeating the same process for parallel (assuming it's similar to origin_en) +_needlebench_32k_parallel_en = [] +_needlebench_32k_parallel_zh = [] +for original_context_length in context_lengths_32k: + _needlebench_32k_parallel_en.append(f'Length{original_context_length}_parallel_en_32k') +for original_context_length in context_lengths_32k: + _needlebench_32k_parallel_zh.append(f'Length{original_context_length}_parallel_zh_32k') +_needlebench_32k_parallel = _needlebench_32k_parallel_en + _needlebench_32k_parallel_zh + +needlebench_summary_groups = [ + {'name': 'original_version', 'subsets': _needlebench_32k_origin}, + {'name': 'original_version_zh', 'subsets': _needlebench_32k_origin_zh}, + {'name': 'original_version_en', 'subsets': _needlebench_32k_origin_en}, + + {'name': 'multi_needle_en', 'subsets': _needlebench_32k_multi_needle_en}, + {'name': 'multi_needle2_en', 'subsets': _needlebench_32k_2needle_en}, + {'name': 'multi_needle3_en', 'subsets': _needlebench_32k_3needle_en}, + {'name': 'multi_needle4_en', 'subsets': _needlebench_32k_4needle_en}, + {'name': 'multi_needle5_en', 'subsets': _needlebench_32k_5needle_en}, + + {'name': 'multi_needle_zh', 'subsets': _needlebench_32k_multi_needle_zh}, + {'name': 'multi_needle2_zh', 'subsets': _needlebench_32k_2needle_zh}, + {'name': 'multi_needle3_zh', 'subsets': _needlebench_32k_3needle_zh}, + {'name': 'multi_needle4_zh', 'subsets': _needlebench_32k_4needle_zh}, + {'name': 'multi_needle5_zh', 'subsets': _needlebench_32k_5needle_zh}, + + {'name': 'multi_needle', 'subsets': _needlebench_32k_multi_needle}, + + {'name': 'parallel_version', 'subsets': _needlebench_32k_parallel}, + {'name': 'parallel_version_zh', 'subsets': _needlebench_32k_parallel_zh}, + {'name': 'parallel_version_en', 'subsets': _needlebench_32k_parallel_en}, + + + {'name': 'overall', + 'subsets': [['original_version', 'naive_average'], + ['multi_needle', 'naive_average'], + ['parallel_version', 'average_score']], + 'weights': {'original_version': 0.4, + 'multi_needle': 0.3, + 'parallel_version': 0.3}}, +] +needlebench_32k_summarizer = dict( + type=NeedleBenchSummarizer, + dataset_abbrs=[ + 'overall', + '--------- NeedleBench-32k Single-Needle ---------', # category + 'original_version', + 'original_version_zh', + 'original_version_en', + '--------- NeedleBench-32k Parallel-Needles ---------', # category + 'parallel_version', + 'parallel_version_zh', + 'parallel_version_en', + '--------- NeedleBench-32k Multi-Needles ---------', # category + 'multi_needle', + 'multi_needle_en', + 'multi_needle_zh', + 'multi_needle2_en', + 'multi_needle3_en', + 'multi_needle4_en', + 'multi_needle5_en', + 'multi_needle2_zh', + 'multi_needle3_zh', + 'multi_needle4_zh', + 'multi_needle5_zh', + + # *_needlebench_32k_origin, *_needlebench_32k_multi_needle, *_needlebench_32k_parallel, + ], + summary_groups=needlebench_summary_groups, +) + +# ----------NeedleBench-128k-summarizer---------- + +context_lengths_128k = list([16000, 32000, 48000, 64000, 80000, 96000, 112000, 128000]) + +# Initialize the lists +_needlebench_128k_2needle_en = [] +_needlebench_128k_3needle_en = [] +_needlebench_128k_4needle_en = [] +_needlebench_128k_5needle_en = [] +_needlebench_128k_2needle_zh = [] +_needlebench_128k_3needle_zh = [] +_needlebench_128k_4needle_zh = [] +_needlebench_128k_5needle_zh = [] +_needlebench_128k_origin_en = [] +_needlebench_128k_origin_zh = [] + +# Fill the lists using nested loops +for original_context_length in context_lengths_128k: + for depth_percent in depths: + _needlebench_128k_2needle_en.append(f'Length{original_context_length}Depth{int(depth_percent)}_2needle_en_128k') + _needlebench_128k_3needle_en.append(f'Length{original_context_length}Depth{int(depth_percent)}_3needle_en_128k') + _needlebench_128k_4needle_en.append(f'Length{original_context_length}Depth{int(depth_percent)}_4needle_en_128k') + _needlebench_128k_5needle_en.append(f'Length{original_context_length}Depth{int(depth_percent)}_5needle_en_128k') + _needlebench_128k_2needle_zh.append(f'Length{original_context_length}Depth{int(depth_percent)}_2needle_zh_128k') + _needlebench_128k_3needle_zh.append(f'Length{original_context_length}Depth{int(depth_percent)}_3needle_zh_128k') + _needlebench_128k_4needle_zh.append(f'Length{original_context_length}Depth{int(depth_percent)}_4needle_zh_128k') + _needlebench_128k_5needle_zh.append(f'Length{original_context_length}Depth{int(depth_percent)}_5needle_zh_128k') + + _needlebench_128k_origin_en.append(f'Length{original_context_length}Depth{int(depth_percent)}_origin_en_128k') + _needlebench_128k_origin_zh.append(f'Length{original_context_length}Depth{int(depth_percent)}_origin_zh_128k') + +# Concatenate the multi-needle and origin lists +_needlebench_128k_multi_needle_en = _needlebench_128k_2needle_en + _needlebench_128k_3needle_en + _needlebench_128k_4needle_en + _needlebench_128k_5needle_en +_needlebench_128k_multi_needle_zh = _needlebench_128k_2needle_zh + _needlebench_128k_3needle_zh + _needlebench_128k_4needle_zh + _needlebench_128k_5needle_zh +_needlebench_128k_origin = _needlebench_128k_origin_en + _needlebench_128k_origin_zh +_needlebench_128k_multi_needle = _needlebench_128k_multi_needle_en + _needlebench_128k_multi_needle_zh + +# Repeating the same process for parallel (assuming it's similar to origin_en) +_needlebench_128k_parallel_en = [] +_needlebench_128k_parallel_zh = [] +for original_context_length in context_lengths_128k: + _needlebench_128k_parallel_en.append(f'Length{original_context_length}_parallel_en_128k') +for original_context_length in context_lengths_128k: + _needlebench_128k_parallel_zh.append(f'Length{original_context_length}_parallel_zh_128k') +_needlebench_128k_parallel = _needlebench_128k_parallel_en + _needlebench_128k_parallel_zh + +needlebench_summary_groups = [ + {'name': 'original_version', 'subsets': _needlebench_128k_origin}, + {'name': 'original_version_zh', 'subsets': _needlebench_128k_origin_zh}, + {'name': 'original_version_en', 'subsets': _needlebench_128k_origin_en}, + + {'name': 'multi_needle_en', 'subsets': _needlebench_128k_multi_needle_en}, + {'name': 'multi_needle2_en', 'subsets': _needlebench_128k_2needle_en}, + {'name': 'multi_needle3_en', 'subsets': _needlebench_128k_3needle_en}, + {'name': 'multi_needle4_en', 'subsets': _needlebench_128k_4needle_en}, + {'name': 'multi_needle5_en', 'subsets': _needlebench_128k_5needle_en}, + + {'name': 'multi_needle_zh', 'subsets': _needlebench_128k_multi_needle_zh}, + {'name': 'multi_needle2_zh', 'subsets': _needlebench_128k_2needle_zh}, + {'name': 'multi_needle3_zh', 'subsets': _needlebench_128k_3needle_zh}, + {'name': 'multi_needle4_zh', 'subsets': _needlebench_128k_4needle_zh}, + {'name': 'multi_needle5_zh', 'subsets': _needlebench_128k_5needle_zh}, + + {'name': 'multi_needle', 'subsets': _needlebench_128k_multi_needle}, + + {'name': 'parallel_version', 'subsets': _needlebench_128k_parallel}, + {'name': 'parallel_version_zh', 'subsets': _needlebench_128k_parallel_zh}, + {'name': 'parallel_version_en', 'subsets': _needlebench_128k_parallel_en}, + + + {'name': 'overall', + 'subsets': [['original_version', 'naive_average'], + ['multi_needle', 'naive_average'], + ['parallel_version', 'average_score']], + 'weights': {'original_version': 0.4, + 'multi_needle': 0.3, + 'parallel_version': 0.3}}, +] +needlebench_128k_summarizer = dict( + type=NeedleBenchSummarizer, + dataset_abbrs=[ + 'overall', + '--------- NeedleBench-128k Single-Needle ---------', # category + 'original_version', + 'original_version_zh', + 'original_version_en', + '--------- NeedleBench-128k Parallel-Needles ---------', # category + 'parallel_version', + 'parallel_version_zh', + 'parallel_version_en', + '--------- NeedleBench-128k Multi-Needles ---------', # category + 'multi_needle', + 'multi_needle_en', + 'multi_needle_zh', + 'multi_needle2_en', + 'multi_needle3_en', + 'multi_needle4_en', + 'multi_needle5_en', + 'multi_needle2_zh', + 'multi_needle3_zh', + 'multi_needle4_zh', + 'multi_needle5_zh', + + # *_needlebench_128k_origin, *_needlebench_128k_multi_needle, *_needlebench_128k_parallel, + ], + summary_groups=needlebench_summary_groups, +) + +# ----------NeedleBench-200k-summarizer---------- + +context_lengths_200k = list([16000, 32000, 48000, 64000, 80000, 96000, 112000, 128000, 144000, 160000, 176000, 192000, 200000]) + +# Initialize the lists +_needlebench_200k_2needle_en = [] +_needlebench_200k_3needle_en = [] +_needlebench_200k_4needle_en = [] +_needlebench_200k_5needle_en = [] +_needlebench_200k_2needle_zh = [] +_needlebench_200k_3needle_zh = [] +_needlebench_200k_4needle_zh = [] +_needlebench_200k_5needle_zh = [] +_needlebench_200k_origin_en = [] +_needlebench_200k_origin_zh = [] + +# Fill the lists using nested loops +for original_context_length in context_lengths_200k: + for depth_percent in depths: + _needlebench_200k_2needle_en.append(f'Length{original_context_length}Depth{int(depth_percent)}_2needle_en_200k') + _needlebench_200k_3needle_en.append(f'Length{original_context_length}Depth{int(depth_percent)}_3needle_en_200k') + _needlebench_200k_4needle_en.append(f'Length{original_context_length}Depth{int(depth_percent)}_4needle_en_200k') + _needlebench_200k_5needle_en.append(f'Length{original_context_length}Depth{int(depth_percent)}_5needle_en_200k') + _needlebench_200k_2needle_zh.append(f'Length{original_context_length}Depth{int(depth_percent)}_2needle_zh_200k') + _needlebench_200k_3needle_zh.append(f'Length{original_context_length}Depth{int(depth_percent)}_3needle_zh_200k') + _needlebench_200k_4needle_zh.append(f'Length{original_context_length}Depth{int(depth_percent)}_4needle_zh_200k') + _needlebench_200k_5needle_zh.append(f'Length{original_context_length}Depth{int(depth_percent)}_5needle_zh_200k') + + _needlebench_200k_origin_en.append(f'Length{original_context_length}Depth{int(depth_percent)}_origin_en_200k') + _needlebench_200k_origin_zh.append(f'Length{original_context_length}Depth{int(depth_percent)}_origin_zh_200k') + +# Concatenate the multi-needle and origin lists +_needlebench_200k_multi_needle_en = _needlebench_200k_2needle_en + _needlebench_200k_3needle_en + _needlebench_200k_4needle_en + _needlebench_200k_5needle_en +_needlebench_200k_multi_needle_zh = _needlebench_200k_2needle_zh + _needlebench_200k_3needle_zh + _needlebench_200k_4needle_zh + _needlebench_200k_5needle_zh +_needlebench_200k_origin = _needlebench_200k_origin_en + _needlebench_200k_origin_zh +_needlebench_200k_multi_needle = _needlebench_200k_multi_needle_en + _needlebench_200k_multi_needle_zh + +# Repeating the same process for parallel (assuming it's similar to origin_en) +_needlebench_200k_parallel_en = [] +_needlebench_200k_parallel_zh = [] +for original_context_length in context_lengths_200k: + _needlebench_200k_parallel_en.append(f'Length{original_context_length}_parallel_en_200k') +for original_context_length in context_lengths_200k: + _needlebench_200k_parallel_zh.append(f'Length{original_context_length}_parallel_zh_200k') +_needlebench_200k_parallel = _needlebench_200k_parallel_en + _needlebench_200k_parallel_zh + +needlebench_summary_groups = [ + {'name': 'original_version', 'subsets': _needlebench_200k_origin}, + {'name': 'original_version_zh', 'subsets': _needlebench_200k_origin_zh}, + {'name': 'original_version_en', 'subsets': _needlebench_200k_origin_en}, + + {'name': 'multi_needle_en', 'subsets': _needlebench_200k_multi_needle_en}, + {'name': 'multi_needle2_en', 'subsets': _needlebench_200k_2needle_en}, + {'name': 'multi_needle3_en', 'subsets': _needlebench_200k_3needle_en}, + {'name': 'multi_needle4_en', 'subsets': _needlebench_200k_4needle_en}, + {'name': 'multi_needle5_en', 'subsets': _needlebench_200k_5needle_en}, + + {'name': 'multi_needle_zh', 'subsets': _needlebench_200k_multi_needle_zh}, + {'name': 'multi_needle2_zh', 'subsets': _needlebench_200k_2needle_zh}, + {'name': 'multi_needle3_zh', 'subsets': _needlebench_200k_3needle_zh}, + {'name': 'multi_needle4_zh', 'subsets': _needlebench_200k_4needle_zh}, + {'name': 'multi_needle5_zh', 'subsets': _needlebench_200k_5needle_zh}, + + {'name': 'multi_needle', 'subsets': _needlebench_200k_multi_needle}, + + {'name': 'parallel_version', 'subsets': _needlebench_200k_parallel}, + {'name': 'parallel_version_zh', 'subsets': _needlebench_200k_parallel_zh}, + {'name': 'parallel_version_en', 'subsets': _needlebench_200k_parallel_en}, + + {'name': 'overall', + 'subsets': [['original_version', 'naive_average'], + ['multi_needle', 'naive_average'], + ['parallel_version', 'average_score']], + 'weights': {'original_version': 0.4, + 'multi_needle': 0.3, + 'parallel_version': 0.3}}, +] +needlebench_200k_summarizer = dict( + type=NeedleBenchSummarizer, + dataset_abbrs=[ + 'overall', + '--------- NeedleBench-200k Single-Needle ---------', # category + 'original_version', + 'original_version_zh', + 'original_version_en', + '--------- NeedleBench-200k Parallel-Needles ---------', # category + 'parallel_version', + 'parallel_version_zh', + 'parallel_version_en', + '--------- NeedleBench-200k Multi-Needles ---------', # category + 'multi_needle', + 'multi_needle_en', + 'multi_needle_zh', + 'multi_needle2_en', + 'multi_needle3_en', + 'multi_needle4_en', + 'multi_needle5_en', + 'multi_needle2_zh', + 'multi_needle3_zh', + 'multi_needle4_zh', + 'multi_needle5_zh', + + # *_needlebench_200k_origin, *_needlebench_200k_multi_needle, *_needlebench_200k_parallel, + ], + summary_groups=needlebench_summary_groups, +) +context_lengths_8k = list(range(5000, 9000, 1000)) + +# Repeating the same process for parallel (assuming it's similar to origin_en) +_needlebench_8k_parallel_en_batch1 = [] +_needlebench_8k_parallel_en_batch5 = [] +_needlebench_8k_parallel_en_batch10 = [] +_needlebench_8k_parallel_en_batch15 = [] +_needlebench_8k_parallel_en_batch20 = [] +_needlebench_8k_parallel_zh_batch1 = [] +_needlebench_8k_parallel_zh_batch5 = [] +_needlebench_8k_parallel_zh_batch10 = [] +_needlebench_8k_parallel_zh_batch15 = [] +_needlebench_8k_parallel_zh_batch20 = [] +for original_context_length in context_lengths_8k: + _needlebench_8k_parallel_en_batch1.append(f'Length{original_context_length}_parallel_en_8k_batch1') + _needlebench_8k_parallel_en_batch5.append(f'Length{original_context_length}_parallel_en_8k_batch5') + _needlebench_8k_parallel_en_batch10.append(f'Length{original_context_length}_parallel_en_8k_batch10') + _needlebench_8k_parallel_en_batch15.append(f'Length{original_context_length}_parallel_en_8k_batch15') + _needlebench_8k_parallel_en_batch20.append(f'Length{original_context_length}_parallel_en_8k_batch20') + _needlebench_8k_parallel_zh_batch1.append(f'Length{original_context_length}_parallel_zh_8k_batch1') + _needlebench_8k_parallel_zh_batch5.append(f'Length{original_context_length}_parallel_zh_8k_batch5') + _needlebench_8k_parallel_zh_batch10.append(f'Length{original_context_length}_parallel_zh_8k_batch10') + _needlebench_8k_parallel_zh_batch15.append(f'Length{original_context_length}_parallel_zh_8k_batch15') + _needlebench_8k_parallel_zh_batch20.append(f'Length{original_context_length}_parallel_zh_8k_batch20') + + +_needlebench_8k_parallel_batch1 = _needlebench_8k_parallel_en_batch1 + _needlebench_8k_parallel_zh_batch1 +_needlebench_8k_parallel_batch5 = _needlebench_8k_parallel_en_batch5 + _needlebench_8k_parallel_zh_batch5 +_needlebench_8k_parallel_batch10 = _needlebench_8k_parallel_en_batch10 + _needlebench_8k_parallel_zh_batch10 +_needlebench_8k_parallel_batch15 = _needlebench_8k_parallel_en_batch15 + _needlebench_8k_parallel_zh_batch15 +_needlebench_8k_parallel_batch20 = _needlebench_8k_parallel_en_batch20 + _needlebench_8k_parallel_zh_batch20 + +needlebench_summary_groups = [ + {'name': 'parallel_version_batch1', 'subsets': [[_dataset, "average_score"] for _dataset in _needlebench_8k_parallel_batch1]}, + {'name': 'parallel_version_zh_batch1', 'subsets': [[_dataset, "average_score"] for _dataset in _needlebench_8k_parallel_zh_batch1]}, + {'name': 'parallel_version_en_batch1', 'subsets': [[_dataset, "average_score"] for _dataset in _needlebench_8k_parallel_en_batch1]}, + {'name': 'parallel_version_batch5', 'subsets': [[_dataset, "average_score"] for _dataset in _needlebench_8k_parallel_batch5]}, + {'name': 'parallel_version_zh_batch5', 'subsets': [[_dataset, "average_score"] for _dataset in _needlebench_8k_parallel_zh_batch5]}, + {'name': 'parallel_version_en_batch5', 'subsets': [[_dataset, "average_score"] for _dataset in _needlebench_8k_parallel_en_batch5]}, + {'name': 'parallel_version_batch10', 'subsets': [[_dataset, "average_score"] for _dataset in _needlebench_8k_parallel_batch10]}, + {'name': 'parallel_version_zh_batch10', 'subsets': [[_dataset, "average_score"] for _dataset in _needlebench_8k_parallel_zh_batch10]}, + {'name': 'parallel_version_en_batch10', 'subsets': [[_dataset, "average_score"] for _dataset in _needlebench_8k_parallel_en_batch10]}, + {'name': 'parallel_version_batch15', 'subsets': [[_dataset, "average_score"] for _dataset in _needlebench_8k_parallel_batch15]}, + {'name': 'parallel_version_zh_batch15', 'subsets': [[_dataset, "average_score"] for _dataset in _needlebench_8k_parallel_zh_batch15]}, + {'name': 'parallel_version_en_batch15', 'subsets': [[_dataset, "average_score"] for _dataset in _needlebench_8k_parallel_en_batch15]}, + {'name': 'parallel_version_batch20', 'subsets': [[_dataset, "average_score"] for _dataset in _needlebench_8k_parallel_batch20]}, + {'name': 'parallel_version_zh_batch20', 'subsets': [[_dataset, "average_score"] for _dataset in _needlebench_8k_parallel_zh_batch20]}, + {'name': 'parallel_version_en_batch20', 'subsets': [[_dataset, "average_score"] for _dataset in _needlebench_8k_parallel_en_batch20]}, +] + +needlebench_8k_batch_overall_summarizer = dict( + dataset_abbrs=[ + '--------- NeedleBench-8k Parallel-Needles ---------', # category + 'parallel_version_batch1', + 'parallel_version_batch5', + 'parallel_version_batch10', + 'parallel_version_batch15', + 'parallel_version_batch20', + 'parallel_version_zh_batch1', + 'parallel_version_en_batch1', + 'parallel_version_zh_batch5', + 'parallel_version_en_batch5', + 'parallel_version_zh_batch10', + 'parallel_version_en_batch10', + 'parallel_version_zh_batch15', + 'parallel_version_en_batch15', + 'parallel_version_zh_batch20', + 'parallel_version_en_batch20', + # *_needlebench_8k_origin, *_needlebench_8k_multi_needle, *_needlebench_8k_parallel, + ], + summary_groups=needlebench_summary_groups, +) + +needlebench_summary_groups = [ + {'name': 'parallel_version_batch1', 'subsets': [[_dataset, "Depth0"] for _dataset in _needlebench_8k_parallel_batch1]}, + {'name': 'parallel_version_zh_batch1', 'subsets': [[_dataset, "Depth0"] for _dataset in _needlebench_8k_parallel_zh_batch1]}, + {'name': 'parallel_version_en_batch1', 'subsets': [[_dataset, "Depth0"] for _dataset in _needlebench_8k_parallel_en_batch1]}, + {'name': 'parallel_version_batch5', 'subsets': [[_dataset, "Depth0"] for _dataset in _needlebench_8k_parallel_batch5]}, + {'name': 'parallel_version_zh_batch5', 'subsets': [[_dataset, "Depth0"] for _dataset in _needlebench_8k_parallel_zh_batch5]}, + {'name': 'parallel_version_en_batch5', 'subsets': [[_dataset, "Depth0"] for _dataset in _needlebench_8k_parallel_en_batch5]}, + {'name': 'parallel_version_batch10', 'subsets': [[_dataset, "Depth0"] for _dataset in _needlebench_8k_parallel_batch10]}, + {'name': 'parallel_version_zh_batch10', 'subsets': [[_dataset, "Depth0"] for _dataset in _needlebench_8k_parallel_zh_batch10]}, + {'name': 'parallel_version_en_batch10', 'subsets': [[_dataset, "Depth0"] for _dataset in _needlebench_8k_parallel_en_batch10]}, + {'name': 'parallel_version_batch15', 'subsets': [[_dataset, "Depth0"] for _dataset in _needlebench_8k_parallel_batch15]}, + {'name': 'parallel_version_zh_batch15', 'subsets': [[_dataset, "Depth0"] for _dataset in _needlebench_8k_parallel_zh_batch15]}, + {'name': 'parallel_version_en_batch15', 'subsets': [[_dataset, "Depth0"] for _dataset in _needlebench_8k_parallel_en_batch15]}, + {'name': 'parallel_version_batch20', 'subsets': [[_dataset, "Depth0"] for _dataset in _needlebench_8k_parallel_batch20]}, + {'name': 'parallel_version_zh_batch20', 'subsets': [[_dataset, "Depth0"] for _dataset in _needlebench_8k_parallel_zh_batch20]}, + {'name': 'parallel_version_en_batch20', 'subsets': [[_dataset, "Depth0"] for _dataset in _needlebench_8k_parallel_en_batch20]}, +] + +needlebench_8k_batch_depth0_summarizer = dict( + dataset_abbrs=[ + '--------- NeedleBench-8k Parallel-Needles ---------', # category + 'parallel_version_batch1', + 'parallel_version_batch5', + 'parallel_version_batch10', + 'parallel_version_batch15', + 'parallel_version_batch20', + 'parallel_version_zh_batch1', + 'parallel_version_en_batch1', + 'parallel_version_zh_batch5', + 'parallel_version_en_batch5', + 'parallel_version_zh_batch10', + 'parallel_version_en_batch10', + 'parallel_version_zh_batch15', + 'parallel_version_en_batch15', + 'parallel_version_zh_batch20', + 'parallel_version_en_batch20', + # *_needlebench_8k_origin, *_needlebench_8k_multi_needle, *_needlebench_8k_parallel, + ], + summary_groups=needlebench_summary_groups, +) +needlebench_atc_summarizer = dict( + type=NeedleBenchATCSummarizer, +) diff --git a/opencompass/datasets/needlebench/atc.py b/opencompass/datasets/needlebench/atc.py new file mode 100644 index 00000000..6f3ebe20 --- /dev/null +++ b/opencompass/datasets/needlebench/atc.py @@ -0,0 +1,247 @@ +# flake8: noqa +import json +import random + +from datasets import Dataset + +from opencompass.datasets.base import BaseDataset +from opencompass.registry import LOAD_DATASET + + +@LOAD_DATASET.register_module() +class NeedleBenchATCDataset(BaseDataset): + + @staticmethod + def load( + path, + num_needles: int, + language: str, + repeats: int, + ): + data = {'prompt': [], 'answer': []} + + with open(path, 'r', encoding='utf-8') as file: + names_data = json.load(file) + + all_names = names_data[language].split(',') + + for _ in range(repeats): + names = random.sample(all_names, num_needles) + if language == 'Chinese': + + relationship_terms = [ + '父亲', '母亲', '爸爸', '妈妈', '爷爷', '奶奶', '姥姥', '姥爷', '外公', '外婆' + ] + + relationship_templates = [ + '{A}是{B}的{relationship}。', + '{B}的{relationship}是{A}。', + '{A}作为{B}的{relationship},对{B}的成长有重要影响。', + '{A}不仅是{B}的{relationship},还是{B}的榜样。', + '{B}是{A}所生的孩子。', + '{A}对{B}来说,不只是一个{relationship},还是一个朋友。', + '{A}在{B}的生命中扮演着{relationship}的角色。', + '{B}把{A}视为其{relationship}。', + ] + elif language == 'English': + + relationship_terms = [ + 'father', 'mother', 'dad', 'mom', 'grandfather', + 'grandmother', 'maternal grandmother', + 'maternal grandfather', 'paternal grandfather', + 'paternal grandmother' + ] + + relationship_templates = [ + "{A} is {B}'s {relationship}.", + "{B}'s {relationship} is {A}.", + ("{A}, as {B}'s {relationship}, " + "has a significant impact on {B}'s upbringing."), + ("{A} is not only {B}'s {relationship} " + "but also {B}'s role model."), + '{B} is the child of {A}.', + ('For {B}, {A} is not just a {relationship}, ' + 'but also a friend.'), + ("{A} plays the role of {B}'s {relationship} " + "in {B}'s life."), + '{B} considers {A} as their {relationship}.', + ] + + def generate_chain_family_story(names, templates, + relationship_terms): + story = '' + for i in range(len(names) - 1): + template = random.choice(templates) + relation_term = random.choice(relationship_terms) + relation = template.format(A=names[i], + B=names[i + 1], + relationship=relation_term) + story += f'{relation}*' + return story + + chain_story = generate_chain_family_story(names, + relationship_templates, + relationship_terms) + + # Splitting the chain_story into a list of fragments + family_story_fragments = chain_story.split('*') + + # Shuffling the list of fragments + random.shuffle(family_story_fragments) + + # Joining the shuffled fragments back into a string + shuffled_story = ''.join(family_story_fragments) + + last_person = names[-1] + + # Generating the prompt based on the language + if language == 'Chinese': + prompt = (f""" +在上面提供的打乱的家族关系文本中,'{last_person}'的能够向上追溯到的最年长的亲人是谁? +例如: +例子1.如果张强的父亲是马克,此外提供的文本中没有更多关于亲属关系的信息,那么在提供的文本中张强能够向上追溯到的最年长的亲人就是马克。 +例子2.如果李明的姥姥是张红,而张红的父亲是张强,此外提供的文本中没有更多关于亲属关系的信息,那么在提供的文本中李明能够向上追溯到的最年长的亲人就是张强。 +例子3.如果小明是张红的曾孙女,张红的祖母是王华,王华的父亲是王刚,此外提供的文本中没有更多关于亲属关系的信息,那么小明能够向上追溯到的最年长的亲人就是王刚。 +""") + elif language == 'English': + prompt = (f""" +Given the scrambled family relationships described above, who is the eldest relative that '{last_person}' can trace back to in the context? +For example: +Example 1: If Zhang Qiang's father is Mark, and no further information about familial relationships is provided in the text, then the oldest relative Zhang Qiang can trace back to in the provided text is Mark. +Example 2: If Li Ming's grandmother is Zhang Hong, and Zhang Hong's father is Zhang Qiang, and no further information about familial relationships is provided in the text, then the oldest relative Li Ming can trace back to in the provided text is Zhang Qiang. +Example 3: If Xiao Ming is Zhang Hong's great-granddaughter, Zhang Hong's grandmother is Wang Hua, and Wang Hua's father is Wang Gang, and no further information about familial relationships is provided in the text, then the oldest relative Xiao Ming can trace back to in the provided text is Wang Gang.""" + ) + else: + prompt = 'Language not supported.' + raise Exception('Unsupported language specified. ' + "Please choose either 'Chinese' or 'English'.") + + # Combine story and prompt + shuffled_story_with_prompt = shuffled_story + ' ' + prompt + + data['prompt'].append(shuffled_story_with_prompt) + data['answer'].append(names[0] + '*' + names[0]) + + dataset = Dataset.from_dict({ + 'prompt': data['prompt'], + 'answer': data['answer'], + }) + return dataset + + +@LOAD_DATASET.register_module() +class NeedleBenchATCOrderedDataset(BaseDataset): + + @staticmethod + def load( + path, + num_needles: int, + language: str, + repeats: int, + ): + data = {'prompt': [], 'answer': []} + + with open(path, 'r', encoding='utf-8') as file: + names_data = json.load(file) + + all_names = names_data[language].split(',') + + for _ in range(repeats): + names = random.sample(all_names, num_needles) + if language == 'Chinese': + + relationship_terms = [ + '父亲', '母亲', '爸爸', '妈妈', '爷爷', '奶奶', '姥姥', '姥爷', '外公', '外婆' + ] + + relationship_templates = [ + '{A}是{B}的{relationship}。', + '{B}的{relationship}是{A}。', + '{A}作为{B}的{relationship},对{B}的成长有重要影响。', + '{A}不仅是{B}的{relationship},还是{B}的榜样。', + '{B}是{A}所生的孩子。', + '{A}对{B}来说,不只是一个{relationship},还是一个朋友。', + '{A}在{B}的生命中扮演着{relationship}的角色。', + '{B}把{A}视为其{relationship}。', + ] + elif language == 'English': + + relationship_terms = [ + 'father', 'mother', 'dad', 'mom', 'grandfather', + 'grandmother', 'maternal grandmother', + 'maternal grandfather', 'paternal grandfather', + 'paternal grandmother' + ] + + relationship_templates = [ + "{A} is {B}'s {relationship}.", + "{B}'s {relationship} is {A}.", + ("{A}, as {B}'s {relationship}, " + "has a significant impact on {B}'s upbringing."), + ("{A} is not only {B}'s {relationship} " + "but also {B}'s role model."), + '{B} is the child of {A}.', + ('For {B}, {A} is not just a {relationship}, ' + 'but also a friend.'), + ("{A} plays the role of {B}'s {relationship} " + "in {B}'s life."), + '{B} considers {A} as their {relationship}.', + ] + + def generate_chain_family_story(names, templates, + relationship_terms): + story = '' + for i in range(len(names) - 1): + template = random.choice(templates) + relation_term = random.choice(relationship_terms) + relation = template.format(A=names[i], + B=names[i + 1], + relationship=relation_term) + story += f'{relation}*' + return story + + chain_story = generate_chain_family_story(names, + relationship_templates, + relationship_terms) + + # Splitting the chain_story into a list of fragments + family_story_fragments = chain_story.split('*') + + # Joining the shuffled fragments back into a string + shuffled_story = ''.join(family_story_fragments) + + last_person = names[-1] + + # Generating the prompt based on the language + if language == 'Chinese': + prompt = (f""" +在上面提供的打乱的家族关系文本中,'{last_person}'的能够向上追溯到的最年长的亲人是谁? +例如: +例子1.如果张强的父亲是马克,除此以外提供的文本中没有更多关于亲属关系的信息,那么在提供的文本中张强能够向上追溯到的最年长的亲人就是马克。 +例子2.如果李明的姥姥是张红,而张红的父亲是张强,除此以外提供的文本中没有更多关于亲属关系的信息,那么在提供的文本中李明能够向上追溯到的最年长的亲人就是张强。 +例子3.如果小明是张红的曾孙女,张红的祖母是王华,王华的父亲是王刚,除此以外提供的文本中没有更多关于亲属关系的信息,那么小明能够向上追溯到的最年长的亲人就是王刚。 +""") + elif language == 'English': + prompt = (f""" +Given the scrambled family relationships described above, who is the eldest relative that '{last_person}' can trace back to in the context? +For example: +Example 1: If Zhang Qiang's father is Mark, and no further information about familial relationships is provided in the text, then the oldest relative Zhang Qiang can trace back to in the provided text is Mark. +Example 2: If Li Ming's grandmother is Zhang Hong, and Zhang Hong's father is Zhang Qiang, and no further information about familial relationships is provided in the text, then the oldest relative Li Ming can trace back to in the provided text is Zhang Qiang. +Example 3: If Xiao Ming is Zhang Hong's great-granddaughter, Zhang Hong's grandmother is Wang Hua, and Wang Hua's father is Wang Gang, and no further information about familial relationships is provided in the text, then the oldest relative Xiao Ming can trace back to in the provided text is Wang Gang.""" + ) + else: + prompt = 'Language not supported.' + raise Exception('Unsupported language specified. ' + "Please choose either 'Chinese' or 'English'.") + + # Combine story and prompt + shuffled_story_with_prompt = shuffled_story + ' ' + prompt + + data['prompt'].append(shuffled_story_with_prompt) + data['answer'].append(names[0] + '*' + names[0]) + + dataset = Dataset.from_dict({ + 'prompt': data['prompt'], + 'answer': data['answer'], + }) + return dataset diff --git a/opencompass/datasets/cdme/cdme_multi.py b/opencompass/datasets/needlebench/multi.py similarity index 73% rename from opencompass/datasets/cdme/cdme_multi.py rename to opencompass/datasets/needlebench/multi.py index 32b29752..3a2d1157 100644 --- a/opencompass/datasets/cdme/cdme_multi.py +++ b/opencompass/datasets/needlebench/multi.py @@ -1,6 +1,6 @@ import json +import os import random -import re from pathlib import Path import tiktoken @@ -8,11 +8,31 @@ from datasets import Dataset from opencompass.datasets.base import BaseDataset from opencompass.openicl import BaseEvaluator -from opencompass.registry import LOAD_DATASET, TEXT_POSTPROCESSORS +from opencompass.registry import LOAD_DATASET + + +def get_random_needles(file_path, needle_count): + with open(file_path, 'r', encoding='utf-8') as file: + data = json.load(file) + + matching_records = [ + record for record in data + if record.get('derivation_count') == needle_count + ] + + if matching_records: + random_record = random.choice(matching_records) + return { + 'needles': random_record['derivations'], + 'answer': random_record['answer'], + 'retrieval_question': random_record['question'] + } + else: + return None @LOAD_DATASET.register_module() -class CDMEDataset(BaseDataset): +class NeedleBenchMultiDataset(BaseDataset): @staticmethod def load( @@ -25,11 +45,9 @@ class CDMEDataset(BaseDataset): length_buffer: int, guide: bool, language: str, - needles: 'list[str]', + needle_file_name: str, + num_needles: int, diff: int, - retrieval_question: str, - answer: str, - keyword: str, ): data = {'prompt': [], 'answer': []} tokenizer = tiktoken.encoding_for_model(tokenizer_model) @@ -73,16 +91,14 @@ class CDMEDataset(BaseDataset): def _modify_retrieval_question(retrieval_question): if language == 'Chinese': - parts = retrieval_question.split('请按照') - guide_retrieval_question = (parts[0] + '在回答之前,请思考文档中与此问题' - '最相关的内容是什么。请按照' + parts[1]) + guide_retrieval_question = (retrieval_question + + '在回答之前,请思考文档中与此问题' + '最相关的内容是什么。') return guide_retrieval_question elif language == 'English': - parts = retrieval_question.split('Please answer in the format') guide_retrieval_question = ( - parts[0] + 'Before answering, please consider' - ' what in the document is most relevant to this question.' - ' Please answer in the format' + parts[1]) + retrieval_question + 'Before answering, please consider' + ' what in the document is most relevant to this question.') return guide_retrieval_question else: raise ValueError(f"Language '{language}' is not supported.") @@ -112,6 +128,7 @@ class CDMEDataset(BaseDataset): return prompt files = Path(path).glob('*.jsonl') + needle_file_path = os.path.join(path, needle_file_name) for file in files: if file.name not in file_list: continue @@ -122,12 +139,20 @@ class CDMEDataset(BaseDataset): for counter in range(num_repeats_per_file): random.seed(counter) random.shuffle(lines) - + random_needle_data = get_random_needles( + needle_file_path, num_needles) + needles = [ + '\n' + needle + '\n' + for needle in random_needle_data['needles'] + ] + answer = random_needle_data['answer'] + keyword = answer + retrieval_question = random_needle_data['retrieval_question'] context_length = length - length_buffer target_length_per_record = context_length - \ sum(len(tokens) for tokens in _get_tokens_from_context(needles)) - + target_length_per_record = max(target_length_per_record, 0) accumulated_tokens = [] for line in lines: tokens_current_line = _get_tokens_from_context( @@ -154,7 +179,7 @@ class CDMEDataset(BaseDataset): return dataset -class CDMEEvaluator(BaseEvaluator): +class NeedleBenchMultiEvaluator(BaseEvaluator): def levenshtein_distance(self, s1, s2): if len(s1) < len(s2): @@ -175,50 +200,33 @@ class CDMEEvaluator(BaseEvaluator): return previous_row[-1] - def score(self, predictions, references): - if len(predictions) != len(references): - return { - 'error': 'predictions and references have different lengths' - } + def score(self, predictions, gold): + if len(predictions) != len(gold): + return {'error': 'predictions and gold have different lengths'} total_score = 0 details = [] - for prediction, reference in zip(predictions, references): - keyword = reference.split('*')[1] - reference = reference.split('*')[0] - prediction = re.sub(r'\s+', '', prediction) - reference = re.sub(r'\s+', '', reference) - edit_distance = self.levenshtein_distance(prediction, reference) - max_len = max(len(prediction), len(reference)) - score = 100 * (1 - - edit_distance / max_len) if max_len != 0 else 100 - if keyword in prediction: - print(f'{keyword} is in {prediction}') - score = 100 - else: - print(f'{keyword} is not in {prediction}') - score = 0.2 * score + for prediction, reference in zip(predictions, gold): + answer, keyword = reference.split('*') + keywords = keyword.lower().split() + prediction = prediction.lower() + + keyword_score = 100 / len(keywords) if keywords else 0 + + matched_keywords = sum(1 for kword in keywords + if kword in prediction) + score = matched_keywords * keyword_score detail = { 'pred': prediction, 'answer': reference, - 'edit_distance': edit_distance, + 'matched_keywords': matched_keywords, 'score': score } + total_score += score details.append(detail) average_score = total_score / len(predictions) if predictions else 0 - result = {'score': average_score, 'details': details} - return result - - -@TEXT_POSTPROCESSORS.register_module('cdme') -def cdme_postprocess(text: str) -> str: - return text - - -@TEXT_POSTPROCESSORS.register_module('cdme_dataset') -def cdme_dataset_postprocess(text: str) -> str: - return text + return {'score': average_score, 'details': details} diff --git a/opencompass/datasets/cdme/cdme.py b/opencompass/datasets/needlebench/origin.py similarity index 78% rename from opencompass/datasets/cdme/cdme.py rename to opencompass/datasets/needlebench/origin.py index d3e0f3f2..561de1ba 100644 --- a/opencompass/datasets/cdme/cdme.py +++ b/opencompass/datasets/needlebench/origin.py @@ -1,4 +1,5 @@ import json +import os import random import re from pathlib import Path @@ -11,8 +12,26 @@ from opencompass.openicl import BaseEvaluator from opencompass.registry import LOAD_DATASET, TEXT_POSTPROCESSORS +def get_random_line_by_language(file_path, language): + with open(file_path, 'r', encoding='utf-8') as file: + lines = [ + json.loads(line.strip()) for line in file + if json.loads(line.strip())['language'] == language + ] + + if lines: + random_line = random.choice(lines) + return { + 'needle': random_line['needle'], + 'retrieval_question': random_line['retrieval_question'], + 'keyword': random_line['arg2'] + } + else: + return None + + @LOAD_DATASET.register_module() -class CDMEDataset(BaseDataset): +class NeedleBenchOriginDataset(BaseDataset): @staticmethod def load( @@ -25,8 +44,7 @@ class CDMEDataset(BaseDataset): length_buffer: int, guide: bool, language: str, - needle: str, - retrieval_question: str, + needle_file_name: str, ): data = {'prompt': [], 'answer': []} tokenizer = tiktoken.encoding_for_model(tokenizer_model) @@ -96,11 +114,17 @@ class CDMEDataset(BaseDataset): for counter in range(num_repeats_per_file): random.seed(counter) random.shuffle(lines) + needle_file_path = os.path.join(path, needle_file_name) + random_needle = get_random_line_by_language( + needle_file_path, language) + needle = '\n' + random_needle['needle'] + '\n' + retrieval_question = random_needle['retrieval_question'] + keyword = random_needle['keyword'] context_length = length - length_buffer target_length_per_record = context_length - len( _get_tokens_from_context(needle)) - + target_length_per_record = max(target_length_per_record, 0) accumulated_tokens = [] for line in lines: tokens_current_line = _get_tokens_from_context( @@ -118,7 +142,7 @@ class CDMEDataset(BaseDataset): retrieval_question) data['prompt'].append(processed_prompt) - data['answer'].append(needle) + data['answer'].append(needle + '*' + keyword) dataset = Dataset.from_dict({ 'prompt': data['prompt'], @@ -127,7 +151,7 @@ class CDMEDataset(BaseDataset): return dataset -class CDMEEvaluator(BaseEvaluator): +class NeedleBenchOriginEvaluator(BaseEvaluator): def __init__(self, use_trim=False): self.use_trim = use_trim @@ -174,20 +198,22 @@ class CDMEEvaluator(BaseEvaluator): return previous_row[-1] - def score(self, predictions, references): - if len(predictions) != len(references): - return { - 'error': 'predictions and references have different lengths' - } + def score(self, predictions, gold): + + if len(predictions) != len(gold): + return {'error': 'predictions and gold have different lengths'} total_score = 0 details = [] - for prediction, reference in zip(predictions, references): + for prediction, reference in zip(predictions, gold): + keyword = reference.split('*')[1] + reference = reference.split('*')[0] + raw_prediction = prediction prediction = re.sub(r'\s+', '', prediction) reference = re.sub(r'\s+', '', reference) if self.use_trim: - prediction = CDMEEvaluator._trim_prediction( + prediction = NeedleBenchOriginEvaluator._trim_prediction( prediction, reference) edit_distance = self.levenshtein_distance(prediction, reference) @@ -195,6 +221,13 @@ class CDMEEvaluator(BaseEvaluator): score = 100 * (1 - edit_distance / max_len) if max_len != 0 else 100 + if keyword in raw_prediction: + print(f'{keyword} is in {prediction}') + score = 100 + else: + print(f'{keyword} is not in {prediction}') + score = 0.2 * score + detail = { 'pred': prediction, 'answer': reference, @@ -209,11 +242,11 @@ class CDMEEvaluator(BaseEvaluator): return result -@TEXT_POSTPROCESSORS.register_module('cdme') -def cdme_postprocess(text: str) -> str: +@TEXT_POSTPROCESSORS.register_module('needlebench') +def needlebench_postprocess(text: str) -> str: return text -@TEXT_POSTPROCESSORS.register_module('cdme_dataset') -def cdme_dataset_postprocess(text: str) -> str: +@TEXT_POSTPROCESSORS.register_module('needlebench_dataset') +def needlebench_dataset_postprocess(text: str) -> str: return text diff --git a/opencompass/datasets/needlebench/parallel.py b/opencompass/datasets/needlebench/parallel.py new file mode 100644 index 00000000..6133a3ca --- /dev/null +++ b/opencompass/datasets/needlebench/parallel.py @@ -0,0 +1,285 @@ +import json +import random +from pathlib import Path + +import tiktoken +from datasets import Dataset + +from opencompass.datasets.base import BaseDataset +from opencompass.openicl import BaseEvaluator +from opencompass.registry import LOAD_DATASET + + +def get_unique_entries(file_path, + n, + language, + unique_arg1=False, + unique_arg2=False, + unique_combination=False): + seen_arg1 = set() + seen_arg2 = set() + seen_combinations = set() + results = [] + + with open(file_path, 'r', encoding='utf-8') as file: + lines = file.readlines() + random.shuffle(lines) + + for line in lines: + try: + entry = json.loads(line.strip()) + except json.JSONDecodeError: + continue + + if entry.get('language') != language: + continue + + key1 = entry.get('arg1', '') if unique_arg1 else '' + key2 = entry.get('arg2', '') if unique_arg2 else '' + combination = (key1, key2) if unique_combination else '' + + if (key1 not in seen_arg1 or not unique_arg1) and \ + (key2 not in seen_arg2 or not unique_arg2) and \ + (combination not in seen_combinations or not unique_combination): + seen_arg1.add(key1) + seen_arg2.add(key2) + seen_combinations.add(combination) + results.append(entry) + + if len(results) == n: + break + + return results + + +@LOAD_DATASET.register_module() +class NeedleBenchParallelDataset(BaseDataset): + + @staticmethod + def load( + path: str, + needle_file_name: str, + length: int, + depths: list[int], + tokenizer_model: str, + file_list: list[str], + num_repeats_per_file: int, + length_buffer: int, + guide: bool, + language: str, + ): + data = {'prompt': [], 'answer': []} + tokenizer = tiktoken.encoding_for_model(tokenizer_model) + + files = Path(path).glob('*.jsonl') + for file in files: + if file.name == needle_file_name: + needle_file_path = file + + predefined_needles_bak = get_unique_entries(needle_file_path, + len(depths), + language, + unique_arg1=True, + unique_arg2=True, + unique_combination=True) + + def _generate_context(tokens_context, depths, needles): + insertion_points = [ + int(len(tokens_context) * (depth / 100)) for depth in depths + ] + + cumulative_inserted_length = 0 + + for i, needle in enumerate(needles): + needle_tokens = _get_tokens_from_context(needle) + current_insertion_point = min( + insertion_points[i] + cumulative_inserted_length, + len(tokens_context)) + + tokens_context = tokens_context[:current_insertion_point] + \ + needle_tokens + tokens_context[current_insertion_point:] + cumulative_inserted_length += len(needle_tokens) + + new_context = _decode_tokens(tokens_context) + return new_context + + def _get_tokens_from_context(context): + if isinstance(context, list): + return [tokenizer.encode(item) for item in context] + else: + return tokenizer.encode(context) + + def _decode_tokens(tokens): + return tokenizer.decode(tokens) + + def _modify_retrieval_question(retrieval_question): + if language == 'Chinese': + parts = retrieval_question.split('请按照') + guide_retrieval_question = (parts[0] + '在回答之前,请思考文档中与此问题' + '最相关的内容是什么。请按照' + parts[1]) + return guide_retrieval_question + elif language == 'English': + parts = retrieval_question.split('Please answer in the format') + guide_retrieval_question = ( + parts[0] + 'Before answering, please consider' + ' what in the document is most relevant to this question.' + ' Please answer in the format' + parts[1]) + return guide_retrieval_question + else: + raise ValueError(f"Language '{language}' is not supported.") + + def _generate_prompt(context, retrieval_question): + if guide: + retrieval_question = _modify_retrieval_question( + retrieval_question) + + if language == 'Chinese': + prompt = ('你是一个善于回答用户问题的智能AI助手\n' + '请保持你的回答简洁清楚。不要说和下面文档中的无关的话' + ',或重复你的回答\n请先仔细阅读下面的文档再依次回答' + f'最后提出的问题\n用户现在给你的文档是{context}\n\n' + f'现在请问:{retrieval_question}\n') + elif language == 'English': + prompt = ( + 'You are an intelligent AI assistant skilled in ' + 'answering user questions.\n' + 'Please keep your answers concise and clear. Do not' + ' talk about irrelevant topics or repeat your ' + 'answers.\n' + f'The document given to you by the user is {context}' + f'\n\nNow, the questions are: {retrieval_question}\n') + else: + raise ValueError(f"Language '{language}' is not supported.") + + return prompt + + files = Path(path).glob('*.jsonl') + for file in files: + if file.name not in file_list: + continue + + with open(file, 'r', encoding='utf-8') as f: + lines_bak = [json.loads(line.strip()) for line in f] + lines = lines_bak.copy() + for counter in range(num_repeats_per_file): + random.seed(counter) + random.shuffle(lines) + predefined_needles = predefined_needles_bak.copy() + random.shuffle(predefined_needles) + + needles = [ + '\n' + item['needle'] + '\n' for item in predefined_needles + ] + keywords = [item['arg2'] for item in predefined_needles] + if language == 'Chinese': + questions = '、'.join([ + item['retrieval_question'].split('?')[0] + '?' + for item in predefined_needles + ]) + + answers_format = '、'.join([ + item['retrieval_question'].split("'")[1].split('。')[0] + for item in predefined_needles + ]) + retrieval_question = questions + "请按照'" + \ + answers_format + "'的格式回答。" + elif language == 'English': + questions = '、'.join([ + item['retrieval_question'].split('?')[0] + '?' + for item in predefined_needles + ]) + + answers_format = '、'.join([ + item['retrieval_question'].split("'")[1].split('.')[0] + for item in predefined_needles + ]) + retrieval_question = questions + \ + "Please answer in the format of '" + \ + answers_format + "'" + + context_length = length - length_buffer + target_length_per_record = context_length - \ + sum(len(tokens) for tokens + in _get_tokens_from_context(needles)) + target_length_per_record = max(target_length_per_record, 0) + accumulated_tokens = [] + for line in lines: + tokens_current_line = _get_tokens_from_context( + line['text']) + accumulated_tokens.extend(tokens_current_line) + + if len(accumulated_tokens) >= target_length_per_record: + break + + processed_text = _generate_context( + accumulated_tokens[:target_length_per_record], depths, + needles) + + processed_prompt = _generate_prompt(processed_text, + retrieval_question) + + data['prompt'].append(processed_prompt) + + data['answer'].append('*'.join(keywords) + '#' + + '*'.join(map(str, depths))) + + dataset = Dataset.from_dict({ + 'prompt': data['prompt'], + 'answer': data['answer'], + }) + return dataset + + +class NeedleBenchParallelEvaluator(BaseEvaluator): + + def levenshtein_distance(self, s1, s2): + if len(s1) < len(s2): + return self.levenshtein_distance(s2, s1) + + if len(s2) == 0: + return len(s1) + + previous_row = range(len(s2) + 1) + for i, c1 in enumerate(s1): + current_row = [i + 1] + for j, c2 in enumerate(s2): + insertions = previous_row[j + 1] + 1 + deletions = current_row[j] + 1 + substitutions = previous_row[j] + (c1 != c2) + current_row.append(min(insertions, deletions, substitutions)) + previous_row = current_row + + return previous_row[-1] + + def score(self, predictions, gold): + if len(predictions) != len(gold): + return {'error': 'predictions and gold have different lengths'} + print('predictions:', predictions) + print('gold:', gold) + + details = [] + depths = [int(i) for i in gold[0].split('#')[1].split('*')] + scores_by_depth = {depth: 0 for depth in depths} + + for prediction, reference in zip(predictions, gold): + print(reference) + keywords = reference.split('#')[0].split('*') + print(keywords) + for keyword, depth in zip(keywords, depths): + print('iterating:', keyword, depth) + if keyword in prediction: + print(f'{keyword} at depth {depth} is in {prediction}') + scores_by_depth[depth] += 100 / (len(predictions)) + + average_score = sum(scores_by_depth.values()) / len(scores_by_depth) + + flattened_scores = { + 'Depth' + str(depth): score + for depth, score in scores_by_depth.items() + } + + result = { + **flattened_scores, 'details': details, + 'average_score': average_score + } + return result diff --git a/opencompass/summarizers/needlebench.py b/opencompass/summarizers/needlebench.py new file mode 100644 index 00000000..cf4a897d --- /dev/null +++ b/opencompass/summarizers/needlebench.py @@ -0,0 +1,1090 @@ +# flake8: noqa +# yapf: disable +import argparse +import functools +import getpass +import math +import os +import os.path as osp +from datetime import datetime +from typing import Any, Dict, List, Optional + +import matplotlib.pyplot as plt +import mmengine +import numpy as np +import pandas as pd +import seaborn as sns +import tabulate +from matplotlib.colors import LinearSegmentedColormap +from mmengine import ConfigDict +from tqdm import tqdm + +from opencompass.utils import (LarkReporter, dataset_abbr_from_cfg, + get_infer_output_path, get_logger, + model_abbr_from_cfg) +from opencompass.utils.prompt import get_prompt_hash + +METRIC_WHITELIST = ['score', 'auc_score', 'accuracy', 'humaneval_pass@1', 'rouge1', 'avg_toxicity_score', 'bleurt_diff', 'matthews_correlation', 'truth'] +METRIC_BLACKLIST = ['bp', 'sys_len', 'ref_len'] + +def model_abbr_from_cfg_used_in_summarizer(model): + if model.get('summarizer_abbr', None): + return model['summarizer_abbr'] + else: + return model_abbr_from_cfg(model) + +def read_after_specific_line_except_last(file_name, keyword, offset): + with open(file_name, 'r', encoding='utf-8') as file: + lines = file.readlines() + + # 找到含有关键词的行的索引 + for index, line in enumerate(lines): + if keyword in line: + start_index = index + offset + 1 + break + else: + # 如果文件中没有找到关键字,则返回空字符串 + return '' + + # 返回从指定行开始到倒数第二行的内容 + return ''.join(lines[start_index:-1]) + +def create_model_dataframe(nested_dict, model_name, dataset_abbr, parallel=False): + # 确保模型名存在于字典中 + if model_name not in nested_dict: + print(f'Model {model_name} not found in the provided data.') + return pd.DataFrame() # 返回一个空的DataFrame + + model_data = nested_dict[model_name] + data = [] + + for key, value in model_data.items(): + if parallel: + if dataset_abbr in key: + new_key_base = key.replace(dataset_abbr, '').strip('_') + for depth_key, score in value.items(): + new_key = f'{new_key_base}{depth_key}' + if 'average_score' not in new_key: + data.append([new_key, score]) + else: + if dataset_abbr in key: + score = value.get('score', None) + new_key = key.replace(dataset_abbr, '').strip('_') + data.append([new_key, score]) + + df = pd.DataFrame(data, columns=['dataset', model_name]) + return df + +def parse_model_scores(text): + # 分割字符串为多行 + lines = text.split('\n') + + result_dict = {} + current_model = None + + for line in lines: + # 检查行是否定义了新模型 + if line.startswith('Model:'): + # 获取模型名称 + current_model = line.split('Model:')[1].strip() + result_dict[current_model] = {} + elif current_model and ':' in line: + # 解析数据集和分数 + dataset, score_str = line.split(':', 1) + score_dict = eval(score_str.strip()) + result_dict[current_model][dataset] = score_dict + + return result_dict + +def visualize(df_raw, save_path: str,model_name: str ,dataset_type:str): + df = df_raw.copy() + df['Context Length'] = df['dataset'].apply( + lambda x: int(x.split('Length')[1].split('Depth')[0])) + df['Document Depth'] = df['dataset'].apply( + lambda x: float(x.split('Depth')[1].split('_')[0])) + + model_columns = [ + col for col in df.columns + if col not in ['Context Length', 'Document Depth'] + ] + + for model_name in model_columns[1:]: + model_df = df[['Document Depth', 'Context Length', + model_name]].copy() + model_df.rename(columns={model_name: 'Score'}, inplace=True) + + # Create pivot table + pivot_table = pd.pivot_table(model_df, + values='Score', + index=['Document Depth'], + columns=['Context Length'], + aggfunc='mean') + + # Calculate mean scores + mean_scores = pivot_table.mean().values + + # Calculate overall score + overall_score = mean_scores.mean() + + # Create heatmap and line plot + plt.figure(figsize=(15.5, 8)) + ax = plt.gca() + cmap = LinearSegmentedColormap.from_list( + 'custom_cmap', ['#F0496E', '#EBB839', '#0CD79F']) + + # Draw heatmap + sns.heatmap(pivot_table, + cmap=cmap, + ax=ax, + cbar_kws={'label': 'Score'}, + vmin=0, + vmax=100) + + # Set line plot data + x_data = [i + 0.5 for i in range(len(mean_scores))] + y_data = mean_scores + + # Create twin axis for line plot + ax2 = ax.twinx() + # Draw line plot + ax2.plot(x_data, + y_data, + color='white', + marker='o', + linestyle='-', + linewidth=2, + markersize=8, + label='Average Depth Score') + # Set y-axis range + ax2.set_ylim(0, 100) + + # Hide original y-axis ticks and labels + ax2.set_yticklabels([]) + ax2.set_yticks([]) + + # Add legend + ax2.legend(loc='upper left') + + # Set chart title and labels + ax.set_title(f'{model_name} {dataset_type} Context ' + 'Performance\nFact Retrieval Across ' + 'Context Lengths ("Needle In A Haystack")') + ax.set_xlabel('Token Limit') + ax.set_ylabel('Depth Percent') + ax.set_xticklabels(pivot_table.columns.values, rotation=45) + ax.set_yticklabels(pivot_table.index.values, rotation=0) + # Add overall score as a subtitle + plt.text(0.5, + -0.13, f'Overall Score for {model_name}: ' + f'{overall_score:.2f}', + ha='center', + va='center', + transform=ax.transAxes, + fontsize=13) + + plt.tight_layout() + plt.subplots_adjust(right=1) + plt.draw() + plt.savefig(save_path) + print(f'Saved :{save_path}') + plt.close() # Close figure to prevent memory leaks + return overall_score + +def save_results_to_plots(txt_results_save_path): + + content = read_after_specific_line_except_last(txt_results_save_path, 'raw format', 2) + + parsed_data = parse_model_scores(content) + model_names = get_dict_model_names(parsed_data) + # 定义数字、语言代码和尺寸 + numbers = [2, 3, 4, 5] + languages = ['en', 'zh'] + size_exists = [] + sizes_origin = ['_4k', '_8k', '_32k', '_128k', '_200k'] + + for size in sizes_origin: + if size in content: + size_exists.append(size) + + # 创建dataset_abbrs列表 + multi_dataset_abbrs = [f'{num}needle_{lang}{size}' for num in numbers for lang in languages for size in size_exists] + origin_dataset_abbrs = [f'origin_{lang}{size}' for lang in languages for size in size_exists] + # 创建parallel_dataset_abbrs列表 + parallel_dataset_abbrs = [f'parallel_{lang}{size}' for lang in languages for size in size_exists] + + dataset_abbrs = multi_dataset_abbrs + origin_dataset_abbrs + \ + parallel_dataset_abbrs + base_path = os.path.dirname(txt_results_save_path) + plot_path = os.path.join(base_path, 'plots') + model_scores = {} + for model_name in tqdm(model_names): + model_datasets_scores = {} # Dictionary to store scores for each dataset for the current model + for dataset_abbr in dataset_abbrs: + parallel_flag = 'parallel' in dataset_abbr + + # Create a directory for each dataset_abbr + folder_path = os.path.join(plot_path, dataset_abbr) + ensure_directory(folder_path) + + # Construct the full path to save the image + save_path = os.path.join(folder_path, f'{model_name}.png') + + # Create DataFrame for the model and dataset + df = create_model_dataframe(parsed_data, model_name, dataset_abbr, parallel=parallel_flag) + + # Generate visualization and get the score + score = visualize(df, save_path, model_name, dataset_abbr) + + # Store the score in the dictionary + model_datasets_scores[dataset_abbr] = '{:.02f}'.format(score) + + # Process and visualize the overall score + overall_score_pic_path = os.path.join(plot_path, f'{model_name}_overall.png') + merged_df = merge_dataframes(model_name, dataset_abbrs, parsed_data) + averaged_df = calculate_elementwise_average(merged_df) + + # Assume visualize returns the average score for the overall visualization + overall_score = visualize(averaged_df, overall_score_pic_path, 'weighted_average_score', 'Overall Score') + + # Add the overall score to the dictionary + model_datasets_scores['Overall'] = '{:.02f}'.format(overall_score) + + # Add the model's scores to the main dictionary + model_scores[model_name] = model_datasets_scores + +def ensure_directory(path): + if not os.path.exists(path): + os.makedirs(path) + +def get_dict_model_names(nested_dict): + model_names = [] + for first_level_key in nested_dict: + model_names.append(first_level_key) + return model_names + +def merge_dataframes(model_name, dataset_abbrs, parsed_data): + dfs = [] + for dataset_abbr in dataset_abbrs: + parallel_flag = 'parallel' in dataset_abbr + df = create_model_dataframe(parsed_data, model_name, dataset_abbr, parallel=parallel_flag) + + # 检查DataFrame是否为空或是否有多于一列(除了'dataset'列) + if not df.empty and len(df.columns) > 1: + # 将模型名称列重命名为dataset_abbr + score_column = df.columns[-1] # 假设分数列是最后一列 + df.rename(columns={score_column: dataset_abbr}, inplace=True) + + dfs.append(df) + + # 沿着列方向合并DataFrame + merged_df = pd.concat(dfs, axis=1) + return merged_df + +def calculate_elementwise_average(merged_df): + # 选择需要计算平均值的列 + score_columns = [col for col in merged_df.columns if col != 'dataset'] + + origin_columns = [col for col in score_columns if 'origin' in col] + parallel_columns = [col for col in score_columns if 'parallel' in col] + multi_columns = [col for col in score_columns if 'needle' in col] + + # 计算加权平均分数 + if origin_columns and parallel_columns and multi_columns: + origin_avg = merged_df[origin_columns].mean(axis=1) * 0.4 + parallel_avg = merged_df[parallel_columns].mean(axis=1) * 0.3 + multi_avg = merged_df[multi_columns].mean(axis=1) * 0.3 + + merged_df['weighted_average_score'] = origin_avg + parallel_avg + multi_avg + else: + # 如果没有任何得分列nn + merged_df['weighted_average_score'] = pd.Series([0] * len(merged_df)) + + return merged_df.iloc[:, [0, -1]] + + +class NeedleBenchSummarizer: + """NeedleBench summarizer in OpenCompass. + + Args: + config (ConfigDict): The configuration object of the evaluation task. It's expected to be filled out at runtime. + dataset_abbrs (list[str], optional): Dataset abbreviations to be listed in the summary. + summary_groups (list): The dataset groups whose results need to be averaged out. For example, mmlu. Each item it a dict with + 'name' (str) and 'subsets' (list of dataset abbrs), and optionally + 'weights' if weighted average is needed. + prompt_db: A deprecated field. + """ + + def __init__(self, config: ConfigDict, dataset_abbrs: Optional[List[str]] = None, summary_groups: List = [], prompt_db = None) -> None: + self.tasks = [] + self.cfg = config + self.logger = get_logger() + self.summary_groups = summary_groups + self.dataset_abbrs = dataset_abbrs + if prompt_db: + self.logger.warning('prompt_db is deprecated and no longer used. ' + 'Please remove it from your config.') + + # Enable lark bot if lark_url is presented + self.lark_reporter = None + if self.cfg.get('lark_bot_url', None): + self.lark_reporter = LarkReporter(self.cfg['lark_bot_url']) + + self.model_cfgs = self.cfg['models'] + self.dataset_cfgs = self.cfg['datasets'] + self.work_dir = self.cfg['work_dir'] + model_abbrs = [] + for model in self.model_cfgs: + model_abbr = model_abbr_from_cfg_used_in_summarizer(model) + if model_abbr in model_abbrs: + continue + model_abbrs.append(model_abbr) + self.model_abbrs = model_abbrs + + def _pick_up_results(self): + """The function reads the numerical results of evaluations from the + output folder based on the configuration file, and ultimately returns + four dictionaries, each containing processed information in different + formats. The contents of the four dictionaries are as follows: + + - raw_results: contains the raw results of each model on each dataset (excluding details). + - parsed_results: contains the results of each model on each dataset for each metric, with metrics in METRIC_BLACKLIST being ignored. + - dataset_metrics: contains the list of metrics for each dataset, consistent with the metrics in parsed_results. The list is ordered according to the METRIC_WHITELIST, + with metrics appearing earlier considered more important. + - dataset_eval_mode: contains the evaluation mode for each dataset. + """ + # raw_results: {model_abbr: {dataset_abbr: result}} + raw_results : Dict[str, Dict[str, Any]] = {} + # parsed_results: {model_abbr: {dataset_abbr: {metric: score}}} + parsed_results : Dict[str, Dict[str, Dict[str, float]]] = {} + # dataset_metrics: {dataset_abbr: [metric]} + dataset_metrics : Dict[str, List[str]] = {} + + for model in self.model_cfgs: + model_abbr = model_abbr_from_cfg_used_in_summarizer(model) + parsed_results.setdefault(model_abbr, {}) + raw_results.setdefault(model_abbr, {}) + for dataset in self.dataset_cfgs: + dataset_abbr = dataset_abbr_from_cfg(dataset) + filepath = get_infer_output_path(model, dataset, osp.join(self.work_dir, 'results')) + if not osp.exists(filepath): + continue + result = mmengine.load(filepath) + result.pop('details', None) + raw_results[model_abbr][dataset_abbr] = result + if 'error' in result: + self.logger.debug(f'error in {model_abbr} {dataset_abbr} {result["error"]}') + continue + _rst, _dm = {}, [] + for metric, score in result.items(): + if metric not in METRIC_BLACKLIST and isinstance(score, (int, float)): + _rst[metric] = score + _dm.append(metric) + else: + continue + if len(_rst) == 0: + self.logger.warning(f'unknown result format: {result}, continue') + continue + _dm = sorted(_dm, key=lambda i: METRIC_WHITELIST.index(i) if i in METRIC_WHITELIST else len(METRIC_WHITELIST)) + + if dataset_abbr in dataset_metrics: + assert tuple(dataset_metrics[dataset_abbr]) == tuple(_dm), \ + f'{dataset_abbr} has different metrics: {dataset_metrics[dataset_abbr]} vs {_dm}' + else: + dataset_metrics[dataset_abbr] = _dm + parsed_results[model_abbr][dataset_abbr] = _rst + + # dataset_eval_mode: {dataset_abbr: eval_mode} + dataset_eval_mode : Dict[str, str] = {} + for dataset in self.dataset_cfgs: + inferencer = dataset.get('infer_cfg', {}).get('inferencer', {}).get('type', '') + inferencer = inferencer if isinstance(inferencer, str) else inferencer.__name__ + dataset_abbr = dataset_abbr_from_cfg(dataset) + if 'GenInferencer' in inferencer: + dataset_eval_mode[dataset_abbr] = 'gen' + elif 'PPLInferencer' in inferencer: + dataset_eval_mode[dataset_abbr] = 'ppl' + elif 'LLInferencer' in inferencer: + dataset_eval_mode[dataset_abbr] = 'll' + else: + dataset_eval_mode[dataset_abbr] = 'unknown' + self.logger.warning(f'unknown inferencer: {inferencer} - {dataset_abbr}') + return raw_results, parsed_results, dataset_metrics, dataset_eval_mode + + def _calculate_group_metrics(self, raw_results, parsed_results, dataset_metrics, dataset_eval_mode): + """The function calculates the numerical results for each group based + on the configuration in summary_groups, and updates the contents of + each dictionary accordingly.""" + summary_groups = self.summary_groups + for sg in summary_groups: + for model_abbr in self.model_abbrs: + available_metrics, missing_metrics = [], [] + for i in sg['subsets']: + if isinstance(i, (list, tuple)): + if i[0] in parsed_results[model_abbr] and i[1] in parsed_results[model_abbr][i[0]]: + available_metrics.append(i) + else: + missing_metrics.append(i) + else: + if i in parsed_results[model_abbr]: + available_metrics.append(i) + else: + missing_metrics.append(i) + + if len(available_metrics) == 0: + continue + if len(missing_metrics) != 0: + raw_results[model_abbr][sg['name']] = {'error': 'missing metrics: {}'.format(missing_metrics)} + continue + + if 'metric' in sg: + default_metric = sg['metric'] + need_smart_metric = False + else: + need_smart_metric = True + if sg.get('std', False): + default_metric = 'standard_deviation' + elif sg.get('weights', []): + default_metric = 'weighted_average' + else: + default_metric = 'naive_average' + + scores, eval_modes, group_metrics = {}, [], None + if any(isinstance(dataset_abbr, (list, tuple)) for dataset_abbr in sg['subsets']) and \ + any(isinstance(dataset_abbr, str) for dataset_abbr in sg['subsets']): + raise NotImplementedError('mixed dataset_abbr type is not supported') + + if all(isinstance(dataset_abbr, (list, tuple)) for dataset_abbr in sg['subsets']): + group_metrics = [default_metric] + for dataset_abbr, metric in sg['subsets']: + scores.setdefault(default_metric, {})[dataset_abbr + '@' + metric] = parsed_results[model_abbr][dataset_abbr][metric] + eval_modes.append(dataset_eval_mode.get(dataset_abbr, 'unknown')) + else: + group_metrics = list(functools.reduce(lambda a, b: a & b, [set(dataset_metrics[dataset_abbr]) for dataset_abbr in sg['subsets']])) + if need_smart_metric and len(group_metrics) > 1: + for metric in group_metrics: + for dataset_abbr in sg['subsets']: + scores.setdefault(metric, {})[dataset_abbr + '@' + metric] = parsed_results[model_abbr][dataset_abbr][metric] + eval_modes.append(dataset_eval_mode.get(sg['subsets'][0], 'unknown')) + else: + group_metrics = [default_metric] + for dataset_abbr in sg['subsets']: + metric = dataset_metrics[dataset_abbr][0] + scores.setdefault(default_metric, {})[dataset_abbr + '@' + metric] = parsed_results[model_abbr][dataset_abbr][metric] + eval_modes.append(dataset_eval_mode.get(dataset_abbr, 'unknown')) + + result = {} + for metric in scores: + if default_metric == 'standard_deviation': + avg = sum(scores[metric].values()) / len(scores[metric]) + variance = sum((scores[metric][k] - avg) ** 2 for k in scores[metric]) / len(scores[metric]) + scores[metric] = result[metric] = math.sqrt(variance) + else: + if sg.get('weights', []): + # check sg['weights'][k] != 0 in case of scores[metric][k] is NaN + try: + numerator = sum(scores[metric][k] * sg['weights'][k] for k in sg['weights'] if sg['weights'][k] != 0) + except KeyError: + tmp_scores = {metric: {k.split('@')[0]: v for k, v in scores[metric].items()} for metric in scores} + numerator = sum(tmp_scores[metric][k] * sg['weights'][k] for k in sg['weights'] if sg['weights'][k] != 0) + denominator = sum(sg['weights'].values()) + else: + numerator = sum(scores[metric].values()) + denominator = len(scores[metric]) + scores[metric] = result[metric] = numerator / denominator + eval_modes = list(set(eval_modes)) + eval_mode = eval_modes[0] if len(eval_modes) == 1 else 'mixed' + + # add to global results + raw_results[model_abbr].setdefault(sg['name'], {}).update(scores) + parsed_results[model_abbr].setdefault(sg['name'], {}).update(result) + dataset_metrics.setdefault(sg['name'], []).extend(group_metrics) + dataset_eval_mode[sg['name']] = eval_mode + + return raw_results, parsed_results, dataset_metrics, dataset_eval_mode + + def _format_table(self, parsed_results, dataset_metrics, dataset_eval_mode): + dataset_abbrs = [dataset_abbr_from_cfg(dataset) for dataset in self.dataset_cfgs] + prompt_version = {dataset_abbr_from_cfg(d): get_prompt_hash(d)[:6] for d in self.dataset_cfgs} + + summarizer_dataset_abbrs = [] + if self.dataset_abbrs is None: + # display all dataset metrics included in the config + for dataset_abbr in dataset_abbrs: + if dataset_abbr in dataset_metrics: + for metric in dataset_metrics[dataset_abbr]: + summarizer_dataset_abbrs.append((dataset_abbr, metric)) + else: + summarizer_dataset_abbrs.append((dataset_abbr, None)) + # along with all possible group metrics + for dataset_abbr in dataset_metrics: + for metric in dataset_metrics[dataset_abbr]: + if (dataset_abbr, metric) not in summarizer_dataset_abbrs: + summarizer_dataset_abbrs.append((dataset_abbr, metric)) + else: + # follow the required order + for item in self.dataset_abbrs: + if isinstance(item, str): + summarizer_dataset_abbrs.append((item, None)) + elif isinstance(item, (list, tuple)): + summarizer_dataset_abbrs.append((item[0], item[1])) + + table = [] + header = ['dataset', 'version', 'metric', 'mode'] + self.model_abbrs + table.append(header) + + for key in dataset_metrics: + dataset_metrics[key] = list(set(dataset_metrics[key])) + + for dataset_abbr, metric in summarizer_dataset_abbrs: + if dataset_abbr not in dataset_metrics: + table.append([dataset_abbr, '-', '-', '-'] + ['-'] * len(self.model_abbrs)) + table.append(header) + continue + if len(dataset_metrics[dataset_abbr]) >= 10: + metric = 'average_score' + + if metric is None: + metric = dataset_metrics[dataset_abbr][0] + elif metric in dataset_metrics[dataset_abbr]: + pass + else: + table.append([dataset_abbr, '-', '-', '-'] + ['-'] * len(self.model_abbrs)) + continue + + row = [dataset_abbr, prompt_version.get(dataset_abbr, '-'), metric, dataset_eval_mode.get(dataset_abbr, '-')] + for model_abbr in self.model_abbrs: + if dataset_abbr in parsed_results[model_abbr]: + row.append('{:.02f}'.format(parsed_results[model_abbr][dataset_abbr][metric])) + else: + row.append('-') + + table.append(row) + for i in range(len(table)): + if i == 0 or table[i][0].startswith('---------'): + table[i] = [table[i][0]] + table[i][4:] + else: + table[i] = [table[i][0]] + table[i][4:] + + return table + + def _format_raw_txt(self, raw_results): + raw_dataset_abbrs = [] + for model_abbr in self.model_abbrs: + for dataset_abbr in raw_results[model_abbr]: + if dataset_abbr not in raw_dataset_abbrs: + raw_dataset_abbrs.append(dataset_abbr) + raw_txts = [] + for model_abbr in self.model_abbrs: + raw_txts.append('-------------------------------') + raw_txts.append(f'Model: {model_abbr}') + for dataset_abbr in raw_dataset_abbrs: + result = raw_results[model_abbr].get(dataset_abbr, '{}') + raw_txts.append(f'{dataset_abbr}: {result}') + raw_txts = '\n'.join(raw_txts) + return raw_txts + + def _read_and_sort_dataframe(self, file_path): + # Read the file without treating the first row as a header + df = pd.read_csv(file_path, header=None) + + # Function to sort columns based on the value of a specific row, excluding the first column + def sort_columns_based_on_row_corrected(df, base_row_idx, start_row_idx, end_row_idx): + # Extract the rows for sorting + sort_values_row = df.iloc[base_row_idx, 1:].replace('-', np.nan).apply(pd.to_numeric, errors='coerce') + # Handle NaNs by setting them to a value less than the minimum or using a method to keep them at the end + min_possible_value = sort_values_row.min(skipna=True) - 1 # Use min value in the row minus 1 or another method + sort_values_row_filled = sort_values_row.fillna(min_possible_value) + # Get the sorted order of indices, excluding the first column + sorted_col_indices = sort_values_row_filled.sort_values(ascending=False).index + # Apply the sorted column indices to the whole DataFrame, adjusting for Python's 0-based index + df.iloc[start_row_idx:end_row_idx+1] = df.iloc[start_row_idx:end_row_idx+1, [0] + sorted_col_indices.tolist()] + + # Apply the corrected sorting function based on the description + sort_columns_based_on_row_corrected(df, 1, 0, 2) # For rows 1-2 based on row 2's values + sort_columns_based_on_row_corrected(df, 4, 3, 7) # For rows 4-7 based on row 5's values + sort_columns_based_on_row_corrected(df, 9, 8, 12) # For rows 9-12 based on row 10's values + sort_columns_based_on_row_corrected(df, 14, 13, 25) # For rows 14-25 based on row 15's values + + # Return the sorted DataFrame + return df + + def _output_to_file(self, output_path, time_str, table, raw_txts): + # output to file + if output_path is None: + output_path = osp.join(self.work_dir, 'summary', f'summary_{time_str}.txt') + output_csv_path = osp.join(self.work_dir, 'summary', f'summary_{time_str}.csv') + else: + output_csv_path = output_path.replace('.txt', '.csv') + + output_dir = osp.split(output_path)[0] + mmengine.mkdir_or_exist(output_dir) + with open(output_path, 'w', encoding='utf-8') as f: + text = f'{time_str}\n' + \ + 'tabulate format\n' + \ + '^' * 128 + '\n' + \ + tabulate.tabulate(table, headers='firstrow') + '\n' + \ + '$' * 128 + '\n\n' + \ + '-' * 128 + ' THIS IS A DIVIDER ' + '-' * 128 + '\n\n' + \ + 'csv format\n' + \ + '^' * 128 + '\n' + \ + '\n'.join([','.join(row) for row in table]) + '\n' + \ + '$' * 128 + '\n\n' + \ + '-' * 128 + ' THIS IS A DIVIDER ' + '-' * 128 + '\n\n' + \ + 'raw format\n' + \ + '^' * 128 + '\n' + \ + raw_txts + '\n' + \ + '$' * 128 + '\n' + f.write(text) + self.logger.info(f'write summary to {osp.abspath(output_path)}') + + with open(output_csv_path, 'w', encoding='utf-8') as f: + f.write('\n'.join([','.join(row) for row in table]) + '\n') + self.logger.info(f'write csv to {osp.abspath(output_csv_path)}') + + # 读取、排序并获取DataFrame + df_sorted = self._read_and_sort_dataframe(output_csv_path) + + # 导出排序后的DataFrame为CSV文件 + sorted_file_path = osp.abspath(output_csv_path).split('.')[0] + '_sorted.csv' # 指定输出文件的路径 + df_sorted.to_csv(sorted_file_path, index=False, header=False) + + self.logger.info(f'write sorted csv to {sorted_file_path}') + + + def summarize( + self, + output_path: str = None, + time_str: str = datetime.now().strftime('%Y%m%d_%H%M%S')): # noqa + + # pick up results + raw_results, parsed_results, dataset_metrics, dataset_eval_mode = self._pick_up_results() + + # calculate group metrics + raw_results, parsed_results, dataset_metrics, dataset_eval_mode = \ + self._calculate_group_metrics(raw_results, parsed_results, dataset_metrics, dataset_eval_mode) + + # format table + table = self._format_table(parsed_results, dataset_metrics, dataset_eval_mode) + + # format raw txt + raw_txts = self._format_raw_txt(raw_results) + + # output to screen + print(tabulate.tabulate(table, headers='firstrow')) + + # output to .text / .csv files + self._output_to_file(output_path, time_str, table, raw_txts) + + if self.lark_reporter: + content = f'{getpass.getuser()} 的' + content += f'详细评测汇总已输出至 {osp.abspath(output_path)}' + self.lark_reporter.post(content) + + if output_path is None: + output_path = osp.join(self.work_dir, 'summary', f'summary_{time_str}.txt') + # plot to show visualize results + save_results_to_plots(output_path) + + +class NeedleBenchATCSummarizer: + """NeedleBench summarizer in OpenCompass. + + Args: + config (ConfigDict): The configuration object of the evaluation task. It's expected to be filled out at runtime. + dataset_abbrs (list[str], optional): Dataset abbreviations to be listed in the summary. + summary_groups (list): The dataset groups whose results need to be averaged out. For example, mmlu. Each item it a dict with + 'name' (str) and 'subsets' (list of dataset abbrs), and optionally + 'weights' if weighted average is needed. + prompt_db: A deprecated field. + """ + + def __init__(self, config: ConfigDict, dataset_abbrs: Optional[List[str]] = None, summary_groups: List = [], prompt_db = None) -> None: + self.tasks = [] + self.cfg = config + self.logger = get_logger() + self.summary_groups = summary_groups + self.dataset_abbrs = dataset_abbrs + if prompt_db: + self.logger.warning('prompt_db is deprecated and no longer used. ' + 'Please remove it from your config.') + + # Enable lark bot if lark_url is presented + self.lark_reporter = None + if self.cfg.get('lark_bot_url', None): + self.lark_reporter = LarkReporter(self.cfg['lark_bot_url']) + + self.model_cfgs = self.cfg['models'] + self.dataset_cfgs = self.cfg['datasets'] + self.work_dir = self.cfg['work_dir'] + model_abbrs = [] + for model in self.model_cfgs: + model_abbr = model_abbr_from_cfg_used_in_summarizer(model) + if model_abbr in model_abbrs: + continue + model_abbrs.append(model_abbr) + self.model_abbrs = model_abbrs + + def _pick_up_results(self): + """The function reads the numerical results of evaluations from the + output folder based on the configuration file, and ultimately returns + four dictionaries, each containing processed information in different + formats. The contents of the four dictionaries are as follows: + + - raw_results: contains the raw results of each model on each dataset (excluding details). + - parsed_results: contains the results of each model on each dataset for each metric, with metrics in METRIC_BLACKLIST being ignored. + - dataset_metrics: contains the list of metrics for each dataset, consistent with the metrics in parsed_results. The list is ordered according to the METRIC_WHITELIST, + with metrics appearing earlier considered more important. + - dataset_eval_mode: contains the evaluation mode for each dataset. + """ + # raw_results: {model_abbr: {dataset_abbr: result}} + raw_results : Dict[str, Dict[str, Any]] = {} + # parsed_results: {model_abbr: {dataset_abbr: {metric: score}}} + parsed_results : Dict[str, Dict[str, Dict[str, float]]] = {} + # dataset_metrics: {dataset_abbr: [metric]} + dataset_metrics : Dict[str, List[str]] = {} + + for model in self.model_cfgs: + model_abbr = model_abbr_from_cfg_used_in_summarizer(model) + parsed_results.setdefault(model_abbr, {}) + raw_results.setdefault(model_abbr, {}) + for dataset in self.dataset_cfgs: + dataset_abbr = dataset_abbr_from_cfg(dataset) + filepath = get_infer_output_path(model, dataset, osp.join(self.work_dir, 'results')) + if not osp.exists(filepath): + continue + result = mmengine.load(filepath) + result.pop('details', None) + raw_results[model_abbr][dataset_abbr] = result + if 'error' in result: + self.logger.debug(f'error in {model_abbr} {dataset_abbr} {result["error"]}') + continue + _rst, _dm = {}, [] + for metric, score in result.items(): + if metric not in METRIC_BLACKLIST and isinstance(score, (int, float)): + _rst[metric] = score + _dm.append(metric) + else: + continue + if len(_rst) == 0: + self.logger.warning(f'unknown result format: {result}, continue') + continue + _dm = sorted(_dm, key=lambda i: METRIC_WHITELIST.index(i) if i in METRIC_WHITELIST else len(METRIC_WHITELIST)) + + if dataset_abbr in dataset_metrics: + assert tuple(dataset_metrics[dataset_abbr]) == tuple(_dm), \ + f'{dataset_abbr} has different metrics: {dataset_metrics[dataset_abbr]} vs {_dm}' + else: + dataset_metrics[dataset_abbr] = _dm + parsed_results[model_abbr][dataset_abbr] = _rst + + # dataset_eval_mode: {dataset_abbr: eval_mode} + dataset_eval_mode : Dict[str, str] = {} + for dataset in self.dataset_cfgs: + inferencer = dataset.get('infer_cfg', {}).get('inferencer', {}).get('type', '') + inferencer = inferencer if isinstance(inferencer, str) else inferencer.__name__ + dataset_abbr = dataset_abbr_from_cfg(dataset) + if 'GenInferencer' in inferencer: + dataset_eval_mode[dataset_abbr] = 'gen' + elif 'PPLInferencer' in inferencer: + dataset_eval_mode[dataset_abbr] = 'ppl' + elif 'LLInferencer' in inferencer: + dataset_eval_mode[dataset_abbr] = 'll' + else: + dataset_eval_mode[dataset_abbr] = 'unknown' + self.logger.warning(f'unknown inferencer: {inferencer} - {dataset_abbr}') + return raw_results, parsed_results, dataset_metrics, dataset_eval_mode + + def _calculate_group_metrics(self, raw_results, parsed_results, dataset_metrics, dataset_eval_mode): + """The function calculates the numerical results for each group based + on the configuration in summary_groups, and updates the contents of + each dictionary accordingly.""" + summary_groups = self.summary_groups + for sg in summary_groups: + for model_abbr in self.model_abbrs: + available_metrics, missing_metrics = [], [] + for i in sg['subsets']: + if isinstance(i, (list, tuple)): + if i[0] in parsed_results[model_abbr] and i[1] in parsed_results[model_abbr][i[0]]: + available_metrics.append(i) + else: + missing_metrics.append(i) + else: + if i in parsed_results[model_abbr]: + available_metrics.append(i) + else: + missing_metrics.append(i) + + if len(available_metrics) == 0: + continue + if len(missing_metrics) != 0: + raw_results[model_abbr][sg['name']] = {'error': 'missing metrics: {}'.format(missing_metrics)} + continue + + if 'metric' in sg: + default_metric = sg['metric'] + need_smart_metric = False + else: + need_smart_metric = True + if sg.get('std', False): + default_metric = 'standard_deviation' + elif sg.get('weights', []): + default_metric = 'weighted_average' + else: + default_metric = 'naive_average' + + scores, eval_modes, group_metrics = {}, [], None + if any(isinstance(dataset_abbr, (list, tuple)) for dataset_abbr in sg['subsets']) and \ + any(isinstance(dataset_abbr, str) for dataset_abbr in sg['subsets']): + raise NotImplementedError('mixed dataset_abbr type is not supported') + + if all(isinstance(dataset_abbr, (list, tuple)) for dataset_abbr in sg['subsets']): + group_metrics = [default_metric] + for dataset_abbr, metric in sg['subsets']: + scores.setdefault(default_metric, {})[dataset_abbr + '@' + metric] = parsed_results[model_abbr][dataset_abbr][metric] + eval_modes.append(dataset_eval_mode.get(dataset_abbr, 'unknown')) + else: + group_metrics = list(functools.reduce(lambda a, b: a & b, [set(dataset_metrics[dataset_abbr]) for dataset_abbr in sg['subsets']])) + if need_smart_metric and len(group_metrics) > 1: + for metric in group_metrics: + for dataset_abbr in sg['subsets']: + scores.setdefault(metric, {})[dataset_abbr + '@' + metric] = parsed_results[model_abbr][dataset_abbr][metric] + eval_modes.append(dataset_eval_mode.get(sg['subsets'][0], 'unknown')) + else: + group_metrics = [default_metric] + for dataset_abbr in sg['subsets']: + metric = dataset_metrics[dataset_abbr][0] + scores.setdefault(default_metric, {})[dataset_abbr + '@' + metric] = parsed_results[model_abbr][dataset_abbr][metric] + eval_modes.append(dataset_eval_mode.get(dataset_abbr, 'unknown')) + + result = {} + for metric in scores: + if default_metric == 'standard_deviation': + avg = sum(scores[metric].values()) / len(scores[metric]) + variance = sum((scores[metric][k] - avg) ** 2 for k in scores[metric]) / len(scores[metric]) + scores[metric] = result[metric] = math.sqrt(variance) + else: + if sg.get('weights', []): + # check sg['weights'][k] != 0 in case of scores[metric][k] is NaN + try: + numerator = sum(scores[metric][k] * sg['weights'][k] for k in sg['weights'] if sg['weights'][k] != 0) + except KeyError: + tmp_scores = {metric: {k.split('@')[0]: v for k, v in scores[metric].items()} for metric in scores} + numerator = sum(tmp_scores[metric][k] * sg['weights'][k] for k in sg['weights'] if sg['weights'][k] != 0) + denominator = sum(sg['weights'].values()) + else: + numerator = sum(scores[metric].values()) + denominator = len(scores[metric]) + scores[metric] = result[metric] = numerator / denominator + eval_modes = list(set(eval_modes)) + eval_mode = eval_modes[0] if len(eval_modes) == 1 else 'mixed' + + # add to global results + raw_results[model_abbr].setdefault(sg['name'], {}).update(scores) + parsed_results[model_abbr].setdefault(sg['name'], {}).update(result) + dataset_metrics.setdefault(sg['name'], []).extend(group_metrics) + dataset_eval_mode[sg['name']] = eval_mode + + return raw_results, parsed_results, dataset_metrics, dataset_eval_mode + + def _format_table(self, parsed_results, dataset_metrics, dataset_eval_mode): + dataset_abbrs = [dataset_abbr_from_cfg(dataset) for dataset in self.dataset_cfgs] + prompt_version = {dataset_abbr_from_cfg(d): get_prompt_hash(d)[:6] for d in self.dataset_cfgs} + + summarizer_dataset_abbrs = [] + if self.dataset_abbrs is None: + # display all dataset metrics included in the config + for dataset_abbr in dataset_abbrs: + if dataset_abbr in dataset_metrics: + for metric in dataset_metrics[dataset_abbr]: + summarizer_dataset_abbrs.append((dataset_abbr, metric)) + else: + summarizer_dataset_abbrs.append((dataset_abbr, None)) + # along with all possible group metrics + for dataset_abbr in dataset_metrics: + for metric in dataset_metrics[dataset_abbr]: + if (dataset_abbr, metric) not in summarizer_dataset_abbrs: + summarizer_dataset_abbrs.append((dataset_abbr, metric)) + else: + # follow the required order + for item in self.dataset_abbrs: + if isinstance(item, str): + summarizer_dataset_abbrs.append((item, None)) + elif isinstance(item, (list, tuple)): + summarizer_dataset_abbrs.append((item[0], item[1])) + + table = [] + header = ['dataset', 'version', 'metric', 'mode'] + self.model_abbrs + table.append(header) + + for key in dataset_metrics: + dataset_metrics[key] = list(set(dataset_metrics[key])) + + for dataset_abbr, metric in summarizer_dataset_abbrs: + if dataset_abbr not in dataset_metrics: + table.append([dataset_abbr, '-', '-', '-'] + ['-'] * len(self.model_abbrs)) + table.append(header) + continue + if len(dataset_metrics[dataset_abbr]) >= 10: + metric = 'average_score' + + if metric is None: + metric = dataset_metrics[dataset_abbr][0] + elif metric in dataset_metrics[dataset_abbr]: + pass + else: + table.append([dataset_abbr, '-', '-', '-'] + ['-'] * len(self.model_abbrs)) + continue + + row = [dataset_abbr, prompt_version.get(dataset_abbr, '-'), metric, dataset_eval_mode.get(dataset_abbr, '-')] + for model_abbr in self.model_abbrs: + if dataset_abbr in parsed_results[model_abbr]: + row.append('{:.02f}'.format(parsed_results[model_abbr][dataset_abbr][metric])) + else: + row.append('-') + + table.append(row) + for i in range(len(table)): + if i == 0 or table[i][0].startswith('---------'): + table[i] = [table[i][0]] + table[i][4:] + else: + table[i] = [table[i][0]] + table[i][4:] + + return table + + def _format_raw_txt(self, raw_results): + raw_dataset_abbrs = [] + for model_abbr in self.model_abbrs: + for dataset_abbr in raw_results[model_abbr]: + if dataset_abbr not in raw_dataset_abbrs: + raw_dataset_abbrs.append(dataset_abbr) + raw_txts = [] + for model_abbr in self.model_abbrs: + raw_txts.append('-------------------------------') + raw_txts.append(f'Model: {model_abbr}') + for dataset_abbr in raw_dataset_abbrs: + result = raw_results[model_abbr].get(dataset_abbr, '{}') + raw_txts.append(f'{dataset_abbr}: {result}') + raw_txts = '\n'.join(raw_txts) + return raw_txts + + def _read_and_sort_dataframe(self, file_path): + # Read the file without treating the first row as a header + data = pd.read_csv(file_path) + # print(data) + # Correct the extraction of needle counts for all settings + data['needle_count'] = data['dataset'].str.extract(r'needle_(\d+)_').astype(float) + data['needle_count'] = data['needle_count'].astype(int) + + # Define experimental settings groups + experimental_settings = { + 'en': '_en$', + 'zh': '_zh$', + 'en_ordered': '_en_ordered', + 'zh_ordered': '_zh_ordered', + } + + # Function to calculate maximum needles + def calculate_max_needles(dataset): + max_needles = {model: None for model in dataset.columns if 'b' in model} + for model in max_needles.keys(): + consecutive_low_scores = 0 + previous_needle_count = 0 + for index, row in dataset.sort_values(by='needle_count').iterrows(): + try: + score = float(row[model]) + except ValueError as e: + score = -1 + if score < 60: + consecutive_low_scores += 1 + if consecutive_low_scores == 1: + max_needles[model] = previous_needle_count + else: + consecutive_low_scores = 0 + previous_needle_count = row['needle_count'] + max_needle_count_seen = dataset['needle_count'].max() + max_needles[model] = max_needle_count_seen if max_needles[model] is None else max_needles[model] + return max_needles + + # Calculate max needles for each group and organize results in a DataFrame + results = {} + for setting, regex in experimental_settings.items(): + filtered_data = data[data['dataset'].str.contains(regex)] + results[setting] = calculate_max_needles(filtered_data) + + # Convert results to DataFrame and transpose it + results_df = pd.DataFrame(results).transpose() + + # Return the sorted DataFrame + results_df.index.name = 'ATC Experiment Type' + return results_df + + def _output_to_file(self, output_path, time_str, table, raw_txts): + # output to file + if output_path is None: + output_path = osp.join(self.work_dir, 'summary', f'summary_{time_str}.txt') + output_csv_path = osp.join(self.work_dir, 'summary', f'summary_{time_str}.csv') + else: + output_csv_path = output_path.replace('.txt', '.csv') + + output_dir = osp.split(output_path)[0] + mmengine.mkdir_or_exist(output_dir) + with open(output_path, 'w', encoding='utf-8') as f: + text = f'{time_str}\n' + \ + 'tabulate format\n' + \ + '^' * 128 + '\n' + \ + tabulate.tabulate(table, headers='firstrow') + '\n' + \ + '$' * 128 + '\n\n' + \ + '-' * 128 + ' THIS IS A DIVIDER ' + '-' * 128 + '\n\n' + \ + 'csv format\n' + \ + '^' * 128 + '\n' + \ + '\n'.join([','.join(row) for row in table]) + '\n' + \ + '$' * 128 + '\n\n' + \ + '-' * 128 + ' THIS IS A DIVIDER ' + '-' * 128 + '\n\n' + \ + 'raw format\n' + \ + '^' * 128 + '\n' + \ + raw_txts + '\n' + \ + '$' * 128 + '\n' + f.write(text) + self.logger.info(f'write summary to {osp.abspath(output_path)}') + + with open(output_csv_path, 'w', encoding='utf-8') as f: + f.write('\n'.join([','.join(row) for row in table]) + '\n') + # self.logger.info(f'write csv to {osp.abspath(output_csv_path)}') + + # 读取、排序并获取DataFrame + df_sorted = self._read_and_sort_dataframe(output_csv_path) + + # 导出排序后的DataFrame为CSV文件 + df_sorted.to_csv(output_csv_path) + + self.logger.info(f'write sorted csv to {output_csv_path}') + + + def summarize( + self, + output_path: str = None, + time_str: str = datetime.now().strftime('%Y%m%d_%H%M%S')): # noqa + + # pick up results + raw_results, parsed_results, dataset_metrics, dataset_eval_mode = self._pick_up_results() + + # calculate group metrics + raw_results, parsed_results, dataset_metrics, dataset_eval_mode = \ + self._calculate_group_metrics(raw_results, parsed_results, dataset_metrics, dataset_eval_mode) + + # format table + table = self._format_table(parsed_results, dataset_metrics, dataset_eval_mode) + + # format raw txt + raw_txts = self._format_raw_txt(raw_results) + + # output to .text / .csv files + self._output_to_file(output_path, time_str, table, raw_txts) + + if self.lark_reporter: + content = f'{getpass.getuser()} 的' + content += f'详细评测汇总已输出至 {osp.abspath(output_path)}' + self.lark_reporter.post(content) + + if output_path is None: + output_path = osp.join(self.work_dir, 'summary', f'summary_{time_str}.txt') diff --git a/tools/tools_needleinahaystack.py b/tools/tools_needleinahaystack.py deleted file mode 100644 index 64a2ba86..00000000 --- a/tools/tools_needleinahaystack.py +++ /dev/null @@ -1,136 +0,0 @@ -import argparse - -import matplotlib.pyplot as plt -import pandas as pd -import seaborn as sns -from matplotlib.colors import LinearSegmentedColormap - - -class CDMEDataset(): - - @staticmethod - def visualize(path: str, dataset_length: str): - for file_path in path: - df = pd.read_csv(file_path) - - df['Context Length'] = df['dataset'].apply( - lambda x: int(x.split('Length')[1].split('Depth')[0])) - df['Document Depth'] = df['dataset'].apply( - lambda x: float(x.split('Depth')[1].split('_')[0])) - - # Exclude 'Context Length' and 'Document Depth' columns - model_columns = [ - col for col in df.columns - if col not in ['Context Length', 'Document Depth'] - ] - - for model_name in model_columns[4:]: - model_df = df[['Document Depth', 'Context Length', - model_name]].copy() - model_df.rename(columns={model_name: 'Score'}, inplace=True) - - # Create pivot table - pivot_table = pd.pivot_table(model_df, - values='Score', - index=['Document Depth'], - columns=['Context Length'], - aggfunc='mean') - - # Calculate mean scores - mean_scores = pivot_table.mean().values - - # Calculate overall score - overall_score = mean_scores.mean() - - # Create heatmap and line plot - plt.figure(figsize=(15.5, 8)) - ax = plt.gca() - cmap = LinearSegmentedColormap.from_list( - 'custom_cmap', ['#F0496E', '#EBB839', '#0CD79F']) - - # Draw heatmap - sns.heatmap(pivot_table, - cmap=cmap, - ax=ax, - cbar_kws={'label': 'Score'}, - vmin=0, - vmax=100) - - # Set line plot data - x_data = [i + 0.5 for i in range(len(mean_scores))] - y_data = mean_scores - - # Create twin axis for line plot - ax2 = ax.twinx() - # Draw line plot - ax2.plot(x_data, - y_data, - color='white', - marker='o', - linestyle='-', - linewidth=2, - markersize=8, - label='Average Depth Score') - # Set y-axis range - ax2.set_ylim(0, 100) - - # Hide original y-axis ticks and labels - ax2.set_yticklabels([]) - ax2.set_yticks([]) - - # Add legend - ax2.legend(loc='upper left') - - # Set chart title and labels - ax.set_title(f'{model_name} {dataset_length} Context ' - 'Performance\nFact Retrieval Across ' - 'Context Lengths ("Needle In A Haystack")') - ax.set_xlabel('Token Limit') - ax.set_ylabel('Depth Percent') - ax.set_xticklabels(pivot_table.columns.values, rotation=45) - ax.set_yticklabels(pivot_table.index.values, rotation=0) - # Add overall score as a subtitle - plt.text(0.5, - -0.13, f'Overall Score for {model_name}: ' - f'{overall_score:.2f}', - ha='center', - va='center', - transform=ax.transAxes, - fontsize=13) - - # Save heatmap as PNG - png_file_path = file_path.replace('.csv', f'_{model_name}.png') - plt.tight_layout() - plt.subplots_adjust(right=1) - plt.draw() - plt.savefig(png_file_path) - plt.show() - - plt.close() # Close figure to prevent memory leaks - - # Print saved PNG file path - print(f'Heatmap for {model_name} saved as: {png_file_path}') - - -def main(): - parser = argparse.ArgumentParser(description='Generate NeedleInAHaystack' - 'Test Plots') - - parser.add_argument('--path', - nargs='*', - default=['path/to/your/result.csv'], - help='Paths to CSV files for visualization') - parser.add_argument('--dataset_length', - default='8K', - type=str, - help='Dataset_length for visualization') - args = parser.parse_args() - - if not args.path: - print("Error: '--path' is required for visualization.") - exit(1) - CDMEDataset.visualize(args.path, args.dataset_length) - - -if __name__ == '__main__': - main()