From 367ba1ba612a8cec5df1f80d5e5ae4e285baf38b Mon Sep 17 00:00:00 2001 From: DseidLi <2568818204@qq.com> Date: Wed, 28 Feb 2024 21:40:00 +0800 Subject: [PATCH] add Needlebench-1000K configs --- configs/datasets/needlebench/needlebench.py | 1 + .../needlebench_1000k/needlebench.py | 18 ++ .../needlebench_multi_reasoning.py | 286 ++++++++++++++++++ .../needlebench_multi_retrieval.py | 108 +++++++ .../needlebench_1000k/needlebench_single.py | 109 +++++++ configs/datasets/needlebench/readme.md | 1 + configs/datasets/needlebench/readme_zh-CN.md | 1 + configs/summarizers/needlebench.py | 108 ++++++- 8 files changed, 631 insertions(+), 1 deletion(-) create mode 100644 configs/datasets/needlebench/needlebench_1000k/needlebench.py create mode 100644 configs/datasets/needlebench/needlebench_1000k/needlebench_multi_reasoning.py create mode 100644 configs/datasets/needlebench/needlebench_1000k/needlebench_multi_retrieval.py create mode 100644 configs/datasets/needlebench/needlebench_1000k/needlebench_single.py diff --git a/configs/datasets/needlebench/needlebench.py b/configs/datasets/needlebench/needlebench.py index 291a3dee..09b978dd 100644 --- a/configs/datasets/needlebench/needlebench.py +++ b/configs/datasets/needlebench/needlebench.py @@ -6,5 +6,6 @@ with read_base(): from .needlebench_32k.needlebench import needlebench_datasets as needlebench_datasets_32k from .needlebench_128k.needlebench import needlebench_datasets as needlebench_datasets_128k from .needlebench_200k.needlebench import needlebench_datasets as needlebench_datasets_200k + from .needlebench_1000k.needlebench import needlebench_datasets as needlebench_datasets_1000k needlebench_datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), []) diff --git a/configs/datasets/needlebench/needlebench_1000k/needlebench.py b/configs/datasets/needlebench/needlebench_1000k/needlebench.py new file mode 100644 index 00000000..b73abb1f --- /dev/null +++ b/configs/datasets/needlebench/needlebench_1000k/needlebench.py @@ -0,0 +1,18 @@ +from mmengine.config import read_base + +with read_base(): + from .needlebench_multi_reasoning import needlebench_datasets_2needle_en as needlebench_multi_2needle_en_datasets + from .needlebench_multi_reasoning import needlebench_datasets_3needle_en as needlebench_multi_3needle_en_datasets + from .needlebench_multi_reasoning import needlebench_datasets_4needle_en as needlebench_multi_4needle_en_datasets + from .needlebench_multi_reasoning import needlebench_datasets_5needle_en as needlebench_multi_5needle_en_datasets + from .needlebench_multi_reasoning import needlebench_datasets_2needle_zh as needlebench_multi_2needle_zh_datasets + from .needlebench_multi_reasoning import needlebench_datasets_3needle_zh as needlebench_multi_3needle_zh_datasets + from .needlebench_multi_reasoning import needlebench_datasets_4needle_zh as needlebench_multi_4needle_zh_datasets + from .needlebench_multi_reasoning import needlebench_datasets_5needle_zh as needlebench_multi_5needle_zh_datasets + + from .needlebench_single import needlebench_datasets_en as needlebench_origin_en_datasets + from .needlebench_single import needlebench_datasets_zh as needlebench_origin_zh_datasets + from .needlebench_multi_retrieval import needlebench_datasets_en as needlebench_parallel_en_datasets + from .needlebench_multi_retrieval import needlebench_datasets_zh as needlebench_parallel_zh_datasets + +needlebench_datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), []) diff --git a/configs/datasets/needlebench/needlebench_1000k/needlebench_multi_reasoning.py b/configs/datasets/needlebench/needlebench_1000k/needlebench_multi_reasoning.py new file mode 100644 index 00000000..80402bdd --- /dev/null +++ b/configs/datasets/needlebench/needlebench_1000k/needlebench_multi_reasoning.py @@ -0,0 +1,286 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets.needlebench.multi import NeedleBenchMultiDataset +from opencompass.datasets.needlebench.multi import NeedleBenchMultiEvaluator +from opencompass.datasets.needlebench.origin import needlebench_postprocess +from opencompass.datasets.needlebench.origin import needlebench_dataset_postprocess +import math + + +def logistic(x, L=100, x0=50, k=0.1): + return round(L / (1 + math.exp(-k * (x - x0))), 3) + + +def generate_linear_space(start, end, num): + if num == 1: + return [start] + elif num < 1: + raise ValueError("num must be at least 1.") + step = (end - start) / (num - 1) + return [start + step * i for i in range(num)] + + +def generate_depth_percents(intervals, interval_type): + if interval_type == 'linear': + return generate_linear_space(0, 100, intervals) + elif interval_type == 'sigmoid': + linear_space = generate_linear_space(0, 100, intervals) + return [logistic(x) for x in linear_space] + else: + raise ValueError('Unsupported interval type') + + +needlebench_reader_cfg = dict(input_columns=['prompt'], output_column='answer') + +needlebench_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict(role='HUMAN', prompt='{prompt}'), + dict(role='BOT', prompt='{answer}\n'), + ] + ) + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer)) + +needlebench_eval_cfg = dict( + evaluator=dict(type=NeedleBenchMultiEvaluator), + pred_postprocessor=dict(type=needlebench_postprocess), + dataset_postprocessor=dict(type=needlebench_dataset_postprocess), + pred_role='BOT') + +context_lengths = [20000, 160000, 300000, 440000, 580000, 720000, 860000, 1000000] +depths_list = [0, 10, 21, 31, 42, 52, 63, 73, 84, 94, 100] + +# ----------English Version---------- +base_path = './data/needlebench' +file_list = ['PaulGrahamEssays.jsonl'] + +needle_file_name = 'multi_needle_reasoning_en.json' +diff = 10 +num_needles = 2 +needlebench_datasets_2needle_en = [] +language = 'English' + +for original_context_length in context_lengths: + for depth_percent in depths_list: + dataset_dict = { + 'abbr': f'Length{original_context_length}' + f'Depth{int(depth_percent)}_{num_needles}needle_en_1000k', + 'type': NeedleBenchMultiDataset, + 'path': base_path, + 'length': original_context_length, + 'depth': int(depth_percent), + 'tokenizer_model': 'gpt-4', + 'file_list': file_list, + 'num_repeats_per_file': 10, + 'length_buffer': 600, + 'guide': True, + 'language': language, + 'needle_file_name': needle_file_name, + 'num_needles': num_needles, + 'diff': diff, + 'reader_cfg': needlebench_reader_cfg, + 'infer_cfg': needlebench_infer_cfg, + 'eval_cfg': needlebench_eval_cfg + } + needlebench_datasets_2needle_en.append(dataset_dict) + +num_needles = 3 +needlebench_datasets_3needle_en = [] + +for original_context_length in context_lengths: + for depth_percent in depths_list: + dataset_dict = { + 'abbr': f'Length{original_context_length}' + f'Depth{int(depth_percent)}_{num_needles}needle_en_1000k', + 'type': NeedleBenchMultiDataset, + 'path': base_path, + 'length': original_context_length, + 'depth': int(depth_percent), + 'tokenizer_model': 'gpt-4', + 'file_list': file_list, + 'num_repeats_per_file': 10, + 'length_buffer': 600, + 'guide': True, + 'language': language, + 'needle_file_name': needle_file_name, + 'num_needles': num_needles, + 'diff': diff, + 'reader_cfg': needlebench_reader_cfg, + 'infer_cfg': needlebench_infer_cfg, + 'eval_cfg': needlebench_eval_cfg + } + needlebench_datasets_3needle_en.append(dataset_dict) + +num_needles = 4 +needlebench_datasets_4needle_en = [] + +for original_context_length in context_lengths: + for depth_percent in depths_list: + dataset_dict = { + 'abbr': f'Length{original_context_length}' + f'Depth{int(depth_percent)}_{num_needles}needle_en_1000k', + 'type': NeedleBenchMultiDataset, + 'path': base_path, + 'length': original_context_length, + 'depth': int(depth_percent), + 'tokenizer_model': 'gpt-4', + 'file_list': file_list, + 'num_repeats_per_file': 10, + 'length_buffer': 600, + 'guide': True, + 'language': language, + 'needle_file_name': needle_file_name, + 'num_needles': num_needles, + 'diff': diff, + 'reader_cfg': needlebench_reader_cfg, + 'infer_cfg': needlebench_infer_cfg, + 'eval_cfg': needlebench_eval_cfg + } + needlebench_datasets_4needle_en.append(dataset_dict) + +num_needles = 5 +needlebench_datasets_5needle_en = [] + +for original_context_length in context_lengths: + for depth_percent in depths_list: + dataset_dict = { + 'abbr': f'Length{original_context_length}' + f'Depth{int(depth_percent)}_{num_needles}needle_en_1000k', + 'type': NeedleBenchMultiDataset, + 'path': base_path, + 'length': original_context_length, + 'depth': int(depth_percent), + 'tokenizer_model': 'gpt-4', + 'file_list': file_list, + 'num_repeats_per_file': 10, + 'length_buffer': 600, + 'guide': True, + 'language': language, + 'needle_file_name': needle_file_name, + 'num_needles': num_needles, + 'diff': diff, + 'reader_cfg': needlebench_reader_cfg, + 'infer_cfg': needlebench_infer_cfg, + 'eval_cfg': needlebench_eval_cfg + } + needlebench_datasets_5needle_en.append(dataset_dict) + +# ----------Chinese Version---------- +base_path = './data/needlebench' +file_list = ['zh_finance.jsonl'] + +needle_file_name = 'multi_needle_reasoning_zh.json' +diff = 10 +num_needles = 2 +needlebench_datasets_2needle_zh = [] +language = 'Chinese' + +for original_context_length in context_lengths: + for depth_percent in depths_list: + dataset_dict = { + 'abbr': f'Length{original_context_length}' + f'Depth{int(depth_percent)}_{num_needles}needle_zh_1000k', + 'type': NeedleBenchMultiDataset, + 'path': base_path, + 'length': original_context_length, + 'depth': int(depth_percent), + 'tokenizer_model': 'gpt-4', + 'file_list': file_list, + 'num_repeats_per_file': 10, + 'length_buffer': 200, + 'guide': True, + 'language': language, + 'needle_file_name': needle_file_name, + 'num_needles': num_needles, + 'diff': diff, + 'reader_cfg': needlebench_reader_cfg, + 'infer_cfg': needlebench_infer_cfg, + 'eval_cfg': needlebench_eval_cfg + } + needlebench_datasets_2needle_zh.append(dataset_dict) + +num_needles = 3 +needlebench_datasets_3needle_zh = [] + +for original_context_length in context_lengths: + for depth_percent in depths_list: + dataset_dict = { + 'abbr': f'Length{original_context_length}' + f'Depth{int(depth_percent)}_{num_needles}needle_zh_1000k', + 'type': NeedleBenchMultiDataset, + 'path': base_path, + 'length': original_context_length, + 'depth': int(depth_percent), + 'tokenizer_model': 'gpt-4', + 'file_list': file_list, + 'num_repeats_per_file': 10, + 'length_buffer': 200, + 'guide': True, + 'language': language, + 'needle_file_name': needle_file_name, + 'num_needles': num_needles, + 'diff': diff, + 'reader_cfg': needlebench_reader_cfg, + 'infer_cfg': needlebench_infer_cfg, + 'eval_cfg': needlebench_eval_cfg + } + needlebench_datasets_3needle_zh.append(dataset_dict) + +num_needles = 4 +needlebench_datasets_4needle_zh = [] + +for original_context_length in context_lengths: + for depth_percent in depths_list: + dataset_dict = { + 'abbr': f'Length{original_context_length}' + f'Depth{int(depth_percent)}_{num_needles}needle_zh_1000k', + 'type': NeedleBenchMultiDataset, + 'path': base_path, + 'length': original_context_length, + 'depth': int(depth_percent), + 'tokenizer_model': 'gpt-4', + 'file_list': file_list, + 'num_repeats_per_file': 10, + 'length_buffer': 200, + 'guide': True, + 'language': language, + 'needle_file_name': needle_file_name, + 'num_needles': num_needles, + 'diff': diff, + 'reader_cfg': needlebench_reader_cfg, + 'infer_cfg': needlebench_infer_cfg, + 'eval_cfg': needlebench_eval_cfg + } + needlebench_datasets_4needle_zh.append(dataset_dict) + +num_needles = 5 +needlebench_datasets_5needle_zh = [] + +for original_context_length in context_lengths: + for depth_percent in depths_list: + dataset_dict = { + 'abbr': f'Length{original_context_length}' + f'Depth{int(depth_percent)}_{num_needles}needle_zh_1000k', + 'type': NeedleBenchMultiDataset, + 'path': base_path, + 'length': original_context_length, + 'depth': int(depth_percent), + 'tokenizer_model': 'gpt-4', + 'file_list': file_list, + 'num_repeats_per_file': 10, + 'length_buffer': 200, + 'guide': True, + 'language': language, + 'needle_file_name': needle_file_name, + 'num_needles': num_needles, + 'diff': diff, + 'reader_cfg': needlebench_reader_cfg, + 'infer_cfg': needlebench_infer_cfg, + 'eval_cfg': needlebench_eval_cfg + } + needlebench_datasets_5needle_zh.append(dataset_dict) diff --git a/configs/datasets/needlebench/needlebench_1000k/needlebench_multi_retrieval.py b/configs/datasets/needlebench/needlebench_1000k/needlebench_multi_retrieval.py new file mode 100644 index 00000000..993e4f7c --- /dev/null +++ b/configs/datasets/needlebench/needlebench_1000k/needlebench_multi_retrieval.py @@ -0,0 +1,108 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets.needlebench.parallel import NeedleBenchParallelDataset +from opencompass.datasets.needlebench.parallel import NeedleBenchParallelEvaluator +from opencompass.datasets.needlebench.origin import needlebench_postprocess +from opencompass.datasets.needlebench.origin import needlebench_dataset_postprocess +import math + + +def logistic(x, L=100, x0=50, k=0.1): + return round(L / (1 + math.exp(-k * (x - x0))), 3) + + +def generate_linear_space(start, end, num): + if num == 1: + return [start] + elif num < 1: + raise ValueError("num must be at least 1.") + step = (end - start) / (num - 1) + return [start + step * i for i in range(num)] + + +def generate_depth_percents(intervals, interval_type): + if interval_type == 'linear': + return generate_linear_space(0, 100, intervals) + elif interval_type == 'sigmoid': + linear_space = generate_linear_space(0, 100, intervals) + return [logistic(x) for x in linear_space] + else: + raise ValueError('Unsupported interval type') + + +needlebench_reader_cfg = dict(input_columns=['prompt'], output_column='answer') + +needlebench_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict(role='HUMAN', prompt='{prompt}'), + dict(role='BOT', prompt='{answer}\n'), + ] + ) + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer)) + +needlebench_eval_cfg = dict( + evaluator=dict(type=NeedleBenchParallelEvaluator), + pred_postprocessor=dict(type=needlebench_postprocess), + dataset_postprocessor=dict(type=needlebench_dataset_postprocess), + pred_role='BOT') + +context_lengths = list([20000, 160000, 300000, 440000, 580000, 720000, 860000, 1000000]) +document_depth_percent_intervals = 20 +document_depth_percent_interval_type = "linear" + +base_path = './data/needlebench' +file_list = ['PaulGrahamEssays.jsonl'] +needlebench_datasets_en = [] +needle_file_name = 'needles.jsonl' +depths = [0, 10, 21, 31, 42, 52, 63, 73, 84, 94, 100] + +for original_context_length in context_lengths: + dataset_dict = { + 'abbr': f'Length{original_context_length}' + f'_parallel_en_1000k', + 'type': NeedleBenchParallelDataset, + 'path': base_path, + 'needle_file_name': needle_file_name, + 'length': original_context_length, + 'depths': depths, + 'tokenizer_model': 'gpt-4', + 'file_list': file_list, + 'num_repeats_per_file': 25, + 'length_buffer': 3000, + 'guide': True, + 'language': 'English', + 'reader_cfg': needlebench_reader_cfg, + 'infer_cfg': needlebench_infer_cfg, + 'eval_cfg': needlebench_eval_cfg + } + needlebench_datasets_en.append(dataset_dict) + +file_list = ['zh_finance.jsonl'] +needlebench_datasets_zh = [] + +for original_context_length in context_lengths: + dataset_dict = { + 'abbr': f'Length{original_context_length}' + f'_parallel_zh_1000k', + 'type': NeedleBenchParallelDataset, + 'path': base_path, + 'needle_file_name': needle_file_name, + 'length': original_context_length, + 'depths': depths, + 'tokenizer_model': 'gpt-4', + 'file_list': file_list, + 'num_repeats_per_file': 25, + 'length_buffer': 200, + 'guide': True, + 'language': 'Chinese', + 'reader_cfg': needlebench_reader_cfg, + 'infer_cfg': needlebench_infer_cfg, + 'eval_cfg': needlebench_eval_cfg + } + needlebench_datasets_zh.append(dataset_dict) diff --git a/configs/datasets/needlebench/needlebench_1000k/needlebench_single.py b/configs/datasets/needlebench/needlebench_1000k/needlebench_single.py new file mode 100644 index 00000000..5a41275e --- /dev/null +++ b/configs/datasets/needlebench/needlebench_1000k/needlebench_single.py @@ -0,0 +1,109 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets.needlebench.origin import NeedleBenchOriginDataset +from opencompass.datasets.needlebench.origin import NeedleBenchOriginEvaluator +from opencompass.datasets.needlebench.origin import needlebench_postprocess +from opencompass.datasets.needlebench.origin import needlebench_dataset_postprocess +import math + + +def logistic(x, L=100, x0=50, k=0.1): + return round(L / (1 + math.exp(-k * (x - x0))), 3) + + +def generate_linear_space(start, end, num): + if num == 1: + return [start] + elif num < 1: + raise ValueError("num must be at least 1.") + step = (end - start) / (num - 1) + return [start + step * i for i in range(num)] + + +def generate_depth_percents(intervals, interval_type): + if interval_type == 'linear': + return generate_linear_space(0, 100, intervals) + elif interval_type == 'sigmoid': + linear_space = generate_linear_space(0, 100, intervals) + return [logistic(x) for x in linear_space] + else: + raise ValueError('Unsupported interval type') + + +needlebench_reader_cfg = dict(input_columns=['prompt'], output_column='answer') + +needlebench_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict(role='HUMAN', prompt='{prompt}'), + dict(role='BOT', prompt='{answer}\n'), + ] + ) + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer)) + +needlebench_eval_cfg = dict( + evaluator=dict(type=NeedleBenchOriginEvaluator), + pred_postprocessor=dict(type=needlebench_postprocess), + dataset_postprocessor=dict(type=needlebench_dataset_postprocess), + pred_role='BOT') + +context_lengths = [20000, 160000, 300000, 440000, 580000, 720000, 860000, 1000000] +depths_list = [0, 10, 21, 31, 42, 52, 63, 73, 84, 94, 100] + +base_path = './data/needlebench' +file_list = ['PaulGrahamEssays.jsonl'] +needlebench_datasets_en = [] +needle_file_name = 'needles.jsonl' + +for original_context_length in context_lengths: + for depth_percent in depths_list: + dataset_dict = { + 'abbr': f'Length{original_context_length}' + f'Depth{int(depth_percent)}_origin_en_1000k', + 'type': NeedleBenchOriginDataset, + 'path': base_path, + 'length': original_context_length, + 'depth': int(depth_percent), + 'tokenizer_model': 'gpt-4', + 'file_list': file_list, + 'num_repeats_per_file': 10, + 'length_buffer': 600, + 'guide': True, + 'language': 'English', + 'needle_file_name': needle_file_name, + 'reader_cfg': needlebench_reader_cfg, + 'infer_cfg': needlebench_infer_cfg, + 'eval_cfg': needlebench_eval_cfg + } + needlebench_datasets_en.append(dataset_dict) + +file_list = ['zh_finance.jsonl'] +needlebench_datasets_zh = [] +needle_file_name = 'needles.jsonl' + +for original_context_length in context_lengths: + for depth_percent in depths_list: + dataset_dict = { + 'abbr': f'Length{original_context_length}' + f'Depth{int(depth_percent)}_origin_zh_1000k', + 'type': NeedleBenchOriginDataset, + 'path': base_path, + 'length': original_context_length, + 'depth': int(depth_percent), + 'tokenizer_model': 'gpt-4', + 'file_list': file_list, + 'num_repeats_per_file': 10, + 'length_buffer': 200, + 'guide': True, + 'language': 'Chinese', + 'needle_file_name': needle_file_name, + 'reader_cfg': needlebench_reader_cfg, + 'infer_cfg': needlebench_infer_cfg, + 'eval_cfg': needlebench_eval_cfg + } + needlebench_datasets_zh.append(dataset_dict) diff --git a/configs/datasets/needlebench/readme.md b/configs/datasets/needlebench/readme.md index ea082132..763c1ef5 100644 --- a/configs/datasets/needlebench/readme.md +++ b/configs/datasets/needlebench/readme.md @@ -16,6 +16,7 @@ configs/datasets/needlebench/ ├── needlebench_32k ├── needlebench_128k ├── needlebench_200k +├── needlebench_1000k ├── needlebench.py ├── readme.md └── readme_zh-CN.md diff --git a/configs/datasets/needlebench/readme_zh-CN.md b/configs/datasets/needlebench/readme_zh-CN.md index d8a95cc9..b2716025 100644 --- a/configs/datasets/needlebench/readme_zh-CN.md +++ b/configs/datasets/needlebench/readme_zh-CN.md @@ -16,6 +16,7 @@ configs/datasets/needlebench/ ├── needlebench_32k ├── needlebench_128k ├── needlebench_200k +├── needlebench_1000k ├── needlebench.py ├── readme.md └── readme_zh-CN.md diff --git a/configs/summarizers/needlebench.py b/configs/summarizers/needlebench.py index f31d54c2..27a71702 100644 --- a/configs/summarizers/needlebench.py +++ b/configs/summarizers/needlebench.py @@ -539,8 +539,114 @@ needlebench_200k_summarizer = dict( ], summary_groups=needlebench_summary_groups, ) -context_lengths_8k = list(range(5000, 9000, 1000)) +# ----------NeedleBench-1000k-summarizer---------- + +context_lengths_1000k = list([20000, 160000, 300000, 440000, 580000, 720000, 860000, 1000000]) +# Initialize the lists +_needlebench_1000k_2needle_en = [] +_needlebench_1000k_3needle_en = [] +_needlebench_1000k_4needle_en = [] +_needlebench_1000k_5needle_en = [] +_needlebench_1000k_2needle_zh = [] +_needlebench_1000k_3needle_zh = [] +_needlebench_1000k_4needle_zh = [] +_needlebench_1000k_5needle_zh = [] +_needlebench_1000k_origin_en = [] +_needlebench_1000k_origin_zh = [] + +# Fill the lists using nested loops +for original_context_length in context_lengths_1000k: + for depth_percent in depths_list_sparse: + _needlebench_1000k_2needle_en.append(f'Length{original_context_length}Depth{int(depth_percent)}_2needle_en_1000k') + _needlebench_1000k_3needle_en.append(f'Length{original_context_length}Depth{int(depth_percent)}_3needle_en_1000k') + _needlebench_1000k_4needle_en.append(f'Length{original_context_length}Depth{int(depth_percent)}_4needle_en_1000k') + _needlebench_1000k_5needle_en.append(f'Length{original_context_length}Depth{int(depth_percent)}_5needle_en_1000k') + _needlebench_1000k_2needle_zh.append(f'Length{original_context_length}Depth{int(depth_percent)}_2needle_zh_1000k') + _needlebench_1000k_3needle_zh.append(f'Length{original_context_length}Depth{int(depth_percent)}_3needle_zh_1000k') + _needlebench_1000k_4needle_zh.append(f'Length{original_context_length}Depth{int(depth_percent)}_4needle_zh_1000k') + _needlebench_1000k_5needle_zh.append(f'Length{original_context_length}Depth{int(depth_percent)}_5needle_zh_1000k') + + _needlebench_1000k_origin_en.append(f'Length{original_context_length}Depth{int(depth_percent)}_origin_en_1000k') + _needlebench_1000k_origin_zh.append(f'Length{original_context_length}Depth{int(depth_percent)}_origin_zh_1000k') + +# Concatenate the multi-needle and origin lists +_needlebench_1000k_multi_needle_en = _needlebench_1000k_2needle_en + _needlebench_1000k_3needle_en + _needlebench_1000k_4needle_en + _needlebench_1000k_5needle_en +_needlebench_1000k_multi_needle_zh = _needlebench_1000k_2needle_zh + _needlebench_1000k_3needle_zh + _needlebench_1000k_4needle_zh + _needlebench_1000k_5needle_zh +_needlebench_1000k_origin = _needlebench_1000k_origin_en + _needlebench_1000k_origin_zh +_needlebench_1000k_multi_needle = _needlebench_1000k_multi_needle_en + _needlebench_1000k_multi_needle_zh + +# Repeating the same process for parallel (assuming it's similar to origin_en) +_needlebench_1000k_parallel_en = [] +_needlebench_1000k_parallel_zh = [] +for original_context_length in context_lengths_1000k: + _needlebench_1000k_parallel_en.append(f'Length{original_context_length}_parallel_en_1000k') +for original_context_length in context_lengths_1000k: + _needlebench_1000k_parallel_zh.append(f'Length{original_context_length}_parallel_zh_1000k') +_needlebench_1000k_parallel = _needlebench_1000k_parallel_en + _needlebench_1000k_parallel_zh + +needlebench_summary_groups = [ + {'name': 'original_version', 'subsets': _needlebench_1000k_origin}, + {'name': 'original_version_zh', 'subsets': _needlebench_1000k_origin_zh}, + {'name': 'original_version_en', 'subsets': _needlebench_1000k_origin_en}, + + {'name': 'multi_needle_en', 'subsets': _needlebench_1000k_multi_needle_en}, + {'name': 'multi_needle2_en', 'subsets': _needlebench_1000k_2needle_en}, + {'name': 'multi_needle3_en', 'subsets': _needlebench_1000k_3needle_en}, + {'name': 'multi_needle4_en', 'subsets': _needlebench_1000k_4needle_en}, + {'name': 'multi_needle5_en', 'subsets': _needlebench_1000k_5needle_en}, + + {'name': 'multi_needle_zh', 'subsets': _needlebench_1000k_multi_needle_zh}, + {'name': 'multi_needle2_zh', 'subsets': _needlebench_1000k_2needle_zh}, + {'name': 'multi_needle3_zh', 'subsets': _needlebench_1000k_3needle_zh}, + {'name': 'multi_needle4_zh', 'subsets': _needlebench_1000k_4needle_zh}, + {'name': 'multi_needle5_zh', 'subsets': _needlebench_1000k_5needle_zh}, + + {'name': 'multi_needle', 'subsets': _needlebench_1000k_multi_needle}, + + {'name': 'parallel_version', 'subsets': _needlebench_1000k_parallel}, + {'name': 'parallel_version_zh', 'subsets': _needlebench_1000k_parallel_zh}, + {'name': 'parallel_version_en', 'subsets': _needlebench_1000k_parallel_en}, + + {'name': 'overall', + 'subsets': [['original_version', 'naive_average'], + ['multi_needle', 'naive_average'], + ['parallel_version', 'average_score']], + 'weights': {'original_version': 0.4, + 'multi_needle': 0.3, + 'parallel_version': 0.3}}, +] +needlebench_1000k_summarizer = dict( + type=NeedleBenchSummarizer, + dataset_abbrs=[ + 'overall', + '--------- NeedleBench-1000k Single-Needle ---------', # category + 'original_version', + 'original_version_zh', + 'original_version_en', + '--------- NeedleBench-1000k Parallel-Needles ---------', # category + 'parallel_version', + 'parallel_version_zh', + 'parallel_version_en', + '--------- NeedleBench-1000k Multi-Needles ---------', # category + 'multi_needle', + 'multi_needle_en', + 'multi_needle_zh', + 'multi_needle2_en', + 'multi_needle3_en', + 'multi_needle4_en', + 'multi_needle5_en', + 'multi_needle2_zh', + 'multi_needle3_zh', + 'multi_needle4_zh', + 'multi_needle5_zh', + + # *_needlebench_1000k_origin, *_needlebench_1000k_multi_needle, *_needlebench_1000k_parallel, + ], + summary_groups=needlebench_summary_groups, +) + +context_lengths_8k = list(range(5000, 9000, 1000)) # Repeating the same process for parallel (assuming it's similar to origin_en) _needlebench_8k_parallel_en_batch1 = [] _needlebench_8k_parallel_en_batch5 = []