[Feature] update needlebench and configs (#986)

* add Needlebench-1000K configs

* add prompt postion args

* add model configs

* Update parallel.py

* fix lint
This commit is contained in:
Mo Li 2024-03-25 18:05:01 +08:00 committed by GitHub
parent 0665bb91a8
commit 0a6a03fe1a
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
14 changed files with 831 additions and 38 deletions

View File

@ -6,5 +6,6 @@ with read_base():
from .needlebench_32k.needlebench import needlebench_datasets as needlebench_datasets_32k from .needlebench_32k.needlebench import needlebench_datasets as needlebench_datasets_32k
from .needlebench_128k.needlebench import needlebench_datasets as needlebench_datasets_128k from .needlebench_128k.needlebench import needlebench_datasets as needlebench_datasets_128k
from .needlebench_200k.needlebench import needlebench_datasets as needlebench_datasets_200k from .needlebench_200k.needlebench import needlebench_datasets as needlebench_datasets_200k
from .needlebench_1000k.needlebench import needlebench_datasets as needlebench_datasets_1000k
needlebench_datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), []) needlebench_datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])

View File

@ -0,0 +1,18 @@
from mmengine.config import read_base
with read_base():
from .needlebench_multi_reasoning import needlebench_datasets_2needle_en as needlebench_multi_2needle_en_datasets
from .needlebench_multi_reasoning import needlebench_datasets_3needle_en as needlebench_multi_3needle_en_datasets
from .needlebench_multi_reasoning import needlebench_datasets_4needle_en as needlebench_multi_4needle_en_datasets
from .needlebench_multi_reasoning import needlebench_datasets_5needle_en as needlebench_multi_5needle_en_datasets
from .needlebench_multi_reasoning import needlebench_datasets_2needle_zh as needlebench_multi_2needle_zh_datasets
from .needlebench_multi_reasoning import needlebench_datasets_3needle_zh as needlebench_multi_3needle_zh_datasets
from .needlebench_multi_reasoning import needlebench_datasets_4needle_zh as needlebench_multi_4needle_zh_datasets
from .needlebench_multi_reasoning import needlebench_datasets_5needle_zh as needlebench_multi_5needle_zh_datasets
from .needlebench_single import needlebench_datasets_en as needlebench_origin_en_datasets
from .needlebench_single import needlebench_datasets_zh as needlebench_origin_zh_datasets
from .needlebench_multi_retrieval import needlebench_datasets_en as needlebench_parallel_en_datasets
from .needlebench_multi_retrieval import needlebench_datasets_zh as needlebench_parallel_zh_datasets
needlebench_datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])

View File

@ -0,0 +1,286 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets.needlebench.multi import NeedleBenchMultiDataset
from opencompass.datasets.needlebench.multi import NeedleBenchMultiEvaluator
from opencompass.datasets.needlebench.origin import needlebench_postprocess
from opencompass.datasets.needlebench.origin import needlebench_dataset_postprocess
import math
def logistic(x, L=100, x0=50, k=0.1):
return round(L / (1 + math.exp(-k * (x - x0))), 3)
def generate_linear_space(start, end, num):
if num == 1:
return [start]
elif num < 1:
raise ValueError("num must be at least 1.")
step = (end - start) / (num - 1)
return [start + step * i for i in range(num)]
def generate_depth_percents(intervals, interval_type):
if interval_type == 'linear':
return generate_linear_space(0, 100, intervals)
elif interval_type == 'sigmoid':
linear_space = generate_linear_space(0, 100, intervals)
return [logistic(x) for x in linear_space]
else:
raise ValueError('Unsupported interval type')
needlebench_reader_cfg = dict(input_columns=['prompt'], output_column='answer')
needlebench_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(role='HUMAN', prompt='{prompt}'),
dict(role='BOT', prompt='{answer}\n'),
]
)
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer))
needlebench_eval_cfg = dict(
evaluator=dict(type=NeedleBenchMultiEvaluator),
pred_postprocessor=dict(type=needlebench_postprocess),
dataset_postprocessor=dict(type=needlebench_dataset_postprocess),
pred_role='BOT')
context_lengths = [20000, 160000, 300000, 440000, 580000, 720000, 860000, 1000000]
depths_list = [0, 10, 21, 31, 42, 52, 63, 73, 84, 94, 100]
# ----------English Version----------
base_path = './data/needlebench'
file_list = ['PaulGrahamEssays.jsonl']
needle_file_name = 'multi_needle_reasoning_en.json'
diff = 10
num_needles = 2
needlebench_datasets_2needle_en = []
language = 'English'
for original_context_length in context_lengths:
for depth_percent in depths_list:
dataset_dict = {
'abbr': f'Length{original_context_length}'
f'Depth{int(depth_percent)}_{num_needles}needle_en_1000k',
'type': NeedleBenchMultiDataset,
'path': base_path,
'length': original_context_length,
'depth': int(depth_percent),
'tokenizer_model': 'gpt-4',
'file_list': file_list,
'num_repeats_per_file': 10,
'length_buffer': 600,
'guide': True,
'language': language,
'needle_file_name': needle_file_name,
'num_needles': num_needles,
'diff': diff,
'reader_cfg': needlebench_reader_cfg,
'infer_cfg': needlebench_infer_cfg,
'eval_cfg': needlebench_eval_cfg
}
needlebench_datasets_2needle_en.append(dataset_dict)
num_needles = 3
needlebench_datasets_3needle_en = []
for original_context_length in context_lengths:
for depth_percent in depths_list:
dataset_dict = {
'abbr': f'Length{original_context_length}'
f'Depth{int(depth_percent)}_{num_needles}needle_en_1000k',
'type': NeedleBenchMultiDataset,
'path': base_path,
'length': original_context_length,
'depth': int(depth_percent),
'tokenizer_model': 'gpt-4',
'file_list': file_list,
'num_repeats_per_file': 10,
'length_buffer': 600,
'guide': True,
'language': language,
'needle_file_name': needle_file_name,
'num_needles': num_needles,
'diff': diff,
'reader_cfg': needlebench_reader_cfg,
'infer_cfg': needlebench_infer_cfg,
'eval_cfg': needlebench_eval_cfg
}
needlebench_datasets_3needle_en.append(dataset_dict)
num_needles = 4
needlebench_datasets_4needle_en = []
for original_context_length in context_lengths:
for depth_percent in depths_list:
dataset_dict = {
'abbr': f'Length{original_context_length}'
f'Depth{int(depth_percent)}_{num_needles}needle_en_1000k',
'type': NeedleBenchMultiDataset,
'path': base_path,
'length': original_context_length,
'depth': int(depth_percent),
'tokenizer_model': 'gpt-4',
'file_list': file_list,
'num_repeats_per_file': 10,
'length_buffer': 600,
'guide': True,
'language': language,
'needle_file_name': needle_file_name,
'num_needles': num_needles,
'diff': diff,
'reader_cfg': needlebench_reader_cfg,
'infer_cfg': needlebench_infer_cfg,
'eval_cfg': needlebench_eval_cfg
}
needlebench_datasets_4needle_en.append(dataset_dict)
num_needles = 5
needlebench_datasets_5needle_en = []
for original_context_length in context_lengths:
for depth_percent in depths_list:
dataset_dict = {
'abbr': f'Length{original_context_length}'
f'Depth{int(depth_percent)}_{num_needles}needle_en_1000k',
'type': NeedleBenchMultiDataset,
'path': base_path,
'length': original_context_length,
'depth': int(depth_percent),
'tokenizer_model': 'gpt-4',
'file_list': file_list,
'num_repeats_per_file': 10,
'length_buffer': 600,
'guide': True,
'language': language,
'needle_file_name': needle_file_name,
'num_needles': num_needles,
'diff': diff,
'reader_cfg': needlebench_reader_cfg,
'infer_cfg': needlebench_infer_cfg,
'eval_cfg': needlebench_eval_cfg
}
needlebench_datasets_5needle_en.append(dataset_dict)
# ----------Chinese Version----------
base_path = './data/needlebench'
file_list = ['zh_finance.jsonl']
needle_file_name = 'multi_needle_reasoning_zh.json'
diff = 10
num_needles = 2
needlebench_datasets_2needle_zh = []
language = 'Chinese'
for original_context_length in context_lengths:
for depth_percent in depths_list:
dataset_dict = {
'abbr': f'Length{original_context_length}'
f'Depth{int(depth_percent)}_{num_needles}needle_zh_1000k',
'type': NeedleBenchMultiDataset,
'path': base_path,
'length': original_context_length,
'depth': int(depth_percent),
'tokenizer_model': 'gpt-4',
'file_list': file_list,
'num_repeats_per_file': 10,
'length_buffer': 200,
'guide': True,
'language': language,
'needle_file_name': needle_file_name,
'num_needles': num_needles,
'diff': diff,
'reader_cfg': needlebench_reader_cfg,
'infer_cfg': needlebench_infer_cfg,
'eval_cfg': needlebench_eval_cfg
}
needlebench_datasets_2needle_zh.append(dataset_dict)
num_needles = 3
needlebench_datasets_3needle_zh = []
for original_context_length in context_lengths:
for depth_percent in depths_list:
dataset_dict = {
'abbr': f'Length{original_context_length}'
f'Depth{int(depth_percent)}_{num_needles}needle_zh_1000k',
'type': NeedleBenchMultiDataset,
'path': base_path,
'length': original_context_length,
'depth': int(depth_percent),
'tokenizer_model': 'gpt-4',
'file_list': file_list,
'num_repeats_per_file': 10,
'length_buffer': 200,
'guide': True,
'language': language,
'needle_file_name': needle_file_name,
'num_needles': num_needles,
'diff': diff,
'reader_cfg': needlebench_reader_cfg,
'infer_cfg': needlebench_infer_cfg,
'eval_cfg': needlebench_eval_cfg
}
needlebench_datasets_3needle_zh.append(dataset_dict)
num_needles = 4
needlebench_datasets_4needle_zh = []
for original_context_length in context_lengths:
for depth_percent in depths_list:
dataset_dict = {
'abbr': f'Length{original_context_length}'
f'Depth{int(depth_percent)}_{num_needles}needle_zh_1000k',
'type': NeedleBenchMultiDataset,
'path': base_path,
'length': original_context_length,
'depth': int(depth_percent),
'tokenizer_model': 'gpt-4',
'file_list': file_list,
'num_repeats_per_file': 10,
'length_buffer': 200,
'guide': True,
'language': language,
'needle_file_name': needle_file_name,
'num_needles': num_needles,
'diff': diff,
'reader_cfg': needlebench_reader_cfg,
'infer_cfg': needlebench_infer_cfg,
'eval_cfg': needlebench_eval_cfg
}
needlebench_datasets_4needle_zh.append(dataset_dict)
num_needles = 5
needlebench_datasets_5needle_zh = []
for original_context_length in context_lengths:
for depth_percent in depths_list:
dataset_dict = {
'abbr': f'Length{original_context_length}'
f'Depth{int(depth_percent)}_{num_needles}needle_zh_1000k',
'type': NeedleBenchMultiDataset,
'path': base_path,
'length': original_context_length,
'depth': int(depth_percent),
'tokenizer_model': 'gpt-4',
'file_list': file_list,
'num_repeats_per_file': 10,
'length_buffer': 200,
'guide': True,
'language': language,
'needle_file_name': needle_file_name,
'num_needles': num_needles,
'diff': diff,
'reader_cfg': needlebench_reader_cfg,
'infer_cfg': needlebench_infer_cfg,
'eval_cfg': needlebench_eval_cfg
}
needlebench_datasets_5needle_zh.append(dataset_dict)

View File

@ -0,0 +1,108 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets.needlebench.parallel import NeedleBenchParallelDataset
from opencompass.datasets.needlebench.parallel import NeedleBenchParallelEvaluator
from opencompass.datasets.needlebench.origin import needlebench_postprocess
from opencompass.datasets.needlebench.origin import needlebench_dataset_postprocess
import math
def logistic(x, L=100, x0=50, k=0.1):
return round(L / (1 + math.exp(-k * (x - x0))), 3)
def generate_linear_space(start, end, num):
if num == 1:
return [start]
elif num < 1:
raise ValueError("num must be at least 1.")
step = (end - start) / (num - 1)
return [start + step * i for i in range(num)]
def generate_depth_percents(intervals, interval_type):
if interval_type == 'linear':
return generate_linear_space(0, 100, intervals)
elif interval_type == 'sigmoid':
linear_space = generate_linear_space(0, 100, intervals)
return [logistic(x) for x in linear_space]
else:
raise ValueError('Unsupported interval type')
needlebench_reader_cfg = dict(input_columns=['prompt'], output_column='answer')
needlebench_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(role='HUMAN', prompt='{prompt}'),
dict(role='BOT', prompt='{answer}\n'),
]
)
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer))
needlebench_eval_cfg = dict(
evaluator=dict(type=NeedleBenchParallelEvaluator),
pred_postprocessor=dict(type=needlebench_postprocess),
dataset_postprocessor=dict(type=needlebench_dataset_postprocess),
pred_role='BOT')
context_lengths = list([20000, 160000, 300000, 440000, 580000, 720000, 860000, 1000000])
document_depth_percent_intervals = 20
document_depth_percent_interval_type = "linear"
base_path = './data/needlebench'
file_list = ['PaulGrahamEssays.jsonl']
needlebench_datasets_en = []
needle_file_name = 'needles.jsonl'
depths = [0, 10, 21, 31, 42, 52, 63, 73, 84, 94, 100]
for original_context_length in context_lengths:
dataset_dict = {
'abbr': f'Length{original_context_length}'
f'_parallel_en_1000k',
'type': NeedleBenchParallelDataset,
'path': base_path,
'needle_file_name': needle_file_name,
'length': original_context_length,
'depths': depths,
'tokenizer_model': 'gpt-4',
'file_list': file_list,
'num_repeats_per_file': 25,
'length_buffer': 3000,
'guide': True,
'language': 'English',
'reader_cfg': needlebench_reader_cfg,
'infer_cfg': needlebench_infer_cfg,
'eval_cfg': needlebench_eval_cfg
}
needlebench_datasets_en.append(dataset_dict)
file_list = ['zh_finance.jsonl']
needlebench_datasets_zh = []
for original_context_length in context_lengths:
dataset_dict = {
'abbr': f'Length{original_context_length}'
f'_parallel_zh_1000k',
'type': NeedleBenchParallelDataset,
'path': base_path,
'needle_file_name': needle_file_name,
'length': original_context_length,
'depths': depths,
'tokenizer_model': 'gpt-4',
'file_list': file_list,
'num_repeats_per_file': 25,
'length_buffer': 200,
'guide': True,
'language': 'Chinese',
'reader_cfg': needlebench_reader_cfg,
'infer_cfg': needlebench_infer_cfg,
'eval_cfg': needlebench_eval_cfg
}
needlebench_datasets_zh.append(dataset_dict)

View File

@ -0,0 +1,109 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets.needlebench.origin import NeedleBenchOriginDataset
from opencompass.datasets.needlebench.origin import NeedleBenchOriginEvaluator
from opencompass.datasets.needlebench.origin import needlebench_postprocess
from opencompass.datasets.needlebench.origin import needlebench_dataset_postprocess
import math
def logistic(x, L=100, x0=50, k=0.1):
return round(L / (1 + math.exp(-k * (x - x0))), 3)
def generate_linear_space(start, end, num):
if num == 1:
return [start]
elif num < 1:
raise ValueError("num must be at least 1.")
step = (end - start) / (num - 1)
return [start + step * i for i in range(num)]
def generate_depth_percents(intervals, interval_type):
if interval_type == 'linear':
return generate_linear_space(0, 100, intervals)
elif interval_type == 'sigmoid':
linear_space = generate_linear_space(0, 100, intervals)
return [logistic(x) for x in linear_space]
else:
raise ValueError('Unsupported interval type')
needlebench_reader_cfg = dict(input_columns=['prompt'], output_column='answer')
needlebench_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(role='HUMAN', prompt='{prompt}'),
dict(role='BOT', prompt='{answer}\n'),
]
)
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer))
needlebench_eval_cfg = dict(
evaluator=dict(type=NeedleBenchOriginEvaluator),
pred_postprocessor=dict(type=needlebench_postprocess),
dataset_postprocessor=dict(type=needlebench_dataset_postprocess),
pred_role='BOT')
context_lengths = [20000, 160000, 300000, 440000, 580000, 720000, 860000, 1000000]
depths_list = [0, 10, 21, 31, 42, 52, 63, 73, 84, 94, 100]
base_path = './data/needlebench'
file_list = ['PaulGrahamEssays.jsonl']
needlebench_datasets_en = []
needle_file_name = 'needles.jsonl'
for original_context_length in context_lengths:
for depth_percent in depths_list:
dataset_dict = {
'abbr': f'Length{original_context_length}'
f'Depth{int(depth_percent)}_origin_en_1000k',
'type': NeedleBenchOriginDataset,
'path': base_path,
'length': original_context_length,
'depth': int(depth_percent),
'tokenizer_model': 'gpt-4',
'file_list': file_list,
'num_repeats_per_file': 10,
'length_buffer': 600,
'guide': True,
'language': 'English',
'needle_file_name': needle_file_name,
'reader_cfg': needlebench_reader_cfg,
'infer_cfg': needlebench_infer_cfg,
'eval_cfg': needlebench_eval_cfg
}
needlebench_datasets_en.append(dataset_dict)
file_list = ['zh_finance.jsonl']
needlebench_datasets_zh = []
needle_file_name = 'needles.jsonl'
for original_context_length in context_lengths:
for depth_percent in depths_list:
dataset_dict = {
'abbr': f'Length{original_context_length}'
f'Depth{int(depth_percent)}_origin_zh_1000k',
'type': NeedleBenchOriginDataset,
'path': base_path,
'length': original_context_length,
'depth': int(depth_percent),
'tokenizer_model': 'gpt-4',
'file_list': file_list,
'num_repeats_per_file': 10,
'length_buffer': 200,
'guide': True,
'language': 'Chinese',
'needle_file_name': needle_file_name,
'reader_cfg': needlebench_reader_cfg,
'infer_cfg': needlebench_infer_cfg,
'eval_cfg': needlebench_eval_cfg
}
needlebench_datasets_zh.append(dataset_dict)

View File

@ -16,6 +16,7 @@ configs/datasets/needlebench/
├── needlebench_32k ├── needlebench_32k
├── needlebench_128k ├── needlebench_128k
├── needlebench_200k ├── needlebench_200k
├── needlebench_1000k
├── needlebench.py ├── needlebench.py
├── readme.md ├── readme.md
└── readme_zh-CN.md └── readme_zh-CN.md

View File

@ -16,6 +16,7 @@ configs/datasets/needlebench/
├── needlebench_32k ├── needlebench_32k
├── needlebench_128k ├── needlebench_128k
├── needlebench_200k ├── needlebench_200k
├── needlebench_1000k
├── needlebench.py ├── needlebench.py
├── readme.md ├── readme.md
└── readme_zh-CN.md └── readme_zh-CN.md

View File

@ -0,0 +1,33 @@
from opencompass.models.turbomind import TurboMindModel
_meta_template = dict(
round=[
dict(role='HUMAN', begin='<|im_start|>user\n', end='<|im_end|>\n'),
dict(role='BOT', begin='<|im_start|>assistant\n', end='<|im_end|>\n',
generate=True),
],
eos_token_id=92542
)
models = [
dict(
type=TurboMindModel,
abbr='internlm2-chat-20b-turbomind',
path="internlm/internlm2-chat-20b",
meta_template=_meta_template,
engine_config=dict(session_len=210000,
max_batch_size=8,
rope_scaling_factor=3.0,
model_name="internlm2-chat-20b",
tp=2),
gen_config=dict(top_k=1, top_p=0.8,
temperature=1.0,
max_new_tokens=2000,),
max_out_len=2000,
max_seq_len=210000,
batch_size=1,
concurrency=8,
run_cfg=dict(num_gpus=2, num_procs=1),
)
]

View File

@ -0,0 +1,32 @@
from opencompass.models.turbomind import TurboMindModel
_meta_template = dict(
round=[
dict(role='HUMAN', begin='<|im_start|>user\n', end='<|im_end|>\n'),
dict(role='BOT', begin='<|im_start|>assistant\n', end='<|im_end|>\n',
generate=True),
],
eos_token_id=92542
)
models = [
dict(
type=TurboMindModel,
abbr='internlm2-chat-7b-turbomind',
path="internlm/internlm2-chat-7b",
meta_template=_meta_template,
engine_config=dict(session_len=210000,
max_batch_size=8,
rope_scaling_factor=2.0,
model_name="internlm2-chat-7b"),
gen_config=dict(top_k=1, top_p=0.8,
temperature=1.0,
max_new_tokens=2000),
max_out_len=2000,
max_seq_len=210000,
batch_size=8,
concurrency=8,
run_cfg=dict(num_gpus=1, num_procs=1),
)
]

View File

@ -0,0 +1,25 @@
from opencompass.models import VLLM
_meta_template = dict(
round=[
dict(role="HUMAN", begin='\n<|im_start|>user\n', end='<|im_end|>'),
dict(role="BOT", begin="\n<|im_start|>assistant\n", end='<|im_end|>', generate=True),
],
)
models = [
dict(
type=VLLM,
abbr='qwen-14b-chat-vllm',
path="Qwen/Qwen-14B-Chat",
model_kwargs=dict(tensor_parallel_size=4),
meta_template=_meta_template,
max_out_len=100,
max_seq_len=2048,
batch_size=32,
generation_kwargs=dict(temperature=0),
end_str='<|im_end|>',
run_cfg=dict(num_gpus=4, num_procs=1),
)
]

View File

@ -539,8 +539,114 @@ needlebench_200k_summarizer = dict(
], ],
summary_groups=needlebench_summary_groups, summary_groups=needlebench_summary_groups,
) )
context_lengths_8k = list(range(5000, 9000, 1000))
# ----------NeedleBench-1000k-summarizer----------
context_lengths_1000k = list([20000, 160000, 300000, 440000, 580000, 720000, 860000, 1000000])
# Initialize the lists
_needlebench_1000k_2needle_en = []
_needlebench_1000k_3needle_en = []
_needlebench_1000k_4needle_en = []
_needlebench_1000k_5needle_en = []
_needlebench_1000k_2needle_zh = []
_needlebench_1000k_3needle_zh = []
_needlebench_1000k_4needle_zh = []
_needlebench_1000k_5needle_zh = []
_needlebench_1000k_origin_en = []
_needlebench_1000k_origin_zh = []
# Fill the lists using nested loops
for original_context_length in context_lengths_1000k:
for depth_percent in depths_list_sparse:
_needlebench_1000k_2needle_en.append(f'Length{original_context_length}Depth{int(depth_percent)}_2needle_en_1000k')
_needlebench_1000k_3needle_en.append(f'Length{original_context_length}Depth{int(depth_percent)}_3needle_en_1000k')
_needlebench_1000k_4needle_en.append(f'Length{original_context_length}Depth{int(depth_percent)}_4needle_en_1000k')
_needlebench_1000k_5needle_en.append(f'Length{original_context_length}Depth{int(depth_percent)}_5needle_en_1000k')
_needlebench_1000k_2needle_zh.append(f'Length{original_context_length}Depth{int(depth_percent)}_2needle_zh_1000k')
_needlebench_1000k_3needle_zh.append(f'Length{original_context_length}Depth{int(depth_percent)}_3needle_zh_1000k')
_needlebench_1000k_4needle_zh.append(f'Length{original_context_length}Depth{int(depth_percent)}_4needle_zh_1000k')
_needlebench_1000k_5needle_zh.append(f'Length{original_context_length}Depth{int(depth_percent)}_5needle_zh_1000k')
_needlebench_1000k_origin_en.append(f'Length{original_context_length}Depth{int(depth_percent)}_origin_en_1000k')
_needlebench_1000k_origin_zh.append(f'Length{original_context_length}Depth{int(depth_percent)}_origin_zh_1000k')
# Concatenate the multi-needle and origin lists
_needlebench_1000k_multi_needle_en = _needlebench_1000k_2needle_en + _needlebench_1000k_3needle_en + _needlebench_1000k_4needle_en + _needlebench_1000k_5needle_en
_needlebench_1000k_multi_needle_zh = _needlebench_1000k_2needle_zh + _needlebench_1000k_3needle_zh + _needlebench_1000k_4needle_zh + _needlebench_1000k_5needle_zh
_needlebench_1000k_origin = _needlebench_1000k_origin_en + _needlebench_1000k_origin_zh
_needlebench_1000k_multi_needle = _needlebench_1000k_multi_needle_en + _needlebench_1000k_multi_needle_zh
# Repeating the same process for parallel (assuming it's similar to origin_en)
_needlebench_1000k_parallel_en = []
_needlebench_1000k_parallel_zh = []
for original_context_length in context_lengths_1000k:
_needlebench_1000k_parallel_en.append(f'Length{original_context_length}_parallel_en_1000k')
for original_context_length in context_lengths_1000k:
_needlebench_1000k_parallel_zh.append(f'Length{original_context_length}_parallel_zh_1000k')
_needlebench_1000k_parallel = _needlebench_1000k_parallel_en + _needlebench_1000k_parallel_zh
needlebench_summary_groups = [
{'name': 'original_version', 'subsets': _needlebench_1000k_origin},
{'name': 'original_version_zh', 'subsets': _needlebench_1000k_origin_zh},
{'name': 'original_version_en', 'subsets': _needlebench_1000k_origin_en},
{'name': 'multi_needle_en', 'subsets': _needlebench_1000k_multi_needle_en},
{'name': 'multi_needle2_en', 'subsets': _needlebench_1000k_2needle_en},
{'name': 'multi_needle3_en', 'subsets': _needlebench_1000k_3needle_en},
{'name': 'multi_needle4_en', 'subsets': _needlebench_1000k_4needle_en},
{'name': 'multi_needle5_en', 'subsets': _needlebench_1000k_5needle_en},
{'name': 'multi_needle_zh', 'subsets': _needlebench_1000k_multi_needle_zh},
{'name': 'multi_needle2_zh', 'subsets': _needlebench_1000k_2needle_zh},
{'name': 'multi_needle3_zh', 'subsets': _needlebench_1000k_3needle_zh},
{'name': 'multi_needle4_zh', 'subsets': _needlebench_1000k_4needle_zh},
{'name': 'multi_needle5_zh', 'subsets': _needlebench_1000k_5needle_zh},
{'name': 'multi_needle', 'subsets': _needlebench_1000k_multi_needle},
{'name': 'parallel_version', 'subsets': _needlebench_1000k_parallel},
{'name': 'parallel_version_zh', 'subsets': _needlebench_1000k_parallel_zh},
{'name': 'parallel_version_en', 'subsets': _needlebench_1000k_parallel_en},
{'name': 'overall',
'subsets': [['original_version', 'naive_average'],
['multi_needle', 'naive_average'],
['parallel_version', 'average_score']],
'weights': {'original_version': 0.4,
'multi_needle': 0.3,
'parallel_version': 0.3}},
]
needlebench_1000k_summarizer = dict(
type=NeedleBenchSummarizer,
dataset_abbrs=[
'overall',
'--------- NeedleBench-1000k Single-Needle ---------', # category
'original_version',
'original_version_zh',
'original_version_en',
'--------- NeedleBench-1000k Parallel-Needles ---------', # category
'parallel_version',
'parallel_version_zh',
'parallel_version_en',
'--------- NeedleBench-1000k Multi-Needles ---------', # category
'multi_needle',
'multi_needle_en',
'multi_needle_zh',
'multi_needle2_en',
'multi_needle3_en',
'multi_needle4_en',
'multi_needle5_en',
'multi_needle2_zh',
'multi_needle3_zh',
'multi_needle4_zh',
'multi_needle5_zh',
# *_needlebench_1000k_origin, *_needlebench_1000k_multi_needle, *_needlebench_1000k_parallel,
],
summary_groups=needlebench_summary_groups,
)
context_lengths_8k = list(range(5000, 9000, 1000))
# Repeating the same process for parallel (assuming it's similar to origin_en) # Repeating the same process for parallel (assuming it's similar to origin_en)
_needlebench_8k_parallel_en_batch1 = [] _needlebench_8k_parallel_en_batch1 = []
_needlebench_8k_parallel_en_batch5 = [] _needlebench_8k_parallel_en_batch5 = []

View File

@ -48,6 +48,7 @@ class NeedleBenchMultiDataset(BaseDataset):
needle_file_name: str, needle_file_name: str,
num_needles: int, num_needles: int,
diff: int, diff: int,
position: str = 'End',
): ):
data = {'prompt': [], 'answer': []} data = {'prompt': [], 'answer': []}
tokenizer = tiktoken.encoding_for_model(tokenizer_model) tokenizer = tiktoken.encoding_for_model(tokenizer_model)
@ -109,19 +110,42 @@ class NeedleBenchMultiDataset(BaseDataset):
retrieval_question) retrieval_question)
if language == 'Chinese': if language == 'Chinese':
if position == 'End':
prompt = ('你是一个善于回答用户问题的智能AI助手\n' prompt = ('你是一个善于回答用户问题的智能AI助手\n'
'请保持你的回答简洁清楚。不要说和下面文档中的无关的话' '请保持你的回答简洁清楚。不要说和下面文档中的无关的话'
',或重复你的回答\n' ',或重复你的回答\n'
f'用户现在给你的文档是{context}\n\n' f'用户现在给你的文档是{context}\n\n'
f'现在请问:{retrieval_question}') f'现在请问:{retrieval_question}')
elif position == 'Start':
prompt = ('你是一个善于回答用户问题的智能AI助手\n'
'请保持你的回答简洁清楚。不要说和下面文档中的无关的话'
',或重复你的回答\n'
f'现在请问:{retrieval_question}',
f'用户现在给你的文档是{context}\n\n')
else:
raise ValueError('Unsupported position. '
'Position must be "End" or "Start".')
elif language == 'English': elif language == 'English':
if position == 'End':
prompt = ('You are an intelligent AI assistant skilled in ' prompt = ('You are an intelligent AI assistant skilled in '
'answering user questions.\n' 'answering user questions.\n'
'Please keep your answers concise and clear. Do not' 'Please keep your answers concise and clear. Do '
' talk about irrelevant topics or repeat your ' 'not talk about irrelevant topics or repeat '
'answers.\n' 'your answers.\nThe document '
f'The document given to you by the user is {context}' f'given to you by the user is {context}\n\n'
f'\n\nNow, the question is: {retrieval_question}') f'Now, the question is: {retrieval_question}')
elif position == 'Start':
prompt = ('You are an intelligent AI assistant skilled in '
'answering user questions.\n'
'Please keep your answers concise and clear. Do '
'not talk about irrelevant topics or repeat '
'your answers.\n'
f'Now, the question is: {retrieval_question}'
'The document given to you by the user'
f' is {context}\n\n')
else:
raise ValueError('Unsupported position. '
'Position must be "End" or "Start".')
else: else:
raise ValueError(f"Language '{language}' is not supported.") raise ValueError(f"Language '{language}' is not supported.")

View File

@ -45,6 +45,7 @@ class NeedleBenchOriginDataset(BaseDataset):
guide: bool, guide: bool,
language: str, language: str,
needle_file_name: str, needle_file_name: str,
position: str = 'End',
): ):
data = {'prompt': [], 'answer': []} data = {'prompt': [], 'answer': []}
tokenizer = tiktoken.encoding_for_model(tokenizer_model) tokenizer = tiktoken.encoding_for_model(tokenizer_model)
@ -85,19 +86,42 @@ class NeedleBenchOriginDataset(BaseDataset):
retrieval_question) retrieval_question)
if language == 'Chinese': if language == 'Chinese':
if position == 'End':
prompt = ('你是一个善于回答用户问题的智能AI助手\n' prompt = ('你是一个善于回答用户问题的智能AI助手\n'
'请保持你的回答简洁清楚。不要说和下面文档中的无关的话' '请保持你的回答简洁清楚。不要说和下面文档中的无关的话'
',或重复你的回答\n' ',或重复你的回答\n'
f'用户现在给你的文档是{context}\n\n' f'用户现在给你的文档是{context}\n\n'
f'现在请问:{retrieval_question}') f'现在请问:{retrieval_question}')
elif position == 'Start':
prompt = ('你是一个善于回答用户问题的智能AI助手\n'
'请保持你的回答简洁清楚。不要说和下面文档中的无关的话'
',或重复你的回答\n'
f'现在请问:{retrieval_question}',
f'用户现在给你的文档是{context}\n\n')
else:
raise ValueError('Unsupported position. '
'Position must be "End" or "Start".')
elif language == 'English': elif language == 'English':
if position == 'End':
prompt = ('You are an intelligent AI assistant skilled in ' prompt = ('You are an intelligent AI assistant skilled in '
'answering user questions.\n' 'answering user questions.\n'
'Please keep your answers concise and clear. Do not' 'Please keep your answers concise and clear. Do '
' talk about irrelevant topics or repeat your ' 'not talk about irrelevant topics or repeat '
'answers.\n' 'your answers.\nThe document '
f'The document given to you by the user is {context}' f'given to you by the user is {context}\n\n'
f'\n\nNow, the question is: {retrieval_question}') f'Now, the question is: {retrieval_question}')
elif position == 'Start':
prompt = ('You are an intelligent AI assistant skilled in '
'answering user questions.\n'
'Please keep your answers concise and clear. Do '
'not talk about irrelevant topics or repeat '
'your answers.\n'
f'Now, the question is: {retrieval_question}'
'The document given to you by the user'
f' is {context}\n\n')
else:
raise ValueError('Unsupported position. '
'Position must be "End" or "Start".')
else: else:
raise ValueError(f"Language '{language}' is not supported.") raise ValueError(f"Language '{language}' is not supported.")

View File

@ -67,6 +67,7 @@ class NeedleBenchParallelDataset(BaseDataset):
length_buffer: int, length_buffer: int,
guide: bool, guide: bool,
language: str, language: str,
position: str = 'End',
): ):
data = {'prompt': [], 'answer': []} data = {'prompt': [], 'answer': []}
tokenizer = tiktoken.encoding_for_model(tokenizer_model) tokenizer = tiktoken.encoding_for_model(tokenizer_model)
@ -134,12 +135,24 @@ class NeedleBenchParallelDataset(BaseDataset):
retrieval_question) retrieval_question)
if language == 'Chinese': if language == 'Chinese':
if position == 'End':
prompt = ('你是一个善于回答用户问题的智能AI助手\n' prompt = ('你是一个善于回答用户问题的智能AI助手\n'
'请保持你的回答简洁清楚。不要说和下面文档中的无关的话' '请保持你的回答简洁清楚。不要说和下面文档中的无关的话'
',或重复你的回答\n请先仔细阅读下面的文档再依次回答' ',或重复你的回答\n请先仔细阅读下面的文档再依次回答'
f'最后提出的问题\n用户现在给你的文档是{context}\n\n' f'最后提出的问题\n用户现在给你的文档是{context}\n\n'
f'现在请问:{retrieval_question}\n') f'现在请问:{retrieval_question}\n')
elif position == 'Start':
prompt = ('你是一个善于回答用户问题的智能AI助手\n'
'请保持你的回答简洁清楚。不要说和下面文档中的无关的话'
',或重复你的回答\n请先仔细阅读下面的文档再依次回答'
f'最后提出的问题\n现在请问:{retrieval_question}\n\n'
f'用户现在给你的文档是{context}\n')
else:
raise ValueError('Unsupported position. '
'Position must be "End" or "Start".')
elif language == 'English': elif language == 'English':
if position == 'End':
prompt = ( prompt = (
'You are an intelligent AI assistant skilled in ' 'You are an intelligent AI assistant skilled in '
'answering user questions.\n' 'answering user questions.\n'
@ -148,6 +161,18 @@ class NeedleBenchParallelDataset(BaseDataset):
'answers.\n' 'answers.\n'
f'The document given to you by the user is {context}' f'The document given to you by the user is {context}'
f'\n\nNow, the questions are: {retrieval_question}\n') f'\n\nNow, the questions are: {retrieval_question}\n')
elif position == 'Start':
prompt = (
'You are an intelligent AI assistant skilled in '
'answering user questions.\n'
'Please keep your answers concise and clear. Do not'
' talk about irrelevant topics or repeat your '
'answers.\n'
f'\nNow, the questions are: {retrieval_question}\n\n'
f'The document given to you by the user is {context}')
else:
raise ValueError('Unsupported position. '
'Position must be "End" or "Start".')
else: else:
raise ValueError(f"Language '{language}' is not supported.") raise ValueError(f"Language '{language}' is not supported.")