mirror of
https://github.com/open-compass/opencompass.git
synced 2025-05-30 16:03:24 +08:00
[Feature] Needlebench auto-download update (#1480)
* update * update * update
This commit is contained in:
parent
716d46e1f5
commit
6c9cd9a260
@ -31,7 +31,9 @@ needlebench_eval_cfg = dict(
|
|||||||
needle_num_list = list(range(2, 100, 3))
|
needle_num_list = list(range(2, 100, 3))
|
||||||
document_depth_percent_intervals = 20
|
document_depth_percent_intervals = 20
|
||||||
repeats = 30
|
repeats = 30
|
||||||
names_path = './data/needlebench/names.json'
|
|
||||||
|
path = 'opencompass/needlebench'
|
||||||
|
file_name = 'names.json'
|
||||||
|
|
||||||
needlebench_atc_datasets_zh = []
|
needlebench_atc_datasets_zh = []
|
||||||
needlebench_atc_datasets_en = []
|
needlebench_atc_datasets_en = []
|
||||||
@ -44,7 +46,8 @@ for num_needles in needle_num_list:
|
|||||||
'abbr': f'needlebench_atc_challenge'
|
'abbr': f'needlebench_atc_challenge'
|
||||||
f'needle_{num_needles}_en_ordered',
|
f'needle_{num_needles}_en_ordered',
|
||||||
'type': NeedleBenchATCOrderedDataset,
|
'type': NeedleBenchATCOrderedDataset,
|
||||||
'path': names_path,
|
'path': path,
|
||||||
|
'file_name': file_name,
|
||||||
'num_needles': num_needles,
|
'num_needles': num_needles,
|
||||||
'language': 'English',
|
'language': 'English',
|
||||||
'repeats': repeats,
|
'repeats': repeats,
|
||||||
@ -61,7 +64,8 @@ for num_needles in needle_num_list:
|
|||||||
'abbr': f'needlebench_atc_challenge'
|
'abbr': f'needlebench_atc_challenge'
|
||||||
f'needle_{num_needles}_zh_ordered',
|
f'needle_{num_needles}_zh_ordered',
|
||||||
'type': NeedleBenchATCOrderedDataset,
|
'type': NeedleBenchATCOrderedDataset,
|
||||||
'path': names_path,
|
'path': path,
|
||||||
|
'file_name': file_name,
|
||||||
'num_needles': num_needles,
|
'num_needles': num_needles,
|
||||||
'language': 'Chinese',
|
'language': 'Chinese',
|
||||||
'repeats': repeats,
|
'repeats': repeats,
|
||||||
@ -77,7 +81,8 @@ for num_needles in needle_num_list:
|
|||||||
'abbr': f'needlebench_atc_challenge'
|
'abbr': f'needlebench_atc_challenge'
|
||||||
f'needle_{num_needles}_en',
|
f'needle_{num_needles}_en',
|
||||||
'type': NeedleBenchATCDataset,
|
'type': NeedleBenchATCDataset,
|
||||||
'path': names_path,
|
'path': path,
|
||||||
|
'file_name': file_name,
|
||||||
'num_needles': num_needles,
|
'num_needles': num_needles,
|
||||||
'language': 'English',
|
'language': 'English',
|
||||||
'repeats': repeats,
|
'repeats': repeats,
|
||||||
@ -93,7 +98,8 @@ for num_needles in needle_num_list:
|
|||||||
'abbr': f'needlebench_atc_challenge'
|
'abbr': f'needlebench_atc_challenge'
|
||||||
f'needle_{num_needles}_zh',
|
f'needle_{num_needles}_zh',
|
||||||
'type': NeedleBenchATCDataset,
|
'type': NeedleBenchATCDataset,
|
||||||
'path': names_path,
|
'path': path,
|
||||||
|
'file_name': file_name,
|
||||||
'num_needles': num_needles,
|
'num_needles': num_needles,
|
||||||
'language': 'Chinese',
|
'language': 'Chinese',
|
||||||
'repeats': repeats,
|
'repeats': repeats,
|
||||||
|
@ -61,7 +61,8 @@ few_shot_prompts = {
|
|||||||
|
|
||||||
# ----------------------- Prompt Settings ----------------------- #
|
# ----------------------- Prompt Settings ----------------------- #
|
||||||
needle_num_list = list(range(2, 20, 1))
|
needle_num_list = list(range(2, 20, 1))
|
||||||
names_path = './data/needlebench/names.json'
|
path = 'opencompass/needlebench'
|
||||||
|
file_name = 'names.json'
|
||||||
|
|
||||||
repeats = 10
|
repeats = 10
|
||||||
|
|
||||||
@ -122,7 +123,8 @@ for _name in list(single_choice_prompts.keys()):
|
|||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': abbr,
|
'abbr': abbr,
|
||||||
'type': NeedleBenchATCDataset,
|
'type': NeedleBenchATCDataset,
|
||||||
'path': names_path,
|
'path': path,
|
||||||
|
'file_name': file_name,
|
||||||
'num_needles': num_needles,
|
'num_needles': num_needles,
|
||||||
'language': language,
|
'language': language,
|
||||||
'repeats': repeats,
|
'repeats': repeats,
|
||||||
|
@ -61,8 +61,8 @@ few_shot_prompts = {
|
|||||||
|
|
||||||
# ----------------------- Prompt Settings ----------------------- #
|
# ----------------------- Prompt Settings ----------------------- #
|
||||||
needle_num_list = list(range(2, 20, 1))
|
needle_num_list = list(range(2, 20, 1))
|
||||||
names_path = './data/needlebench/names.json'
|
path = 'opencompass/needlebench'
|
||||||
|
file_name = 'names.json'
|
||||||
repeats = 10
|
repeats = 10
|
||||||
|
|
||||||
# Use Zero-Shot or not
|
# Use Zero-Shot or not
|
||||||
@ -120,7 +120,8 @@ for _name in list(single_choice_prompts.keys()):
|
|||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': abbr,
|
'abbr': abbr,
|
||||||
'type': NeedleBenchATCDataset,
|
'type': NeedleBenchATCDataset,
|
||||||
'path': names_path,
|
'path': path,
|
||||||
|
'file_name': file_name,
|
||||||
'num_needles': num_needles,
|
'num_needles': num_needles,
|
||||||
'language': language,
|
'language': language,
|
||||||
'repeats': repeats,
|
'repeats': repeats,
|
||||||
|
@ -30,7 +30,8 @@ for _name in list(single_choice_prompts.keys()):
|
|||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': abbr,
|
'abbr': abbr,
|
||||||
'type': NeedleBenchATCDataset,
|
'type': NeedleBenchATCDataset,
|
||||||
'path': names_path,
|
'path': path,
|
||||||
|
'file_name': file_name,
|
||||||
'num_needles': num_needles,
|
'num_needles': num_needles,
|
||||||
'language': language,
|
'language': language,
|
||||||
'repeats': repeats,
|
'repeats': repeats,
|
||||||
|
@ -10,14 +10,38 @@ from opencompass.utils.text_postprocessors import first_option_postprocess
|
|||||||
few_shot_prompts = {
|
few_shot_prompts = {
|
||||||
'single_choice_prompts': {
|
'single_choice_prompts': {
|
||||||
'single_choice_en_reasoning': [
|
'single_choice_en_reasoning': [
|
||||||
dict(role='HUMAN', prompt="Question: Sharon House, as Jessica Stewart's father, has a significant impact on Jessica Stewart's upbringing. \nGiven the scrambled family relationships described above, who is the eldest relative that 'Jessica Stewart' can trace back to in the context?\nA. Jack Burch\nB. Jessica Stewart\nC. Sharon House\nD. Carolyn Jackson\n"),
|
dict(
|
||||||
dict(role='BOT', prompt="Answer: Based on the provided information, we can construct the following family relationship chain:\n\n- Sharon House, as Jessica Stewart's father, has a significant impact on Jessica Stewart's upbringing.\n\nTherefore, the eldest relative that 'Jessica Stewart' can trace back to in the context is Sharon House. The answer is: C"),
|
role='HUMAN',
|
||||||
dict(role='HUMAN', prompt="Question: For Robert Hill, Mikayla Scott is not just a paternal grandfather, but also a friend.Jacob Oconnor's paternal grandmother is Robert Hill. \nGiven the scrambled family relationships described above, who is the eldest relative that 'Jacob Oconnor' can trace back to in the context?\nA. Laura Holland\nB. Robert Hill\nC. Jacob Oconnor\nD. Mikayla Scott\n"),
|
prompt="Question: Sharon House, as Jessica Stewart's father, has a significant impact on Jessica Stewart's upbringing. \nGiven the scrambled family relationships described above, who is the eldest relative that 'Jessica Stewart' can trace back to in the context?\nA. Jack Burch\nB. Jessica Stewart\nC. Sharon House\nD. Carolyn Jackson\n",
|
||||||
dict(role='BOT', prompt="Answer: Based on the provided information, we can construct the following family relationship chain:\n\n-Jacob Oconnor's paternal grandmother is Robert Hill. \n- For Robert Hill, Mikayla Scott is not just a paternal grandfather, but also a friend.\n\nTherefore, the eldest relative that 'Jacob Oconnor' can trace back to in the context is Mikayla Scott. The answer is: D"),
|
),
|
||||||
dict(role='HUMAN', prompt="Question: Misty Moore plays the role of Barbara Fuentes's maternal grandfather in Barbara Fuentes's life.Jennifer Garcia, as Michael Martinez's grandmother, has a significant impact on Michael Martinez's upbringing.Michael Martinez is not only Misty Moore's father but also Misty Moore's role model. \nGiven the scrambled family relationships described above, who is the eldest relative that 'Barbara Fuentes' can trace back to in the context?\nA. Michael Martinez\nB. Jennifer Garcia\nC. Misty Moore\nD. Barbara Fuentes\n"),
|
dict(
|
||||||
dict(role='BOT', prompt="Answer: Based on the provided information, we can construct the following family relationship chain:\n- Misty Moore plays the role of Barbara Fuentes's maternal grandfather in Barbara Fuentes's life. \n- Michael Martinez is not only Misty Moore's father but also Misty Moore's role model.\n- Jennifer Garcia, as Michael Martinez's grandmother, has a significant impact on Michael Martinez's upbringing.\n\nTherefore, the eldest relative that 'Barbara Fuentes' can trace back to in the context is Jennifer Garcia. The answer is: B"),
|
role='BOT',
|
||||||
dict(role='HUMAN', prompt="Question: Carlos Smith, as Mary Gay's grandfather, has a significant impact on Mary Gay's upbringing.Victor Dean considers Mary Gay as their grandfather.Marcus Miller, as Carlos Smith's paternal grandfather, has a significant impact on Carlos Smith's upbringing.Victor Dean is not only Danielle Yates's maternal grandmother but also Danielle Yates's role model.Danielle Yates is not only David Hernandez's paternal grandmother but also David Hernandez's role model.David Hernandez is Jennifer Williams's mom. \nGiven the scrambled family relationships described above, who is the eldest relative that 'Jennifer Williams' can trace back to in the context?\nA. Marcus Miller\nB. Carlos Smith\nC. Mary Gay\nD. Victor Dean\n"),
|
prompt="Answer: Based on the provided information, we can construct the following family relationship chain:\n\n- Sharon House, as Jessica Stewart's father, has a significant impact on Jessica Stewart's upbringing.\n\nTherefore, the eldest relative that 'Jessica Stewart' can trace back to in the context is Sharon House. The answer is: C",
|
||||||
dict(role='BOT', prompt="Answer: Based on the provided information, we can construct the following family relationship chain:\n\n- David Hernandez is Jennifer Williams's mom.\n- Danielle Yates is not only David Hernandez's paternal grandmother but also David Hernandez's role model.\n- Victor Dean is not only Danielle Yates's maternal grandmother but also Danielle Yates's role model.\n- Victor Dean considers Mary Gay as their grandfather. \n- Carlos Smith, as Mary Gay's grandfather, has a significant impact on Mary Gay's upbringing.\n- Marcus Miller, as Carlos Smith's paternal grandfather, has a significant impact on Carlos Smith's upbringing.\n\nTherefore, the eldest relative that 'Jennifer Williams' can trace back to in the context is Marcus Miller. The answer is: A"),
|
),
|
||||||
|
dict(
|
||||||
|
role='HUMAN',
|
||||||
|
prompt="Question: For Robert Hill, Mikayla Scott is not just a paternal grandfather, but also a friend.Jacob Oconnor's paternal grandmother is Robert Hill. \nGiven the scrambled family relationships described above, who is the eldest relative that 'Jacob Oconnor' can trace back to in the context?\nA. Laura Holland\nB. Robert Hill\nC. Jacob Oconnor\nD. Mikayla Scott\n",
|
||||||
|
),
|
||||||
|
dict(
|
||||||
|
role='BOT',
|
||||||
|
prompt="Answer: Based on the provided information, we can construct the following family relationship chain:\n\n-Jacob Oconnor's paternal grandmother is Robert Hill. \n- For Robert Hill, Mikayla Scott is not just a paternal grandfather, but also a friend.\n\nTherefore, the eldest relative that 'Jacob Oconnor' can trace back to in the context is Mikayla Scott. The answer is: D",
|
||||||
|
),
|
||||||
|
dict(
|
||||||
|
role='HUMAN',
|
||||||
|
prompt="Question: Misty Moore plays the role of Barbara Fuentes's maternal grandfather in Barbara Fuentes's life.Jennifer Garcia, as Michael Martinez's grandmother, has a significant impact on Michael Martinez's upbringing.Michael Martinez is not only Misty Moore's father but also Misty Moore's role model. \nGiven the scrambled family relationships described above, who is the eldest relative that 'Barbara Fuentes' can trace back to in the context?\nA. Michael Martinez\nB. Jennifer Garcia\nC. Misty Moore\nD. Barbara Fuentes\n",
|
||||||
|
),
|
||||||
|
dict(
|
||||||
|
role='BOT',
|
||||||
|
prompt="Answer: Based on the provided information, we can construct the following family relationship chain:\n- Misty Moore plays the role of Barbara Fuentes's maternal grandfather in Barbara Fuentes's life. \n- Michael Martinez is not only Misty Moore's father but also Misty Moore's role model.\n- Jennifer Garcia, as Michael Martinez's grandmother, has a significant impact on Michael Martinez's upbringing.\n\nTherefore, the eldest relative that 'Barbara Fuentes' can trace back to in the context is Jennifer Garcia. The answer is: B",
|
||||||
|
),
|
||||||
|
dict(
|
||||||
|
role='HUMAN',
|
||||||
|
prompt="Question: Carlos Smith, as Mary Gay's grandfather, has a significant impact on Mary Gay's upbringing.Victor Dean considers Mary Gay as their grandfather.Marcus Miller, as Carlos Smith's paternal grandfather, has a significant impact on Carlos Smith's upbringing.Victor Dean is not only Danielle Yates's maternal grandmother but also Danielle Yates's role model.Danielle Yates is not only David Hernandez's paternal grandmother but also David Hernandez's role model.David Hernandez is Jennifer Williams's mom. \nGiven the scrambled family relationships described above, who is the eldest relative that 'Jennifer Williams' can trace back to in the context?\nA. Marcus Miller\nB. Carlos Smith\nC. Mary Gay\nD. Victor Dean\n",
|
||||||
|
),
|
||||||
|
dict(
|
||||||
|
role='BOT',
|
||||||
|
prompt="Answer: Based on the provided information, we can construct the following family relationship chain:\n\n- David Hernandez is Jennifer Williams's mom.\n- Danielle Yates is not only David Hernandez's paternal grandmother but also David Hernandez's role model.\n- Victor Dean is not only Danielle Yates's maternal grandmother but also Danielle Yates's role model.\n- Victor Dean considers Mary Gay as their grandfather. \n- Carlos Smith, as Mary Gay's grandfather, has a significant impact on Mary Gay's upbringing.\n- Marcus Miller, as Carlos Smith's paternal grandfather, has a significant impact on Carlos Smith's upbringing.\n\nTherefore, the eldest relative that 'Jennifer Williams' can trace back to in the context is Marcus Miller. The answer is: A",
|
||||||
|
),
|
||||||
dict(role='HUMAN', prompt='Question: {question}'),
|
dict(role='HUMAN', prompt='Question: {question}'),
|
||||||
],
|
],
|
||||||
},
|
},
|
||||||
@ -25,8 +49,8 @@ few_shot_prompts = {
|
|||||||
|
|
||||||
# ----------------------- Prompt Settings ----------------------- #
|
# ----------------------- Prompt Settings ----------------------- #
|
||||||
needle_num_list = list(range(2, 50, 1))
|
needle_num_list = list(range(2, 50, 1))
|
||||||
names_path = './data/needlebench/names.json'
|
path = 'opencompass/needlebench'
|
||||||
|
file_name = 'names.json'
|
||||||
repeats = 10
|
repeats = 10
|
||||||
|
|
||||||
# Use Zero-Shot or not
|
# Use Zero-Shot or not
|
||||||
@ -48,49 +72,54 @@ single_choice_prompts = needlebench_prompts['single_choice_prompts']
|
|||||||
for _name in list(single_choice_prompts.keys()):
|
for _name in list(single_choice_prompts.keys()):
|
||||||
if with_few_shot:
|
if with_few_shot:
|
||||||
assert few_shot_samples > 0 and few_shot_samples <= 4
|
assert few_shot_samples > 0 and few_shot_samples <= 4
|
||||||
single_choice_prompts[_name] = \
|
single_choice_prompts[_name] = single_choice_prompts[_name][
|
||||||
single_choice_prompts[_name][- few_shot_samples * 2 - 1:]
|
-few_shot_samples * 2 - 1 :
|
||||||
|
]
|
||||||
|
|
||||||
# ----------------------- Dataset Settings ----------------------- #
|
# ----------------------- Dataset Settings ----------------------- #
|
||||||
|
|
||||||
needlebench_datasets = []
|
needlebench_datasets = []
|
||||||
|
|
||||||
|
|
||||||
needlebench_atc_reader_cfg = dict(input_columns=['question'],
|
needlebench_atc_reader_cfg = dict(input_columns=['question'], output_column='answer')
|
||||||
output_column='answer')
|
|
||||||
|
|
||||||
for _name in list(single_choice_prompts.keys()):
|
for _name in list(single_choice_prompts.keys()):
|
||||||
|
|
||||||
needlebench_atc_infer_cfg = dict(
|
needlebench_atc_infer_cfg = dict(
|
||||||
prompt_template=dict(
|
prompt_template=dict(
|
||||||
type=PromptTemplate,
|
type=PromptTemplate,
|
||||||
template=dict(
|
template=dict(round=(single_choice_prompts[_name])),
|
||||||
round=(single_choice_prompts[_name])),
|
|
||||||
),
|
),
|
||||||
retriever=dict(type=ZeroRetriever),
|
retriever=dict(type=ZeroRetriever),
|
||||||
inferencer=dict(type=GenInferencer,),
|
inferencer=dict(
|
||||||
|
type=GenInferencer,
|
||||||
|
),
|
||||||
)
|
)
|
||||||
|
|
||||||
needlebench_atc_eval_cfg = dict(
|
needlebench_atc_eval_cfg = dict(
|
||||||
evaluator=dict(type=CircularEvaluator),
|
evaluator=dict(type=CircularEvaluator),
|
||||||
pred_postprocessor=dict(type=first_option_postprocess, options='ABCD'))
|
pred_postprocessor=dict(type=first_option_postprocess, options='ABCD'),
|
||||||
|
)
|
||||||
|
|
||||||
for num_needles in needle_num_list:
|
for num_needles in needle_num_list:
|
||||||
abbr = (f'NeedleBenchATCDataset-'
|
abbr = (
|
||||||
f'{num_needles}Needle-{"EN" if "en" in _name else "ZH"}')
|
f'NeedleBenchATCDataset-'
|
||||||
|
f'{num_needles}Needle-{"EN" if "en" in _name else "ZH"}'
|
||||||
|
)
|
||||||
language = 'English' if 'en' in _name else 'Chinese'
|
language = 'English' if 'en' in _name else 'Chinese'
|
||||||
if 'reasoning' in _name:
|
if 'reasoning' in _name:
|
||||||
abbr += '-Reasoning'
|
abbr += '-Reasoning'
|
||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': abbr,
|
'abbr': abbr,
|
||||||
'type': NeedleBenchATCDataset,
|
'type': NeedleBenchATCDataset,
|
||||||
'path': names_path,
|
'path': path,
|
||||||
|
'file_name':file_name,
|
||||||
'num_needles': num_needles,
|
'num_needles': num_needles,
|
||||||
'language': language,
|
'language': language,
|
||||||
'repeats': repeats,
|
'repeats': repeats,
|
||||||
'with_circular': with_circular_eval,
|
'with_circular': with_circular_eval,
|
||||||
'reader_cfg': needlebench_atc_reader_cfg,
|
'reader_cfg': needlebench_atc_reader_cfg,
|
||||||
'infer_cfg': needlebench_atc_infer_cfg,
|
'infer_cfg': needlebench_atc_infer_cfg,
|
||||||
'eval_cfg': needlebench_atc_eval_cfg
|
'eval_cfg': needlebench_atc_eval_cfg,
|
||||||
}
|
}
|
||||||
needlebench_datasets.append(dataset_dict)
|
needlebench_datasets.append(dataset_dict)
|
||||||
|
@ -30,7 +30,8 @@ for _name in list(single_choice_prompts.keys()):
|
|||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': abbr,
|
'abbr': abbr,
|
||||||
'type': NeedleBenchATCDataset,
|
'type': NeedleBenchATCDataset,
|
||||||
'path': names_path,
|
'path': path,
|
||||||
|
'file_name': file_name,
|
||||||
'num_needles': num_needles,
|
'num_needles': num_needles,
|
||||||
'language': language,
|
'language': language,
|
||||||
'repeats': repeats,
|
'repeats': repeats,
|
||||||
|
@ -25,8 +25,8 @@ few_shot_prompts = {
|
|||||||
|
|
||||||
# ----------------------- Prompt Settings ----------------------- #
|
# ----------------------- Prompt Settings ----------------------- #
|
||||||
needle_num_list = list(range(2, 80, 1))
|
needle_num_list = list(range(2, 80, 1))
|
||||||
names_path = './data/needlebench/names.json'
|
path = 'opencompass/needlebench'
|
||||||
|
file_name = 'names.json'
|
||||||
repeats = 10
|
repeats = 10
|
||||||
|
|
||||||
# Use Zero-Shot or not
|
# Use Zero-Shot or not
|
||||||
@ -84,7 +84,8 @@ for _name in list(single_choice_prompts.keys()):
|
|||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': abbr,
|
'abbr': abbr,
|
||||||
'type': NeedleBenchATCDataset,
|
'type': NeedleBenchATCDataset,
|
||||||
'path': names_path,
|
'path': path,
|
||||||
|
'file_name': file_name,
|
||||||
'num_needles': num_needles,
|
'num_needles': num_needles,
|
||||||
'language': language,
|
'language': language,
|
||||||
'repeats': repeats,
|
'repeats': repeats,
|
||||||
|
@ -41,22 +41,24 @@ needlebench_infer_cfg = dict(
|
|||||||
dict(role='HUMAN', prompt='{prompt}'),
|
dict(role='HUMAN', prompt='{prompt}'),
|
||||||
dict(role='BOT', prompt='{answer}\n'),
|
dict(role='BOT', prompt='{answer}\n'),
|
||||||
]
|
]
|
||||||
)
|
|
||||||
),
|
),
|
||||||
|
),
|
||||||
retriever=dict(type=ZeroRetriever),
|
retriever=dict(type=ZeroRetriever),
|
||||||
inferencer=dict(type=GenInferencer))
|
inferencer=dict(type=GenInferencer),
|
||||||
|
)
|
||||||
|
|
||||||
needlebench_eval_cfg = dict(
|
needlebench_eval_cfg = dict(
|
||||||
evaluator=dict(type=NeedleBenchMultiEvaluator),
|
evaluator=dict(type=NeedleBenchMultiEvaluator),
|
||||||
pred_postprocessor=dict(type=needlebench_postprocess),
|
pred_postprocessor=dict(type=needlebench_postprocess),
|
||||||
dataset_postprocessor=dict(type=needlebench_dataset_postprocess),
|
dataset_postprocessor=dict(type=needlebench_dataset_postprocess),
|
||||||
pred_role='BOT')
|
pred_role='BOT',
|
||||||
|
)
|
||||||
|
|
||||||
context_lengths = [20000, 160000, 300000, 440000, 580000, 720000, 860000, 1000000]
|
context_lengths = [20000, 160000, 300000, 440000, 580000, 720000, 860000, 1000000]
|
||||||
depths_list = [0, 10, 21, 31, 42, 52, 63, 73, 84, 94, 100]
|
depths_list = [0, 10, 21, 31, 42, 52, 63, 73, 84, 94, 100]
|
||||||
|
|
||||||
# ----------English Version----------
|
# ----------English Version----------
|
||||||
base_path = './data/needlebench'
|
base_path = 'opencompass/needlebench'
|
||||||
file_list = ['PaulGrahamEssays.jsonl']
|
file_list = ['PaulGrahamEssays.jsonl']
|
||||||
|
|
||||||
needle_file_name = 'multi_needle_reasoning_en.json'
|
needle_file_name = 'multi_needle_reasoning_en.json'
|
||||||
@ -69,7 +71,7 @@ for original_context_length in context_lengths:
|
|||||||
for depth_percent in depths_list:
|
for depth_percent in depths_list:
|
||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': f'Length{original_context_length}'
|
'abbr': f'Length{original_context_length}'
|
||||||
f'Depth{int(depth_percent)}_{num_needles}needle_en_1000k',
|
f'Depth{int(depth_percent)}_{num_needles}needle_en_1000k',
|
||||||
'type': NeedleBenchMultiDataset,
|
'type': NeedleBenchMultiDataset,
|
||||||
'path': base_path,
|
'path': base_path,
|
||||||
'length': original_context_length,
|
'length': original_context_length,
|
||||||
@ -85,7 +87,7 @@ for original_context_length in context_lengths:
|
|||||||
'diff': diff,
|
'diff': diff,
|
||||||
'reader_cfg': needlebench_reader_cfg,
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
'infer_cfg': needlebench_infer_cfg,
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
'eval_cfg': needlebench_eval_cfg
|
'eval_cfg': needlebench_eval_cfg,
|
||||||
}
|
}
|
||||||
needlebench_2needle_en_datasets.append(dataset_dict)
|
needlebench_2needle_en_datasets.append(dataset_dict)
|
||||||
|
|
||||||
@ -96,7 +98,7 @@ for original_context_length in context_lengths:
|
|||||||
for depth_percent in depths_list:
|
for depth_percent in depths_list:
|
||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': f'Length{original_context_length}'
|
'abbr': f'Length{original_context_length}'
|
||||||
f'Depth{int(depth_percent)}_{num_needles}needle_en_1000k',
|
f'Depth{int(depth_percent)}_{num_needles}needle_en_1000k',
|
||||||
'type': NeedleBenchMultiDataset,
|
'type': NeedleBenchMultiDataset,
|
||||||
'path': base_path,
|
'path': base_path,
|
||||||
'length': original_context_length,
|
'length': original_context_length,
|
||||||
@ -112,7 +114,7 @@ for original_context_length in context_lengths:
|
|||||||
'diff': diff,
|
'diff': diff,
|
||||||
'reader_cfg': needlebench_reader_cfg,
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
'infer_cfg': needlebench_infer_cfg,
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
'eval_cfg': needlebench_eval_cfg
|
'eval_cfg': needlebench_eval_cfg,
|
||||||
}
|
}
|
||||||
needlebench_3needle_en_datasets.append(dataset_dict)
|
needlebench_3needle_en_datasets.append(dataset_dict)
|
||||||
|
|
||||||
@ -123,7 +125,7 @@ for original_context_length in context_lengths:
|
|||||||
for depth_percent in depths_list:
|
for depth_percent in depths_list:
|
||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': f'Length{original_context_length}'
|
'abbr': f'Length{original_context_length}'
|
||||||
f'Depth{int(depth_percent)}_{num_needles}needle_en_1000k',
|
f'Depth{int(depth_percent)}_{num_needles}needle_en_1000k',
|
||||||
'type': NeedleBenchMultiDataset,
|
'type': NeedleBenchMultiDataset,
|
||||||
'path': base_path,
|
'path': base_path,
|
||||||
'length': original_context_length,
|
'length': original_context_length,
|
||||||
@ -139,7 +141,7 @@ for original_context_length in context_lengths:
|
|||||||
'diff': diff,
|
'diff': diff,
|
||||||
'reader_cfg': needlebench_reader_cfg,
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
'infer_cfg': needlebench_infer_cfg,
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
'eval_cfg': needlebench_eval_cfg
|
'eval_cfg': needlebench_eval_cfg,
|
||||||
}
|
}
|
||||||
needlebench_4needle_en_datasets.append(dataset_dict)
|
needlebench_4needle_en_datasets.append(dataset_dict)
|
||||||
|
|
||||||
@ -150,7 +152,7 @@ for original_context_length in context_lengths:
|
|||||||
for depth_percent in depths_list:
|
for depth_percent in depths_list:
|
||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': f'Length{original_context_length}'
|
'abbr': f'Length{original_context_length}'
|
||||||
f'Depth{int(depth_percent)}_{num_needles}needle_en_1000k',
|
f'Depth{int(depth_percent)}_{num_needles}needle_en_1000k',
|
||||||
'type': NeedleBenchMultiDataset,
|
'type': NeedleBenchMultiDataset,
|
||||||
'path': base_path,
|
'path': base_path,
|
||||||
'length': original_context_length,
|
'length': original_context_length,
|
||||||
@ -166,12 +168,12 @@ for original_context_length in context_lengths:
|
|||||||
'diff': diff,
|
'diff': diff,
|
||||||
'reader_cfg': needlebench_reader_cfg,
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
'infer_cfg': needlebench_infer_cfg,
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
'eval_cfg': needlebench_eval_cfg
|
'eval_cfg': needlebench_eval_cfg,
|
||||||
}
|
}
|
||||||
needlebench_5needle_en_datasets.append(dataset_dict)
|
needlebench_5needle_en_datasets.append(dataset_dict)
|
||||||
|
|
||||||
# ----------Chinese Version----------
|
# ----------Chinese Version----------
|
||||||
base_path = './data/needlebench'
|
base_path = 'opencompass/needlebench'
|
||||||
file_list = ['zh_finance.jsonl']
|
file_list = ['zh_finance.jsonl']
|
||||||
|
|
||||||
needle_file_name = 'multi_needle_reasoning_zh.json'
|
needle_file_name = 'multi_needle_reasoning_zh.json'
|
||||||
@ -184,7 +186,7 @@ for original_context_length in context_lengths:
|
|||||||
for depth_percent in depths_list:
|
for depth_percent in depths_list:
|
||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': f'Length{original_context_length}'
|
'abbr': f'Length{original_context_length}'
|
||||||
f'Depth{int(depth_percent)}_{num_needles}needle_zh_1000k',
|
f'Depth{int(depth_percent)}_{num_needles}needle_zh_1000k',
|
||||||
'type': NeedleBenchMultiDataset,
|
'type': NeedleBenchMultiDataset,
|
||||||
'path': base_path,
|
'path': base_path,
|
||||||
'length': original_context_length,
|
'length': original_context_length,
|
||||||
@ -200,7 +202,7 @@ for original_context_length in context_lengths:
|
|||||||
'diff': diff,
|
'diff': diff,
|
||||||
'reader_cfg': needlebench_reader_cfg,
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
'infer_cfg': needlebench_infer_cfg,
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
'eval_cfg': needlebench_eval_cfg
|
'eval_cfg': needlebench_eval_cfg,
|
||||||
}
|
}
|
||||||
needlebench_2needle_zh_datasets.append(dataset_dict)
|
needlebench_2needle_zh_datasets.append(dataset_dict)
|
||||||
|
|
||||||
@ -211,7 +213,7 @@ for original_context_length in context_lengths:
|
|||||||
for depth_percent in depths_list:
|
for depth_percent in depths_list:
|
||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': f'Length{original_context_length}'
|
'abbr': f'Length{original_context_length}'
|
||||||
f'Depth{int(depth_percent)}_{num_needles}needle_zh_1000k',
|
f'Depth{int(depth_percent)}_{num_needles}needle_zh_1000k',
|
||||||
'type': NeedleBenchMultiDataset,
|
'type': NeedleBenchMultiDataset,
|
||||||
'path': base_path,
|
'path': base_path,
|
||||||
'length': original_context_length,
|
'length': original_context_length,
|
||||||
@ -227,7 +229,7 @@ for original_context_length in context_lengths:
|
|||||||
'diff': diff,
|
'diff': diff,
|
||||||
'reader_cfg': needlebench_reader_cfg,
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
'infer_cfg': needlebench_infer_cfg,
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
'eval_cfg': needlebench_eval_cfg
|
'eval_cfg': needlebench_eval_cfg,
|
||||||
}
|
}
|
||||||
needlebench_3needle_zh_datasets.append(dataset_dict)
|
needlebench_3needle_zh_datasets.append(dataset_dict)
|
||||||
|
|
||||||
@ -238,7 +240,7 @@ for original_context_length in context_lengths:
|
|||||||
for depth_percent in depths_list:
|
for depth_percent in depths_list:
|
||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': f'Length{original_context_length}'
|
'abbr': f'Length{original_context_length}'
|
||||||
f'Depth{int(depth_percent)}_{num_needles}needle_zh_1000k',
|
f'Depth{int(depth_percent)}_{num_needles}needle_zh_1000k',
|
||||||
'type': NeedleBenchMultiDataset,
|
'type': NeedleBenchMultiDataset,
|
||||||
'path': base_path,
|
'path': base_path,
|
||||||
'length': original_context_length,
|
'length': original_context_length,
|
||||||
@ -254,7 +256,7 @@ for original_context_length in context_lengths:
|
|||||||
'diff': diff,
|
'diff': diff,
|
||||||
'reader_cfg': needlebench_reader_cfg,
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
'infer_cfg': needlebench_infer_cfg,
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
'eval_cfg': needlebench_eval_cfg
|
'eval_cfg': needlebench_eval_cfg,
|
||||||
}
|
}
|
||||||
needlebench_4needle_zh_datasets.append(dataset_dict)
|
needlebench_4needle_zh_datasets.append(dataset_dict)
|
||||||
|
|
||||||
@ -265,7 +267,7 @@ for original_context_length in context_lengths:
|
|||||||
for depth_percent in depths_list:
|
for depth_percent in depths_list:
|
||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': f'Length{original_context_length}'
|
'abbr': f'Length{original_context_length}'
|
||||||
f'Depth{int(depth_percent)}_{num_needles}needle_zh_1000k',
|
f'Depth{int(depth_percent)}_{num_needles}needle_zh_1000k',
|
||||||
'type': NeedleBenchMultiDataset,
|
'type': NeedleBenchMultiDataset,
|
||||||
'path': base_path,
|
'path': base_path,
|
||||||
'length': original_context_length,
|
'length': original_context_length,
|
||||||
@ -281,6 +283,6 @@ for original_context_length in context_lengths:
|
|||||||
'diff': diff,
|
'diff': diff,
|
||||||
'reader_cfg': needlebench_reader_cfg,
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
'infer_cfg': needlebench_infer_cfg,
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
'eval_cfg': needlebench_eval_cfg
|
'eval_cfg': needlebench_eval_cfg,
|
||||||
}
|
}
|
||||||
needlebench_5needle_zh_datasets.append(dataset_dict)
|
needlebench_5needle_zh_datasets.append(dataset_dict)
|
||||||
|
@ -41,22 +41,24 @@ needlebench_infer_cfg = dict(
|
|||||||
dict(role='HUMAN', prompt='{prompt}'),
|
dict(role='HUMAN', prompt='{prompt}'),
|
||||||
dict(role='BOT', prompt='{answer}\n'),
|
dict(role='BOT', prompt='{answer}\n'),
|
||||||
]
|
]
|
||||||
)
|
|
||||||
),
|
),
|
||||||
|
),
|
||||||
retriever=dict(type=ZeroRetriever),
|
retriever=dict(type=ZeroRetriever),
|
||||||
inferencer=dict(type=GenInferencer))
|
inferencer=dict(type=GenInferencer),
|
||||||
|
)
|
||||||
|
|
||||||
needlebench_eval_cfg = dict(
|
needlebench_eval_cfg = dict(
|
||||||
evaluator=dict(type=NeedleBenchParallelEvaluator),
|
evaluator=dict(type=NeedleBenchParallelEvaluator),
|
||||||
pred_postprocessor=dict(type=needlebench_postprocess),
|
pred_postprocessor=dict(type=needlebench_postprocess),
|
||||||
dataset_postprocessor=dict(type=needlebench_dataset_postprocess),
|
dataset_postprocessor=dict(type=needlebench_dataset_postprocess),
|
||||||
pred_role='BOT')
|
pred_role='BOT',
|
||||||
|
)
|
||||||
|
|
||||||
context_lengths = list([20000, 160000, 300000, 440000, 580000, 720000, 860000, 1000000])
|
context_lengths = list([20000, 160000, 300000, 440000, 580000, 720000, 860000, 1000000])
|
||||||
document_depth_percent_intervals = 20
|
document_depth_percent_intervals = 20
|
||||||
document_depth_percent_interval_type = 'linear'
|
document_depth_percent_interval_type = 'linear'
|
||||||
|
|
||||||
base_path = './data/needlebench'
|
base_path = 'opencompass/needlebench'
|
||||||
file_list = ['PaulGrahamEssays.jsonl']
|
file_list = ['PaulGrahamEssays.jsonl']
|
||||||
needlebench_en_datasets = []
|
needlebench_en_datasets = []
|
||||||
needle_file_name = 'needles.jsonl'
|
needle_file_name = 'needles.jsonl'
|
||||||
@ -64,8 +66,7 @@ depths = [0, 10, 21, 31, 42, 52, 63, 73, 84, 94, 100]
|
|||||||
|
|
||||||
for original_context_length in context_lengths:
|
for original_context_length in context_lengths:
|
||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': f'Length{original_context_length}'
|
'abbr': f'Length{original_context_length}' f'_parallel_en_1000k',
|
||||||
f'_parallel_en_1000k',
|
|
||||||
'type': NeedleBenchParallelDataset,
|
'type': NeedleBenchParallelDataset,
|
||||||
'path': base_path,
|
'path': base_path,
|
||||||
'needle_file_name': needle_file_name,
|
'needle_file_name': needle_file_name,
|
||||||
@ -79,7 +80,7 @@ for original_context_length in context_lengths:
|
|||||||
'language': 'English',
|
'language': 'English',
|
||||||
'reader_cfg': needlebench_reader_cfg,
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
'infer_cfg': needlebench_infer_cfg,
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
'eval_cfg': needlebench_eval_cfg
|
'eval_cfg': needlebench_eval_cfg,
|
||||||
}
|
}
|
||||||
needlebench_en_datasets.append(dataset_dict)
|
needlebench_en_datasets.append(dataset_dict)
|
||||||
|
|
||||||
@ -88,8 +89,7 @@ needlebench_zh_datasets = []
|
|||||||
|
|
||||||
for original_context_length in context_lengths:
|
for original_context_length in context_lengths:
|
||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': f'Length{original_context_length}'
|
'abbr': f'Length{original_context_length}' f'_parallel_zh_1000k',
|
||||||
f'_parallel_zh_1000k',
|
|
||||||
'type': NeedleBenchParallelDataset,
|
'type': NeedleBenchParallelDataset,
|
||||||
'path': base_path,
|
'path': base_path,
|
||||||
'needle_file_name': needle_file_name,
|
'needle_file_name': needle_file_name,
|
||||||
@ -103,6 +103,6 @@ for original_context_length in context_lengths:
|
|||||||
'language': 'Chinese',
|
'language': 'Chinese',
|
||||||
'reader_cfg': needlebench_reader_cfg,
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
'infer_cfg': needlebench_infer_cfg,
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
'eval_cfg': needlebench_eval_cfg
|
'eval_cfg': needlebench_eval_cfg,
|
||||||
}
|
}
|
||||||
needlebench_zh_datasets.append(dataset_dict)
|
needlebench_zh_datasets.append(dataset_dict)
|
||||||
|
@ -41,21 +41,23 @@ needlebench_infer_cfg = dict(
|
|||||||
dict(role='HUMAN', prompt='{prompt}'),
|
dict(role='HUMAN', prompt='{prompt}'),
|
||||||
dict(role='BOT', prompt='{answer}\n'),
|
dict(role='BOT', prompt='{answer}\n'),
|
||||||
]
|
]
|
||||||
)
|
|
||||||
),
|
),
|
||||||
|
),
|
||||||
retriever=dict(type=ZeroRetriever),
|
retriever=dict(type=ZeroRetriever),
|
||||||
inferencer=dict(type=GenInferencer))
|
inferencer=dict(type=GenInferencer),
|
||||||
|
)
|
||||||
|
|
||||||
needlebench_eval_cfg = dict(
|
needlebench_eval_cfg = dict(
|
||||||
evaluator=dict(type=NeedleBenchOriginEvaluator),
|
evaluator=dict(type=NeedleBenchOriginEvaluator),
|
||||||
pred_postprocessor=dict(type=needlebench_postprocess),
|
pred_postprocessor=dict(type=needlebench_postprocess),
|
||||||
dataset_postprocessor=dict(type=needlebench_dataset_postprocess),
|
dataset_postprocessor=dict(type=needlebench_dataset_postprocess),
|
||||||
pred_role='BOT')
|
pred_role='BOT',
|
||||||
|
)
|
||||||
|
|
||||||
context_lengths = [20000, 160000, 300000, 440000, 580000, 720000, 860000, 1000000]
|
context_lengths = [20000, 160000, 300000, 440000, 580000, 720000, 860000, 1000000]
|
||||||
depths_list = [0, 10, 21, 31, 42, 52, 63, 73, 84, 94, 100]
|
depths_list = [0, 10, 21, 31, 42, 52, 63, 73, 84, 94, 100]
|
||||||
|
|
||||||
base_path = './data/needlebench'
|
base_path = 'opencompass/needlebench'
|
||||||
file_list = ['PaulGrahamEssays.jsonl']
|
file_list = ['PaulGrahamEssays.jsonl']
|
||||||
needlebench_en_datasets = []
|
needlebench_en_datasets = []
|
||||||
needle_file_name = 'needles.jsonl'
|
needle_file_name = 'needles.jsonl'
|
||||||
@ -64,7 +66,7 @@ for original_context_length in context_lengths:
|
|||||||
for depth_percent in depths_list:
|
for depth_percent in depths_list:
|
||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': f'Length{original_context_length}'
|
'abbr': f'Length{original_context_length}'
|
||||||
f'Depth{int(depth_percent)}_origin_en_1000k',
|
f'Depth{int(depth_percent)}_origin_en_1000k',
|
||||||
'type': NeedleBenchOriginDataset,
|
'type': NeedleBenchOriginDataset,
|
||||||
'path': base_path,
|
'path': base_path,
|
||||||
'length': original_context_length,
|
'length': original_context_length,
|
||||||
@ -78,7 +80,7 @@ for original_context_length in context_lengths:
|
|||||||
'needle_file_name': needle_file_name,
|
'needle_file_name': needle_file_name,
|
||||||
'reader_cfg': needlebench_reader_cfg,
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
'infer_cfg': needlebench_infer_cfg,
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
'eval_cfg': needlebench_eval_cfg
|
'eval_cfg': needlebench_eval_cfg,
|
||||||
}
|
}
|
||||||
needlebench_en_datasets.append(dataset_dict)
|
needlebench_en_datasets.append(dataset_dict)
|
||||||
|
|
||||||
@ -90,7 +92,7 @@ for original_context_length in context_lengths:
|
|||||||
for depth_percent in depths_list:
|
for depth_percent in depths_list:
|
||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': f'Length{original_context_length}'
|
'abbr': f'Length{original_context_length}'
|
||||||
f'Depth{int(depth_percent)}_origin_zh_1000k',
|
f'Depth{int(depth_percent)}_origin_zh_1000k',
|
||||||
'type': NeedleBenchOriginDataset,
|
'type': NeedleBenchOriginDataset,
|
||||||
'path': base_path,
|
'path': base_path,
|
||||||
'length': original_context_length,
|
'length': original_context_length,
|
||||||
@ -104,6 +106,6 @@ for original_context_length in context_lengths:
|
|||||||
'needle_file_name': needle_file_name,
|
'needle_file_name': needle_file_name,
|
||||||
'reader_cfg': needlebench_reader_cfg,
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
'infer_cfg': needlebench_infer_cfg,
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
'eval_cfg': needlebench_eval_cfg
|
'eval_cfg': needlebench_eval_cfg,
|
||||||
}
|
}
|
||||||
needlebench_zh_datasets.append(dataset_dict)
|
needlebench_zh_datasets.append(dataset_dict)
|
||||||
|
@ -41,16 +41,18 @@ needlebench_infer_cfg = dict(
|
|||||||
dict(role='HUMAN', prompt='{prompt}'),
|
dict(role='HUMAN', prompt='{prompt}'),
|
||||||
dict(role='BOT', prompt='{answer}\n'),
|
dict(role='BOT', prompt='{answer}\n'),
|
||||||
]
|
]
|
||||||
)
|
|
||||||
),
|
),
|
||||||
|
),
|
||||||
retriever=dict(type=ZeroRetriever),
|
retriever=dict(type=ZeroRetriever),
|
||||||
inferencer=dict(type=GenInferencer))
|
inferencer=dict(type=GenInferencer),
|
||||||
|
)
|
||||||
|
|
||||||
needlebench_eval_cfg = dict(
|
needlebench_eval_cfg = dict(
|
||||||
evaluator=dict(type=NeedleBenchMultiEvaluator),
|
evaluator=dict(type=NeedleBenchMultiEvaluator),
|
||||||
pred_postprocessor=dict(type=needlebench_postprocess),
|
pred_postprocessor=dict(type=needlebench_postprocess),
|
||||||
dataset_postprocessor=dict(type=needlebench_dataset_postprocess),
|
dataset_postprocessor=dict(type=needlebench_dataset_postprocess),
|
||||||
pred_role='BOT')
|
pred_role='BOT',
|
||||||
|
)
|
||||||
|
|
||||||
context_lengths = list([16000, 32000, 48000, 64000, 80000, 96000, 112000, 128000])
|
context_lengths = list([16000, 32000, 48000, 64000, 80000, 96000, 112000, 128000])
|
||||||
depths_list = [0, 10, 21, 31, 42, 52, 63, 73, 84, 94, 100]
|
depths_list = [0, 10, 21, 31, 42, 52, 63, 73, 84, 94, 100]
|
||||||
@ -58,7 +60,7 @@ document_depth_percent_intervals = 20
|
|||||||
document_depth_percent_interval_type = 'linear'
|
document_depth_percent_interval_type = 'linear'
|
||||||
|
|
||||||
# ----------English Version----------
|
# ----------English Version----------
|
||||||
base_path = './data/needlebench'
|
base_path = 'opencompass/needlebench'
|
||||||
file_list = ['PaulGrahamEssays.jsonl']
|
file_list = ['PaulGrahamEssays.jsonl']
|
||||||
|
|
||||||
needle_file_name = 'multi_needle_reasoning_en.json'
|
needle_file_name = 'multi_needle_reasoning_en.json'
|
||||||
@ -71,7 +73,7 @@ for original_context_length in context_lengths:
|
|||||||
for depth_percent in depths_list:
|
for depth_percent in depths_list:
|
||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': f'Length{original_context_length}'
|
'abbr': f'Length{original_context_length}'
|
||||||
f'Depth{int(depth_percent)}_{num_needles}needle_en_128k',
|
f'Depth{int(depth_percent)}_{num_needles}needle_en_128k',
|
||||||
'type': NeedleBenchMultiDataset,
|
'type': NeedleBenchMultiDataset,
|
||||||
'path': base_path,
|
'path': base_path,
|
||||||
'length': original_context_length,
|
'length': original_context_length,
|
||||||
@ -87,7 +89,7 @@ for original_context_length in context_lengths:
|
|||||||
'diff': diff,
|
'diff': diff,
|
||||||
'reader_cfg': needlebench_reader_cfg,
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
'infer_cfg': needlebench_infer_cfg,
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
'eval_cfg': needlebench_eval_cfg
|
'eval_cfg': needlebench_eval_cfg,
|
||||||
}
|
}
|
||||||
needlebench_2needle_en_datasets.append(dataset_dict)
|
needlebench_2needle_en_datasets.append(dataset_dict)
|
||||||
|
|
||||||
@ -98,7 +100,7 @@ for original_context_length in context_lengths:
|
|||||||
for depth_percent in depths_list:
|
for depth_percent in depths_list:
|
||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': f'Length{original_context_length}'
|
'abbr': f'Length{original_context_length}'
|
||||||
f'Depth{int(depth_percent)}_{num_needles}needle_en_128k',
|
f'Depth{int(depth_percent)}_{num_needles}needle_en_128k',
|
||||||
'type': NeedleBenchMultiDataset,
|
'type': NeedleBenchMultiDataset,
|
||||||
'path': base_path,
|
'path': base_path,
|
||||||
'length': original_context_length,
|
'length': original_context_length,
|
||||||
@ -114,7 +116,7 @@ for original_context_length in context_lengths:
|
|||||||
'diff': diff,
|
'diff': diff,
|
||||||
'reader_cfg': needlebench_reader_cfg,
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
'infer_cfg': needlebench_infer_cfg,
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
'eval_cfg': needlebench_eval_cfg
|
'eval_cfg': needlebench_eval_cfg,
|
||||||
}
|
}
|
||||||
needlebench_3needle_en_datasets.append(dataset_dict)
|
needlebench_3needle_en_datasets.append(dataset_dict)
|
||||||
|
|
||||||
@ -125,7 +127,7 @@ for original_context_length in context_lengths:
|
|||||||
for depth_percent in depths_list:
|
for depth_percent in depths_list:
|
||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': f'Length{original_context_length}'
|
'abbr': f'Length{original_context_length}'
|
||||||
f'Depth{int(depth_percent)}_{num_needles}needle_en_128k',
|
f'Depth{int(depth_percent)}_{num_needles}needle_en_128k',
|
||||||
'type': NeedleBenchMultiDataset,
|
'type': NeedleBenchMultiDataset,
|
||||||
'path': base_path,
|
'path': base_path,
|
||||||
'length': original_context_length,
|
'length': original_context_length,
|
||||||
@ -141,7 +143,7 @@ for original_context_length in context_lengths:
|
|||||||
'diff': diff,
|
'diff': diff,
|
||||||
'reader_cfg': needlebench_reader_cfg,
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
'infer_cfg': needlebench_infer_cfg,
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
'eval_cfg': needlebench_eval_cfg
|
'eval_cfg': needlebench_eval_cfg,
|
||||||
}
|
}
|
||||||
needlebench_4needle_en_datasets.append(dataset_dict)
|
needlebench_4needle_en_datasets.append(dataset_dict)
|
||||||
|
|
||||||
@ -152,7 +154,7 @@ for original_context_length in context_lengths:
|
|||||||
for depth_percent in depths_list:
|
for depth_percent in depths_list:
|
||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': f'Length{original_context_length}'
|
'abbr': f'Length{original_context_length}'
|
||||||
f'Depth{int(depth_percent)}_{num_needles}needle_en_128k',
|
f'Depth{int(depth_percent)}_{num_needles}needle_en_128k',
|
||||||
'type': NeedleBenchMultiDataset,
|
'type': NeedleBenchMultiDataset,
|
||||||
'path': base_path,
|
'path': base_path,
|
||||||
'length': original_context_length,
|
'length': original_context_length,
|
||||||
@ -168,12 +170,12 @@ for original_context_length in context_lengths:
|
|||||||
'diff': diff,
|
'diff': diff,
|
||||||
'reader_cfg': needlebench_reader_cfg,
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
'infer_cfg': needlebench_infer_cfg,
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
'eval_cfg': needlebench_eval_cfg
|
'eval_cfg': needlebench_eval_cfg,
|
||||||
}
|
}
|
||||||
needlebench_5needle_en_datasets.append(dataset_dict)
|
needlebench_5needle_en_datasets.append(dataset_dict)
|
||||||
|
|
||||||
# ----------Chinese Version----------
|
# ----------Chinese Version----------
|
||||||
base_path = './data/needlebench'
|
base_path = 'opencompass/needlebench'
|
||||||
file_list = ['zh_finance.jsonl']
|
file_list = ['zh_finance.jsonl']
|
||||||
|
|
||||||
needle_file_name = 'multi_needle_reasoning_zh.json'
|
needle_file_name = 'multi_needle_reasoning_zh.json'
|
||||||
@ -186,7 +188,7 @@ for original_context_length in context_lengths:
|
|||||||
for depth_percent in depths_list:
|
for depth_percent in depths_list:
|
||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': f'Length{original_context_length}'
|
'abbr': f'Length{original_context_length}'
|
||||||
f'Depth{int(depth_percent)}_{num_needles}needle_zh_128k',
|
f'Depth{int(depth_percent)}_{num_needles}needle_zh_128k',
|
||||||
'type': NeedleBenchMultiDataset,
|
'type': NeedleBenchMultiDataset,
|
||||||
'path': base_path,
|
'path': base_path,
|
||||||
'length': original_context_length,
|
'length': original_context_length,
|
||||||
@ -202,7 +204,7 @@ for original_context_length in context_lengths:
|
|||||||
'diff': diff,
|
'diff': diff,
|
||||||
'reader_cfg': needlebench_reader_cfg,
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
'infer_cfg': needlebench_infer_cfg,
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
'eval_cfg': needlebench_eval_cfg
|
'eval_cfg': needlebench_eval_cfg,
|
||||||
}
|
}
|
||||||
needlebench_2needle_zh_datasets.append(dataset_dict)
|
needlebench_2needle_zh_datasets.append(dataset_dict)
|
||||||
|
|
||||||
@ -213,7 +215,7 @@ for original_context_length in context_lengths:
|
|||||||
for depth_percent in depths_list:
|
for depth_percent in depths_list:
|
||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': f'Length{original_context_length}'
|
'abbr': f'Length{original_context_length}'
|
||||||
f'Depth{int(depth_percent)}_{num_needles}needle_zh_128k',
|
f'Depth{int(depth_percent)}_{num_needles}needle_zh_128k',
|
||||||
'type': NeedleBenchMultiDataset,
|
'type': NeedleBenchMultiDataset,
|
||||||
'path': base_path,
|
'path': base_path,
|
||||||
'length': original_context_length,
|
'length': original_context_length,
|
||||||
@ -229,7 +231,7 @@ for original_context_length in context_lengths:
|
|||||||
'diff': diff,
|
'diff': diff,
|
||||||
'reader_cfg': needlebench_reader_cfg,
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
'infer_cfg': needlebench_infer_cfg,
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
'eval_cfg': needlebench_eval_cfg
|
'eval_cfg': needlebench_eval_cfg,
|
||||||
}
|
}
|
||||||
needlebench_3needle_zh_datasets.append(dataset_dict)
|
needlebench_3needle_zh_datasets.append(dataset_dict)
|
||||||
|
|
||||||
@ -240,7 +242,7 @@ for original_context_length in context_lengths:
|
|||||||
for depth_percent in depths_list:
|
for depth_percent in depths_list:
|
||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': f'Length{original_context_length}'
|
'abbr': f'Length{original_context_length}'
|
||||||
f'Depth{int(depth_percent)}_{num_needles}needle_zh_128k',
|
f'Depth{int(depth_percent)}_{num_needles}needle_zh_128k',
|
||||||
'type': NeedleBenchMultiDataset,
|
'type': NeedleBenchMultiDataset,
|
||||||
'path': base_path,
|
'path': base_path,
|
||||||
'length': original_context_length,
|
'length': original_context_length,
|
||||||
@ -256,7 +258,7 @@ for original_context_length in context_lengths:
|
|||||||
'diff': diff,
|
'diff': diff,
|
||||||
'reader_cfg': needlebench_reader_cfg,
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
'infer_cfg': needlebench_infer_cfg,
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
'eval_cfg': needlebench_eval_cfg
|
'eval_cfg': needlebench_eval_cfg,
|
||||||
}
|
}
|
||||||
needlebench_4needle_zh_datasets.append(dataset_dict)
|
needlebench_4needle_zh_datasets.append(dataset_dict)
|
||||||
|
|
||||||
@ -267,7 +269,7 @@ for original_context_length in context_lengths:
|
|||||||
for depth_percent in depths_list:
|
for depth_percent in depths_list:
|
||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': f'Length{original_context_length}'
|
'abbr': f'Length{original_context_length}'
|
||||||
f'Depth{int(depth_percent)}_{num_needles}needle_zh_128k',
|
f'Depth{int(depth_percent)}_{num_needles}needle_zh_128k',
|
||||||
'type': NeedleBenchMultiDataset,
|
'type': NeedleBenchMultiDataset,
|
||||||
'path': base_path,
|
'path': base_path,
|
||||||
'length': original_context_length,
|
'length': original_context_length,
|
||||||
@ -283,6 +285,6 @@ for original_context_length in context_lengths:
|
|||||||
'diff': diff,
|
'diff': diff,
|
||||||
'reader_cfg': needlebench_reader_cfg,
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
'infer_cfg': needlebench_infer_cfg,
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
'eval_cfg': needlebench_eval_cfg
|
'eval_cfg': needlebench_eval_cfg,
|
||||||
}
|
}
|
||||||
needlebench_5needle_zh_datasets.append(dataset_dict)
|
needlebench_5needle_zh_datasets.append(dataset_dict)
|
||||||
|
@ -41,22 +41,24 @@ needlebench_infer_cfg = dict(
|
|||||||
dict(role='HUMAN', prompt='{prompt}'),
|
dict(role='HUMAN', prompt='{prompt}'),
|
||||||
dict(role='BOT', prompt='{answer}\n'),
|
dict(role='BOT', prompt='{answer}\n'),
|
||||||
]
|
]
|
||||||
)
|
|
||||||
),
|
),
|
||||||
|
),
|
||||||
retriever=dict(type=ZeroRetriever),
|
retriever=dict(type=ZeroRetriever),
|
||||||
inferencer=dict(type=GenInferencer))
|
inferencer=dict(type=GenInferencer),
|
||||||
|
)
|
||||||
|
|
||||||
needlebench_eval_cfg = dict(
|
needlebench_eval_cfg = dict(
|
||||||
evaluator=dict(type=NeedleBenchParallelEvaluator),
|
evaluator=dict(type=NeedleBenchParallelEvaluator),
|
||||||
pred_postprocessor=dict(type=needlebench_postprocess),
|
pred_postprocessor=dict(type=needlebench_postprocess),
|
||||||
dataset_postprocessor=dict(type=needlebench_dataset_postprocess),
|
dataset_postprocessor=dict(type=needlebench_dataset_postprocess),
|
||||||
pred_role='BOT')
|
pred_role='BOT',
|
||||||
|
)
|
||||||
|
|
||||||
context_lengths = list([16000, 32000, 48000, 64000, 80000, 96000, 112000, 128000])
|
context_lengths = list([16000, 32000, 48000, 64000, 80000, 96000, 112000, 128000])
|
||||||
document_depth_percent_intervals = 20
|
document_depth_percent_intervals = 20
|
||||||
document_depth_percent_interval_type = 'linear'
|
document_depth_percent_interval_type = 'linear'
|
||||||
|
|
||||||
base_path = './data/needlebench'
|
base_path = 'opencompass/needlebench'
|
||||||
file_list = ['PaulGrahamEssays.jsonl']
|
file_list = ['PaulGrahamEssays.jsonl']
|
||||||
needlebench_en_datasets = []
|
needlebench_en_datasets = []
|
||||||
needle_file_name = 'needles.jsonl'
|
needle_file_name = 'needles.jsonl'
|
||||||
@ -64,8 +66,7 @@ depths = [0, 10, 21, 31, 42, 52, 63, 73, 84, 94, 100]
|
|||||||
|
|
||||||
for original_context_length in context_lengths:
|
for original_context_length in context_lengths:
|
||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': f'Length{original_context_length}'
|
'abbr': f'Length{original_context_length}' f'_parallel_en_128k',
|
||||||
f'_parallel_en_128k',
|
|
||||||
'type': NeedleBenchParallelDataset,
|
'type': NeedleBenchParallelDataset,
|
||||||
'path': base_path,
|
'path': base_path,
|
||||||
'needle_file_name': needle_file_name,
|
'needle_file_name': needle_file_name,
|
||||||
@ -79,7 +80,7 @@ for original_context_length in context_lengths:
|
|||||||
'language': 'English',
|
'language': 'English',
|
||||||
'reader_cfg': needlebench_reader_cfg,
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
'infer_cfg': needlebench_infer_cfg,
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
'eval_cfg': needlebench_eval_cfg
|
'eval_cfg': needlebench_eval_cfg,
|
||||||
}
|
}
|
||||||
needlebench_en_datasets.append(dataset_dict)
|
needlebench_en_datasets.append(dataset_dict)
|
||||||
|
|
||||||
@ -88,8 +89,7 @@ needlebench_zh_datasets = []
|
|||||||
|
|
||||||
for original_context_length in context_lengths:
|
for original_context_length in context_lengths:
|
||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': f'Length{original_context_length}'
|
'abbr': f'Length{original_context_length}' f'_parallel_zh_128k',
|
||||||
f'_parallel_zh_128k',
|
|
||||||
'type': NeedleBenchParallelDataset,
|
'type': NeedleBenchParallelDataset,
|
||||||
'path': base_path,
|
'path': base_path,
|
||||||
'needle_file_name': needle_file_name,
|
'needle_file_name': needle_file_name,
|
||||||
@ -103,6 +103,6 @@ for original_context_length in context_lengths:
|
|||||||
'language': 'Chinese',
|
'language': 'Chinese',
|
||||||
'reader_cfg': needlebench_reader_cfg,
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
'infer_cfg': needlebench_infer_cfg,
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
'eval_cfg': needlebench_eval_cfg
|
'eval_cfg': needlebench_eval_cfg,
|
||||||
}
|
}
|
||||||
needlebench_zh_datasets.append(dataset_dict)
|
needlebench_zh_datasets.append(dataset_dict)
|
||||||
|
@ -41,23 +41,25 @@ needlebench_infer_cfg = dict(
|
|||||||
dict(role='HUMAN', prompt='{prompt}'),
|
dict(role='HUMAN', prompt='{prompt}'),
|
||||||
dict(role='BOT', prompt='{answer}\n'),
|
dict(role='BOT', prompt='{answer}\n'),
|
||||||
]
|
]
|
||||||
)
|
|
||||||
),
|
),
|
||||||
|
),
|
||||||
retriever=dict(type=ZeroRetriever),
|
retriever=dict(type=ZeroRetriever),
|
||||||
inferencer=dict(type=GenInferencer))
|
inferencer=dict(type=GenInferencer),
|
||||||
|
)
|
||||||
|
|
||||||
needlebench_eval_cfg = dict(
|
needlebench_eval_cfg = dict(
|
||||||
evaluator=dict(type=NeedleBenchOriginEvaluator),
|
evaluator=dict(type=NeedleBenchOriginEvaluator),
|
||||||
pred_postprocessor=dict(type=needlebench_postprocess),
|
pred_postprocessor=dict(type=needlebench_postprocess),
|
||||||
dataset_postprocessor=dict(type=needlebench_dataset_postprocess),
|
dataset_postprocessor=dict(type=needlebench_dataset_postprocess),
|
||||||
pred_role='BOT')
|
pred_role='BOT',
|
||||||
|
)
|
||||||
|
|
||||||
context_lengths = list([16000, 32000, 48000, 64000, 80000, 96000, 112000, 128000])
|
context_lengths = list([16000, 32000, 48000, 64000, 80000, 96000, 112000, 128000])
|
||||||
depths_list = [0, 10, 21, 31, 42, 52, 63, 73, 84, 94, 100]
|
depths_list = [0, 10, 21, 31, 42, 52, 63, 73, 84, 94, 100]
|
||||||
document_depth_percent_intervals = 20
|
document_depth_percent_intervals = 20
|
||||||
document_depth_percent_interval_type = 'linear'
|
document_depth_percent_interval_type = 'linear'
|
||||||
|
|
||||||
base_path = './data/needlebench'
|
base_path = 'opencompass/needlebench'
|
||||||
file_list = ['PaulGrahamEssays.jsonl']
|
file_list = ['PaulGrahamEssays.jsonl']
|
||||||
needlebench_en_datasets = []
|
needlebench_en_datasets = []
|
||||||
needle_file_name = 'needles.jsonl'
|
needle_file_name = 'needles.jsonl'
|
||||||
@ -66,7 +68,7 @@ for original_context_length in context_lengths:
|
|||||||
for depth_percent in depths_list:
|
for depth_percent in depths_list:
|
||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': f'Length{original_context_length}'
|
'abbr': f'Length{original_context_length}'
|
||||||
f'Depth{int(depth_percent)}_origin_en_128k',
|
f'Depth{int(depth_percent)}_origin_en_128k',
|
||||||
'type': NeedleBenchOriginDataset,
|
'type': NeedleBenchOriginDataset,
|
||||||
'path': base_path,
|
'path': base_path,
|
||||||
'length': original_context_length,
|
'length': original_context_length,
|
||||||
@ -80,7 +82,7 @@ for original_context_length in context_lengths:
|
|||||||
'needle_file_name': needle_file_name,
|
'needle_file_name': needle_file_name,
|
||||||
'reader_cfg': needlebench_reader_cfg,
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
'infer_cfg': needlebench_infer_cfg,
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
'eval_cfg': needlebench_eval_cfg
|
'eval_cfg': needlebench_eval_cfg,
|
||||||
}
|
}
|
||||||
needlebench_en_datasets.append(dataset_dict)
|
needlebench_en_datasets.append(dataset_dict)
|
||||||
|
|
||||||
@ -92,7 +94,7 @@ for original_context_length in context_lengths:
|
|||||||
for depth_percent in depths_list:
|
for depth_percent in depths_list:
|
||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': f'Length{original_context_length}'
|
'abbr': f'Length{original_context_length}'
|
||||||
f'Depth{int(depth_percent)}_origin_zh_128k',
|
f'Depth{int(depth_percent)}_origin_zh_128k',
|
||||||
'type': NeedleBenchOriginDataset,
|
'type': NeedleBenchOriginDataset,
|
||||||
'path': base_path,
|
'path': base_path,
|
||||||
'length': original_context_length,
|
'length': original_context_length,
|
||||||
@ -106,6 +108,6 @@ for original_context_length in context_lengths:
|
|||||||
'needle_file_name': needle_file_name,
|
'needle_file_name': needle_file_name,
|
||||||
'reader_cfg': needlebench_reader_cfg,
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
'infer_cfg': needlebench_infer_cfg,
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
'eval_cfg': needlebench_eval_cfg
|
'eval_cfg': needlebench_eval_cfg,
|
||||||
}
|
}
|
||||||
needlebench_zh_datasets.append(dataset_dict)
|
needlebench_zh_datasets.append(dataset_dict)
|
||||||
|
@ -41,23 +41,25 @@ needlebench_infer_cfg = dict(
|
|||||||
dict(role='HUMAN', prompt='{prompt}'),
|
dict(role='HUMAN', prompt='{prompt}'),
|
||||||
dict(role='BOT', prompt='{answer}\n'),
|
dict(role='BOT', prompt='{answer}\n'),
|
||||||
]
|
]
|
||||||
)
|
|
||||||
),
|
),
|
||||||
|
),
|
||||||
retriever=dict(type=ZeroRetriever),
|
retriever=dict(type=ZeroRetriever),
|
||||||
inferencer=dict(type=GenInferencer))
|
inferencer=dict(type=GenInferencer),
|
||||||
|
)
|
||||||
|
|
||||||
needlebench_eval_cfg = dict(
|
needlebench_eval_cfg = dict(
|
||||||
evaluator=dict(type=NeedleBenchMultiEvaluator),
|
evaluator=dict(type=NeedleBenchMultiEvaluator),
|
||||||
pred_postprocessor=dict(type=needlebench_postprocess),
|
pred_postprocessor=dict(type=needlebench_postprocess),
|
||||||
dataset_postprocessor=dict(type=needlebench_dataset_postprocess),
|
dataset_postprocessor=dict(type=needlebench_dataset_postprocess),
|
||||||
pred_role='BOT')
|
pred_role='BOT',
|
||||||
|
)
|
||||||
|
|
||||||
# context_lengths = list([16000, 32000, 48000, 64000, 80000, 96000, 112000, 128000, 144000, 160000, 176000, 192000, 200000])
|
# context_lengths = list([16000, 32000, 48000, 64000, 80000, 96000, 112000, 128000, 144000, 160000, 176000, 192000, 200000])
|
||||||
context_lengths = [16000, 48000, 80000, 112000, 128000, 144000, 176000, 200000]
|
context_lengths = [16000, 48000, 80000, 112000, 128000, 144000, 176000, 200000]
|
||||||
depths_list = [0, 10, 21, 31, 42, 52, 63, 73, 84, 94, 100]
|
depths_list = [0, 10, 21, 31, 42, 52, 63, 73, 84, 94, 100]
|
||||||
|
|
||||||
# ----------English Version----------
|
# ----------English Version----------
|
||||||
base_path = './data/needlebench'
|
base_path = 'opencompass/needlebench'
|
||||||
file_list = ['PaulGrahamEssays.jsonl']
|
file_list = ['PaulGrahamEssays.jsonl']
|
||||||
|
|
||||||
needle_file_name = 'multi_needle_reasoning_en.json'
|
needle_file_name = 'multi_needle_reasoning_en.json'
|
||||||
@ -70,7 +72,7 @@ for original_context_length in context_lengths:
|
|||||||
for depth_percent in depths_list:
|
for depth_percent in depths_list:
|
||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': f'Length{original_context_length}'
|
'abbr': f'Length{original_context_length}'
|
||||||
f'Depth{int(depth_percent)}_{num_needles}needle_en_200k',
|
f'Depth{int(depth_percent)}_{num_needles}needle_en_200k',
|
||||||
'type': NeedleBenchMultiDataset,
|
'type': NeedleBenchMultiDataset,
|
||||||
'path': base_path,
|
'path': base_path,
|
||||||
'length': original_context_length,
|
'length': original_context_length,
|
||||||
@ -86,7 +88,7 @@ for original_context_length in context_lengths:
|
|||||||
'diff': diff,
|
'diff': diff,
|
||||||
'reader_cfg': needlebench_reader_cfg,
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
'infer_cfg': needlebench_infer_cfg,
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
'eval_cfg': needlebench_eval_cfg
|
'eval_cfg': needlebench_eval_cfg,
|
||||||
}
|
}
|
||||||
needlebench_2needle_en_datasets.append(dataset_dict)
|
needlebench_2needle_en_datasets.append(dataset_dict)
|
||||||
|
|
||||||
@ -97,7 +99,7 @@ for original_context_length in context_lengths:
|
|||||||
for depth_percent in depths_list:
|
for depth_percent in depths_list:
|
||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': f'Length{original_context_length}'
|
'abbr': f'Length{original_context_length}'
|
||||||
f'Depth{int(depth_percent)}_{num_needles}needle_en_200k',
|
f'Depth{int(depth_percent)}_{num_needles}needle_en_200k',
|
||||||
'type': NeedleBenchMultiDataset,
|
'type': NeedleBenchMultiDataset,
|
||||||
'path': base_path,
|
'path': base_path,
|
||||||
'length': original_context_length,
|
'length': original_context_length,
|
||||||
@ -113,7 +115,7 @@ for original_context_length in context_lengths:
|
|||||||
'diff': diff,
|
'diff': diff,
|
||||||
'reader_cfg': needlebench_reader_cfg,
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
'infer_cfg': needlebench_infer_cfg,
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
'eval_cfg': needlebench_eval_cfg
|
'eval_cfg': needlebench_eval_cfg,
|
||||||
}
|
}
|
||||||
needlebench_3needle_en_datasets.append(dataset_dict)
|
needlebench_3needle_en_datasets.append(dataset_dict)
|
||||||
|
|
||||||
@ -124,7 +126,7 @@ for original_context_length in context_lengths:
|
|||||||
for depth_percent in depths_list:
|
for depth_percent in depths_list:
|
||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': f'Length{original_context_length}'
|
'abbr': f'Length{original_context_length}'
|
||||||
f'Depth{int(depth_percent)}_{num_needles}needle_en_200k',
|
f'Depth{int(depth_percent)}_{num_needles}needle_en_200k',
|
||||||
'type': NeedleBenchMultiDataset,
|
'type': NeedleBenchMultiDataset,
|
||||||
'path': base_path,
|
'path': base_path,
|
||||||
'length': original_context_length,
|
'length': original_context_length,
|
||||||
@ -140,7 +142,7 @@ for original_context_length in context_lengths:
|
|||||||
'diff': diff,
|
'diff': diff,
|
||||||
'reader_cfg': needlebench_reader_cfg,
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
'infer_cfg': needlebench_infer_cfg,
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
'eval_cfg': needlebench_eval_cfg
|
'eval_cfg': needlebench_eval_cfg,
|
||||||
}
|
}
|
||||||
needlebench_4needle_en_datasets.append(dataset_dict)
|
needlebench_4needle_en_datasets.append(dataset_dict)
|
||||||
|
|
||||||
@ -151,7 +153,7 @@ for original_context_length in context_lengths:
|
|||||||
for depth_percent in depths_list:
|
for depth_percent in depths_list:
|
||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': f'Length{original_context_length}'
|
'abbr': f'Length{original_context_length}'
|
||||||
f'Depth{int(depth_percent)}_{num_needles}needle_en_200k',
|
f'Depth{int(depth_percent)}_{num_needles}needle_en_200k',
|
||||||
'type': NeedleBenchMultiDataset,
|
'type': NeedleBenchMultiDataset,
|
||||||
'path': base_path,
|
'path': base_path,
|
||||||
'length': original_context_length,
|
'length': original_context_length,
|
||||||
@ -167,12 +169,12 @@ for original_context_length in context_lengths:
|
|||||||
'diff': diff,
|
'diff': diff,
|
||||||
'reader_cfg': needlebench_reader_cfg,
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
'infer_cfg': needlebench_infer_cfg,
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
'eval_cfg': needlebench_eval_cfg
|
'eval_cfg': needlebench_eval_cfg,
|
||||||
}
|
}
|
||||||
needlebench_5needle_en_datasets.append(dataset_dict)
|
needlebench_5needle_en_datasets.append(dataset_dict)
|
||||||
|
|
||||||
# ----------Chinese Version----------
|
# ----------Chinese Version----------
|
||||||
base_path = './data/needlebench'
|
base_path = 'opencompass/needlebench'
|
||||||
file_list = ['zh_finance.jsonl']
|
file_list = ['zh_finance.jsonl']
|
||||||
|
|
||||||
needle_file_name = 'multi_needle_reasoning_zh.json'
|
needle_file_name = 'multi_needle_reasoning_zh.json'
|
||||||
@ -185,7 +187,7 @@ for original_context_length in context_lengths:
|
|||||||
for depth_percent in depths_list:
|
for depth_percent in depths_list:
|
||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': f'Length{original_context_length}'
|
'abbr': f'Length{original_context_length}'
|
||||||
f'Depth{int(depth_percent)}_{num_needles}needle_zh_200k',
|
f'Depth{int(depth_percent)}_{num_needles}needle_zh_200k',
|
||||||
'type': NeedleBenchMultiDataset,
|
'type': NeedleBenchMultiDataset,
|
||||||
'path': base_path,
|
'path': base_path,
|
||||||
'length': original_context_length,
|
'length': original_context_length,
|
||||||
@ -201,7 +203,7 @@ for original_context_length in context_lengths:
|
|||||||
'diff': diff,
|
'diff': diff,
|
||||||
'reader_cfg': needlebench_reader_cfg,
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
'infer_cfg': needlebench_infer_cfg,
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
'eval_cfg': needlebench_eval_cfg
|
'eval_cfg': needlebench_eval_cfg,
|
||||||
}
|
}
|
||||||
needlebench_2needle_zh_datasets.append(dataset_dict)
|
needlebench_2needle_zh_datasets.append(dataset_dict)
|
||||||
|
|
||||||
@ -212,7 +214,7 @@ for original_context_length in context_lengths:
|
|||||||
for depth_percent in depths_list:
|
for depth_percent in depths_list:
|
||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': f'Length{original_context_length}'
|
'abbr': f'Length{original_context_length}'
|
||||||
f'Depth{int(depth_percent)}_{num_needles}needle_zh_200k',
|
f'Depth{int(depth_percent)}_{num_needles}needle_zh_200k',
|
||||||
'type': NeedleBenchMultiDataset,
|
'type': NeedleBenchMultiDataset,
|
||||||
'path': base_path,
|
'path': base_path,
|
||||||
'length': original_context_length,
|
'length': original_context_length,
|
||||||
@ -228,7 +230,7 @@ for original_context_length in context_lengths:
|
|||||||
'diff': diff,
|
'diff': diff,
|
||||||
'reader_cfg': needlebench_reader_cfg,
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
'infer_cfg': needlebench_infer_cfg,
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
'eval_cfg': needlebench_eval_cfg
|
'eval_cfg': needlebench_eval_cfg,
|
||||||
}
|
}
|
||||||
needlebench_3needle_zh_datasets.append(dataset_dict)
|
needlebench_3needle_zh_datasets.append(dataset_dict)
|
||||||
|
|
||||||
@ -239,7 +241,7 @@ for original_context_length in context_lengths:
|
|||||||
for depth_percent in depths_list:
|
for depth_percent in depths_list:
|
||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': f'Length{original_context_length}'
|
'abbr': f'Length{original_context_length}'
|
||||||
f'Depth{int(depth_percent)}_{num_needles}needle_zh_200k',
|
f'Depth{int(depth_percent)}_{num_needles}needle_zh_200k',
|
||||||
'type': NeedleBenchMultiDataset,
|
'type': NeedleBenchMultiDataset,
|
||||||
'path': base_path,
|
'path': base_path,
|
||||||
'length': original_context_length,
|
'length': original_context_length,
|
||||||
@ -255,7 +257,7 @@ for original_context_length in context_lengths:
|
|||||||
'diff': diff,
|
'diff': diff,
|
||||||
'reader_cfg': needlebench_reader_cfg,
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
'infer_cfg': needlebench_infer_cfg,
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
'eval_cfg': needlebench_eval_cfg
|
'eval_cfg': needlebench_eval_cfg,
|
||||||
}
|
}
|
||||||
needlebench_4needle_zh_datasets.append(dataset_dict)
|
needlebench_4needle_zh_datasets.append(dataset_dict)
|
||||||
|
|
||||||
@ -266,7 +268,7 @@ for original_context_length in context_lengths:
|
|||||||
for depth_percent in depths_list:
|
for depth_percent in depths_list:
|
||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': f'Length{original_context_length}'
|
'abbr': f'Length{original_context_length}'
|
||||||
f'Depth{int(depth_percent)}_{num_needles}needle_zh_200k',
|
f'Depth{int(depth_percent)}_{num_needles}needle_zh_200k',
|
||||||
'type': NeedleBenchMultiDataset,
|
'type': NeedleBenchMultiDataset,
|
||||||
'path': base_path,
|
'path': base_path,
|
||||||
'length': original_context_length,
|
'length': original_context_length,
|
||||||
@ -282,6 +284,6 @@ for original_context_length in context_lengths:
|
|||||||
'diff': diff,
|
'diff': diff,
|
||||||
'reader_cfg': needlebench_reader_cfg,
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
'infer_cfg': needlebench_infer_cfg,
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
'eval_cfg': needlebench_eval_cfg
|
'eval_cfg': needlebench_eval_cfg,
|
||||||
}
|
}
|
||||||
needlebench_5needle_zh_datasets.append(dataset_dict)
|
needlebench_5needle_zh_datasets.append(dataset_dict)
|
||||||
|
@ -41,23 +41,25 @@ needlebench_infer_cfg = dict(
|
|||||||
dict(role='HUMAN', prompt='{prompt}'),
|
dict(role='HUMAN', prompt='{prompt}'),
|
||||||
dict(role='BOT', prompt='{answer}\n'),
|
dict(role='BOT', prompt='{answer}\n'),
|
||||||
]
|
]
|
||||||
)
|
|
||||||
),
|
),
|
||||||
|
),
|
||||||
retriever=dict(type=ZeroRetriever),
|
retriever=dict(type=ZeroRetriever),
|
||||||
inferencer=dict(type=GenInferencer))
|
inferencer=dict(type=GenInferencer),
|
||||||
|
)
|
||||||
|
|
||||||
needlebench_eval_cfg = dict(
|
needlebench_eval_cfg = dict(
|
||||||
evaluator=dict(type=NeedleBenchParallelEvaluator),
|
evaluator=dict(type=NeedleBenchParallelEvaluator),
|
||||||
pred_postprocessor=dict(type=needlebench_postprocess),
|
pred_postprocessor=dict(type=needlebench_postprocess),
|
||||||
dataset_postprocessor=dict(type=needlebench_dataset_postprocess),
|
dataset_postprocessor=dict(type=needlebench_dataset_postprocess),
|
||||||
pred_role='BOT')
|
pred_role='BOT',
|
||||||
|
)
|
||||||
|
|
||||||
# context_lengths = list([16000, 32000, 48000, 64000, 80000, 96000, 112000, 128000, 144000, 160000, 176000, 192000, 200000])
|
# context_lengths = list([16000, 32000, 48000, 64000, 80000, 96000, 112000, 128000, 144000, 160000, 176000, 192000, 200000])
|
||||||
context_lengths = list([16000, 48000, 80000, 112000, 128000, 144000, 176000, 200000])
|
context_lengths = list([16000, 48000, 80000, 112000, 128000, 144000, 176000, 200000])
|
||||||
document_depth_percent_intervals = 20
|
document_depth_percent_intervals = 20
|
||||||
document_depth_percent_interval_type = 'linear'
|
document_depth_percent_interval_type = 'linear'
|
||||||
|
|
||||||
base_path = './data/needlebench'
|
base_path = 'opencompass/needlebench'
|
||||||
file_list = ['PaulGrahamEssays.jsonl']
|
file_list = ['PaulGrahamEssays.jsonl']
|
||||||
needlebench_en_datasets = []
|
needlebench_en_datasets = []
|
||||||
needle_file_name = 'needles.jsonl'
|
needle_file_name = 'needles.jsonl'
|
||||||
@ -65,8 +67,7 @@ depths = [0, 10, 21, 31, 42, 52, 63, 73, 84, 94, 100]
|
|||||||
|
|
||||||
for original_context_length in context_lengths:
|
for original_context_length in context_lengths:
|
||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': f'Length{original_context_length}'
|
'abbr': f'Length{original_context_length}' f'_parallel_en_200k',
|
||||||
f'_parallel_en_200k',
|
|
||||||
'type': NeedleBenchParallelDataset,
|
'type': NeedleBenchParallelDataset,
|
||||||
'path': base_path,
|
'path': base_path,
|
||||||
'needle_file_name': needle_file_name,
|
'needle_file_name': needle_file_name,
|
||||||
@ -80,7 +81,7 @@ for original_context_length in context_lengths:
|
|||||||
'language': 'English',
|
'language': 'English',
|
||||||
'reader_cfg': needlebench_reader_cfg,
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
'infer_cfg': needlebench_infer_cfg,
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
'eval_cfg': needlebench_eval_cfg
|
'eval_cfg': needlebench_eval_cfg,
|
||||||
}
|
}
|
||||||
needlebench_en_datasets.append(dataset_dict)
|
needlebench_en_datasets.append(dataset_dict)
|
||||||
|
|
||||||
@ -89,8 +90,7 @@ needlebench_zh_datasets = []
|
|||||||
|
|
||||||
for original_context_length in context_lengths:
|
for original_context_length in context_lengths:
|
||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': f'Length{original_context_length}'
|
'abbr': f'Length{original_context_length}' f'_parallel_zh_200k',
|
||||||
f'_parallel_zh_200k',
|
|
||||||
'type': NeedleBenchParallelDataset,
|
'type': NeedleBenchParallelDataset,
|
||||||
'path': base_path,
|
'path': base_path,
|
||||||
'needle_file_name': needle_file_name,
|
'needle_file_name': needle_file_name,
|
||||||
@ -104,6 +104,6 @@ for original_context_length in context_lengths:
|
|||||||
'language': 'Chinese',
|
'language': 'Chinese',
|
||||||
'reader_cfg': needlebench_reader_cfg,
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
'infer_cfg': needlebench_infer_cfg,
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
'eval_cfg': needlebench_eval_cfg
|
'eval_cfg': needlebench_eval_cfg,
|
||||||
}
|
}
|
||||||
needlebench_zh_datasets.append(dataset_dict)
|
needlebench_zh_datasets.append(dataset_dict)
|
||||||
|
@ -41,22 +41,24 @@ needlebench_infer_cfg = dict(
|
|||||||
dict(role='HUMAN', prompt='{prompt}'),
|
dict(role='HUMAN', prompt='{prompt}'),
|
||||||
dict(role='BOT', prompt='{answer}\n'),
|
dict(role='BOT', prompt='{answer}\n'),
|
||||||
]
|
]
|
||||||
)
|
|
||||||
),
|
),
|
||||||
|
),
|
||||||
retriever=dict(type=ZeroRetriever),
|
retriever=dict(type=ZeroRetriever),
|
||||||
inferencer=dict(type=GenInferencer))
|
inferencer=dict(type=GenInferencer),
|
||||||
|
)
|
||||||
|
|
||||||
needlebench_eval_cfg = dict(
|
needlebench_eval_cfg = dict(
|
||||||
evaluator=dict(type=NeedleBenchOriginEvaluator),
|
evaluator=dict(type=NeedleBenchOriginEvaluator),
|
||||||
pred_postprocessor=dict(type=needlebench_postprocess),
|
pred_postprocessor=dict(type=needlebench_postprocess),
|
||||||
dataset_postprocessor=dict(type=needlebench_dataset_postprocess),
|
dataset_postprocessor=dict(type=needlebench_dataset_postprocess),
|
||||||
pred_role='BOT')
|
pred_role='BOT',
|
||||||
|
)
|
||||||
|
|
||||||
# context_lengths = list([16000, 32000, 48000, 64000, 80000, 96000, 112000, 128000, 144000, 160000, 176000, 192000, 200000])
|
# context_lengths = list([16000, 32000, 48000, 64000, 80000, 96000, 112000, 128000, 144000, 160000, 176000, 192000, 200000])
|
||||||
context_lengths = [16000, 48000, 80000, 112000, 128000, 144000, 176000, 200000]
|
context_lengths = [16000, 48000, 80000, 112000, 128000, 144000, 176000, 200000]
|
||||||
depths_list = [0, 10, 21, 31, 42, 52, 63, 73, 84, 94, 100]
|
depths_list = [0, 10, 21, 31, 42, 52, 63, 73, 84, 94, 100]
|
||||||
|
|
||||||
base_path = './data/needlebench'
|
base_path = 'opencompass/needlebench'
|
||||||
file_list = ['PaulGrahamEssays.jsonl']
|
file_list = ['PaulGrahamEssays.jsonl']
|
||||||
needlebench_en_datasets = []
|
needlebench_en_datasets = []
|
||||||
needle_file_name = 'needles.jsonl'
|
needle_file_name = 'needles.jsonl'
|
||||||
@ -65,7 +67,7 @@ for original_context_length in context_lengths:
|
|||||||
for depth_percent in depths_list:
|
for depth_percent in depths_list:
|
||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': f'Length{original_context_length}'
|
'abbr': f'Length{original_context_length}'
|
||||||
f'Depth{int(depth_percent)}_origin_en_200k',
|
f'Depth{int(depth_percent)}_origin_en_200k',
|
||||||
'type': NeedleBenchOriginDataset,
|
'type': NeedleBenchOriginDataset,
|
||||||
'path': base_path,
|
'path': base_path,
|
||||||
'length': original_context_length,
|
'length': original_context_length,
|
||||||
@ -79,7 +81,7 @@ for original_context_length in context_lengths:
|
|||||||
'needle_file_name': needle_file_name,
|
'needle_file_name': needle_file_name,
|
||||||
'reader_cfg': needlebench_reader_cfg,
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
'infer_cfg': needlebench_infer_cfg,
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
'eval_cfg': needlebench_eval_cfg
|
'eval_cfg': needlebench_eval_cfg,
|
||||||
}
|
}
|
||||||
needlebench_en_datasets.append(dataset_dict)
|
needlebench_en_datasets.append(dataset_dict)
|
||||||
|
|
||||||
@ -91,7 +93,7 @@ for original_context_length in context_lengths:
|
|||||||
for depth_percent in depths_list:
|
for depth_percent in depths_list:
|
||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': f'Length{original_context_length}'
|
'abbr': f'Length{original_context_length}'
|
||||||
f'Depth{int(depth_percent)}_origin_zh_200k',
|
f'Depth{int(depth_percent)}_origin_zh_200k',
|
||||||
'type': NeedleBenchOriginDataset,
|
'type': NeedleBenchOriginDataset,
|
||||||
'path': base_path,
|
'path': base_path,
|
||||||
'length': original_context_length,
|
'length': original_context_length,
|
||||||
@ -105,6 +107,6 @@ for original_context_length in context_lengths:
|
|||||||
'needle_file_name': needle_file_name,
|
'needle_file_name': needle_file_name,
|
||||||
'reader_cfg': needlebench_reader_cfg,
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
'infer_cfg': needlebench_infer_cfg,
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
'eval_cfg': needlebench_eval_cfg
|
'eval_cfg': needlebench_eval_cfg,
|
||||||
}
|
}
|
||||||
needlebench_zh_datasets.append(dataset_dict)
|
needlebench_zh_datasets.append(dataset_dict)
|
||||||
|
@ -41,23 +41,25 @@ needlebench_infer_cfg = dict(
|
|||||||
dict(role='HUMAN', prompt='{prompt}'),
|
dict(role='HUMAN', prompt='{prompt}'),
|
||||||
dict(role='BOT', prompt='{answer}\n'),
|
dict(role='BOT', prompt='{answer}\n'),
|
||||||
]
|
]
|
||||||
)
|
|
||||||
),
|
),
|
||||||
|
),
|
||||||
retriever=dict(type=ZeroRetriever),
|
retriever=dict(type=ZeroRetriever),
|
||||||
inferencer=dict(type=GenInferencer))
|
inferencer=dict(type=GenInferencer),
|
||||||
|
)
|
||||||
|
|
||||||
needlebench_eval_cfg = dict(
|
needlebench_eval_cfg = dict(
|
||||||
evaluator=dict(type=NeedleBenchMultiEvaluator),
|
evaluator=dict(type=NeedleBenchMultiEvaluator),
|
||||||
pred_postprocessor=dict(type=needlebench_postprocess),
|
pred_postprocessor=dict(type=needlebench_postprocess),
|
||||||
dataset_postprocessor=dict(type=needlebench_dataset_postprocess),
|
dataset_postprocessor=dict(type=needlebench_dataset_postprocess),
|
||||||
pred_role='BOT')
|
pred_role='BOT',
|
||||||
|
)
|
||||||
|
|
||||||
# context_lengths = list([16000, 32000, 48000, 64000, 80000, 96000, 112000, 128000, 144000, 160000, 176000, 192000, 200000])
|
# context_lengths = list([16000, 32000, 48000, 64000, 80000, 96000, 112000, 128000, 144000, 160000, 176000, 192000, 200000])
|
||||||
context_lengths = [32000, 128000, 256000]
|
context_lengths = [32000, 128000, 256000]
|
||||||
depths_list = [0, 10, 21, 31, 42, 52, 63, 73, 84, 94, 100]
|
depths_list = [0, 10, 21, 31, 42, 52, 63, 73, 84, 94, 100]
|
||||||
|
|
||||||
# ----------English Version----------
|
# ----------English Version----------
|
||||||
base_path = './data/needlebench'
|
base_path = 'opencompass/needlebench'
|
||||||
file_list = ['PaulGrahamEssays.jsonl']
|
file_list = ['PaulGrahamEssays.jsonl']
|
||||||
|
|
||||||
needle_file_name = 'multi_needle_reasoning_en.json'
|
needle_file_name = 'multi_needle_reasoning_en.json'
|
||||||
@ -70,7 +72,7 @@ for original_context_length in context_lengths:
|
|||||||
for depth_percent in depths_list:
|
for depth_percent in depths_list:
|
||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': f'Length{original_context_length}'
|
'abbr': f'Length{original_context_length}'
|
||||||
f'Depth{int(depth_percent)}_{num_needles}needle_en_256k',
|
f'Depth{int(depth_percent)}_{num_needles}needle_en_256k',
|
||||||
'type': NeedleBenchMultiDataset,
|
'type': NeedleBenchMultiDataset,
|
||||||
'path': base_path,
|
'path': base_path,
|
||||||
'length': original_context_length,
|
'length': original_context_length,
|
||||||
@ -86,7 +88,7 @@ for original_context_length in context_lengths:
|
|||||||
'diff': diff,
|
'diff': diff,
|
||||||
'reader_cfg': needlebench_reader_cfg,
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
'infer_cfg': needlebench_infer_cfg,
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
'eval_cfg': needlebench_eval_cfg
|
'eval_cfg': needlebench_eval_cfg,
|
||||||
}
|
}
|
||||||
needlebench_2needle_en_datasets.append(dataset_dict)
|
needlebench_2needle_en_datasets.append(dataset_dict)
|
||||||
|
|
||||||
@ -97,7 +99,7 @@ for original_context_length in context_lengths:
|
|||||||
for depth_percent in depths_list:
|
for depth_percent in depths_list:
|
||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': f'Length{original_context_length}'
|
'abbr': f'Length{original_context_length}'
|
||||||
f'Depth{int(depth_percent)}_{num_needles}needle_en_256k',
|
f'Depth{int(depth_percent)}_{num_needles}needle_en_256k',
|
||||||
'type': NeedleBenchMultiDataset,
|
'type': NeedleBenchMultiDataset,
|
||||||
'path': base_path,
|
'path': base_path,
|
||||||
'length': original_context_length,
|
'length': original_context_length,
|
||||||
@ -113,7 +115,7 @@ for original_context_length in context_lengths:
|
|||||||
'diff': diff,
|
'diff': diff,
|
||||||
'reader_cfg': needlebench_reader_cfg,
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
'infer_cfg': needlebench_infer_cfg,
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
'eval_cfg': needlebench_eval_cfg
|
'eval_cfg': needlebench_eval_cfg,
|
||||||
}
|
}
|
||||||
needlebench_3needle_en_datasets.append(dataset_dict)
|
needlebench_3needle_en_datasets.append(dataset_dict)
|
||||||
|
|
||||||
@ -124,7 +126,7 @@ for original_context_length in context_lengths:
|
|||||||
for depth_percent in depths_list:
|
for depth_percent in depths_list:
|
||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': f'Length{original_context_length}'
|
'abbr': f'Length{original_context_length}'
|
||||||
f'Depth{int(depth_percent)}_{num_needles}needle_en_256k',
|
f'Depth{int(depth_percent)}_{num_needles}needle_en_256k',
|
||||||
'type': NeedleBenchMultiDataset,
|
'type': NeedleBenchMultiDataset,
|
||||||
'path': base_path,
|
'path': base_path,
|
||||||
'length': original_context_length,
|
'length': original_context_length,
|
||||||
@ -140,7 +142,7 @@ for original_context_length in context_lengths:
|
|||||||
'diff': diff,
|
'diff': diff,
|
||||||
'reader_cfg': needlebench_reader_cfg,
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
'infer_cfg': needlebench_infer_cfg,
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
'eval_cfg': needlebench_eval_cfg
|
'eval_cfg': needlebench_eval_cfg,
|
||||||
}
|
}
|
||||||
needlebench_4needle_en_datasets.append(dataset_dict)
|
needlebench_4needle_en_datasets.append(dataset_dict)
|
||||||
|
|
||||||
@ -151,7 +153,7 @@ for original_context_length in context_lengths:
|
|||||||
for depth_percent in depths_list:
|
for depth_percent in depths_list:
|
||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': f'Length{original_context_length}'
|
'abbr': f'Length{original_context_length}'
|
||||||
f'Depth{int(depth_percent)}_{num_needles}needle_en_256k',
|
f'Depth{int(depth_percent)}_{num_needles}needle_en_256k',
|
||||||
'type': NeedleBenchMultiDataset,
|
'type': NeedleBenchMultiDataset,
|
||||||
'path': base_path,
|
'path': base_path,
|
||||||
'length': original_context_length,
|
'length': original_context_length,
|
||||||
@ -167,12 +169,12 @@ for original_context_length in context_lengths:
|
|||||||
'diff': diff,
|
'diff': diff,
|
||||||
'reader_cfg': needlebench_reader_cfg,
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
'infer_cfg': needlebench_infer_cfg,
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
'eval_cfg': needlebench_eval_cfg
|
'eval_cfg': needlebench_eval_cfg,
|
||||||
}
|
}
|
||||||
needlebench_5needle_en_datasets.append(dataset_dict)
|
needlebench_5needle_en_datasets.append(dataset_dict)
|
||||||
|
|
||||||
# ----------Chinese Version----------
|
# ----------Chinese Version----------
|
||||||
base_path = './data/needlebench'
|
base_path = 'opencompass/needlebench'
|
||||||
file_list = ['zh_finance.jsonl']
|
file_list = ['zh_finance.jsonl']
|
||||||
|
|
||||||
needle_file_name = 'multi_needle_reasoning_zh.json'
|
needle_file_name = 'multi_needle_reasoning_zh.json'
|
||||||
@ -185,7 +187,7 @@ for original_context_length in context_lengths:
|
|||||||
for depth_percent in depths_list:
|
for depth_percent in depths_list:
|
||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': f'Length{original_context_length}'
|
'abbr': f'Length{original_context_length}'
|
||||||
f'Depth{int(depth_percent)}_{num_needles}needle_zh_256k',
|
f'Depth{int(depth_percent)}_{num_needles}needle_zh_256k',
|
||||||
'type': NeedleBenchMultiDataset,
|
'type': NeedleBenchMultiDataset,
|
||||||
'path': base_path,
|
'path': base_path,
|
||||||
'length': original_context_length,
|
'length': original_context_length,
|
||||||
@ -201,7 +203,7 @@ for original_context_length in context_lengths:
|
|||||||
'diff': diff,
|
'diff': diff,
|
||||||
'reader_cfg': needlebench_reader_cfg,
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
'infer_cfg': needlebench_infer_cfg,
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
'eval_cfg': needlebench_eval_cfg
|
'eval_cfg': needlebench_eval_cfg,
|
||||||
}
|
}
|
||||||
needlebench_2needle_zh_datasets.append(dataset_dict)
|
needlebench_2needle_zh_datasets.append(dataset_dict)
|
||||||
|
|
||||||
@ -212,7 +214,7 @@ for original_context_length in context_lengths:
|
|||||||
for depth_percent in depths_list:
|
for depth_percent in depths_list:
|
||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': f'Length{original_context_length}'
|
'abbr': f'Length{original_context_length}'
|
||||||
f'Depth{int(depth_percent)}_{num_needles}needle_zh_256k',
|
f'Depth{int(depth_percent)}_{num_needles}needle_zh_256k',
|
||||||
'type': NeedleBenchMultiDataset,
|
'type': NeedleBenchMultiDataset,
|
||||||
'path': base_path,
|
'path': base_path,
|
||||||
'length': original_context_length,
|
'length': original_context_length,
|
||||||
@ -228,7 +230,7 @@ for original_context_length in context_lengths:
|
|||||||
'diff': diff,
|
'diff': diff,
|
||||||
'reader_cfg': needlebench_reader_cfg,
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
'infer_cfg': needlebench_infer_cfg,
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
'eval_cfg': needlebench_eval_cfg
|
'eval_cfg': needlebench_eval_cfg,
|
||||||
}
|
}
|
||||||
needlebench_3needle_zh_datasets.append(dataset_dict)
|
needlebench_3needle_zh_datasets.append(dataset_dict)
|
||||||
|
|
||||||
@ -239,7 +241,7 @@ for original_context_length in context_lengths:
|
|||||||
for depth_percent in depths_list:
|
for depth_percent in depths_list:
|
||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': f'Length{original_context_length}'
|
'abbr': f'Length{original_context_length}'
|
||||||
f'Depth{int(depth_percent)}_{num_needles}needle_zh_256k',
|
f'Depth{int(depth_percent)}_{num_needles}needle_zh_256k',
|
||||||
'type': NeedleBenchMultiDataset,
|
'type': NeedleBenchMultiDataset,
|
||||||
'path': base_path,
|
'path': base_path,
|
||||||
'length': original_context_length,
|
'length': original_context_length,
|
||||||
@ -255,7 +257,7 @@ for original_context_length in context_lengths:
|
|||||||
'diff': diff,
|
'diff': diff,
|
||||||
'reader_cfg': needlebench_reader_cfg,
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
'infer_cfg': needlebench_infer_cfg,
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
'eval_cfg': needlebench_eval_cfg
|
'eval_cfg': needlebench_eval_cfg,
|
||||||
}
|
}
|
||||||
needlebench_4needle_zh_datasets.append(dataset_dict)
|
needlebench_4needle_zh_datasets.append(dataset_dict)
|
||||||
|
|
||||||
@ -266,7 +268,7 @@ for original_context_length in context_lengths:
|
|||||||
for depth_percent in depths_list:
|
for depth_percent in depths_list:
|
||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': f'Length{original_context_length}'
|
'abbr': f'Length{original_context_length}'
|
||||||
f'Depth{int(depth_percent)}_{num_needles}needle_zh_256k',
|
f'Depth{int(depth_percent)}_{num_needles}needle_zh_256k',
|
||||||
'type': NeedleBenchMultiDataset,
|
'type': NeedleBenchMultiDataset,
|
||||||
'path': base_path,
|
'path': base_path,
|
||||||
'length': original_context_length,
|
'length': original_context_length,
|
||||||
@ -282,6 +284,6 @@ for original_context_length in context_lengths:
|
|||||||
'diff': diff,
|
'diff': diff,
|
||||||
'reader_cfg': needlebench_reader_cfg,
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
'infer_cfg': needlebench_infer_cfg,
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
'eval_cfg': needlebench_eval_cfg
|
'eval_cfg': needlebench_eval_cfg,
|
||||||
}
|
}
|
||||||
needlebench_5needle_zh_datasets.append(dataset_dict)
|
needlebench_5needle_zh_datasets.append(dataset_dict)
|
||||||
|
@ -41,23 +41,25 @@ needlebench_infer_cfg = dict(
|
|||||||
dict(role='HUMAN', prompt='{prompt}'),
|
dict(role='HUMAN', prompt='{prompt}'),
|
||||||
dict(role='BOT', prompt='{answer}\n'),
|
dict(role='BOT', prompt='{answer}\n'),
|
||||||
]
|
]
|
||||||
)
|
|
||||||
),
|
),
|
||||||
|
),
|
||||||
retriever=dict(type=ZeroRetriever),
|
retriever=dict(type=ZeroRetriever),
|
||||||
inferencer=dict(type=GenInferencer))
|
inferencer=dict(type=GenInferencer),
|
||||||
|
)
|
||||||
|
|
||||||
needlebench_eval_cfg = dict(
|
needlebench_eval_cfg = dict(
|
||||||
evaluator=dict(type=NeedleBenchParallelEvaluator),
|
evaluator=dict(type=NeedleBenchParallelEvaluator),
|
||||||
pred_postprocessor=dict(type=needlebench_postprocess),
|
pred_postprocessor=dict(type=needlebench_postprocess),
|
||||||
dataset_postprocessor=dict(type=needlebench_dataset_postprocess),
|
dataset_postprocessor=dict(type=needlebench_dataset_postprocess),
|
||||||
pred_role='BOT')
|
pred_role='BOT',
|
||||||
|
)
|
||||||
|
|
||||||
# context_lengths = list([16000, 32000, 48000, 64000, 80000, 96000, 112000, 128000, 144000, 160000, 176000, 192000, 200000])
|
# context_lengths = list([16000, 32000, 48000, 64000, 80000, 96000, 112000, 128000, 144000, 160000, 176000, 192000, 200000])
|
||||||
context_lengths = [32000, 128000, 256000]
|
context_lengths = [32000, 128000, 256000]
|
||||||
document_depth_percent_intervals = 20
|
document_depth_percent_intervals = 20
|
||||||
document_depth_percent_interval_type = 'linear'
|
document_depth_percent_interval_type = 'linear'
|
||||||
|
|
||||||
base_path = './data/needlebench'
|
base_path = 'opencompass/needlebench'
|
||||||
file_list = ['PaulGrahamEssays.jsonl']
|
file_list = ['PaulGrahamEssays.jsonl']
|
||||||
needlebench_en_datasets = []
|
needlebench_en_datasets = []
|
||||||
needle_file_name = 'needles.jsonl'
|
needle_file_name = 'needles.jsonl'
|
||||||
@ -65,8 +67,7 @@ depths = [0, 10, 21, 31, 42, 52, 63, 73, 84, 94, 100]
|
|||||||
|
|
||||||
for original_context_length in context_lengths:
|
for original_context_length in context_lengths:
|
||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': f'Length{original_context_length}'
|
'abbr': f'Length{original_context_length}' f'_parallel_en_256k',
|
||||||
f'_parallel_en_256k',
|
|
||||||
'type': NeedleBenchParallelDataset,
|
'type': NeedleBenchParallelDataset,
|
||||||
'path': base_path,
|
'path': base_path,
|
||||||
'needle_file_name': needle_file_name,
|
'needle_file_name': needle_file_name,
|
||||||
@ -80,7 +81,7 @@ for original_context_length in context_lengths:
|
|||||||
'language': 'English',
|
'language': 'English',
|
||||||
'reader_cfg': needlebench_reader_cfg,
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
'infer_cfg': needlebench_infer_cfg,
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
'eval_cfg': needlebench_eval_cfg
|
'eval_cfg': needlebench_eval_cfg,
|
||||||
}
|
}
|
||||||
needlebench_en_datasets.append(dataset_dict)
|
needlebench_en_datasets.append(dataset_dict)
|
||||||
|
|
||||||
@ -89,8 +90,7 @@ needlebench_zh_datasets = []
|
|||||||
|
|
||||||
for original_context_length in context_lengths:
|
for original_context_length in context_lengths:
|
||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': f'Length{original_context_length}'
|
'abbr': f'Length{original_context_length}' f'_parallel_zh_256k',
|
||||||
f'_parallel_zh_256k',
|
|
||||||
'type': NeedleBenchParallelDataset,
|
'type': NeedleBenchParallelDataset,
|
||||||
'path': base_path,
|
'path': base_path,
|
||||||
'needle_file_name': needle_file_name,
|
'needle_file_name': needle_file_name,
|
||||||
@ -104,6 +104,6 @@ for original_context_length in context_lengths:
|
|||||||
'language': 'Chinese',
|
'language': 'Chinese',
|
||||||
'reader_cfg': needlebench_reader_cfg,
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
'infer_cfg': needlebench_infer_cfg,
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
'eval_cfg': needlebench_eval_cfg
|
'eval_cfg': needlebench_eval_cfg,
|
||||||
}
|
}
|
||||||
needlebench_zh_datasets.append(dataset_dict)
|
needlebench_zh_datasets.append(dataset_dict)
|
||||||
|
@ -41,22 +41,24 @@ needlebench_infer_cfg = dict(
|
|||||||
dict(role='HUMAN', prompt='{prompt}'),
|
dict(role='HUMAN', prompt='{prompt}'),
|
||||||
dict(role='BOT', prompt='{answer}\n'),
|
dict(role='BOT', prompt='{answer}\n'),
|
||||||
]
|
]
|
||||||
)
|
|
||||||
),
|
),
|
||||||
|
),
|
||||||
retriever=dict(type=ZeroRetriever),
|
retriever=dict(type=ZeroRetriever),
|
||||||
inferencer=dict(type=GenInferencer))
|
inferencer=dict(type=GenInferencer),
|
||||||
|
)
|
||||||
|
|
||||||
needlebench_eval_cfg = dict(
|
needlebench_eval_cfg = dict(
|
||||||
evaluator=dict(type=NeedleBenchOriginEvaluator),
|
evaluator=dict(type=NeedleBenchOriginEvaluator),
|
||||||
pred_postprocessor=dict(type=needlebench_postprocess),
|
pred_postprocessor=dict(type=needlebench_postprocess),
|
||||||
dataset_postprocessor=dict(type=needlebench_dataset_postprocess),
|
dataset_postprocessor=dict(type=needlebench_dataset_postprocess),
|
||||||
pred_role='BOT')
|
pred_role='BOT',
|
||||||
|
)
|
||||||
|
|
||||||
# context_lengths = list([16000, 32000, 48000, 64000, 80000, 96000, 112000, 128000, 144000, 160000, 176000, 192000, 200000])
|
# context_lengths = list([16000, 32000, 48000, 64000, 80000, 96000, 112000, 128000, 144000, 160000, 176000, 192000, 200000])
|
||||||
context_lengths = [32000, 128000, 256000]
|
context_lengths = [32000, 128000, 256000]
|
||||||
depths_list = [0, 10, 21, 31, 42, 52, 63, 73, 84, 94, 100]
|
depths_list = [0, 10, 21, 31, 42, 52, 63, 73, 84, 94, 100]
|
||||||
|
|
||||||
base_path = './data/needlebench'
|
base_path = 'opencompass/needlebench'
|
||||||
file_list = ['PaulGrahamEssays.jsonl']
|
file_list = ['PaulGrahamEssays.jsonl']
|
||||||
needlebench_en_datasets = []
|
needlebench_en_datasets = []
|
||||||
needle_file_name = 'needles.jsonl'
|
needle_file_name = 'needles.jsonl'
|
||||||
@ -65,7 +67,7 @@ for original_context_length in context_lengths:
|
|||||||
for depth_percent in depths_list:
|
for depth_percent in depths_list:
|
||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': f'Length{original_context_length}'
|
'abbr': f'Length{original_context_length}'
|
||||||
f'Depth{int(depth_percent)}_origin_en_256k',
|
f'Depth{int(depth_percent)}_origin_en_256k',
|
||||||
'type': NeedleBenchOriginDataset,
|
'type': NeedleBenchOriginDataset,
|
||||||
'path': base_path,
|
'path': base_path,
|
||||||
'length': original_context_length,
|
'length': original_context_length,
|
||||||
@ -79,7 +81,7 @@ for original_context_length in context_lengths:
|
|||||||
'needle_file_name': needle_file_name,
|
'needle_file_name': needle_file_name,
|
||||||
'reader_cfg': needlebench_reader_cfg,
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
'infer_cfg': needlebench_infer_cfg,
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
'eval_cfg': needlebench_eval_cfg
|
'eval_cfg': needlebench_eval_cfg,
|
||||||
}
|
}
|
||||||
needlebench_en_datasets.append(dataset_dict)
|
needlebench_en_datasets.append(dataset_dict)
|
||||||
|
|
||||||
@ -91,7 +93,7 @@ for original_context_length in context_lengths:
|
|||||||
for depth_percent in depths_list:
|
for depth_percent in depths_list:
|
||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': f'Length{original_context_length}'
|
'abbr': f'Length{original_context_length}'
|
||||||
f'Depth{int(depth_percent)}_origin_zh_256k',
|
f'Depth{int(depth_percent)}_origin_zh_256k',
|
||||||
'type': NeedleBenchOriginDataset,
|
'type': NeedleBenchOriginDataset,
|
||||||
'path': base_path,
|
'path': base_path,
|
||||||
'length': original_context_length,
|
'length': original_context_length,
|
||||||
@ -105,6 +107,6 @@ for original_context_length in context_lengths:
|
|||||||
'needle_file_name': needle_file_name,
|
'needle_file_name': needle_file_name,
|
||||||
'reader_cfg': needlebench_reader_cfg,
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
'infer_cfg': needlebench_infer_cfg,
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
'eval_cfg': needlebench_eval_cfg
|
'eval_cfg': needlebench_eval_cfg,
|
||||||
}
|
}
|
||||||
needlebench_zh_datasets.append(dataset_dict)
|
needlebench_zh_datasets.append(dataset_dict)
|
||||||
|
@ -41,16 +41,18 @@ needlebench_infer_cfg = dict(
|
|||||||
dict(role='HUMAN', prompt='{prompt}'),
|
dict(role='HUMAN', prompt='{prompt}'),
|
||||||
dict(role='BOT', prompt='{answer}\n'),
|
dict(role='BOT', prompt='{answer}\n'),
|
||||||
]
|
]
|
||||||
)
|
|
||||||
),
|
),
|
||||||
|
),
|
||||||
retriever=dict(type=ZeroRetriever),
|
retriever=dict(type=ZeroRetriever),
|
||||||
inferencer=dict(type=GenInferencer))
|
inferencer=dict(type=GenInferencer),
|
||||||
|
)
|
||||||
|
|
||||||
needlebench_eval_cfg = dict(
|
needlebench_eval_cfg = dict(
|
||||||
evaluator=dict(type=NeedleBenchMultiEvaluator),
|
evaluator=dict(type=NeedleBenchMultiEvaluator),
|
||||||
pred_postprocessor=dict(type=needlebench_postprocess),
|
pred_postprocessor=dict(type=needlebench_postprocess),
|
||||||
dataset_postprocessor=dict(type=needlebench_dataset_postprocess),
|
dataset_postprocessor=dict(type=needlebench_dataset_postprocess),
|
||||||
pred_role='BOT')
|
pred_role='BOT',
|
||||||
|
)
|
||||||
|
|
||||||
context_lengths = list([9000, 13000, 17000, 21000, 25000, 29000, 31000, 32000])
|
context_lengths = list([9000, 13000, 17000, 21000, 25000, 29000, 31000, 32000])
|
||||||
depths_list = [0, 10, 21, 31, 42, 52, 63, 73, 84, 94, 100]
|
depths_list = [0, 10, 21, 31, 42, 52, 63, 73, 84, 94, 100]
|
||||||
@ -58,7 +60,7 @@ document_depth_percent_intervals = 20
|
|||||||
document_depth_percent_interval_type = 'linear'
|
document_depth_percent_interval_type = 'linear'
|
||||||
|
|
||||||
# ----------English Version----------
|
# ----------English Version----------
|
||||||
base_path = './data/needlebench'
|
base_path = 'opencompass/needlebench'
|
||||||
file_list = ['PaulGrahamEssays.jsonl']
|
file_list = ['PaulGrahamEssays.jsonl']
|
||||||
|
|
||||||
needle_file_name = 'multi_needle_reasoning_en.json'
|
needle_file_name = 'multi_needle_reasoning_en.json'
|
||||||
@ -71,7 +73,7 @@ for original_context_length in context_lengths:
|
|||||||
for depth_percent in depths_list:
|
for depth_percent in depths_list:
|
||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': f'Length{original_context_length}'
|
'abbr': f'Length{original_context_length}'
|
||||||
f'Depth{int(depth_percent)}_{num_needles}needle_en_32k',
|
f'Depth{int(depth_percent)}_{num_needles}needle_en_32k',
|
||||||
'type': NeedleBenchMultiDataset,
|
'type': NeedleBenchMultiDataset,
|
||||||
'path': base_path,
|
'path': base_path,
|
||||||
'length': original_context_length,
|
'length': original_context_length,
|
||||||
@ -87,7 +89,7 @@ for original_context_length in context_lengths:
|
|||||||
'diff': diff,
|
'diff': diff,
|
||||||
'reader_cfg': needlebench_reader_cfg,
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
'infer_cfg': needlebench_infer_cfg,
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
'eval_cfg': needlebench_eval_cfg
|
'eval_cfg': needlebench_eval_cfg,
|
||||||
}
|
}
|
||||||
needlebench_2needle_en_datasets.append(dataset_dict)
|
needlebench_2needle_en_datasets.append(dataset_dict)
|
||||||
|
|
||||||
@ -98,7 +100,7 @@ for original_context_length in context_lengths:
|
|||||||
for depth_percent in depths_list:
|
for depth_percent in depths_list:
|
||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': f'Length{original_context_length}'
|
'abbr': f'Length{original_context_length}'
|
||||||
f'Depth{int(depth_percent)}_{num_needles}needle_en_32k',
|
f'Depth{int(depth_percent)}_{num_needles}needle_en_32k',
|
||||||
'type': NeedleBenchMultiDataset,
|
'type': NeedleBenchMultiDataset,
|
||||||
'path': base_path,
|
'path': base_path,
|
||||||
'length': original_context_length,
|
'length': original_context_length,
|
||||||
@ -114,7 +116,7 @@ for original_context_length in context_lengths:
|
|||||||
'diff': diff,
|
'diff': diff,
|
||||||
'reader_cfg': needlebench_reader_cfg,
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
'infer_cfg': needlebench_infer_cfg,
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
'eval_cfg': needlebench_eval_cfg
|
'eval_cfg': needlebench_eval_cfg,
|
||||||
}
|
}
|
||||||
needlebench_3needle_en_datasets.append(dataset_dict)
|
needlebench_3needle_en_datasets.append(dataset_dict)
|
||||||
|
|
||||||
@ -125,7 +127,7 @@ for original_context_length in context_lengths:
|
|||||||
for depth_percent in depths_list:
|
for depth_percent in depths_list:
|
||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': f'Length{original_context_length}'
|
'abbr': f'Length{original_context_length}'
|
||||||
f'Depth{int(depth_percent)}_{num_needles}needle_en_32k',
|
f'Depth{int(depth_percent)}_{num_needles}needle_en_32k',
|
||||||
'type': NeedleBenchMultiDataset,
|
'type': NeedleBenchMultiDataset,
|
||||||
'path': base_path,
|
'path': base_path,
|
||||||
'length': original_context_length,
|
'length': original_context_length,
|
||||||
@ -141,7 +143,7 @@ for original_context_length in context_lengths:
|
|||||||
'diff': diff,
|
'diff': diff,
|
||||||
'reader_cfg': needlebench_reader_cfg,
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
'infer_cfg': needlebench_infer_cfg,
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
'eval_cfg': needlebench_eval_cfg
|
'eval_cfg': needlebench_eval_cfg,
|
||||||
}
|
}
|
||||||
needlebench_4needle_en_datasets.append(dataset_dict)
|
needlebench_4needle_en_datasets.append(dataset_dict)
|
||||||
|
|
||||||
@ -152,7 +154,7 @@ for original_context_length in context_lengths:
|
|||||||
for depth_percent in depths_list:
|
for depth_percent in depths_list:
|
||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': f'Length{original_context_length}'
|
'abbr': f'Length{original_context_length}'
|
||||||
f'Depth{int(depth_percent)}_{num_needles}needle_en_32k',
|
f'Depth{int(depth_percent)}_{num_needles}needle_en_32k',
|
||||||
'type': NeedleBenchMultiDataset,
|
'type': NeedleBenchMultiDataset,
|
||||||
'path': base_path,
|
'path': base_path,
|
||||||
'length': original_context_length,
|
'length': original_context_length,
|
||||||
@ -168,12 +170,12 @@ for original_context_length in context_lengths:
|
|||||||
'diff': diff,
|
'diff': diff,
|
||||||
'reader_cfg': needlebench_reader_cfg,
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
'infer_cfg': needlebench_infer_cfg,
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
'eval_cfg': needlebench_eval_cfg
|
'eval_cfg': needlebench_eval_cfg,
|
||||||
}
|
}
|
||||||
needlebench_5needle_en_datasets.append(dataset_dict)
|
needlebench_5needle_en_datasets.append(dataset_dict)
|
||||||
|
|
||||||
# ----------Chinese Version----------
|
# ----------Chinese Version----------
|
||||||
base_path = './data/needlebench'
|
base_path = 'opencompass/needlebench'
|
||||||
file_list = ['zh_finance.jsonl']
|
file_list = ['zh_finance.jsonl']
|
||||||
|
|
||||||
needle_file_name = 'multi_needle_reasoning_zh.json'
|
needle_file_name = 'multi_needle_reasoning_zh.json'
|
||||||
@ -186,7 +188,7 @@ for original_context_length in context_lengths:
|
|||||||
for depth_percent in depths_list:
|
for depth_percent in depths_list:
|
||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': f'Length{original_context_length}'
|
'abbr': f'Length{original_context_length}'
|
||||||
f'Depth{int(depth_percent)}_{num_needles}needle_zh_32k',
|
f'Depth{int(depth_percent)}_{num_needles}needle_zh_32k',
|
||||||
'type': NeedleBenchMultiDataset,
|
'type': NeedleBenchMultiDataset,
|
||||||
'path': base_path,
|
'path': base_path,
|
||||||
'length': original_context_length,
|
'length': original_context_length,
|
||||||
@ -202,7 +204,7 @@ for original_context_length in context_lengths:
|
|||||||
'diff': diff,
|
'diff': diff,
|
||||||
'reader_cfg': needlebench_reader_cfg,
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
'infer_cfg': needlebench_infer_cfg,
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
'eval_cfg': needlebench_eval_cfg
|
'eval_cfg': needlebench_eval_cfg,
|
||||||
}
|
}
|
||||||
needlebench_2needle_zh_datasets.append(dataset_dict)
|
needlebench_2needle_zh_datasets.append(dataset_dict)
|
||||||
|
|
||||||
@ -213,7 +215,7 @@ for original_context_length in context_lengths:
|
|||||||
for depth_percent in depths_list:
|
for depth_percent in depths_list:
|
||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': f'Length{original_context_length}'
|
'abbr': f'Length{original_context_length}'
|
||||||
f'Depth{int(depth_percent)}_{num_needles}needle_zh_32k',
|
f'Depth{int(depth_percent)}_{num_needles}needle_zh_32k',
|
||||||
'type': NeedleBenchMultiDataset,
|
'type': NeedleBenchMultiDataset,
|
||||||
'path': base_path,
|
'path': base_path,
|
||||||
'length': original_context_length,
|
'length': original_context_length,
|
||||||
@ -229,7 +231,7 @@ for original_context_length in context_lengths:
|
|||||||
'diff': diff,
|
'diff': diff,
|
||||||
'reader_cfg': needlebench_reader_cfg,
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
'infer_cfg': needlebench_infer_cfg,
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
'eval_cfg': needlebench_eval_cfg
|
'eval_cfg': needlebench_eval_cfg,
|
||||||
}
|
}
|
||||||
needlebench_3needle_zh_datasets.append(dataset_dict)
|
needlebench_3needle_zh_datasets.append(dataset_dict)
|
||||||
|
|
||||||
@ -240,7 +242,7 @@ for original_context_length in context_lengths:
|
|||||||
for depth_percent in depths_list:
|
for depth_percent in depths_list:
|
||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': f'Length{original_context_length}'
|
'abbr': f'Length{original_context_length}'
|
||||||
f'Depth{int(depth_percent)}_{num_needles}needle_zh_32k',
|
f'Depth{int(depth_percent)}_{num_needles}needle_zh_32k',
|
||||||
'type': NeedleBenchMultiDataset,
|
'type': NeedleBenchMultiDataset,
|
||||||
'path': base_path,
|
'path': base_path,
|
||||||
'length': original_context_length,
|
'length': original_context_length,
|
||||||
@ -256,7 +258,7 @@ for original_context_length in context_lengths:
|
|||||||
'diff': diff,
|
'diff': diff,
|
||||||
'reader_cfg': needlebench_reader_cfg,
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
'infer_cfg': needlebench_infer_cfg,
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
'eval_cfg': needlebench_eval_cfg
|
'eval_cfg': needlebench_eval_cfg,
|
||||||
}
|
}
|
||||||
needlebench_4needle_zh_datasets.append(dataset_dict)
|
needlebench_4needle_zh_datasets.append(dataset_dict)
|
||||||
|
|
||||||
@ -267,7 +269,7 @@ for original_context_length in context_lengths:
|
|||||||
for depth_percent in depths_list:
|
for depth_percent in depths_list:
|
||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': f'Length{original_context_length}'
|
'abbr': f'Length{original_context_length}'
|
||||||
f'Depth{int(depth_percent)}_{num_needles}needle_zh_32k',
|
f'Depth{int(depth_percent)}_{num_needles}needle_zh_32k',
|
||||||
'type': NeedleBenchMultiDataset,
|
'type': NeedleBenchMultiDataset,
|
||||||
'path': base_path,
|
'path': base_path,
|
||||||
'length': original_context_length,
|
'length': original_context_length,
|
||||||
@ -283,6 +285,6 @@ for original_context_length in context_lengths:
|
|||||||
'diff': diff,
|
'diff': diff,
|
||||||
'reader_cfg': needlebench_reader_cfg,
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
'infer_cfg': needlebench_infer_cfg,
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
'eval_cfg': needlebench_eval_cfg
|
'eval_cfg': needlebench_eval_cfg,
|
||||||
}
|
}
|
||||||
needlebench_5needle_zh_datasets.append(dataset_dict)
|
needlebench_5needle_zh_datasets.append(dataset_dict)
|
||||||
|
@ -41,22 +41,24 @@ needlebench_infer_cfg = dict(
|
|||||||
dict(role='HUMAN', prompt='{prompt}'),
|
dict(role='HUMAN', prompt='{prompt}'),
|
||||||
dict(role='BOT', prompt='{answer}\n'),
|
dict(role='BOT', prompt='{answer}\n'),
|
||||||
]
|
]
|
||||||
)
|
|
||||||
),
|
),
|
||||||
|
),
|
||||||
retriever=dict(type=ZeroRetriever),
|
retriever=dict(type=ZeroRetriever),
|
||||||
inferencer=dict(type=GenInferencer))
|
inferencer=dict(type=GenInferencer),
|
||||||
|
)
|
||||||
|
|
||||||
needlebench_eval_cfg = dict(
|
needlebench_eval_cfg = dict(
|
||||||
evaluator=dict(type=NeedleBenchParallelEvaluator),
|
evaluator=dict(type=NeedleBenchParallelEvaluator),
|
||||||
pred_postprocessor=dict(type=needlebench_postprocess),
|
pred_postprocessor=dict(type=needlebench_postprocess),
|
||||||
dataset_postprocessor=dict(type=needlebench_dataset_postprocess),
|
dataset_postprocessor=dict(type=needlebench_dataset_postprocess),
|
||||||
pred_role='BOT')
|
pred_role='BOT',
|
||||||
|
)
|
||||||
|
|
||||||
context_lengths = list([9000, 13000, 17000, 21000, 25000, 29000, 31000, 32000])
|
context_lengths = list([9000, 13000, 17000, 21000, 25000, 29000, 31000, 32000])
|
||||||
document_depth_percent_intervals = 20
|
document_depth_percent_intervals = 20
|
||||||
document_depth_percent_interval_type = 'linear'
|
document_depth_percent_interval_type = 'linear'
|
||||||
|
|
||||||
base_path = './data/needlebench'
|
base_path = 'opencompass/needlebench'
|
||||||
file_list = ['PaulGrahamEssays.jsonl']
|
file_list = ['PaulGrahamEssays.jsonl']
|
||||||
needlebench_en_datasets = []
|
needlebench_en_datasets = []
|
||||||
needle_file_name = 'needles.jsonl'
|
needle_file_name = 'needles.jsonl'
|
||||||
@ -64,8 +66,7 @@ depths = [0, 10, 21, 31, 42, 52, 63, 73, 84, 94, 100]
|
|||||||
|
|
||||||
for original_context_length in context_lengths:
|
for original_context_length in context_lengths:
|
||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': f'Length{original_context_length}'
|
'abbr': f'Length{original_context_length}' f'_parallel_en_32k',
|
||||||
f'_parallel_en_32k',
|
|
||||||
'type': NeedleBenchParallelDataset,
|
'type': NeedleBenchParallelDataset,
|
||||||
'path': base_path,
|
'path': base_path,
|
||||||
'needle_file_name': needle_file_name,
|
'needle_file_name': needle_file_name,
|
||||||
@ -79,7 +80,7 @@ for original_context_length in context_lengths:
|
|||||||
'language': 'English',
|
'language': 'English',
|
||||||
'reader_cfg': needlebench_reader_cfg,
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
'infer_cfg': needlebench_infer_cfg,
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
'eval_cfg': needlebench_eval_cfg
|
'eval_cfg': needlebench_eval_cfg,
|
||||||
}
|
}
|
||||||
needlebench_en_datasets.append(dataset_dict)
|
needlebench_en_datasets.append(dataset_dict)
|
||||||
|
|
||||||
@ -88,8 +89,7 @@ needlebench_zh_datasets = []
|
|||||||
|
|
||||||
for original_context_length in context_lengths:
|
for original_context_length in context_lengths:
|
||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': f'Length{original_context_length}'
|
'abbr': f'Length{original_context_length}' f'_parallel_zh_32k',
|
||||||
f'_parallel_zh_32k',
|
|
||||||
'type': NeedleBenchParallelDataset,
|
'type': NeedleBenchParallelDataset,
|
||||||
'path': base_path,
|
'path': base_path,
|
||||||
'needle_file_name': needle_file_name,
|
'needle_file_name': needle_file_name,
|
||||||
@ -103,6 +103,6 @@ for original_context_length in context_lengths:
|
|||||||
'language': 'Chinese',
|
'language': 'Chinese',
|
||||||
'reader_cfg': needlebench_reader_cfg,
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
'infer_cfg': needlebench_infer_cfg,
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
'eval_cfg': needlebench_eval_cfg
|
'eval_cfg': needlebench_eval_cfg,
|
||||||
}
|
}
|
||||||
needlebench_zh_datasets.append(dataset_dict)
|
needlebench_zh_datasets.append(dataset_dict)
|
||||||
|
@ -41,23 +41,25 @@ needlebench_infer_cfg = dict(
|
|||||||
dict(role='HUMAN', prompt='{prompt}'),
|
dict(role='HUMAN', prompt='{prompt}'),
|
||||||
dict(role='BOT', prompt='{answer}\n'),
|
dict(role='BOT', prompt='{answer}\n'),
|
||||||
]
|
]
|
||||||
)
|
|
||||||
),
|
),
|
||||||
|
),
|
||||||
retriever=dict(type=ZeroRetriever),
|
retriever=dict(type=ZeroRetriever),
|
||||||
inferencer=dict(type=GenInferencer))
|
inferencer=dict(type=GenInferencer),
|
||||||
|
)
|
||||||
|
|
||||||
needlebench_eval_cfg = dict(
|
needlebench_eval_cfg = dict(
|
||||||
evaluator=dict(type=NeedleBenchOriginEvaluator),
|
evaluator=dict(type=NeedleBenchOriginEvaluator),
|
||||||
pred_postprocessor=dict(type=needlebench_postprocess),
|
pred_postprocessor=dict(type=needlebench_postprocess),
|
||||||
dataset_postprocessor=dict(type=needlebench_dataset_postprocess),
|
dataset_postprocessor=dict(type=needlebench_dataset_postprocess),
|
||||||
pred_role='BOT')
|
pred_role='BOT',
|
||||||
|
)
|
||||||
|
|
||||||
context_lengths = list([9000, 13000, 17000, 21000, 25000, 29000, 31000, 32000])
|
context_lengths = list([9000, 13000, 17000, 21000, 25000, 29000, 31000, 32000])
|
||||||
depths_list = [0, 10, 21, 31, 42, 52, 63, 73, 84, 94, 100]
|
depths_list = [0, 10, 21, 31, 42, 52, 63, 73, 84, 94, 100]
|
||||||
document_depth_percent_intervals = 20
|
document_depth_percent_intervals = 20
|
||||||
document_depth_percent_interval_type = 'linear'
|
document_depth_percent_interval_type = 'linear'
|
||||||
|
|
||||||
base_path = './data/needlebench'
|
base_path = 'opencompass/needlebench'
|
||||||
file_list = ['PaulGrahamEssays.jsonl']
|
file_list = ['PaulGrahamEssays.jsonl']
|
||||||
needlebench_en_datasets = []
|
needlebench_en_datasets = []
|
||||||
needle_file_name = 'needles.jsonl'
|
needle_file_name = 'needles.jsonl'
|
||||||
@ -66,7 +68,7 @@ for original_context_length in context_lengths:
|
|||||||
for depth_percent in depths_list:
|
for depth_percent in depths_list:
|
||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': f'Length{original_context_length}'
|
'abbr': f'Length{original_context_length}'
|
||||||
f'Depth{int(depth_percent)}_origin_en_32k',
|
f'Depth{int(depth_percent)}_origin_en_32k',
|
||||||
'type': NeedleBenchOriginDataset,
|
'type': NeedleBenchOriginDataset,
|
||||||
'path': base_path,
|
'path': base_path,
|
||||||
'length': original_context_length,
|
'length': original_context_length,
|
||||||
@ -80,7 +82,7 @@ for original_context_length in context_lengths:
|
|||||||
'needle_file_name': needle_file_name,
|
'needle_file_name': needle_file_name,
|
||||||
'reader_cfg': needlebench_reader_cfg,
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
'infer_cfg': needlebench_infer_cfg,
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
'eval_cfg': needlebench_eval_cfg
|
'eval_cfg': needlebench_eval_cfg,
|
||||||
}
|
}
|
||||||
needlebench_en_datasets.append(dataset_dict)
|
needlebench_en_datasets.append(dataset_dict)
|
||||||
|
|
||||||
@ -92,7 +94,7 @@ for original_context_length in context_lengths:
|
|||||||
for depth_percent in depths_list:
|
for depth_percent in depths_list:
|
||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': f'Length{original_context_length}'
|
'abbr': f'Length{original_context_length}'
|
||||||
f'Depth{int(depth_percent)}_origin_zh_32k',
|
f'Depth{int(depth_percent)}_origin_zh_32k',
|
||||||
'type': NeedleBenchOriginDataset,
|
'type': NeedleBenchOriginDataset,
|
||||||
'path': base_path,
|
'path': base_path,
|
||||||
'length': original_context_length,
|
'length': original_context_length,
|
||||||
@ -106,6 +108,6 @@ for original_context_length in context_lengths:
|
|||||||
'needle_file_name': needle_file_name,
|
'needle_file_name': needle_file_name,
|
||||||
'reader_cfg': needlebench_reader_cfg,
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
'infer_cfg': needlebench_infer_cfg,
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
'eval_cfg': needlebench_eval_cfg
|
'eval_cfg': needlebench_eval_cfg,
|
||||||
}
|
}
|
||||||
needlebench_zh_datasets.append(dataset_dict)
|
needlebench_zh_datasets.append(dataset_dict)
|
||||||
|
@ -41,23 +41,25 @@ needlebench_infer_cfg = dict(
|
|||||||
dict(role='HUMAN', prompt='{prompt}'),
|
dict(role='HUMAN', prompt='{prompt}'),
|
||||||
dict(role='BOT', prompt='{answer}\n'),
|
dict(role='BOT', prompt='{answer}\n'),
|
||||||
]
|
]
|
||||||
)
|
|
||||||
),
|
),
|
||||||
|
),
|
||||||
retriever=dict(type=ZeroRetriever),
|
retriever=dict(type=ZeroRetriever),
|
||||||
inferencer=dict(type=GenInferencer))
|
inferencer=dict(type=GenInferencer),
|
||||||
|
)
|
||||||
|
|
||||||
needlebench_eval_cfg = dict(
|
needlebench_eval_cfg = dict(
|
||||||
evaluator=dict(type=NeedleBenchMultiEvaluator),
|
evaluator=dict(type=NeedleBenchMultiEvaluator),
|
||||||
pred_postprocessor=dict(type=needlebench_postprocess),
|
pred_postprocessor=dict(type=needlebench_postprocess),
|
||||||
dataset_postprocessor=dict(type=needlebench_dataset_postprocess),
|
dataset_postprocessor=dict(type=needlebench_dataset_postprocess),
|
||||||
pred_role='BOT')
|
pred_role='BOT',
|
||||||
|
)
|
||||||
|
|
||||||
context_lengths = list(range(1000, 5000, 1000))
|
context_lengths = list(range(1000, 5000, 1000))
|
||||||
document_depth_percent_intervals = 20
|
document_depth_percent_intervals = 20
|
||||||
document_depth_percent_interval_type = 'linear'
|
document_depth_percent_interval_type = 'linear'
|
||||||
|
|
||||||
# ----------English Version----------
|
# ----------English Version----------
|
||||||
base_path = './data/needlebench'
|
base_path = 'opencompass/needlebench'
|
||||||
file_list = ['PaulGrahamEssays.jsonl']
|
file_list = ['PaulGrahamEssays.jsonl']
|
||||||
|
|
||||||
needle_file_name = 'multi_needle_reasoning_en.json'
|
needle_file_name = 'multi_needle_reasoning_en.json'
|
||||||
@ -68,11 +70,11 @@ language = 'English'
|
|||||||
|
|
||||||
for original_context_length in context_lengths:
|
for original_context_length in context_lengths:
|
||||||
for depth_percent in generate_depth_percents(
|
for depth_percent in generate_depth_percents(
|
||||||
document_depth_percent_intervals,
|
document_depth_percent_intervals, document_depth_percent_interval_type
|
||||||
document_depth_percent_interval_type):
|
):
|
||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': f'Length{original_context_length}'
|
'abbr': f'Length{original_context_length}'
|
||||||
f'Depth{int(depth_percent)}_{num_needles}needle_en_4k',
|
f'Depth{int(depth_percent)}_{num_needles}needle_en_4k',
|
||||||
'type': NeedleBenchMultiDataset,
|
'type': NeedleBenchMultiDataset,
|
||||||
'path': base_path,
|
'path': base_path,
|
||||||
'length': original_context_length,
|
'length': original_context_length,
|
||||||
@ -88,7 +90,7 @@ for original_context_length in context_lengths:
|
|||||||
'diff': diff,
|
'diff': diff,
|
||||||
'reader_cfg': needlebench_reader_cfg,
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
'infer_cfg': needlebench_infer_cfg,
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
'eval_cfg': needlebench_eval_cfg
|
'eval_cfg': needlebench_eval_cfg,
|
||||||
}
|
}
|
||||||
needlebench_2needle_en_datasets.append(dataset_dict)
|
needlebench_2needle_en_datasets.append(dataset_dict)
|
||||||
|
|
||||||
@ -97,11 +99,11 @@ needlebench_3needle_en_datasets = []
|
|||||||
|
|
||||||
for original_context_length in context_lengths:
|
for original_context_length in context_lengths:
|
||||||
for depth_percent in generate_depth_percents(
|
for depth_percent in generate_depth_percents(
|
||||||
document_depth_percent_intervals,
|
document_depth_percent_intervals, document_depth_percent_interval_type
|
||||||
document_depth_percent_interval_type):
|
):
|
||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': f'Length{original_context_length}'
|
'abbr': f'Length{original_context_length}'
|
||||||
f'Depth{int(depth_percent)}_{num_needles}needle_en_4k',
|
f'Depth{int(depth_percent)}_{num_needles}needle_en_4k',
|
||||||
'type': NeedleBenchMultiDataset,
|
'type': NeedleBenchMultiDataset,
|
||||||
'path': base_path,
|
'path': base_path,
|
||||||
'length': original_context_length,
|
'length': original_context_length,
|
||||||
@ -117,7 +119,7 @@ for original_context_length in context_lengths:
|
|||||||
'diff': diff,
|
'diff': diff,
|
||||||
'reader_cfg': needlebench_reader_cfg,
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
'infer_cfg': needlebench_infer_cfg,
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
'eval_cfg': needlebench_eval_cfg
|
'eval_cfg': needlebench_eval_cfg,
|
||||||
}
|
}
|
||||||
needlebench_3needle_en_datasets.append(dataset_dict)
|
needlebench_3needle_en_datasets.append(dataset_dict)
|
||||||
|
|
||||||
@ -126,11 +128,11 @@ needlebench_4needle_en_datasets = []
|
|||||||
|
|
||||||
for original_context_length in context_lengths:
|
for original_context_length in context_lengths:
|
||||||
for depth_percent in generate_depth_percents(
|
for depth_percent in generate_depth_percents(
|
||||||
document_depth_percent_intervals,
|
document_depth_percent_intervals, document_depth_percent_interval_type
|
||||||
document_depth_percent_interval_type):
|
):
|
||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': f'Length{original_context_length}'
|
'abbr': f'Length{original_context_length}'
|
||||||
f'Depth{int(depth_percent)}_{num_needles}needle_en_4k',
|
f'Depth{int(depth_percent)}_{num_needles}needle_en_4k',
|
||||||
'type': NeedleBenchMultiDataset,
|
'type': NeedleBenchMultiDataset,
|
||||||
'path': base_path,
|
'path': base_path,
|
||||||
'length': original_context_length,
|
'length': original_context_length,
|
||||||
@ -146,7 +148,7 @@ for original_context_length in context_lengths:
|
|||||||
'diff': diff,
|
'diff': diff,
|
||||||
'reader_cfg': needlebench_reader_cfg,
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
'infer_cfg': needlebench_infer_cfg,
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
'eval_cfg': needlebench_eval_cfg
|
'eval_cfg': needlebench_eval_cfg,
|
||||||
}
|
}
|
||||||
needlebench_4needle_en_datasets.append(dataset_dict)
|
needlebench_4needle_en_datasets.append(dataset_dict)
|
||||||
|
|
||||||
@ -155,11 +157,11 @@ needlebench_5needle_en_datasets = []
|
|||||||
|
|
||||||
for original_context_length in context_lengths:
|
for original_context_length in context_lengths:
|
||||||
for depth_percent in generate_depth_percents(
|
for depth_percent in generate_depth_percents(
|
||||||
document_depth_percent_intervals,
|
document_depth_percent_intervals, document_depth_percent_interval_type
|
||||||
document_depth_percent_interval_type):
|
):
|
||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': f'Length{original_context_length}'
|
'abbr': f'Length{original_context_length}'
|
||||||
f'Depth{int(depth_percent)}_{num_needles}needle_en_4k',
|
f'Depth{int(depth_percent)}_{num_needles}needle_en_4k',
|
||||||
'type': NeedleBenchMultiDataset,
|
'type': NeedleBenchMultiDataset,
|
||||||
'path': base_path,
|
'path': base_path,
|
||||||
'length': original_context_length,
|
'length': original_context_length,
|
||||||
@ -175,12 +177,12 @@ for original_context_length in context_lengths:
|
|||||||
'diff': diff,
|
'diff': diff,
|
||||||
'reader_cfg': needlebench_reader_cfg,
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
'infer_cfg': needlebench_infer_cfg,
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
'eval_cfg': needlebench_eval_cfg
|
'eval_cfg': needlebench_eval_cfg,
|
||||||
}
|
}
|
||||||
needlebench_5needle_en_datasets.append(dataset_dict)
|
needlebench_5needle_en_datasets.append(dataset_dict)
|
||||||
|
|
||||||
# ----------Chinese Version----------
|
# ----------Chinese Version----------
|
||||||
base_path = './data/needlebench'
|
base_path = 'opencompass/needlebench'
|
||||||
file_list = ['zh_finance.jsonl']
|
file_list = ['zh_finance.jsonl']
|
||||||
|
|
||||||
needle_file_name = 'multi_needle_reasoning_zh.json'
|
needle_file_name = 'multi_needle_reasoning_zh.json'
|
||||||
@ -191,11 +193,11 @@ language = 'Chinese'
|
|||||||
|
|
||||||
for original_context_length in context_lengths:
|
for original_context_length in context_lengths:
|
||||||
for depth_percent in generate_depth_percents(
|
for depth_percent in generate_depth_percents(
|
||||||
document_depth_percent_intervals,
|
document_depth_percent_intervals, document_depth_percent_interval_type
|
||||||
document_depth_percent_interval_type):
|
):
|
||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': f'Length{original_context_length}'
|
'abbr': f'Length{original_context_length}'
|
||||||
f'Depth{int(depth_percent)}_{num_needles}needle_zh_4k',
|
f'Depth{int(depth_percent)}_{num_needles}needle_zh_4k',
|
||||||
'type': NeedleBenchMultiDataset,
|
'type': NeedleBenchMultiDataset,
|
||||||
'path': base_path,
|
'path': base_path,
|
||||||
'length': original_context_length,
|
'length': original_context_length,
|
||||||
@ -211,7 +213,7 @@ for original_context_length in context_lengths:
|
|||||||
'diff': diff,
|
'diff': diff,
|
||||||
'reader_cfg': needlebench_reader_cfg,
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
'infer_cfg': needlebench_infer_cfg,
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
'eval_cfg': needlebench_eval_cfg
|
'eval_cfg': needlebench_eval_cfg,
|
||||||
}
|
}
|
||||||
needlebench_2needle_zh_datasets.append(dataset_dict)
|
needlebench_2needle_zh_datasets.append(dataset_dict)
|
||||||
|
|
||||||
@ -220,11 +222,11 @@ needlebench_3needle_zh_datasets = []
|
|||||||
|
|
||||||
for original_context_length in context_lengths:
|
for original_context_length in context_lengths:
|
||||||
for depth_percent in generate_depth_percents(
|
for depth_percent in generate_depth_percents(
|
||||||
document_depth_percent_intervals,
|
document_depth_percent_intervals, document_depth_percent_interval_type
|
||||||
document_depth_percent_interval_type):
|
):
|
||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': f'Length{original_context_length}'
|
'abbr': f'Length{original_context_length}'
|
||||||
f'Depth{int(depth_percent)}_{num_needles}needle_zh_4k',
|
f'Depth{int(depth_percent)}_{num_needles}needle_zh_4k',
|
||||||
'type': NeedleBenchMultiDataset,
|
'type': NeedleBenchMultiDataset,
|
||||||
'path': base_path,
|
'path': base_path,
|
||||||
'length': original_context_length,
|
'length': original_context_length,
|
||||||
@ -240,7 +242,7 @@ for original_context_length in context_lengths:
|
|||||||
'diff': diff,
|
'diff': diff,
|
||||||
'reader_cfg': needlebench_reader_cfg,
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
'infer_cfg': needlebench_infer_cfg,
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
'eval_cfg': needlebench_eval_cfg
|
'eval_cfg': needlebench_eval_cfg,
|
||||||
}
|
}
|
||||||
needlebench_3needle_zh_datasets.append(dataset_dict)
|
needlebench_3needle_zh_datasets.append(dataset_dict)
|
||||||
|
|
||||||
@ -249,11 +251,11 @@ needlebench_4needle_zh_datasets = []
|
|||||||
|
|
||||||
for original_context_length in context_lengths:
|
for original_context_length in context_lengths:
|
||||||
for depth_percent in generate_depth_percents(
|
for depth_percent in generate_depth_percents(
|
||||||
document_depth_percent_intervals,
|
document_depth_percent_intervals, document_depth_percent_interval_type
|
||||||
document_depth_percent_interval_type):
|
):
|
||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': f'Length{original_context_length}'
|
'abbr': f'Length{original_context_length}'
|
||||||
f'Depth{int(depth_percent)}_{num_needles}needle_zh_4k',
|
f'Depth{int(depth_percent)}_{num_needles}needle_zh_4k',
|
||||||
'type': NeedleBenchMultiDataset,
|
'type': NeedleBenchMultiDataset,
|
||||||
'path': base_path,
|
'path': base_path,
|
||||||
'length': original_context_length,
|
'length': original_context_length,
|
||||||
@ -269,7 +271,7 @@ for original_context_length in context_lengths:
|
|||||||
'diff': diff,
|
'diff': diff,
|
||||||
'reader_cfg': needlebench_reader_cfg,
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
'infer_cfg': needlebench_infer_cfg,
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
'eval_cfg': needlebench_eval_cfg
|
'eval_cfg': needlebench_eval_cfg,
|
||||||
}
|
}
|
||||||
needlebench_4needle_zh_datasets.append(dataset_dict)
|
needlebench_4needle_zh_datasets.append(dataset_dict)
|
||||||
|
|
||||||
@ -278,11 +280,11 @@ needlebench_5needle_zh_datasets = []
|
|||||||
|
|
||||||
for original_context_length in context_lengths:
|
for original_context_length in context_lengths:
|
||||||
for depth_percent in generate_depth_percents(
|
for depth_percent in generate_depth_percents(
|
||||||
document_depth_percent_intervals,
|
document_depth_percent_intervals, document_depth_percent_interval_type
|
||||||
document_depth_percent_interval_type):
|
):
|
||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': f'Length{original_context_length}'
|
'abbr': f'Length{original_context_length}'
|
||||||
f'Depth{int(depth_percent)}_{num_needles}needle_zh_4k',
|
f'Depth{int(depth_percent)}_{num_needles}needle_zh_4k',
|
||||||
'type': NeedleBenchMultiDataset,
|
'type': NeedleBenchMultiDataset,
|
||||||
'path': base_path,
|
'path': base_path,
|
||||||
'length': original_context_length,
|
'length': original_context_length,
|
||||||
@ -298,6 +300,6 @@ for original_context_length in context_lengths:
|
|||||||
'diff': diff,
|
'diff': diff,
|
||||||
'reader_cfg': needlebench_reader_cfg,
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
'infer_cfg': needlebench_infer_cfg,
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
'eval_cfg': needlebench_eval_cfg
|
'eval_cfg': needlebench_eval_cfg,
|
||||||
}
|
}
|
||||||
needlebench_5needle_zh_datasets.append(dataset_dict)
|
needlebench_5needle_zh_datasets.append(dataset_dict)
|
||||||
|
@ -41,34 +41,35 @@ needlebench_infer_cfg = dict(
|
|||||||
dict(role='HUMAN', prompt='{prompt}'),
|
dict(role='HUMAN', prompt='{prompt}'),
|
||||||
dict(role='BOT', prompt='{answer}\n'),
|
dict(role='BOT', prompt='{answer}\n'),
|
||||||
]
|
]
|
||||||
)
|
|
||||||
),
|
),
|
||||||
|
),
|
||||||
retriever=dict(type=ZeroRetriever),
|
retriever=dict(type=ZeroRetriever),
|
||||||
inferencer=dict(type=GenInferencer))
|
inferencer=dict(type=GenInferencer),
|
||||||
|
)
|
||||||
|
|
||||||
needlebench_eval_cfg = dict(
|
needlebench_eval_cfg = dict(
|
||||||
evaluator=dict(type=NeedleBenchParallelEvaluator),
|
evaluator=dict(type=NeedleBenchParallelEvaluator),
|
||||||
pred_postprocessor=dict(type=needlebench_postprocess),
|
pred_postprocessor=dict(type=needlebench_postprocess),
|
||||||
dataset_postprocessor=dict(type=needlebench_dataset_postprocess),
|
dataset_postprocessor=dict(type=needlebench_dataset_postprocess),
|
||||||
pred_role='BOT')
|
pred_role='BOT',
|
||||||
|
)
|
||||||
|
|
||||||
context_lengths = list(range(1000, 5000, 1000))
|
context_lengths = list(range(1000, 5000, 1000))
|
||||||
document_depth_percent_intervals = 20
|
document_depth_percent_intervals = 20
|
||||||
document_depth_percent_interval_type = 'linear'
|
document_depth_percent_interval_type = 'linear'
|
||||||
|
|
||||||
base_path = './data/needlebench'
|
base_path = 'opencompass/needlebench'
|
||||||
file_list = ['PaulGrahamEssays.jsonl']
|
file_list = ['PaulGrahamEssays.jsonl']
|
||||||
needlebench_en_datasets = []
|
needlebench_en_datasets = []
|
||||||
needle_file_name = 'needles.jsonl'
|
needle_file_name = 'needles.jsonl'
|
||||||
depths_float = generate_depth_percents(
|
depths_float = generate_depth_percents(
|
||||||
document_depth_percent_intervals,
|
document_depth_percent_intervals, document_depth_percent_interval_type
|
||||||
document_depth_percent_interval_type)
|
)
|
||||||
depths = [int(depth) for depth in depths_float]
|
depths = [int(depth) for depth in depths_float]
|
||||||
|
|
||||||
for original_context_length in context_lengths:
|
for original_context_length in context_lengths:
|
||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': f'Length{original_context_length}'
|
'abbr': f'Length{original_context_length}' f'_parallel_en_4k',
|
||||||
f'_parallel_en_4k',
|
|
||||||
'type': NeedleBenchParallelDataset,
|
'type': NeedleBenchParallelDataset,
|
||||||
'path': base_path,
|
'path': base_path,
|
||||||
'needle_file_name': needle_file_name,
|
'needle_file_name': needle_file_name,
|
||||||
@ -82,7 +83,7 @@ for original_context_length in context_lengths:
|
|||||||
'language': 'English',
|
'language': 'English',
|
||||||
'reader_cfg': needlebench_reader_cfg,
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
'infer_cfg': needlebench_infer_cfg,
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
'eval_cfg': needlebench_eval_cfg
|
'eval_cfg': needlebench_eval_cfg,
|
||||||
}
|
}
|
||||||
needlebench_en_datasets.append(dataset_dict)
|
needlebench_en_datasets.append(dataset_dict)
|
||||||
|
|
||||||
@ -91,8 +92,7 @@ needlebench_zh_datasets = []
|
|||||||
|
|
||||||
for original_context_length in context_lengths:
|
for original_context_length in context_lengths:
|
||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': f'Length{original_context_length}'
|
'abbr': f'Length{original_context_length}' f'_parallel_zh_4k',
|
||||||
f'_parallel_zh_4k',
|
|
||||||
'type': NeedleBenchParallelDataset,
|
'type': NeedleBenchParallelDataset,
|
||||||
'path': base_path,
|
'path': base_path,
|
||||||
'needle_file_name': needle_file_name,
|
'needle_file_name': needle_file_name,
|
||||||
@ -106,6 +106,6 @@ for original_context_length in context_lengths:
|
|||||||
'language': 'Chinese',
|
'language': 'Chinese',
|
||||||
'reader_cfg': needlebench_reader_cfg,
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
'infer_cfg': needlebench_infer_cfg,
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
'eval_cfg': needlebench_eval_cfg
|
'eval_cfg': needlebench_eval_cfg,
|
||||||
}
|
}
|
||||||
needlebench_zh_datasets.append(dataset_dict)
|
needlebench_zh_datasets.append(dataset_dict)
|
||||||
|
@ -41,33 +41,35 @@ needlebench_infer_cfg = dict(
|
|||||||
dict(role='HUMAN', prompt='{prompt}'),
|
dict(role='HUMAN', prompt='{prompt}'),
|
||||||
dict(role='BOT', prompt='{answer}\n'),
|
dict(role='BOT', prompt='{answer}\n'),
|
||||||
]
|
]
|
||||||
)
|
|
||||||
),
|
),
|
||||||
|
),
|
||||||
retriever=dict(type=ZeroRetriever),
|
retriever=dict(type=ZeroRetriever),
|
||||||
inferencer=dict(type=GenInferencer))
|
inferencer=dict(type=GenInferencer),
|
||||||
|
)
|
||||||
|
|
||||||
needlebench_eval_cfg = dict(
|
needlebench_eval_cfg = dict(
|
||||||
evaluator=dict(type=NeedleBenchOriginEvaluator),
|
evaluator=dict(type=NeedleBenchOriginEvaluator),
|
||||||
pred_postprocessor=dict(type=needlebench_postprocess),
|
pred_postprocessor=dict(type=needlebench_postprocess),
|
||||||
dataset_postprocessor=dict(type=needlebench_dataset_postprocess),
|
dataset_postprocessor=dict(type=needlebench_dataset_postprocess),
|
||||||
pred_role='BOT')
|
pred_role='BOT',
|
||||||
|
)
|
||||||
|
|
||||||
context_lengths = list(range(1000, 5000, 1000))
|
context_lengths = list(range(1000, 5000, 1000))
|
||||||
document_depth_percent_intervals = 20
|
document_depth_percent_intervals = 20
|
||||||
document_depth_percent_interval_type = 'linear'
|
document_depth_percent_interval_type = 'linear'
|
||||||
|
|
||||||
base_path = './data/needlebench'
|
base_path = 'opencompass/needlebench'
|
||||||
file_list = ['PaulGrahamEssays.jsonl']
|
file_list = ['PaulGrahamEssays.jsonl']
|
||||||
needlebench_en_datasets = []
|
needlebench_en_datasets = []
|
||||||
needle_file_name = 'needles.jsonl'
|
needle_file_name = 'needles.jsonl'
|
||||||
|
|
||||||
for original_context_length in context_lengths:
|
for original_context_length in context_lengths:
|
||||||
for depth_percent in generate_depth_percents(
|
for depth_percent in generate_depth_percents(
|
||||||
document_depth_percent_intervals,
|
document_depth_percent_intervals, document_depth_percent_interval_type
|
||||||
document_depth_percent_interval_type):
|
):
|
||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': f'Length{original_context_length}'
|
'abbr': f'Length{original_context_length}'
|
||||||
f'Depth{int(depth_percent)}_origin_en_4k',
|
f'Depth{int(depth_percent)}_origin_en_4k',
|
||||||
'type': NeedleBenchOriginDataset,
|
'type': NeedleBenchOriginDataset,
|
||||||
'path': base_path,
|
'path': base_path,
|
||||||
'length': original_context_length,
|
'length': original_context_length,
|
||||||
@ -81,7 +83,7 @@ for original_context_length in context_lengths:
|
|||||||
'needle_file_name': needle_file_name,
|
'needle_file_name': needle_file_name,
|
||||||
'reader_cfg': needlebench_reader_cfg,
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
'infer_cfg': needlebench_infer_cfg,
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
'eval_cfg': needlebench_eval_cfg
|
'eval_cfg': needlebench_eval_cfg,
|
||||||
}
|
}
|
||||||
needlebench_en_datasets.append(dataset_dict)
|
needlebench_en_datasets.append(dataset_dict)
|
||||||
|
|
||||||
@ -91,11 +93,11 @@ needle_file_name = 'needles.jsonl'
|
|||||||
|
|
||||||
for original_context_length in context_lengths:
|
for original_context_length in context_lengths:
|
||||||
for depth_percent in generate_depth_percents(
|
for depth_percent in generate_depth_percents(
|
||||||
document_depth_percent_intervals,
|
document_depth_percent_intervals, document_depth_percent_interval_type
|
||||||
document_depth_percent_interval_type):
|
):
|
||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': f'Length{original_context_length}'
|
'abbr': f'Length{original_context_length}'
|
||||||
f'Depth{int(depth_percent)}_origin_zh_4k',
|
f'Depth{int(depth_percent)}_origin_zh_4k',
|
||||||
'type': NeedleBenchOriginDataset,
|
'type': NeedleBenchOriginDataset,
|
||||||
'path': base_path,
|
'path': base_path,
|
||||||
'length': original_context_length,
|
'length': original_context_length,
|
||||||
@ -109,6 +111,6 @@ for original_context_length in context_lengths:
|
|||||||
'needle_file_name': needle_file_name,
|
'needle_file_name': needle_file_name,
|
||||||
'reader_cfg': needlebench_reader_cfg,
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
'infer_cfg': needlebench_infer_cfg,
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
'eval_cfg': needlebench_eval_cfg
|
'eval_cfg': needlebench_eval_cfg,
|
||||||
}
|
}
|
||||||
needlebench_zh_datasets.append(dataset_dict)
|
needlebench_zh_datasets.append(dataset_dict)
|
||||||
|
@ -41,23 +41,25 @@ needlebench_infer_cfg = dict(
|
|||||||
dict(role='HUMAN', prompt='{prompt}'),
|
dict(role='HUMAN', prompt='{prompt}'),
|
||||||
dict(role='BOT', prompt='{answer}\n'),
|
dict(role='BOT', prompt='{answer}\n'),
|
||||||
]
|
]
|
||||||
)
|
|
||||||
),
|
),
|
||||||
|
),
|
||||||
retriever=dict(type=ZeroRetriever),
|
retriever=dict(type=ZeroRetriever),
|
||||||
inferencer=dict(type=GenInferencer))
|
inferencer=dict(type=GenInferencer),
|
||||||
|
)
|
||||||
|
|
||||||
needlebench_eval_cfg = dict(
|
needlebench_eval_cfg = dict(
|
||||||
evaluator=dict(type=NeedleBenchMultiEvaluator),
|
evaluator=dict(type=NeedleBenchMultiEvaluator),
|
||||||
pred_postprocessor=dict(type=needlebench_postprocess),
|
pred_postprocessor=dict(type=needlebench_postprocess),
|
||||||
dataset_postprocessor=dict(type=needlebench_dataset_postprocess),
|
dataset_postprocessor=dict(type=needlebench_dataset_postprocess),
|
||||||
pred_role='BOT')
|
pred_role='BOT',
|
||||||
|
)
|
||||||
|
|
||||||
context_lengths = list(range(5000, 9000, 1000))
|
context_lengths = list(range(5000, 9000, 1000))
|
||||||
document_depth_percent_intervals = 20
|
document_depth_percent_intervals = 20
|
||||||
document_depth_percent_interval_type = 'linear'
|
document_depth_percent_interval_type = 'linear'
|
||||||
|
|
||||||
# ----------English Version----------
|
# ----------English Version----------
|
||||||
base_path = './data/needlebench'
|
base_path = 'opencompass/needlebench'
|
||||||
file_list = ['PaulGrahamEssays.jsonl']
|
file_list = ['PaulGrahamEssays.jsonl']
|
||||||
|
|
||||||
needle_file_name = 'multi_needle_reasoning_en.json'
|
needle_file_name = 'multi_needle_reasoning_en.json'
|
||||||
@ -68,11 +70,11 @@ language = 'English'
|
|||||||
|
|
||||||
for original_context_length in context_lengths:
|
for original_context_length in context_lengths:
|
||||||
for depth_percent in generate_depth_percents(
|
for depth_percent in generate_depth_percents(
|
||||||
document_depth_percent_intervals,
|
document_depth_percent_intervals, document_depth_percent_interval_type
|
||||||
document_depth_percent_interval_type):
|
):
|
||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': f'Length{original_context_length}'
|
'abbr': f'Length{original_context_length}'
|
||||||
f'Depth{int(depth_percent)}_{num_needles}needle_en_8k',
|
f'Depth{int(depth_percent)}_{num_needles}needle_en_8k',
|
||||||
'type': NeedleBenchMultiDataset,
|
'type': NeedleBenchMultiDataset,
|
||||||
'path': base_path,
|
'path': base_path,
|
||||||
'length': original_context_length,
|
'length': original_context_length,
|
||||||
@ -88,7 +90,7 @@ for original_context_length in context_lengths:
|
|||||||
'diff': diff,
|
'diff': diff,
|
||||||
'reader_cfg': needlebench_reader_cfg,
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
'infer_cfg': needlebench_infer_cfg,
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
'eval_cfg': needlebench_eval_cfg
|
'eval_cfg': needlebench_eval_cfg,
|
||||||
}
|
}
|
||||||
needlebench_2needle_en_datasets.append(dataset_dict)
|
needlebench_2needle_en_datasets.append(dataset_dict)
|
||||||
|
|
||||||
@ -97,11 +99,11 @@ needlebench_3needle_en_datasets = []
|
|||||||
|
|
||||||
for original_context_length in context_lengths:
|
for original_context_length in context_lengths:
|
||||||
for depth_percent in generate_depth_percents(
|
for depth_percent in generate_depth_percents(
|
||||||
document_depth_percent_intervals,
|
document_depth_percent_intervals, document_depth_percent_interval_type
|
||||||
document_depth_percent_interval_type):
|
):
|
||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': f'Length{original_context_length}'
|
'abbr': f'Length{original_context_length}'
|
||||||
f'Depth{int(depth_percent)}_{num_needles}needle_en_8k',
|
f'Depth{int(depth_percent)}_{num_needles}needle_en_8k',
|
||||||
'type': NeedleBenchMultiDataset,
|
'type': NeedleBenchMultiDataset,
|
||||||
'path': base_path,
|
'path': base_path,
|
||||||
'length': original_context_length,
|
'length': original_context_length,
|
||||||
@ -117,7 +119,7 @@ for original_context_length in context_lengths:
|
|||||||
'diff': diff,
|
'diff': diff,
|
||||||
'reader_cfg': needlebench_reader_cfg,
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
'infer_cfg': needlebench_infer_cfg,
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
'eval_cfg': needlebench_eval_cfg
|
'eval_cfg': needlebench_eval_cfg,
|
||||||
}
|
}
|
||||||
needlebench_3needle_en_datasets.append(dataset_dict)
|
needlebench_3needle_en_datasets.append(dataset_dict)
|
||||||
|
|
||||||
@ -126,11 +128,11 @@ needlebench_4needle_en_datasets = []
|
|||||||
|
|
||||||
for original_context_length in context_lengths:
|
for original_context_length in context_lengths:
|
||||||
for depth_percent in generate_depth_percents(
|
for depth_percent in generate_depth_percents(
|
||||||
document_depth_percent_intervals,
|
document_depth_percent_intervals, document_depth_percent_interval_type
|
||||||
document_depth_percent_interval_type):
|
):
|
||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': f'Length{original_context_length}'
|
'abbr': f'Length{original_context_length}'
|
||||||
f'Depth{int(depth_percent)}_{num_needles}needle_en_8k',
|
f'Depth{int(depth_percent)}_{num_needles}needle_en_8k',
|
||||||
'type': NeedleBenchMultiDataset,
|
'type': NeedleBenchMultiDataset,
|
||||||
'path': base_path,
|
'path': base_path,
|
||||||
'length': original_context_length,
|
'length': original_context_length,
|
||||||
@ -146,7 +148,7 @@ for original_context_length in context_lengths:
|
|||||||
'diff': diff,
|
'diff': diff,
|
||||||
'reader_cfg': needlebench_reader_cfg,
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
'infer_cfg': needlebench_infer_cfg,
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
'eval_cfg': needlebench_eval_cfg
|
'eval_cfg': needlebench_eval_cfg,
|
||||||
}
|
}
|
||||||
needlebench_4needle_en_datasets.append(dataset_dict)
|
needlebench_4needle_en_datasets.append(dataset_dict)
|
||||||
|
|
||||||
@ -155,11 +157,11 @@ needlebench_5needle_en_datasets = []
|
|||||||
|
|
||||||
for original_context_length in context_lengths:
|
for original_context_length in context_lengths:
|
||||||
for depth_percent in generate_depth_percents(
|
for depth_percent in generate_depth_percents(
|
||||||
document_depth_percent_intervals,
|
document_depth_percent_intervals, document_depth_percent_interval_type
|
||||||
document_depth_percent_interval_type):
|
):
|
||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': f'Length{original_context_length}'
|
'abbr': f'Length{original_context_length}'
|
||||||
f'Depth{int(depth_percent)}_{num_needles}needle_en_8k',
|
f'Depth{int(depth_percent)}_{num_needles}needle_en_8k',
|
||||||
'type': NeedleBenchMultiDataset,
|
'type': NeedleBenchMultiDataset,
|
||||||
'path': base_path,
|
'path': base_path,
|
||||||
'length': original_context_length,
|
'length': original_context_length,
|
||||||
@ -175,12 +177,12 @@ for original_context_length in context_lengths:
|
|||||||
'diff': diff,
|
'diff': diff,
|
||||||
'reader_cfg': needlebench_reader_cfg,
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
'infer_cfg': needlebench_infer_cfg,
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
'eval_cfg': needlebench_eval_cfg
|
'eval_cfg': needlebench_eval_cfg,
|
||||||
}
|
}
|
||||||
needlebench_5needle_en_datasets.append(dataset_dict)
|
needlebench_5needle_en_datasets.append(dataset_dict)
|
||||||
|
|
||||||
# ----------Chinese Version----------
|
# ----------Chinese Version----------
|
||||||
base_path = './data/needlebench'
|
base_path = 'opencompass/needlebench'
|
||||||
file_list = ['zh_finance.jsonl']
|
file_list = ['zh_finance.jsonl']
|
||||||
|
|
||||||
needle_file_name = 'multi_needle_reasoning_zh.json'
|
needle_file_name = 'multi_needle_reasoning_zh.json'
|
||||||
@ -191,11 +193,11 @@ language = 'Chinese'
|
|||||||
|
|
||||||
for original_context_length in context_lengths:
|
for original_context_length in context_lengths:
|
||||||
for depth_percent in generate_depth_percents(
|
for depth_percent in generate_depth_percents(
|
||||||
document_depth_percent_intervals,
|
document_depth_percent_intervals, document_depth_percent_interval_type
|
||||||
document_depth_percent_interval_type):
|
):
|
||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': f'Length{original_context_length}'
|
'abbr': f'Length{original_context_length}'
|
||||||
f'Depth{int(depth_percent)}_{num_needles}needle_zh_8k',
|
f'Depth{int(depth_percent)}_{num_needles}needle_zh_8k',
|
||||||
'type': NeedleBenchMultiDataset,
|
'type': NeedleBenchMultiDataset,
|
||||||
'path': base_path,
|
'path': base_path,
|
||||||
'length': original_context_length,
|
'length': original_context_length,
|
||||||
@ -211,7 +213,7 @@ for original_context_length in context_lengths:
|
|||||||
'diff': diff,
|
'diff': diff,
|
||||||
'reader_cfg': needlebench_reader_cfg,
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
'infer_cfg': needlebench_infer_cfg,
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
'eval_cfg': needlebench_eval_cfg
|
'eval_cfg': needlebench_eval_cfg,
|
||||||
}
|
}
|
||||||
needlebench_2needle_zh_datasets.append(dataset_dict)
|
needlebench_2needle_zh_datasets.append(dataset_dict)
|
||||||
|
|
||||||
@ -220,11 +222,11 @@ needlebench_3needle_zh_datasets = []
|
|||||||
|
|
||||||
for original_context_length in context_lengths:
|
for original_context_length in context_lengths:
|
||||||
for depth_percent in generate_depth_percents(
|
for depth_percent in generate_depth_percents(
|
||||||
document_depth_percent_intervals,
|
document_depth_percent_intervals, document_depth_percent_interval_type
|
||||||
document_depth_percent_interval_type):
|
):
|
||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': f'Length{original_context_length}'
|
'abbr': f'Length{original_context_length}'
|
||||||
f'Depth{int(depth_percent)}_{num_needles}needle_zh_8k',
|
f'Depth{int(depth_percent)}_{num_needles}needle_zh_8k',
|
||||||
'type': NeedleBenchMultiDataset,
|
'type': NeedleBenchMultiDataset,
|
||||||
'path': base_path,
|
'path': base_path,
|
||||||
'length': original_context_length,
|
'length': original_context_length,
|
||||||
@ -240,7 +242,7 @@ for original_context_length in context_lengths:
|
|||||||
'diff': diff,
|
'diff': diff,
|
||||||
'reader_cfg': needlebench_reader_cfg,
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
'infer_cfg': needlebench_infer_cfg,
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
'eval_cfg': needlebench_eval_cfg
|
'eval_cfg': needlebench_eval_cfg,
|
||||||
}
|
}
|
||||||
needlebench_3needle_zh_datasets.append(dataset_dict)
|
needlebench_3needle_zh_datasets.append(dataset_dict)
|
||||||
|
|
||||||
@ -249,11 +251,11 @@ needlebench_4needle_zh_datasets = []
|
|||||||
|
|
||||||
for original_context_length in context_lengths:
|
for original_context_length in context_lengths:
|
||||||
for depth_percent in generate_depth_percents(
|
for depth_percent in generate_depth_percents(
|
||||||
document_depth_percent_intervals,
|
document_depth_percent_intervals, document_depth_percent_interval_type
|
||||||
document_depth_percent_interval_type):
|
):
|
||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': f'Length{original_context_length}'
|
'abbr': f'Length{original_context_length}'
|
||||||
f'Depth{int(depth_percent)}_{num_needles}needle_zh_8k',
|
f'Depth{int(depth_percent)}_{num_needles}needle_zh_8k',
|
||||||
'type': NeedleBenchMultiDataset,
|
'type': NeedleBenchMultiDataset,
|
||||||
'path': base_path,
|
'path': base_path,
|
||||||
'length': original_context_length,
|
'length': original_context_length,
|
||||||
@ -269,7 +271,7 @@ for original_context_length in context_lengths:
|
|||||||
'diff': diff,
|
'diff': diff,
|
||||||
'reader_cfg': needlebench_reader_cfg,
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
'infer_cfg': needlebench_infer_cfg,
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
'eval_cfg': needlebench_eval_cfg
|
'eval_cfg': needlebench_eval_cfg,
|
||||||
}
|
}
|
||||||
needlebench_4needle_zh_datasets.append(dataset_dict)
|
needlebench_4needle_zh_datasets.append(dataset_dict)
|
||||||
|
|
||||||
@ -278,11 +280,11 @@ needlebench_5needle_zh_datasets = []
|
|||||||
|
|
||||||
for original_context_length in context_lengths:
|
for original_context_length in context_lengths:
|
||||||
for depth_percent in generate_depth_percents(
|
for depth_percent in generate_depth_percents(
|
||||||
document_depth_percent_intervals,
|
document_depth_percent_intervals, document_depth_percent_interval_type
|
||||||
document_depth_percent_interval_type):
|
):
|
||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': f'Length{original_context_length}'
|
'abbr': f'Length{original_context_length}'
|
||||||
f'Depth{int(depth_percent)}_{num_needles}needle_zh_8k',
|
f'Depth{int(depth_percent)}_{num_needles}needle_zh_8k',
|
||||||
'type': NeedleBenchMultiDataset,
|
'type': NeedleBenchMultiDataset,
|
||||||
'path': base_path,
|
'path': base_path,
|
||||||
'length': original_context_length,
|
'length': original_context_length,
|
||||||
@ -298,6 +300,6 @@ for original_context_length in context_lengths:
|
|||||||
'diff': diff,
|
'diff': diff,
|
||||||
'reader_cfg': needlebench_reader_cfg,
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
'infer_cfg': needlebench_infer_cfg,
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
'eval_cfg': needlebench_eval_cfg
|
'eval_cfg': needlebench_eval_cfg,
|
||||||
}
|
}
|
||||||
needlebench_5needle_zh_datasets.append(dataset_dict)
|
needlebench_5needle_zh_datasets.append(dataset_dict)
|
||||||
|
@ -41,34 +41,35 @@ needlebench_infer_cfg = dict(
|
|||||||
dict(role='HUMAN', prompt='{prompt}'),
|
dict(role='HUMAN', prompt='{prompt}'),
|
||||||
dict(role='BOT', prompt='{answer}\n'),
|
dict(role='BOT', prompt='{answer}\n'),
|
||||||
]
|
]
|
||||||
)
|
|
||||||
),
|
),
|
||||||
|
),
|
||||||
retriever=dict(type=ZeroRetriever),
|
retriever=dict(type=ZeroRetriever),
|
||||||
inferencer=dict(type=GenInferencer))
|
inferencer=dict(type=GenInferencer),
|
||||||
|
)
|
||||||
|
|
||||||
needlebench_eval_cfg = dict(
|
needlebench_eval_cfg = dict(
|
||||||
evaluator=dict(type=NeedleBenchParallelEvaluator),
|
evaluator=dict(type=NeedleBenchParallelEvaluator),
|
||||||
pred_postprocessor=dict(type=needlebench_postprocess),
|
pred_postprocessor=dict(type=needlebench_postprocess),
|
||||||
dataset_postprocessor=dict(type=needlebench_dataset_postprocess),
|
dataset_postprocessor=dict(type=needlebench_dataset_postprocess),
|
||||||
pred_role='BOT')
|
pred_role='BOT',
|
||||||
|
)
|
||||||
|
|
||||||
context_lengths = list(range(5000, 9000, 1000))
|
context_lengths = list(range(5000, 9000, 1000))
|
||||||
document_depth_percent_intervals = 20
|
document_depth_percent_intervals = 20
|
||||||
document_depth_percent_interval_type = 'linear'
|
document_depth_percent_interval_type = 'linear'
|
||||||
|
|
||||||
base_path = './data/needlebench'
|
base_path = 'opencompass/needlebench'
|
||||||
file_list = ['PaulGrahamEssays.jsonl']
|
file_list = ['PaulGrahamEssays.jsonl']
|
||||||
needlebench_en_datasets = []
|
needlebench_en_datasets = []
|
||||||
needle_file_name = 'needles.jsonl'
|
needle_file_name = 'needles.jsonl'
|
||||||
depths_float = generate_depth_percents(
|
depths_float = generate_depth_percents(
|
||||||
document_depth_percent_intervals,
|
document_depth_percent_intervals, document_depth_percent_interval_type
|
||||||
document_depth_percent_interval_type)
|
)
|
||||||
depths = [int(depth) for depth in depths_float]
|
depths = [int(depth) for depth in depths_float]
|
||||||
|
|
||||||
for original_context_length in context_lengths:
|
for original_context_length in context_lengths:
|
||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': f'Length{original_context_length}'
|
'abbr': f'Length{original_context_length}' f'_parallel_en_8k',
|
||||||
f'_parallel_en_8k',
|
|
||||||
'type': NeedleBenchParallelDataset,
|
'type': NeedleBenchParallelDataset,
|
||||||
'path': base_path,
|
'path': base_path,
|
||||||
'needle_file_name': needle_file_name,
|
'needle_file_name': needle_file_name,
|
||||||
@ -82,7 +83,7 @@ for original_context_length in context_lengths:
|
|||||||
'language': 'English',
|
'language': 'English',
|
||||||
'reader_cfg': needlebench_reader_cfg,
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
'infer_cfg': needlebench_infer_cfg,
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
'eval_cfg': needlebench_eval_cfg
|
'eval_cfg': needlebench_eval_cfg,
|
||||||
}
|
}
|
||||||
needlebench_en_datasets.append(dataset_dict)
|
needlebench_en_datasets.append(dataset_dict)
|
||||||
|
|
||||||
@ -91,8 +92,7 @@ needlebench_zh_datasets = []
|
|||||||
|
|
||||||
for original_context_length in context_lengths:
|
for original_context_length in context_lengths:
|
||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': f'Length{original_context_length}'
|
'abbr': f'Length{original_context_length}' f'_parallel_zh_8k',
|
||||||
f'_parallel_zh_8k',
|
|
||||||
'type': NeedleBenchParallelDataset,
|
'type': NeedleBenchParallelDataset,
|
||||||
'path': base_path,
|
'path': base_path,
|
||||||
'needle_file_name': needle_file_name,
|
'needle_file_name': needle_file_name,
|
||||||
@ -106,6 +106,6 @@ for original_context_length in context_lengths:
|
|||||||
'language': 'Chinese',
|
'language': 'Chinese',
|
||||||
'reader_cfg': needlebench_reader_cfg,
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
'infer_cfg': needlebench_infer_cfg,
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
'eval_cfg': needlebench_eval_cfg
|
'eval_cfg': needlebench_eval_cfg,
|
||||||
}
|
}
|
||||||
needlebench_zh_datasets.append(dataset_dict)
|
needlebench_zh_datasets.append(dataset_dict)
|
||||||
|
@ -41,36 +41,38 @@ needlebench_infer_cfg = dict(
|
|||||||
dict(role='HUMAN', prompt='{prompt}'),
|
dict(role='HUMAN', prompt='{prompt}'),
|
||||||
dict(role='BOT', prompt='{answer}\n'),
|
dict(role='BOT', prompt='{answer}\n'),
|
||||||
]
|
]
|
||||||
)
|
|
||||||
),
|
),
|
||||||
|
),
|
||||||
retriever=dict(type=ZeroRetriever),
|
retriever=dict(type=ZeroRetriever),
|
||||||
inferencer=dict(type=GenInferencer))
|
inferencer=dict(type=GenInferencer),
|
||||||
|
)
|
||||||
|
|
||||||
needlebench_eval_cfg = dict(
|
needlebench_eval_cfg = dict(
|
||||||
evaluator=dict(type=NeedleBenchParallelEvaluator),
|
evaluator=dict(type=NeedleBenchParallelEvaluator),
|
||||||
pred_postprocessor=dict(type=needlebench_postprocess),
|
pred_postprocessor=dict(type=needlebench_postprocess),
|
||||||
dataset_postprocessor=dict(type=needlebench_dataset_postprocess),
|
dataset_postprocessor=dict(type=needlebench_dataset_postprocess),
|
||||||
pred_role='BOT')
|
pred_role='BOT',
|
||||||
|
)
|
||||||
|
|
||||||
context_lengths = list(range(5000, 9000, 1000))
|
context_lengths = list(range(5000, 9000, 1000))
|
||||||
document_depth_percent_intervals_list = [1, 5, 10, 15, 20]
|
document_depth_percent_intervals_list = [1, 5, 10, 15, 20]
|
||||||
document_depth_percent_interval_type = 'linear'
|
document_depth_percent_interval_type = 'linear'
|
||||||
|
|
||||||
base_path = './data/needlebench'
|
base_path = 'opencompass/needlebench'
|
||||||
file_list = ['PaulGrahamEssays.jsonl']
|
file_list = ['PaulGrahamEssays.jsonl']
|
||||||
needlebench_en_datasets = []
|
needlebench_en_datasets = []
|
||||||
needle_file_name = 'needles.jsonl'
|
needle_file_name = 'needles.jsonl'
|
||||||
|
|
||||||
for document_depth_percent_intervals in document_depth_percent_intervals_list:
|
for document_depth_percent_intervals in document_depth_percent_intervals_list:
|
||||||
depths_float = generate_depth_percents(
|
depths_float = generate_depth_percents(
|
||||||
document_depth_percent_intervals,
|
document_depth_percent_intervals, document_depth_percent_interval_type
|
||||||
document_depth_percent_interval_type)
|
)
|
||||||
depths = [int(depth) for depth in depths_float]
|
depths = [int(depth) for depth in depths_float]
|
||||||
|
|
||||||
for original_context_length in context_lengths:
|
for original_context_length in context_lengths:
|
||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': f'Length{original_context_length}'
|
'abbr': f'Length{original_context_length}'
|
||||||
f'_parallel_en_8k_batch{document_depth_percent_intervals}',
|
f'_parallel_en_8k_batch{document_depth_percent_intervals}',
|
||||||
'type': NeedleBenchParallelDataset,
|
'type': NeedleBenchParallelDataset,
|
||||||
'path': base_path,
|
'path': base_path,
|
||||||
'needle_file_name': needle_file_name,
|
'needle_file_name': needle_file_name,
|
||||||
@ -84,7 +86,7 @@ for document_depth_percent_intervals in document_depth_percent_intervals_list:
|
|||||||
'language': 'English',
|
'language': 'English',
|
||||||
'reader_cfg': needlebench_reader_cfg,
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
'infer_cfg': needlebench_infer_cfg,
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
'eval_cfg': needlebench_eval_cfg
|
'eval_cfg': needlebench_eval_cfg,
|
||||||
}
|
}
|
||||||
needlebench_en_datasets.append(dataset_dict)
|
needlebench_en_datasets.append(dataset_dict)
|
||||||
|
|
||||||
@ -94,14 +96,14 @@ needle_file_name = 'needles.jsonl'
|
|||||||
|
|
||||||
for document_depth_percent_intervals in document_depth_percent_intervals_list:
|
for document_depth_percent_intervals in document_depth_percent_intervals_list:
|
||||||
depths_float = generate_depth_percents(
|
depths_float = generate_depth_percents(
|
||||||
document_depth_percent_intervals,
|
document_depth_percent_intervals, document_depth_percent_interval_type
|
||||||
document_depth_percent_interval_type)
|
)
|
||||||
depths = [int(depth) for depth in depths_float]
|
depths = [int(depth) for depth in depths_float]
|
||||||
|
|
||||||
for original_context_length in context_lengths:
|
for original_context_length in context_lengths:
|
||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': f'Length{original_context_length}'
|
'abbr': f'Length{original_context_length}'
|
||||||
f'_parallel_zh_8k_batch{document_depth_percent_intervals}',
|
f'_parallel_zh_8k_batch{document_depth_percent_intervals}',
|
||||||
'type': NeedleBenchParallelDataset,
|
'type': NeedleBenchParallelDataset,
|
||||||
'path': base_path,
|
'path': base_path,
|
||||||
'needle_file_name': needle_file_name,
|
'needle_file_name': needle_file_name,
|
||||||
@ -115,6 +117,6 @@ for document_depth_percent_intervals in document_depth_percent_intervals_list:
|
|||||||
'language': 'Chinese',
|
'language': 'Chinese',
|
||||||
'reader_cfg': needlebench_reader_cfg,
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
'infer_cfg': needlebench_infer_cfg,
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
'eval_cfg': needlebench_eval_cfg
|
'eval_cfg': needlebench_eval_cfg,
|
||||||
}
|
}
|
||||||
needlebench_zh_datasets.append(dataset_dict)
|
needlebench_zh_datasets.append(dataset_dict)
|
||||||
|
@ -41,33 +41,35 @@ needlebench_infer_cfg = dict(
|
|||||||
dict(role='HUMAN', prompt='{prompt}'),
|
dict(role='HUMAN', prompt='{prompt}'),
|
||||||
dict(role='BOT', prompt='{answer}\n'),
|
dict(role='BOT', prompt='{answer}\n'),
|
||||||
]
|
]
|
||||||
)
|
|
||||||
),
|
),
|
||||||
|
),
|
||||||
retriever=dict(type=ZeroRetriever),
|
retriever=dict(type=ZeroRetriever),
|
||||||
inferencer=dict(type=GenInferencer))
|
inferencer=dict(type=GenInferencer),
|
||||||
|
)
|
||||||
|
|
||||||
needlebench_eval_cfg = dict(
|
needlebench_eval_cfg = dict(
|
||||||
evaluator=dict(type=NeedleBenchOriginEvaluator),
|
evaluator=dict(type=NeedleBenchOriginEvaluator),
|
||||||
pred_postprocessor=dict(type=needlebench_postprocess),
|
pred_postprocessor=dict(type=needlebench_postprocess),
|
||||||
dataset_postprocessor=dict(type=needlebench_dataset_postprocess),
|
dataset_postprocessor=dict(type=needlebench_dataset_postprocess),
|
||||||
pred_role='BOT')
|
pred_role='BOT',
|
||||||
|
)
|
||||||
|
|
||||||
context_lengths = list(range(5000, 9000, 1000))
|
context_lengths = list(range(5000, 9000, 1000))
|
||||||
document_depth_percent_intervals = 20
|
document_depth_percent_intervals = 20
|
||||||
document_depth_percent_interval_type = 'linear'
|
document_depth_percent_interval_type = 'linear'
|
||||||
|
|
||||||
base_path = './data/needlebench'
|
base_path = 'opencompass/needlebench'
|
||||||
file_list = ['PaulGrahamEssays.jsonl']
|
file_list = ['PaulGrahamEssays.jsonl']
|
||||||
needlebench_en_datasets = []
|
needlebench_en_datasets = []
|
||||||
needle_file_name = 'needles.jsonl'
|
needle_file_name = 'needles.jsonl'
|
||||||
|
|
||||||
for original_context_length in context_lengths:
|
for original_context_length in context_lengths:
|
||||||
for depth_percent in generate_depth_percents(
|
for depth_percent in generate_depth_percents(
|
||||||
document_depth_percent_intervals,
|
document_depth_percent_intervals, document_depth_percent_interval_type
|
||||||
document_depth_percent_interval_type):
|
):
|
||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': f'Length{original_context_length}'
|
'abbr': f'Length{original_context_length}'
|
||||||
f'Depth{int(depth_percent)}_origin_en_8k',
|
f'Depth{int(depth_percent)}_origin_en_8k',
|
||||||
'type': NeedleBenchOriginDataset,
|
'type': NeedleBenchOriginDataset,
|
||||||
'path': base_path,
|
'path': base_path,
|
||||||
'length': original_context_length,
|
'length': original_context_length,
|
||||||
@ -81,7 +83,7 @@ for original_context_length in context_lengths:
|
|||||||
'needle_file_name': needle_file_name,
|
'needle_file_name': needle_file_name,
|
||||||
'reader_cfg': needlebench_reader_cfg,
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
'infer_cfg': needlebench_infer_cfg,
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
'eval_cfg': needlebench_eval_cfg
|
'eval_cfg': needlebench_eval_cfg,
|
||||||
}
|
}
|
||||||
needlebench_en_datasets.append(dataset_dict)
|
needlebench_en_datasets.append(dataset_dict)
|
||||||
|
|
||||||
@ -91,11 +93,11 @@ needle_file_name = 'needles.jsonl'
|
|||||||
|
|
||||||
for original_context_length in context_lengths:
|
for original_context_length in context_lengths:
|
||||||
for depth_percent in generate_depth_percents(
|
for depth_percent in generate_depth_percents(
|
||||||
document_depth_percent_intervals,
|
document_depth_percent_intervals, document_depth_percent_interval_type
|
||||||
document_depth_percent_interval_type):
|
):
|
||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': f'Length{original_context_length}'
|
'abbr': f'Length{original_context_length}'
|
||||||
f'Depth{int(depth_percent)}_origin_zh_8k',
|
f'Depth{int(depth_percent)}_origin_zh_8k',
|
||||||
'type': NeedleBenchOriginDataset,
|
'type': NeedleBenchOriginDataset,
|
||||||
'path': base_path,
|
'path': base_path,
|
||||||
'length': original_context_length,
|
'length': original_context_length,
|
||||||
@ -109,6 +111,6 @@ for original_context_length in context_lengths:
|
|||||||
'needle_file_name': needle_file_name,
|
'needle_file_name': needle_file_name,
|
||||||
'reader_cfg': needlebench_reader_cfg,
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
'infer_cfg': needlebench_infer_cfg,
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
'eval_cfg': needlebench_eval_cfg
|
'eval_cfg': needlebench_eval_cfg,
|
||||||
}
|
}
|
||||||
needlebench_zh_datasets.append(dataset_dict)
|
needlebench_zh_datasets.append(dataset_dict)
|
||||||
|
@ -23,9 +23,9 @@ def create_m_rs_names_list(context_lengths, depths, needle_counts,
|
|||||||
multi_needle_en_list.extend(names_list)
|
multi_needle_en_list.extend(names_list)
|
||||||
elif language == 'zh':
|
elif language == 'zh':
|
||||||
multi_needle_zh_list.extend(names_list)
|
multi_needle_zh_list.extend(names_list)
|
||||||
names_dict['Multi-Needle-Reasoning(M-RS)'] = multi_needle_list
|
names_dict[f'Multi-Needle-Reasoning(M-RS)-{dataset_size.upper()}'] = multi_needle_list
|
||||||
names_dict['Multi-Needle-Reasoning-EN'] = multi_needle_en_list
|
names_dict[f'Multi-Needle-Reasoning-EN-{dataset_size.upper()}'] = multi_needle_en_list
|
||||||
names_dict['Multi-Needle-Reasoning-ZH'] = multi_needle_zh_list
|
names_dict[f'Multi-Needle-Reasoning-ZH-{dataset_size.upper()}'] = multi_needle_zh_list
|
||||||
|
|
||||||
return names_dict
|
return names_dict
|
||||||
|
|
||||||
@ -56,9 +56,9 @@ def create_summarizer(context_lengths, depths, dataset_size,
|
|||||||
single_needle_en_list.extend(names_list)
|
single_needle_en_list.extend(names_list)
|
||||||
elif language == 'zh':
|
elif language == 'zh':
|
||||||
single_needle_zh_list.extend(names_list)
|
single_needle_zh_list.extend(names_list)
|
||||||
names_dict['Single-Needle-Retrieval(S-RT)'] = single_needle_list
|
names_dict[f'Single-Needle-Retrieval(S-RT)-{dataset_size.upper()}'] = single_needle_list
|
||||||
names_dict['Single-Needle-Retrieval-EN'] = single_needle_en_list
|
names_dict[f'Single-Needle-Retrieval-EN-{dataset_size.upper()}'] = single_needle_en_list
|
||||||
names_dict['Single-Needle-Retrieval-ZH'] = single_needle_zh_list
|
names_dict[f'Single-Needle-Retrieval-ZH-{dataset_size.upper()}'] = single_needle_zh_list
|
||||||
|
|
||||||
parallel_list = []
|
parallel_list = []
|
||||||
parallel_en_list = []
|
parallel_en_list = []
|
||||||
@ -74,39 +74,39 @@ def create_summarizer(context_lengths, depths, dataset_size,
|
|||||||
parallel_en_list.extend(names_list)
|
parallel_en_list.extend(names_list)
|
||||||
elif language == 'zh':
|
elif language == 'zh':
|
||||||
parallel_zh_list.extend(names_list)
|
parallel_zh_list.extend(names_list)
|
||||||
names_dict['Multi-Needle-Retrieval(M-RT)'] = parallel_list
|
names_dict[f'Multi-Needle-Retrieval(M-RT)-{dataset_size.upper()}'] = parallel_list
|
||||||
names_dict['Multi-Needle-Retrieval-EN'] = parallel_en_list
|
names_dict[f'Multi-Needle-Retrieval-EN-{dataset_size.upper()}'] = parallel_en_list
|
||||||
names_dict['Multi-Needle-Retrieval-ZH'] = parallel_zh_list
|
names_dict[f'Multi-Needle-Retrieval-ZH-{dataset_size.upper()}'] = parallel_zh_list
|
||||||
|
|
||||||
summary_groups = [
|
summary_groups = [
|
||||||
{'name': key, 'subsets': value} for key, value in names_dict.items()
|
{'name': key, 'subsets': value} for key, value in names_dict.items()
|
||||||
]
|
]
|
||||||
|
|
||||||
summary_groups.append({
|
summary_groups.append({
|
||||||
'name': 'NeedleBench-Overall-Score',
|
'name': f'NeedleBench-Overall-Score-{dataset_size.upper()}',
|
||||||
'subsets': [['Single-Needle-Retrieval(S-RT)', 'naive_average'],
|
'subsets': [[f'Single-Needle-Retrieval(S-RT)-{dataset_size.upper()}', 'naive_average'],
|
||||||
['Multi-Needle-Reasoning(M-RS)', 'naive_average'],
|
[f'Multi-Needle-Reasoning(M-RS)-{dataset_size.upper()}', 'naive_average'],
|
||||||
['Multi-Needle-Retrieval(M-RT)', 'average_score']],
|
[f'Multi-Needle-Retrieval(M-RT)-{dataset_size.upper()}', 'average_score']],
|
||||||
'weights': {'Single-Needle-Retrieval(S-RT)': 0.4,
|
'weights': {f'Single-Needle-Retrieval(S-RT)-{dataset_size.upper()}': 0.4,
|
||||||
'Multi-Needle-Reasoning(M-RS)': 0.3,
|
f'Multi-Needle-Reasoning(M-RS)-{dataset_size.upper()}': 0.3,
|
||||||
'Multi-Needle-Retrieval(M-RT)': 0.3}})
|
f'Multi-Needle-Retrieval(M-RT)-{dataset_size.upper()}': 0.3}})
|
||||||
summarizer_config = {
|
summarizer_config = {
|
||||||
'type': NeedleBenchSummarizer,
|
'type': NeedleBenchSummarizer,
|
||||||
'summary_groups': summary_groups,
|
'summary_groups': summary_groups,
|
||||||
'dataset_abbrs': [
|
'dataset_abbrs': [
|
||||||
'NeedleBench-Overall-Score',
|
f'NeedleBench-Overall-Score-{dataset_size.upper()}',
|
||||||
f'--------- NeedleBench-{dataset_size.upper()}-Single-Needle-Retrieval ---------',
|
f'--------- NeedleBench-{dataset_size.upper()}-Single-Needle-Retrieval ---------',
|
||||||
'Single-Needle-Retrieval(S-RT)',
|
f'Single-Needle-Retrieval(S-RT)-{dataset_size.upper()}',
|
||||||
'Single-Needle-Retrieval-EN',
|
f'Single-Needle-Retrieval-EN-{dataset_size.upper()}',
|
||||||
'Single-Needle-Retrieval-ZH',
|
f'Single-Needle-Retrieval-ZH-{dataset_size.upper()}',
|
||||||
f'--------- NeedleBench-{dataset_size.upper()}-Multi-Needle-Retrieval ---------',
|
f'--------- NeedleBench-{dataset_size.upper()}-Multi-Needle-Retrieval ---------',
|
||||||
'Multi-Needle-Retrieval(M-RT)',
|
f'Multi-Needle-Retrieval(M-RT)-{dataset_size.upper()}',
|
||||||
'Multi-Needle-Retrieval-EN',
|
f'Multi-Needle-Retrieval-EN-{dataset_size.upper()}',
|
||||||
'Multi-Needle-Retrieval-ZH',
|
f'Multi-Needle-Retrieval-ZH-{dataset_size.upper()}',
|
||||||
f'--------- NeedleBench-{dataset_size.upper()}-Multi-Needle-Reasoning ---------',
|
f'--------- NeedleBench-{dataset_size.upper()}-Multi-Needle-Reasoning ---------',
|
||||||
'Multi-Needle-Reasoning(M-RS)',
|
f'Multi-Needle-Reasoning(M-RS)-{dataset_size.upper()}',
|
||||||
'Multi-Needle-Reasoning-EN',
|
f'Multi-Needle-Reasoning-EN-{dataset_size.upper()}',
|
||||||
'Multi-Needle-Reasoning-ZH',
|
f'Multi-Needle-Reasoning-ZH-{dataset_size.upper()}',
|
||||||
f'2-Needle-EN-{dataset_size.upper()}',
|
f'2-Needle-EN-{dataset_size.upper()}',
|
||||||
f'2-Needle-ZH-{dataset_size.upper()}',
|
f'2-Needle-ZH-{dataset_size.upper()}',
|
||||||
f'3-Needle-EN-{dataset_size.upper()}',
|
f'3-Needle-EN-{dataset_size.upper()}',
|
||||||
|
@ -31,7 +31,9 @@ needlebench_eval_cfg = dict(
|
|||||||
needle_num_list = list(range(2, 100, 3))
|
needle_num_list = list(range(2, 100, 3))
|
||||||
document_depth_percent_intervals = 20
|
document_depth_percent_intervals = 20
|
||||||
repeats = 30
|
repeats = 30
|
||||||
names_path = './data/needlebench/names.json'
|
|
||||||
|
path = 'opencompass/needlebench'
|
||||||
|
file_name = 'names.json'
|
||||||
|
|
||||||
needlebench_atc_datasets_zh = []
|
needlebench_atc_datasets_zh = []
|
||||||
needlebench_atc_datasets_en = []
|
needlebench_atc_datasets_en = []
|
||||||
@ -44,7 +46,8 @@ for num_needles in needle_num_list:
|
|||||||
'abbr': f'needlebench_atc_challenge'
|
'abbr': f'needlebench_atc_challenge'
|
||||||
f'needle_{num_needles}_en_ordered',
|
f'needle_{num_needles}_en_ordered',
|
||||||
'type': NeedleBenchATCOrderedDataset,
|
'type': NeedleBenchATCOrderedDataset,
|
||||||
'path': names_path,
|
'path': path,
|
||||||
|
'file_name': file_name,
|
||||||
'num_needles': num_needles,
|
'num_needles': num_needles,
|
||||||
'language': 'English',
|
'language': 'English',
|
||||||
'repeats': repeats,
|
'repeats': repeats,
|
||||||
@ -61,7 +64,8 @@ for num_needles in needle_num_list:
|
|||||||
'abbr': f'needlebench_atc_challenge'
|
'abbr': f'needlebench_atc_challenge'
|
||||||
f'needle_{num_needles}_zh_ordered',
|
f'needle_{num_needles}_zh_ordered',
|
||||||
'type': NeedleBenchATCOrderedDataset,
|
'type': NeedleBenchATCOrderedDataset,
|
||||||
'path': names_path,
|
'path': path,
|
||||||
|
'file_name': file_name,
|
||||||
'num_needles': num_needles,
|
'num_needles': num_needles,
|
||||||
'language': 'Chinese',
|
'language': 'Chinese',
|
||||||
'repeats': repeats,
|
'repeats': repeats,
|
||||||
@ -77,7 +81,8 @@ for num_needles in needle_num_list:
|
|||||||
'abbr': f'needlebench_atc_challenge'
|
'abbr': f'needlebench_atc_challenge'
|
||||||
f'needle_{num_needles}_en',
|
f'needle_{num_needles}_en',
|
||||||
'type': NeedleBenchATCDataset,
|
'type': NeedleBenchATCDataset,
|
||||||
'path': names_path,
|
'path': path,
|
||||||
|
'file_name': file_name,
|
||||||
'num_needles': num_needles,
|
'num_needles': num_needles,
|
||||||
'language': 'English',
|
'language': 'English',
|
||||||
'repeats': repeats,
|
'repeats': repeats,
|
||||||
@ -93,7 +98,8 @@ for num_needles in needle_num_list:
|
|||||||
'abbr': f'needlebench_atc_challenge'
|
'abbr': f'needlebench_atc_challenge'
|
||||||
f'needle_{num_needles}_zh',
|
f'needle_{num_needles}_zh',
|
||||||
'type': NeedleBenchATCDataset,
|
'type': NeedleBenchATCDataset,
|
||||||
'path': names_path,
|
'path': path,
|
||||||
|
'file_name': file_name,
|
||||||
'num_needles': num_needles,
|
'num_needles': num_needles,
|
||||||
'language': 'Chinese',
|
'language': 'Chinese',
|
||||||
'repeats': repeats,
|
'repeats': repeats,
|
||||||
|
@ -61,7 +61,8 @@ few_shot_prompts = {
|
|||||||
|
|
||||||
# ----------------------- Prompt Settings ----------------------- #
|
# ----------------------- Prompt Settings ----------------------- #
|
||||||
needle_num_list = list(range(2, 20, 1))
|
needle_num_list = list(range(2, 20, 1))
|
||||||
names_path = './data/needlebench/names.json'
|
path = 'opencompass/needlebench'
|
||||||
|
file_name = 'names.json'
|
||||||
|
|
||||||
repeats = 10
|
repeats = 10
|
||||||
|
|
||||||
@ -122,7 +123,8 @@ for _name in list(single_choice_prompts.keys()):
|
|||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': abbr,
|
'abbr': abbr,
|
||||||
'type': NeedleBenchATCDataset,
|
'type': NeedleBenchATCDataset,
|
||||||
'path': names_path,
|
'path': path,
|
||||||
|
'file_name': file_name,
|
||||||
'num_needles': num_needles,
|
'num_needles': num_needles,
|
||||||
'language': language,
|
'language': language,
|
||||||
'repeats': repeats,
|
'repeats': repeats,
|
||||||
|
@ -61,8 +61,8 @@ few_shot_prompts = {
|
|||||||
|
|
||||||
# ----------------------- Prompt Settings ----------------------- #
|
# ----------------------- Prompt Settings ----------------------- #
|
||||||
needle_num_list = list(range(2, 20, 1))
|
needle_num_list = list(range(2, 20, 1))
|
||||||
names_path = './data/needlebench/names.json'
|
path = 'opencompass/needlebench'
|
||||||
|
file_name = 'names.json'
|
||||||
repeats = 10
|
repeats = 10
|
||||||
|
|
||||||
# Use Zero-Shot or not
|
# Use Zero-Shot or not
|
||||||
@ -120,7 +120,8 @@ for _name in list(single_choice_prompts.keys()):
|
|||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': abbr,
|
'abbr': abbr,
|
||||||
'type': NeedleBenchATCDataset,
|
'type': NeedleBenchATCDataset,
|
||||||
'path': names_path,
|
'path': path,
|
||||||
|
'file_name': file_name,
|
||||||
'num_needles': num_needles,
|
'num_needles': num_needles,
|
||||||
'language': language,
|
'language': language,
|
||||||
'repeats': repeats,
|
'repeats': repeats,
|
||||||
|
@ -30,7 +30,8 @@ for _name in list(single_choice_prompts.keys()):
|
|||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': abbr,
|
'abbr': abbr,
|
||||||
'type': NeedleBenchATCDataset,
|
'type': NeedleBenchATCDataset,
|
||||||
'path': names_path,
|
'path': path,
|
||||||
|
'file_name': file_name,
|
||||||
'num_needles': num_needles,
|
'num_needles': num_needles,
|
||||||
'language': language,
|
'language': language,
|
||||||
'repeats': repeats,
|
'repeats': repeats,
|
||||||
|
@ -10,14 +10,38 @@ from opencompass.utils.text_postprocessors import first_option_postprocess
|
|||||||
few_shot_prompts = {
|
few_shot_prompts = {
|
||||||
'single_choice_prompts': {
|
'single_choice_prompts': {
|
||||||
'single_choice_en_reasoning': [
|
'single_choice_en_reasoning': [
|
||||||
dict(role='HUMAN', prompt="Question: Sharon House, as Jessica Stewart's father, has a significant impact on Jessica Stewart's upbringing. \nGiven the scrambled family relationships described above, who is the eldest relative that 'Jessica Stewart' can trace back to in the context?\nA. Jack Burch\nB. Jessica Stewart\nC. Sharon House\nD. Carolyn Jackson\n"),
|
dict(
|
||||||
dict(role='BOT', prompt="Answer: Based on the provided information, we can construct the following family relationship chain:\n\n- Sharon House, as Jessica Stewart's father, has a significant impact on Jessica Stewart's upbringing.\n\nTherefore, the eldest relative that 'Jessica Stewart' can trace back to in the context is Sharon House. The answer is: C"),
|
role='HUMAN',
|
||||||
dict(role='HUMAN', prompt="Question: For Robert Hill, Mikayla Scott is not just a paternal grandfather, but also a friend.Jacob Oconnor's paternal grandmother is Robert Hill. \nGiven the scrambled family relationships described above, who is the eldest relative that 'Jacob Oconnor' can trace back to in the context?\nA. Laura Holland\nB. Robert Hill\nC. Jacob Oconnor\nD. Mikayla Scott\n"),
|
prompt="Question: Sharon House, as Jessica Stewart's father, has a significant impact on Jessica Stewart's upbringing. \nGiven the scrambled family relationships described above, who is the eldest relative that 'Jessica Stewart' can trace back to in the context?\nA. Jack Burch\nB. Jessica Stewart\nC. Sharon House\nD. Carolyn Jackson\n",
|
||||||
dict(role='BOT', prompt="Answer: Based on the provided information, we can construct the following family relationship chain:\n\n-Jacob Oconnor's paternal grandmother is Robert Hill. \n- For Robert Hill, Mikayla Scott is not just a paternal grandfather, but also a friend.\n\nTherefore, the eldest relative that 'Jacob Oconnor' can trace back to in the context is Mikayla Scott. The answer is: D"),
|
),
|
||||||
dict(role='HUMAN', prompt="Question: Misty Moore plays the role of Barbara Fuentes's maternal grandfather in Barbara Fuentes's life.Jennifer Garcia, as Michael Martinez's grandmother, has a significant impact on Michael Martinez's upbringing.Michael Martinez is not only Misty Moore's father but also Misty Moore's role model. \nGiven the scrambled family relationships described above, who is the eldest relative that 'Barbara Fuentes' can trace back to in the context?\nA. Michael Martinez\nB. Jennifer Garcia\nC. Misty Moore\nD. Barbara Fuentes\n"),
|
dict(
|
||||||
dict(role='BOT', prompt="Answer: Based on the provided information, we can construct the following family relationship chain:\n- Misty Moore plays the role of Barbara Fuentes's maternal grandfather in Barbara Fuentes's life. \n- Michael Martinez is not only Misty Moore's father but also Misty Moore's role model.\n- Jennifer Garcia, as Michael Martinez's grandmother, has a significant impact on Michael Martinez's upbringing.\n\nTherefore, the eldest relative that 'Barbara Fuentes' can trace back to in the context is Jennifer Garcia. The answer is: B"),
|
role='BOT',
|
||||||
dict(role='HUMAN', prompt="Question: Carlos Smith, as Mary Gay's grandfather, has a significant impact on Mary Gay's upbringing.Victor Dean considers Mary Gay as their grandfather.Marcus Miller, as Carlos Smith's paternal grandfather, has a significant impact on Carlos Smith's upbringing.Victor Dean is not only Danielle Yates's maternal grandmother but also Danielle Yates's role model.Danielle Yates is not only David Hernandez's paternal grandmother but also David Hernandez's role model.David Hernandez is Jennifer Williams's mom. \nGiven the scrambled family relationships described above, who is the eldest relative that 'Jennifer Williams' can trace back to in the context?\nA. Marcus Miller\nB. Carlos Smith\nC. Mary Gay\nD. Victor Dean\n"),
|
prompt="Answer: Based on the provided information, we can construct the following family relationship chain:\n\n- Sharon House, as Jessica Stewart's father, has a significant impact on Jessica Stewart's upbringing.\n\nTherefore, the eldest relative that 'Jessica Stewart' can trace back to in the context is Sharon House. The answer is: C",
|
||||||
dict(role='BOT', prompt="Answer: Based on the provided information, we can construct the following family relationship chain:\n\n- David Hernandez is Jennifer Williams's mom.\n- Danielle Yates is not only David Hernandez's paternal grandmother but also David Hernandez's role model.\n- Victor Dean is not only Danielle Yates's maternal grandmother but also Danielle Yates's role model.\n- Victor Dean considers Mary Gay as their grandfather. \n- Carlos Smith, as Mary Gay's grandfather, has a significant impact on Mary Gay's upbringing.\n- Marcus Miller, as Carlos Smith's paternal grandfather, has a significant impact on Carlos Smith's upbringing.\n\nTherefore, the eldest relative that 'Jennifer Williams' can trace back to in the context is Marcus Miller. The answer is: A"),
|
),
|
||||||
|
dict(
|
||||||
|
role='HUMAN',
|
||||||
|
prompt="Question: For Robert Hill, Mikayla Scott is not just a paternal grandfather, but also a friend.Jacob Oconnor's paternal grandmother is Robert Hill. \nGiven the scrambled family relationships described above, who is the eldest relative that 'Jacob Oconnor' can trace back to in the context?\nA. Laura Holland\nB. Robert Hill\nC. Jacob Oconnor\nD. Mikayla Scott\n",
|
||||||
|
),
|
||||||
|
dict(
|
||||||
|
role='BOT',
|
||||||
|
prompt="Answer: Based on the provided information, we can construct the following family relationship chain:\n\n-Jacob Oconnor's paternal grandmother is Robert Hill. \n- For Robert Hill, Mikayla Scott is not just a paternal grandfather, but also a friend.\n\nTherefore, the eldest relative that 'Jacob Oconnor' can trace back to in the context is Mikayla Scott. The answer is: D",
|
||||||
|
),
|
||||||
|
dict(
|
||||||
|
role='HUMAN',
|
||||||
|
prompt="Question: Misty Moore plays the role of Barbara Fuentes's maternal grandfather in Barbara Fuentes's life.Jennifer Garcia, as Michael Martinez's grandmother, has a significant impact on Michael Martinez's upbringing.Michael Martinez is not only Misty Moore's father but also Misty Moore's role model. \nGiven the scrambled family relationships described above, who is the eldest relative that 'Barbara Fuentes' can trace back to in the context?\nA. Michael Martinez\nB. Jennifer Garcia\nC. Misty Moore\nD. Barbara Fuentes\n",
|
||||||
|
),
|
||||||
|
dict(
|
||||||
|
role='BOT',
|
||||||
|
prompt="Answer: Based on the provided information, we can construct the following family relationship chain:\n- Misty Moore plays the role of Barbara Fuentes's maternal grandfather in Barbara Fuentes's life. \n- Michael Martinez is not only Misty Moore's father but also Misty Moore's role model.\n- Jennifer Garcia, as Michael Martinez's grandmother, has a significant impact on Michael Martinez's upbringing.\n\nTherefore, the eldest relative that 'Barbara Fuentes' can trace back to in the context is Jennifer Garcia. The answer is: B",
|
||||||
|
),
|
||||||
|
dict(
|
||||||
|
role='HUMAN',
|
||||||
|
prompt="Question: Carlos Smith, as Mary Gay's grandfather, has a significant impact on Mary Gay's upbringing.Victor Dean considers Mary Gay as their grandfather.Marcus Miller, as Carlos Smith's paternal grandfather, has a significant impact on Carlos Smith's upbringing.Victor Dean is not only Danielle Yates's maternal grandmother but also Danielle Yates's role model.Danielle Yates is not only David Hernandez's paternal grandmother but also David Hernandez's role model.David Hernandez is Jennifer Williams's mom. \nGiven the scrambled family relationships described above, who is the eldest relative that 'Jennifer Williams' can trace back to in the context?\nA. Marcus Miller\nB. Carlos Smith\nC. Mary Gay\nD. Victor Dean\n",
|
||||||
|
),
|
||||||
|
dict(
|
||||||
|
role='BOT',
|
||||||
|
prompt="Answer: Based on the provided information, we can construct the following family relationship chain:\n\n- David Hernandez is Jennifer Williams's mom.\n- Danielle Yates is not only David Hernandez's paternal grandmother but also David Hernandez's role model.\n- Victor Dean is not only Danielle Yates's maternal grandmother but also Danielle Yates's role model.\n- Victor Dean considers Mary Gay as their grandfather. \n- Carlos Smith, as Mary Gay's grandfather, has a significant impact on Mary Gay's upbringing.\n- Marcus Miller, as Carlos Smith's paternal grandfather, has a significant impact on Carlos Smith's upbringing.\n\nTherefore, the eldest relative that 'Jennifer Williams' can trace back to in the context is Marcus Miller. The answer is: A",
|
||||||
|
),
|
||||||
dict(role='HUMAN', prompt='Question: {question}'),
|
dict(role='HUMAN', prompt='Question: {question}'),
|
||||||
],
|
],
|
||||||
},
|
},
|
||||||
@ -25,8 +49,8 @@ few_shot_prompts = {
|
|||||||
|
|
||||||
# ----------------------- Prompt Settings ----------------------- #
|
# ----------------------- Prompt Settings ----------------------- #
|
||||||
needle_num_list = list(range(2, 50, 1))
|
needle_num_list = list(range(2, 50, 1))
|
||||||
names_path = './data/needlebench/names.json'
|
path = 'opencompass/needlebench'
|
||||||
|
file_name = 'names.json'
|
||||||
repeats = 10
|
repeats = 10
|
||||||
|
|
||||||
# Use Zero-Shot or not
|
# Use Zero-Shot or not
|
||||||
@ -48,49 +72,54 @@ single_choice_prompts = needlebench_prompts['single_choice_prompts']
|
|||||||
for _name in list(single_choice_prompts.keys()):
|
for _name in list(single_choice_prompts.keys()):
|
||||||
if with_few_shot:
|
if with_few_shot:
|
||||||
assert few_shot_samples > 0 and few_shot_samples <= 4
|
assert few_shot_samples > 0 and few_shot_samples <= 4
|
||||||
single_choice_prompts[_name] = \
|
single_choice_prompts[_name] = single_choice_prompts[_name][
|
||||||
single_choice_prompts[_name][- few_shot_samples * 2 - 1:]
|
-few_shot_samples * 2 - 1 :
|
||||||
|
]
|
||||||
|
|
||||||
# ----------------------- Dataset Settings ----------------------- #
|
# ----------------------- Dataset Settings ----------------------- #
|
||||||
|
|
||||||
needlebench_datasets = []
|
needlebench_datasets = []
|
||||||
|
|
||||||
|
|
||||||
needlebench_atc_reader_cfg = dict(input_columns=['question'],
|
needlebench_atc_reader_cfg = dict(input_columns=['question'], output_column='answer')
|
||||||
output_column='answer')
|
|
||||||
|
|
||||||
for _name in list(single_choice_prompts.keys()):
|
for _name in list(single_choice_prompts.keys()):
|
||||||
|
|
||||||
needlebench_atc_infer_cfg = dict(
|
needlebench_atc_infer_cfg = dict(
|
||||||
prompt_template=dict(
|
prompt_template=dict(
|
||||||
type=PromptTemplate,
|
type=PromptTemplate,
|
||||||
template=dict(
|
template=dict(round=(single_choice_prompts[_name])),
|
||||||
round=(single_choice_prompts[_name])),
|
|
||||||
),
|
),
|
||||||
retriever=dict(type=ZeroRetriever),
|
retriever=dict(type=ZeroRetriever),
|
||||||
inferencer=dict(type=GenInferencer,),
|
inferencer=dict(
|
||||||
|
type=GenInferencer,
|
||||||
|
),
|
||||||
)
|
)
|
||||||
|
|
||||||
needlebench_atc_eval_cfg = dict(
|
needlebench_atc_eval_cfg = dict(
|
||||||
evaluator=dict(type=CircularEvaluator),
|
evaluator=dict(type=CircularEvaluator),
|
||||||
pred_postprocessor=dict(type=first_option_postprocess, options='ABCD'))
|
pred_postprocessor=dict(type=first_option_postprocess, options='ABCD'),
|
||||||
|
)
|
||||||
|
|
||||||
for num_needles in needle_num_list:
|
for num_needles in needle_num_list:
|
||||||
abbr = (f'NeedleBenchATCDataset-'
|
abbr = (
|
||||||
f'{num_needles}Needle-{"EN" if "en" in _name else "ZH"}')
|
f'NeedleBenchATCDataset-'
|
||||||
|
f'{num_needles}Needle-{"EN" if "en" in _name else "ZH"}'
|
||||||
|
)
|
||||||
language = 'English' if 'en' in _name else 'Chinese'
|
language = 'English' if 'en' in _name else 'Chinese'
|
||||||
if 'reasoning' in _name:
|
if 'reasoning' in _name:
|
||||||
abbr += '-Reasoning'
|
abbr += '-Reasoning'
|
||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': abbr,
|
'abbr': abbr,
|
||||||
'type': NeedleBenchATCDataset,
|
'type': NeedleBenchATCDataset,
|
||||||
'path': names_path,
|
'path': path,
|
||||||
|
'file_name':file_name,
|
||||||
'num_needles': num_needles,
|
'num_needles': num_needles,
|
||||||
'language': language,
|
'language': language,
|
||||||
'repeats': repeats,
|
'repeats': repeats,
|
||||||
'with_circular': with_circular_eval,
|
'with_circular': with_circular_eval,
|
||||||
'reader_cfg': needlebench_atc_reader_cfg,
|
'reader_cfg': needlebench_atc_reader_cfg,
|
||||||
'infer_cfg': needlebench_atc_infer_cfg,
|
'infer_cfg': needlebench_atc_infer_cfg,
|
||||||
'eval_cfg': needlebench_atc_eval_cfg
|
'eval_cfg': needlebench_atc_eval_cfg,
|
||||||
}
|
}
|
||||||
needlebench_datasets.append(dataset_dict)
|
needlebench_datasets.append(dataset_dict)
|
||||||
|
@ -30,7 +30,8 @@ for _name in list(single_choice_prompts.keys()):
|
|||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': abbr,
|
'abbr': abbr,
|
||||||
'type': NeedleBenchATCDataset,
|
'type': NeedleBenchATCDataset,
|
||||||
'path': names_path,
|
'path': path,
|
||||||
|
'file_name': file_name,
|
||||||
'num_needles': num_needles,
|
'num_needles': num_needles,
|
||||||
'language': language,
|
'language': language,
|
||||||
'repeats': repeats,
|
'repeats': repeats,
|
||||||
|
@ -25,8 +25,8 @@ few_shot_prompts = {
|
|||||||
|
|
||||||
# ----------------------- Prompt Settings ----------------------- #
|
# ----------------------- Prompt Settings ----------------------- #
|
||||||
needle_num_list = list(range(2, 80, 1))
|
needle_num_list = list(range(2, 80, 1))
|
||||||
names_path = './data/needlebench/names.json'
|
path = 'opencompass/needlebench'
|
||||||
|
file_name = 'names.json'
|
||||||
repeats = 10
|
repeats = 10
|
||||||
|
|
||||||
# Use Zero-Shot or not
|
# Use Zero-Shot or not
|
||||||
@ -84,7 +84,8 @@ for _name in list(single_choice_prompts.keys()):
|
|||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': abbr,
|
'abbr': abbr,
|
||||||
'type': NeedleBenchATCDataset,
|
'type': NeedleBenchATCDataset,
|
||||||
'path': names_path,
|
'path': path,
|
||||||
|
'file_name': file_name,
|
||||||
'num_needles': num_needles,
|
'num_needles': num_needles,
|
||||||
'language': language,
|
'language': language,
|
||||||
'repeats': repeats,
|
'repeats': repeats,
|
||||||
|
@ -41,22 +41,24 @@ needlebench_infer_cfg = dict(
|
|||||||
dict(role='HUMAN', prompt='{prompt}'),
|
dict(role='HUMAN', prompt='{prompt}'),
|
||||||
dict(role='BOT', prompt='{answer}\n'),
|
dict(role='BOT', prompt='{answer}\n'),
|
||||||
]
|
]
|
||||||
)
|
|
||||||
),
|
),
|
||||||
|
),
|
||||||
retriever=dict(type=ZeroRetriever),
|
retriever=dict(type=ZeroRetriever),
|
||||||
inferencer=dict(type=GenInferencer))
|
inferencer=dict(type=GenInferencer),
|
||||||
|
)
|
||||||
|
|
||||||
needlebench_eval_cfg = dict(
|
needlebench_eval_cfg = dict(
|
||||||
evaluator=dict(type=NeedleBenchMultiEvaluator),
|
evaluator=dict(type=NeedleBenchMultiEvaluator),
|
||||||
pred_postprocessor=dict(type=needlebench_postprocess),
|
pred_postprocessor=dict(type=needlebench_postprocess),
|
||||||
dataset_postprocessor=dict(type=needlebench_dataset_postprocess),
|
dataset_postprocessor=dict(type=needlebench_dataset_postprocess),
|
||||||
pred_role='BOT')
|
pred_role='BOT',
|
||||||
|
)
|
||||||
|
|
||||||
context_lengths = [20000, 160000, 300000, 440000, 580000, 720000, 860000, 1000000]
|
context_lengths = [20000, 160000, 300000, 440000, 580000, 720000, 860000, 1000000]
|
||||||
depths_list = [0, 10, 21, 31, 42, 52, 63, 73, 84, 94, 100]
|
depths_list = [0, 10, 21, 31, 42, 52, 63, 73, 84, 94, 100]
|
||||||
|
|
||||||
# ----------English Version----------
|
# ----------English Version----------
|
||||||
base_path = './data/needlebench'
|
base_path = 'opencompass/needlebench'
|
||||||
file_list = ['PaulGrahamEssays.jsonl']
|
file_list = ['PaulGrahamEssays.jsonl']
|
||||||
|
|
||||||
needle_file_name = 'multi_needle_reasoning_en.json'
|
needle_file_name = 'multi_needle_reasoning_en.json'
|
||||||
@ -69,7 +71,7 @@ for original_context_length in context_lengths:
|
|||||||
for depth_percent in depths_list:
|
for depth_percent in depths_list:
|
||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': f'Length{original_context_length}'
|
'abbr': f'Length{original_context_length}'
|
||||||
f'Depth{int(depth_percent)}_{num_needles}needle_en_1000k',
|
f'Depth{int(depth_percent)}_{num_needles}needle_en_1000k',
|
||||||
'type': NeedleBenchMultiDataset,
|
'type': NeedleBenchMultiDataset,
|
||||||
'path': base_path,
|
'path': base_path,
|
||||||
'length': original_context_length,
|
'length': original_context_length,
|
||||||
@ -85,7 +87,7 @@ for original_context_length in context_lengths:
|
|||||||
'diff': diff,
|
'diff': diff,
|
||||||
'reader_cfg': needlebench_reader_cfg,
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
'infer_cfg': needlebench_infer_cfg,
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
'eval_cfg': needlebench_eval_cfg
|
'eval_cfg': needlebench_eval_cfg,
|
||||||
}
|
}
|
||||||
needlebench_2needle_en_datasets.append(dataset_dict)
|
needlebench_2needle_en_datasets.append(dataset_dict)
|
||||||
|
|
||||||
@ -96,7 +98,7 @@ for original_context_length in context_lengths:
|
|||||||
for depth_percent in depths_list:
|
for depth_percent in depths_list:
|
||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': f'Length{original_context_length}'
|
'abbr': f'Length{original_context_length}'
|
||||||
f'Depth{int(depth_percent)}_{num_needles}needle_en_1000k',
|
f'Depth{int(depth_percent)}_{num_needles}needle_en_1000k',
|
||||||
'type': NeedleBenchMultiDataset,
|
'type': NeedleBenchMultiDataset,
|
||||||
'path': base_path,
|
'path': base_path,
|
||||||
'length': original_context_length,
|
'length': original_context_length,
|
||||||
@ -112,7 +114,7 @@ for original_context_length in context_lengths:
|
|||||||
'diff': diff,
|
'diff': diff,
|
||||||
'reader_cfg': needlebench_reader_cfg,
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
'infer_cfg': needlebench_infer_cfg,
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
'eval_cfg': needlebench_eval_cfg
|
'eval_cfg': needlebench_eval_cfg,
|
||||||
}
|
}
|
||||||
needlebench_3needle_en_datasets.append(dataset_dict)
|
needlebench_3needle_en_datasets.append(dataset_dict)
|
||||||
|
|
||||||
@ -123,7 +125,7 @@ for original_context_length in context_lengths:
|
|||||||
for depth_percent in depths_list:
|
for depth_percent in depths_list:
|
||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': f'Length{original_context_length}'
|
'abbr': f'Length{original_context_length}'
|
||||||
f'Depth{int(depth_percent)}_{num_needles}needle_en_1000k',
|
f'Depth{int(depth_percent)}_{num_needles}needle_en_1000k',
|
||||||
'type': NeedleBenchMultiDataset,
|
'type': NeedleBenchMultiDataset,
|
||||||
'path': base_path,
|
'path': base_path,
|
||||||
'length': original_context_length,
|
'length': original_context_length,
|
||||||
@ -139,7 +141,7 @@ for original_context_length in context_lengths:
|
|||||||
'diff': diff,
|
'diff': diff,
|
||||||
'reader_cfg': needlebench_reader_cfg,
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
'infer_cfg': needlebench_infer_cfg,
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
'eval_cfg': needlebench_eval_cfg
|
'eval_cfg': needlebench_eval_cfg,
|
||||||
}
|
}
|
||||||
needlebench_4needle_en_datasets.append(dataset_dict)
|
needlebench_4needle_en_datasets.append(dataset_dict)
|
||||||
|
|
||||||
@ -150,7 +152,7 @@ for original_context_length in context_lengths:
|
|||||||
for depth_percent in depths_list:
|
for depth_percent in depths_list:
|
||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': f'Length{original_context_length}'
|
'abbr': f'Length{original_context_length}'
|
||||||
f'Depth{int(depth_percent)}_{num_needles}needle_en_1000k',
|
f'Depth{int(depth_percent)}_{num_needles}needle_en_1000k',
|
||||||
'type': NeedleBenchMultiDataset,
|
'type': NeedleBenchMultiDataset,
|
||||||
'path': base_path,
|
'path': base_path,
|
||||||
'length': original_context_length,
|
'length': original_context_length,
|
||||||
@ -166,12 +168,12 @@ for original_context_length in context_lengths:
|
|||||||
'diff': diff,
|
'diff': diff,
|
||||||
'reader_cfg': needlebench_reader_cfg,
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
'infer_cfg': needlebench_infer_cfg,
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
'eval_cfg': needlebench_eval_cfg
|
'eval_cfg': needlebench_eval_cfg,
|
||||||
}
|
}
|
||||||
needlebench_5needle_en_datasets.append(dataset_dict)
|
needlebench_5needle_en_datasets.append(dataset_dict)
|
||||||
|
|
||||||
# ----------Chinese Version----------
|
# ----------Chinese Version----------
|
||||||
base_path = './data/needlebench'
|
base_path = 'opencompass/needlebench'
|
||||||
file_list = ['zh_finance.jsonl']
|
file_list = ['zh_finance.jsonl']
|
||||||
|
|
||||||
needle_file_name = 'multi_needle_reasoning_zh.json'
|
needle_file_name = 'multi_needle_reasoning_zh.json'
|
||||||
@ -184,7 +186,7 @@ for original_context_length in context_lengths:
|
|||||||
for depth_percent in depths_list:
|
for depth_percent in depths_list:
|
||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': f'Length{original_context_length}'
|
'abbr': f'Length{original_context_length}'
|
||||||
f'Depth{int(depth_percent)}_{num_needles}needle_zh_1000k',
|
f'Depth{int(depth_percent)}_{num_needles}needle_zh_1000k',
|
||||||
'type': NeedleBenchMultiDataset,
|
'type': NeedleBenchMultiDataset,
|
||||||
'path': base_path,
|
'path': base_path,
|
||||||
'length': original_context_length,
|
'length': original_context_length,
|
||||||
@ -200,7 +202,7 @@ for original_context_length in context_lengths:
|
|||||||
'diff': diff,
|
'diff': diff,
|
||||||
'reader_cfg': needlebench_reader_cfg,
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
'infer_cfg': needlebench_infer_cfg,
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
'eval_cfg': needlebench_eval_cfg
|
'eval_cfg': needlebench_eval_cfg,
|
||||||
}
|
}
|
||||||
needlebench_2needle_zh_datasets.append(dataset_dict)
|
needlebench_2needle_zh_datasets.append(dataset_dict)
|
||||||
|
|
||||||
@ -211,7 +213,7 @@ for original_context_length in context_lengths:
|
|||||||
for depth_percent in depths_list:
|
for depth_percent in depths_list:
|
||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': f'Length{original_context_length}'
|
'abbr': f'Length{original_context_length}'
|
||||||
f'Depth{int(depth_percent)}_{num_needles}needle_zh_1000k',
|
f'Depth{int(depth_percent)}_{num_needles}needle_zh_1000k',
|
||||||
'type': NeedleBenchMultiDataset,
|
'type': NeedleBenchMultiDataset,
|
||||||
'path': base_path,
|
'path': base_path,
|
||||||
'length': original_context_length,
|
'length': original_context_length,
|
||||||
@ -227,7 +229,7 @@ for original_context_length in context_lengths:
|
|||||||
'diff': diff,
|
'diff': diff,
|
||||||
'reader_cfg': needlebench_reader_cfg,
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
'infer_cfg': needlebench_infer_cfg,
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
'eval_cfg': needlebench_eval_cfg
|
'eval_cfg': needlebench_eval_cfg,
|
||||||
}
|
}
|
||||||
needlebench_3needle_zh_datasets.append(dataset_dict)
|
needlebench_3needle_zh_datasets.append(dataset_dict)
|
||||||
|
|
||||||
@ -238,7 +240,7 @@ for original_context_length in context_lengths:
|
|||||||
for depth_percent in depths_list:
|
for depth_percent in depths_list:
|
||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': f'Length{original_context_length}'
|
'abbr': f'Length{original_context_length}'
|
||||||
f'Depth{int(depth_percent)}_{num_needles}needle_zh_1000k',
|
f'Depth{int(depth_percent)}_{num_needles}needle_zh_1000k',
|
||||||
'type': NeedleBenchMultiDataset,
|
'type': NeedleBenchMultiDataset,
|
||||||
'path': base_path,
|
'path': base_path,
|
||||||
'length': original_context_length,
|
'length': original_context_length,
|
||||||
@ -254,7 +256,7 @@ for original_context_length in context_lengths:
|
|||||||
'diff': diff,
|
'diff': diff,
|
||||||
'reader_cfg': needlebench_reader_cfg,
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
'infer_cfg': needlebench_infer_cfg,
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
'eval_cfg': needlebench_eval_cfg
|
'eval_cfg': needlebench_eval_cfg,
|
||||||
}
|
}
|
||||||
needlebench_4needle_zh_datasets.append(dataset_dict)
|
needlebench_4needle_zh_datasets.append(dataset_dict)
|
||||||
|
|
||||||
@ -265,7 +267,7 @@ for original_context_length in context_lengths:
|
|||||||
for depth_percent in depths_list:
|
for depth_percent in depths_list:
|
||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': f'Length{original_context_length}'
|
'abbr': f'Length{original_context_length}'
|
||||||
f'Depth{int(depth_percent)}_{num_needles}needle_zh_1000k',
|
f'Depth{int(depth_percent)}_{num_needles}needle_zh_1000k',
|
||||||
'type': NeedleBenchMultiDataset,
|
'type': NeedleBenchMultiDataset,
|
||||||
'path': base_path,
|
'path': base_path,
|
||||||
'length': original_context_length,
|
'length': original_context_length,
|
||||||
@ -281,6 +283,6 @@ for original_context_length in context_lengths:
|
|||||||
'diff': diff,
|
'diff': diff,
|
||||||
'reader_cfg': needlebench_reader_cfg,
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
'infer_cfg': needlebench_infer_cfg,
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
'eval_cfg': needlebench_eval_cfg
|
'eval_cfg': needlebench_eval_cfg,
|
||||||
}
|
}
|
||||||
needlebench_5needle_zh_datasets.append(dataset_dict)
|
needlebench_5needle_zh_datasets.append(dataset_dict)
|
||||||
|
@ -41,22 +41,24 @@ needlebench_infer_cfg = dict(
|
|||||||
dict(role='HUMAN', prompt='{prompt}'),
|
dict(role='HUMAN', prompt='{prompt}'),
|
||||||
dict(role='BOT', prompt='{answer}\n'),
|
dict(role='BOT', prompt='{answer}\n'),
|
||||||
]
|
]
|
||||||
)
|
|
||||||
),
|
),
|
||||||
|
),
|
||||||
retriever=dict(type=ZeroRetriever),
|
retriever=dict(type=ZeroRetriever),
|
||||||
inferencer=dict(type=GenInferencer))
|
inferencer=dict(type=GenInferencer),
|
||||||
|
)
|
||||||
|
|
||||||
needlebench_eval_cfg = dict(
|
needlebench_eval_cfg = dict(
|
||||||
evaluator=dict(type=NeedleBenchParallelEvaluator),
|
evaluator=dict(type=NeedleBenchParallelEvaluator),
|
||||||
pred_postprocessor=dict(type=needlebench_postprocess),
|
pred_postprocessor=dict(type=needlebench_postprocess),
|
||||||
dataset_postprocessor=dict(type=needlebench_dataset_postprocess),
|
dataset_postprocessor=dict(type=needlebench_dataset_postprocess),
|
||||||
pred_role='BOT')
|
pred_role='BOT',
|
||||||
|
)
|
||||||
|
|
||||||
context_lengths = list([20000, 160000, 300000, 440000, 580000, 720000, 860000, 1000000])
|
context_lengths = list([20000, 160000, 300000, 440000, 580000, 720000, 860000, 1000000])
|
||||||
document_depth_percent_intervals = 20
|
document_depth_percent_intervals = 20
|
||||||
document_depth_percent_interval_type = 'linear'
|
document_depth_percent_interval_type = 'linear'
|
||||||
|
|
||||||
base_path = './data/needlebench'
|
base_path = 'opencompass/needlebench'
|
||||||
file_list = ['PaulGrahamEssays.jsonl']
|
file_list = ['PaulGrahamEssays.jsonl']
|
||||||
needlebench_en_datasets = []
|
needlebench_en_datasets = []
|
||||||
needle_file_name = 'needles.jsonl'
|
needle_file_name = 'needles.jsonl'
|
||||||
@ -64,8 +66,7 @@ depths = [0, 10, 21, 31, 42, 52, 63, 73, 84, 94, 100]
|
|||||||
|
|
||||||
for original_context_length in context_lengths:
|
for original_context_length in context_lengths:
|
||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': f'Length{original_context_length}'
|
'abbr': f'Length{original_context_length}' f'_parallel_en_1000k',
|
||||||
f'_parallel_en_1000k',
|
|
||||||
'type': NeedleBenchParallelDataset,
|
'type': NeedleBenchParallelDataset,
|
||||||
'path': base_path,
|
'path': base_path,
|
||||||
'needle_file_name': needle_file_name,
|
'needle_file_name': needle_file_name,
|
||||||
@ -79,7 +80,7 @@ for original_context_length in context_lengths:
|
|||||||
'language': 'English',
|
'language': 'English',
|
||||||
'reader_cfg': needlebench_reader_cfg,
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
'infer_cfg': needlebench_infer_cfg,
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
'eval_cfg': needlebench_eval_cfg
|
'eval_cfg': needlebench_eval_cfg,
|
||||||
}
|
}
|
||||||
needlebench_en_datasets.append(dataset_dict)
|
needlebench_en_datasets.append(dataset_dict)
|
||||||
|
|
||||||
@ -88,8 +89,7 @@ needlebench_zh_datasets = []
|
|||||||
|
|
||||||
for original_context_length in context_lengths:
|
for original_context_length in context_lengths:
|
||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': f'Length{original_context_length}'
|
'abbr': f'Length{original_context_length}' f'_parallel_zh_1000k',
|
||||||
f'_parallel_zh_1000k',
|
|
||||||
'type': NeedleBenchParallelDataset,
|
'type': NeedleBenchParallelDataset,
|
||||||
'path': base_path,
|
'path': base_path,
|
||||||
'needle_file_name': needle_file_name,
|
'needle_file_name': needle_file_name,
|
||||||
@ -103,6 +103,6 @@ for original_context_length in context_lengths:
|
|||||||
'language': 'Chinese',
|
'language': 'Chinese',
|
||||||
'reader_cfg': needlebench_reader_cfg,
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
'infer_cfg': needlebench_infer_cfg,
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
'eval_cfg': needlebench_eval_cfg
|
'eval_cfg': needlebench_eval_cfg,
|
||||||
}
|
}
|
||||||
needlebench_zh_datasets.append(dataset_dict)
|
needlebench_zh_datasets.append(dataset_dict)
|
||||||
|
@ -41,21 +41,23 @@ needlebench_infer_cfg = dict(
|
|||||||
dict(role='HUMAN', prompt='{prompt}'),
|
dict(role='HUMAN', prompt='{prompt}'),
|
||||||
dict(role='BOT', prompt='{answer}\n'),
|
dict(role='BOT', prompt='{answer}\n'),
|
||||||
]
|
]
|
||||||
)
|
|
||||||
),
|
),
|
||||||
|
),
|
||||||
retriever=dict(type=ZeroRetriever),
|
retriever=dict(type=ZeroRetriever),
|
||||||
inferencer=dict(type=GenInferencer))
|
inferencer=dict(type=GenInferencer),
|
||||||
|
)
|
||||||
|
|
||||||
needlebench_eval_cfg = dict(
|
needlebench_eval_cfg = dict(
|
||||||
evaluator=dict(type=NeedleBenchOriginEvaluator),
|
evaluator=dict(type=NeedleBenchOriginEvaluator),
|
||||||
pred_postprocessor=dict(type=needlebench_postprocess),
|
pred_postprocessor=dict(type=needlebench_postprocess),
|
||||||
dataset_postprocessor=dict(type=needlebench_dataset_postprocess),
|
dataset_postprocessor=dict(type=needlebench_dataset_postprocess),
|
||||||
pred_role='BOT')
|
pred_role='BOT',
|
||||||
|
)
|
||||||
|
|
||||||
context_lengths = [20000, 160000, 300000, 440000, 580000, 720000, 860000, 1000000]
|
context_lengths = [20000, 160000, 300000, 440000, 580000, 720000, 860000, 1000000]
|
||||||
depths_list = [0, 10, 21, 31, 42, 52, 63, 73, 84, 94, 100]
|
depths_list = [0, 10, 21, 31, 42, 52, 63, 73, 84, 94, 100]
|
||||||
|
|
||||||
base_path = './data/needlebench'
|
base_path = 'opencompass/needlebench'
|
||||||
file_list = ['PaulGrahamEssays.jsonl']
|
file_list = ['PaulGrahamEssays.jsonl']
|
||||||
needlebench_en_datasets = []
|
needlebench_en_datasets = []
|
||||||
needle_file_name = 'needles.jsonl'
|
needle_file_name = 'needles.jsonl'
|
||||||
@ -64,7 +66,7 @@ for original_context_length in context_lengths:
|
|||||||
for depth_percent in depths_list:
|
for depth_percent in depths_list:
|
||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': f'Length{original_context_length}'
|
'abbr': f'Length{original_context_length}'
|
||||||
f'Depth{int(depth_percent)}_origin_en_1000k',
|
f'Depth{int(depth_percent)}_origin_en_1000k',
|
||||||
'type': NeedleBenchOriginDataset,
|
'type': NeedleBenchOriginDataset,
|
||||||
'path': base_path,
|
'path': base_path,
|
||||||
'length': original_context_length,
|
'length': original_context_length,
|
||||||
@ -78,7 +80,7 @@ for original_context_length in context_lengths:
|
|||||||
'needle_file_name': needle_file_name,
|
'needle_file_name': needle_file_name,
|
||||||
'reader_cfg': needlebench_reader_cfg,
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
'infer_cfg': needlebench_infer_cfg,
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
'eval_cfg': needlebench_eval_cfg
|
'eval_cfg': needlebench_eval_cfg,
|
||||||
}
|
}
|
||||||
needlebench_en_datasets.append(dataset_dict)
|
needlebench_en_datasets.append(dataset_dict)
|
||||||
|
|
||||||
@ -90,7 +92,7 @@ for original_context_length in context_lengths:
|
|||||||
for depth_percent in depths_list:
|
for depth_percent in depths_list:
|
||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': f'Length{original_context_length}'
|
'abbr': f'Length{original_context_length}'
|
||||||
f'Depth{int(depth_percent)}_origin_zh_1000k',
|
f'Depth{int(depth_percent)}_origin_zh_1000k',
|
||||||
'type': NeedleBenchOriginDataset,
|
'type': NeedleBenchOriginDataset,
|
||||||
'path': base_path,
|
'path': base_path,
|
||||||
'length': original_context_length,
|
'length': original_context_length,
|
||||||
@ -104,6 +106,6 @@ for original_context_length in context_lengths:
|
|||||||
'needle_file_name': needle_file_name,
|
'needle_file_name': needle_file_name,
|
||||||
'reader_cfg': needlebench_reader_cfg,
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
'infer_cfg': needlebench_infer_cfg,
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
'eval_cfg': needlebench_eval_cfg
|
'eval_cfg': needlebench_eval_cfg,
|
||||||
}
|
}
|
||||||
needlebench_zh_datasets.append(dataset_dict)
|
needlebench_zh_datasets.append(dataset_dict)
|
||||||
|
@ -41,16 +41,18 @@ needlebench_infer_cfg = dict(
|
|||||||
dict(role='HUMAN', prompt='{prompt}'),
|
dict(role='HUMAN', prompt='{prompt}'),
|
||||||
dict(role='BOT', prompt='{answer}\n'),
|
dict(role='BOT', prompt='{answer}\n'),
|
||||||
]
|
]
|
||||||
)
|
|
||||||
),
|
),
|
||||||
|
),
|
||||||
retriever=dict(type=ZeroRetriever),
|
retriever=dict(type=ZeroRetriever),
|
||||||
inferencer=dict(type=GenInferencer))
|
inferencer=dict(type=GenInferencer),
|
||||||
|
)
|
||||||
|
|
||||||
needlebench_eval_cfg = dict(
|
needlebench_eval_cfg = dict(
|
||||||
evaluator=dict(type=NeedleBenchMultiEvaluator),
|
evaluator=dict(type=NeedleBenchMultiEvaluator),
|
||||||
pred_postprocessor=dict(type=needlebench_postprocess),
|
pred_postprocessor=dict(type=needlebench_postprocess),
|
||||||
dataset_postprocessor=dict(type=needlebench_dataset_postprocess),
|
dataset_postprocessor=dict(type=needlebench_dataset_postprocess),
|
||||||
pred_role='BOT')
|
pred_role='BOT',
|
||||||
|
)
|
||||||
|
|
||||||
context_lengths = list([16000, 32000, 48000, 64000, 80000, 96000, 112000, 128000])
|
context_lengths = list([16000, 32000, 48000, 64000, 80000, 96000, 112000, 128000])
|
||||||
depths_list = [0, 10, 21, 31, 42, 52, 63, 73, 84, 94, 100]
|
depths_list = [0, 10, 21, 31, 42, 52, 63, 73, 84, 94, 100]
|
||||||
@ -58,7 +60,7 @@ document_depth_percent_intervals = 20
|
|||||||
document_depth_percent_interval_type = 'linear'
|
document_depth_percent_interval_type = 'linear'
|
||||||
|
|
||||||
# ----------English Version----------
|
# ----------English Version----------
|
||||||
base_path = './data/needlebench'
|
base_path = 'opencompass/needlebench'
|
||||||
file_list = ['PaulGrahamEssays.jsonl']
|
file_list = ['PaulGrahamEssays.jsonl']
|
||||||
|
|
||||||
needle_file_name = 'multi_needle_reasoning_en.json'
|
needle_file_name = 'multi_needle_reasoning_en.json'
|
||||||
@ -71,7 +73,7 @@ for original_context_length in context_lengths:
|
|||||||
for depth_percent in depths_list:
|
for depth_percent in depths_list:
|
||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': f'Length{original_context_length}'
|
'abbr': f'Length{original_context_length}'
|
||||||
f'Depth{int(depth_percent)}_{num_needles}needle_en_128k',
|
f'Depth{int(depth_percent)}_{num_needles}needle_en_128k',
|
||||||
'type': NeedleBenchMultiDataset,
|
'type': NeedleBenchMultiDataset,
|
||||||
'path': base_path,
|
'path': base_path,
|
||||||
'length': original_context_length,
|
'length': original_context_length,
|
||||||
@ -87,7 +89,7 @@ for original_context_length in context_lengths:
|
|||||||
'diff': diff,
|
'diff': diff,
|
||||||
'reader_cfg': needlebench_reader_cfg,
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
'infer_cfg': needlebench_infer_cfg,
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
'eval_cfg': needlebench_eval_cfg
|
'eval_cfg': needlebench_eval_cfg,
|
||||||
}
|
}
|
||||||
needlebench_2needle_en_datasets.append(dataset_dict)
|
needlebench_2needle_en_datasets.append(dataset_dict)
|
||||||
|
|
||||||
@ -98,7 +100,7 @@ for original_context_length in context_lengths:
|
|||||||
for depth_percent in depths_list:
|
for depth_percent in depths_list:
|
||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': f'Length{original_context_length}'
|
'abbr': f'Length{original_context_length}'
|
||||||
f'Depth{int(depth_percent)}_{num_needles}needle_en_128k',
|
f'Depth{int(depth_percent)}_{num_needles}needle_en_128k',
|
||||||
'type': NeedleBenchMultiDataset,
|
'type': NeedleBenchMultiDataset,
|
||||||
'path': base_path,
|
'path': base_path,
|
||||||
'length': original_context_length,
|
'length': original_context_length,
|
||||||
@ -114,7 +116,7 @@ for original_context_length in context_lengths:
|
|||||||
'diff': diff,
|
'diff': diff,
|
||||||
'reader_cfg': needlebench_reader_cfg,
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
'infer_cfg': needlebench_infer_cfg,
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
'eval_cfg': needlebench_eval_cfg
|
'eval_cfg': needlebench_eval_cfg,
|
||||||
}
|
}
|
||||||
needlebench_3needle_en_datasets.append(dataset_dict)
|
needlebench_3needle_en_datasets.append(dataset_dict)
|
||||||
|
|
||||||
@ -125,7 +127,7 @@ for original_context_length in context_lengths:
|
|||||||
for depth_percent in depths_list:
|
for depth_percent in depths_list:
|
||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': f'Length{original_context_length}'
|
'abbr': f'Length{original_context_length}'
|
||||||
f'Depth{int(depth_percent)}_{num_needles}needle_en_128k',
|
f'Depth{int(depth_percent)}_{num_needles}needle_en_128k',
|
||||||
'type': NeedleBenchMultiDataset,
|
'type': NeedleBenchMultiDataset,
|
||||||
'path': base_path,
|
'path': base_path,
|
||||||
'length': original_context_length,
|
'length': original_context_length,
|
||||||
@ -141,7 +143,7 @@ for original_context_length in context_lengths:
|
|||||||
'diff': diff,
|
'diff': diff,
|
||||||
'reader_cfg': needlebench_reader_cfg,
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
'infer_cfg': needlebench_infer_cfg,
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
'eval_cfg': needlebench_eval_cfg
|
'eval_cfg': needlebench_eval_cfg,
|
||||||
}
|
}
|
||||||
needlebench_4needle_en_datasets.append(dataset_dict)
|
needlebench_4needle_en_datasets.append(dataset_dict)
|
||||||
|
|
||||||
@ -152,7 +154,7 @@ for original_context_length in context_lengths:
|
|||||||
for depth_percent in depths_list:
|
for depth_percent in depths_list:
|
||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': f'Length{original_context_length}'
|
'abbr': f'Length{original_context_length}'
|
||||||
f'Depth{int(depth_percent)}_{num_needles}needle_en_128k',
|
f'Depth{int(depth_percent)}_{num_needles}needle_en_128k',
|
||||||
'type': NeedleBenchMultiDataset,
|
'type': NeedleBenchMultiDataset,
|
||||||
'path': base_path,
|
'path': base_path,
|
||||||
'length': original_context_length,
|
'length': original_context_length,
|
||||||
@ -168,12 +170,12 @@ for original_context_length in context_lengths:
|
|||||||
'diff': diff,
|
'diff': diff,
|
||||||
'reader_cfg': needlebench_reader_cfg,
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
'infer_cfg': needlebench_infer_cfg,
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
'eval_cfg': needlebench_eval_cfg
|
'eval_cfg': needlebench_eval_cfg,
|
||||||
}
|
}
|
||||||
needlebench_5needle_en_datasets.append(dataset_dict)
|
needlebench_5needle_en_datasets.append(dataset_dict)
|
||||||
|
|
||||||
# ----------Chinese Version----------
|
# ----------Chinese Version----------
|
||||||
base_path = './data/needlebench'
|
base_path = 'opencompass/needlebench'
|
||||||
file_list = ['zh_finance.jsonl']
|
file_list = ['zh_finance.jsonl']
|
||||||
|
|
||||||
needle_file_name = 'multi_needle_reasoning_zh.json'
|
needle_file_name = 'multi_needle_reasoning_zh.json'
|
||||||
@ -186,7 +188,7 @@ for original_context_length in context_lengths:
|
|||||||
for depth_percent in depths_list:
|
for depth_percent in depths_list:
|
||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': f'Length{original_context_length}'
|
'abbr': f'Length{original_context_length}'
|
||||||
f'Depth{int(depth_percent)}_{num_needles}needle_zh_128k',
|
f'Depth{int(depth_percent)}_{num_needles}needle_zh_128k',
|
||||||
'type': NeedleBenchMultiDataset,
|
'type': NeedleBenchMultiDataset,
|
||||||
'path': base_path,
|
'path': base_path,
|
||||||
'length': original_context_length,
|
'length': original_context_length,
|
||||||
@ -202,7 +204,7 @@ for original_context_length in context_lengths:
|
|||||||
'diff': diff,
|
'diff': diff,
|
||||||
'reader_cfg': needlebench_reader_cfg,
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
'infer_cfg': needlebench_infer_cfg,
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
'eval_cfg': needlebench_eval_cfg
|
'eval_cfg': needlebench_eval_cfg,
|
||||||
}
|
}
|
||||||
needlebench_2needle_zh_datasets.append(dataset_dict)
|
needlebench_2needle_zh_datasets.append(dataset_dict)
|
||||||
|
|
||||||
@ -213,7 +215,7 @@ for original_context_length in context_lengths:
|
|||||||
for depth_percent in depths_list:
|
for depth_percent in depths_list:
|
||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': f'Length{original_context_length}'
|
'abbr': f'Length{original_context_length}'
|
||||||
f'Depth{int(depth_percent)}_{num_needles}needle_zh_128k',
|
f'Depth{int(depth_percent)}_{num_needles}needle_zh_128k',
|
||||||
'type': NeedleBenchMultiDataset,
|
'type': NeedleBenchMultiDataset,
|
||||||
'path': base_path,
|
'path': base_path,
|
||||||
'length': original_context_length,
|
'length': original_context_length,
|
||||||
@ -229,7 +231,7 @@ for original_context_length in context_lengths:
|
|||||||
'diff': diff,
|
'diff': diff,
|
||||||
'reader_cfg': needlebench_reader_cfg,
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
'infer_cfg': needlebench_infer_cfg,
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
'eval_cfg': needlebench_eval_cfg
|
'eval_cfg': needlebench_eval_cfg,
|
||||||
}
|
}
|
||||||
needlebench_3needle_zh_datasets.append(dataset_dict)
|
needlebench_3needle_zh_datasets.append(dataset_dict)
|
||||||
|
|
||||||
@ -240,7 +242,7 @@ for original_context_length in context_lengths:
|
|||||||
for depth_percent in depths_list:
|
for depth_percent in depths_list:
|
||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': f'Length{original_context_length}'
|
'abbr': f'Length{original_context_length}'
|
||||||
f'Depth{int(depth_percent)}_{num_needles}needle_zh_128k',
|
f'Depth{int(depth_percent)}_{num_needles}needle_zh_128k',
|
||||||
'type': NeedleBenchMultiDataset,
|
'type': NeedleBenchMultiDataset,
|
||||||
'path': base_path,
|
'path': base_path,
|
||||||
'length': original_context_length,
|
'length': original_context_length,
|
||||||
@ -256,7 +258,7 @@ for original_context_length in context_lengths:
|
|||||||
'diff': diff,
|
'diff': diff,
|
||||||
'reader_cfg': needlebench_reader_cfg,
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
'infer_cfg': needlebench_infer_cfg,
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
'eval_cfg': needlebench_eval_cfg
|
'eval_cfg': needlebench_eval_cfg,
|
||||||
}
|
}
|
||||||
needlebench_4needle_zh_datasets.append(dataset_dict)
|
needlebench_4needle_zh_datasets.append(dataset_dict)
|
||||||
|
|
||||||
@ -267,7 +269,7 @@ for original_context_length in context_lengths:
|
|||||||
for depth_percent in depths_list:
|
for depth_percent in depths_list:
|
||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': f'Length{original_context_length}'
|
'abbr': f'Length{original_context_length}'
|
||||||
f'Depth{int(depth_percent)}_{num_needles}needle_zh_128k',
|
f'Depth{int(depth_percent)}_{num_needles}needle_zh_128k',
|
||||||
'type': NeedleBenchMultiDataset,
|
'type': NeedleBenchMultiDataset,
|
||||||
'path': base_path,
|
'path': base_path,
|
||||||
'length': original_context_length,
|
'length': original_context_length,
|
||||||
@ -283,6 +285,6 @@ for original_context_length in context_lengths:
|
|||||||
'diff': diff,
|
'diff': diff,
|
||||||
'reader_cfg': needlebench_reader_cfg,
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
'infer_cfg': needlebench_infer_cfg,
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
'eval_cfg': needlebench_eval_cfg
|
'eval_cfg': needlebench_eval_cfg,
|
||||||
}
|
}
|
||||||
needlebench_5needle_zh_datasets.append(dataset_dict)
|
needlebench_5needle_zh_datasets.append(dataset_dict)
|
||||||
|
@ -41,22 +41,24 @@ needlebench_infer_cfg = dict(
|
|||||||
dict(role='HUMAN', prompt='{prompt}'),
|
dict(role='HUMAN', prompt='{prompt}'),
|
||||||
dict(role='BOT', prompt='{answer}\n'),
|
dict(role='BOT', prompt='{answer}\n'),
|
||||||
]
|
]
|
||||||
)
|
|
||||||
),
|
),
|
||||||
|
),
|
||||||
retriever=dict(type=ZeroRetriever),
|
retriever=dict(type=ZeroRetriever),
|
||||||
inferencer=dict(type=GenInferencer))
|
inferencer=dict(type=GenInferencer),
|
||||||
|
)
|
||||||
|
|
||||||
needlebench_eval_cfg = dict(
|
needlebench_eval_cfg = dict(
|
||||||
evaluator=dict(type=NeedleBenchParallelEvaluator),
|
evaluator=dict(type=NeedleBenchParallelEvaluator),
|
||||||
pred_postprocessor=dict(type=needlebench_postprocess),
|
pred_postprocessor=dict(type=needlebench_postprocess),
|
||||||
dataset_postprocessor=dict(type=needlebench_dataset_postprocess),
|
dataset_postprocessor=dict(type=needlebench_dataset_postprocess),
|
||||||
pred_role='BOT')
|
pred_role='BOT',
|
||||||
|
)
|
||||||
|
|
||||||
context_lengths = list([16000, 32000, 48000, 64000, 80000, 96000, 112000, 128000])
|
context_lengths = list([16000, 32000, 48000, 64000, 80000, 96000, 112000, 128000])
|
||||||
document_depth_percent_intervals = 20
|
document_depth_percent_intervals = 20
|
||||||
document_depth_percent_interval_type = 'linear'
|
document_depth_percent_interval_type = 'linear'
|
||||||
|
|
||||||
base_path = './data/needlebench'
|
base_path = 'opencompass/needlebench'
|
||||||
file_list = ['PaulGrahamEssays.jsonl']
|
file_list = ['PaulGrahamEssays.jsonl']
|
||||||
needlebench_en_datasets = []
|
needlebench_en_datasets = []
|
||||||
needle_file_name = 'needles.jsonl'
|
needle_file_name = 'needles.jsonl'
|
||||||
@ -64,8 +66,7 @@ depths = [0, 10, 21, 31, 42, 52, 63, 73, 84, 94, 100]
|
|||||||
|
|
||||||
for original_context_length in context_lengths:
|
for original_context_length in context_lengths:
|
||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': f'Length{original_context_length}'
|
'abbr': f'Length{original_context_length}' f'_parallel_en_128k',
|
||||||
f'_parallel_en_128k',
|
|
||||||
'type': NeedleBenchParallelDataset,
|
'type': NeedleBenchParallelDataset,
|
||||||
'path': base_path,
|
'path': base_path,
|
||||||
'needle_file_name': needle_file_name,
|
'needle_file_name': needle_file_name,
|
||||||
@ -79,7 +80,7 @@ for original_context_length in context_lengths:
|
|||||||
'language': 'English',
|
'language': 'English',
|
||||||
'reader_cfg': needlebench_reader_cfg,
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
'infer_cfg': needlebench_infer_cfg,
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
'eval_cfg': needlebench_eval_cfg
|
'eval_cfg': needlebench_eval_cfg,
|
||||||
}
|
}
|
||||||
needlebench_en_datasets.append(dataset_dict)
|
needlebench_en_datasets.append(dataset_dict)
|
||||||
|
|
||||||
@ -88,8 +89,7 @@ needlebench_zh_datasets = []
|
|||||||
|
|
||||||
for original_context_length in context_lengths:
|
for original_context_length in context_lengths:
|
||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': f'Length{original_context_length}'
|
'abbr': f'Length{original_context_length}' f'_parallel_zh_128k',
|
||||||
f'_parallel_zh_128k',
|
|
||||||
'type': NeedleBenchParallelDataset,
|
'type': NeedleBenchParallelDataset,
|
||||||
'path': base_path,
|
'path': base_path,
|
||||||
'needle_file_name': needle_file_name,
|
'needle_file_name': needle_file_name,
|
||||||
@ -103,6 +103,6 @@ for original_context_length in context_lengths:
|
|||||||
'language': 'Chinese',
|
'language': 'Chinese',
|
||||||
'reader_cfg': needlebench_reader_cfg,
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
'infer_cfg': needlebench_infer_cfg,
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
'eval_cfg': needlebench_eval_cfg
|
'eval_cfg': needlebench_eval_cfg,
|
||||||
}
|
}
|
||||||
needlebench_zh_datasets.append(dataset_dict)
|
needlebench_zh_datasets.append(dataset_dict)
|
||||||
|
@ -41,23 +41,25 @@ needlebench_infer_cfg = dict(
|
|||||||
dict(role='HUMAN', prompt='{prompt}'),
|
dict(role='HUMAN', prompt='{prompt}'),
|
||||||
dict(role='BOT', prompt='{answer}\n'),
|
dict(role='BOT', prompt='{answer}\n'),
|
||||||
]
|
]
|
||||||
)
|
|
||||||
),
|
),
|
||||||
|
),
|
||||||
retriever=dict(type=ZeroRetriever),
|
retriever=dict(type=ZeroRetriever),
|
||||||
inferencer=dict(type=GenInferencer))
|
inferencer=dict(type=GenInferencer),
|
||||||
|
)
|
||||||
|
|
||||||
needlebench_eval_cfg = dict(
|
needlebench_eval_cfg = dict(
|
||||||
evaluator=dict(type=NeedleBenchOriginEvaluator),
|
evaluator=dict(type=NeedleBenchOriginEvaluator),
|
||||||
pred_postprocessor=dict(type=needlebench_postprocess),
|
pred_postprocessor=dict(type=needlebench_postprocess),
|
||||||
dataset_postprocessor=dict(type=needlebench_dataset_postprocess),
|
dataset_postprocessor=dict(type=needlebench_dataset_postprocess),
|
||||||
pred_role='BOT')
|
pred_role='BOT',
|
||||||
|
)
|
||||||
|
|
||||||
context_lengths = list([16000, 32000, 48000, 64000, 80000, 96000, 112000, 128000])
|
context_lengths = list([16000, 32000, 48000, 64000, 80000, 96000, 112000, 128000])
|
||||||
depths_list = [0, 10, 21, 31, 42, 52, 63, 73, 84, 94, 100]
|
depths_list = [0, 10, 21, 31, 42, 52, 63, 73, 84, 94, 100]
|
||||||
document_depth_percent_intervals = 20
|
document_depth_percent_intervals = 20
|
||||||
document_depth_percent_interval_type = 'linear'
|
document_depth_percent_interval_type = 'linear'
|
||||||
|
|
||||||
base_path = './data/needlebench'
|
base_path = 'opencompass/needlebench'
|
||||||
file_list = ['PaulGrahamEssays.jsonl']
|
file_list = ['PaulGrahamEssays.jsonl']
|
||||||
needlebench_en_datasets = []
|
needlebench_en_datasets = []
|
||||||
needle_file_name = 'needles.jsonl'
|
needle_file_name = 'needles.jsonl'
|
||||||
@ -66,7 +68,7 @@ for original_context_length in context_lengths:
|
|||||||
for depth_percent in depths_list:
|
for depth_percent in depths_list:
|
||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': f'Length{original_context_length}'
|
'abbr': f'Length{original_context_length}'
|
||||||
f'Depth{int(depth_percent)}_origin_en_128k',
|
f'Depth{int(depth_percent)}_origin_en_128k',
|
||||||
'type': NeedleBenchOriginDataset,
|
'type': NeedleBenchOriginDataset,
|
||||||
'path': base_path,
|
'path': base_path,
|
||||||
'length': original_context_length,
|
'length': original_context_length,
|
||||||
@ -80,7 +82,7 @@ for original_context_length in context_lengths:
|
|||||||
'needle_file_name': needle_file_name,
|
'needle_file_name': needle_file_name,
|
||||||
'reader_cfg': needlebench_reader_cfg,
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
'infer_cfg': needlebench_infer_cfg,
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
'eval_cfg': needlebench_eval_cfg
|
'eval_cfg': needlebench_eval_cfg,
|
||||||
}
|
}
|
||||||
needlebench_en_datasets.append(dataset_dict)
|
needlebench_en_datasets.append(dataset_dict)
|
||||||
|
|
||||||
@ -92,7 +94,7 @@ for original_context_length in context_lengths:
|
|||||||
for depth_percent in depths_list:
|
for depth_percent in depths_list:
|
||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': f'Length{original_context_length}'
|
'abbr': f'Length{original_context_length}'
|
||||||
f'Depth{int(depth_percent)}_origin_zh_128k',
|
f'Depth{int(depth_percent)}_origin_zh_128k',
|
||||||
'type': NeedleBenchOriginDataset,
|
'type': NeedleBenchOriginDataset,
|
||||||
'path': base_path,
|
'path': base_path,
|
||||||
'length': original_context_length,
|
'length': original_context_length,
|
||||||
@ -106,6 +108,6 @@ for original_context_length in context_lengths:
|
|||||||
'needle_file_name': needle_file_name,
|
'needle_file_name': needle_file_name,
|
||||||
'reader_cfg': needlebench_reader_cfg,
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
'infer_cfg': needlebench_infer_cfg,
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
'eval_cfg': needlebench_eval_cfg
|
'eval_cfg': needlebench_eval_cfg,
|
||||||
}
|
}
|
||||||
needlebench_zh_datasets.append(dataset_dict)
|
needlebench_zh_datasets.append(dataset_dict)
|
||||||
|
@ -41,23 +41,25 @@ needlebench_infer_cfg = dict(
|
|||||||
dict(role='HUMAN', prompt='{prompt}'),
|
dict(role='HUMAN', prompt='{prompt}'),
|
||||||
dict(role='BOT', prompt='{answer}\n'),
|
dict(role='BOT', prompt='{answer}\n'),
|
||||||
]
|
]
|
||||||
)
|
|
||||||
),
|
),
|
||||||
|
),
|
||||||
retriever=dict(type=ZeroRetriever),
|
retriever=dict(type=ZeroRetriever),
|
||||||
inferencer=dict(type=GenInferencer))
|
inferencer=dict(type=GenInferencer),
|
||||||
|
)
|
||||||
|
|
||||||
needlebench_eval_cfg = dict(
|
needlebench_eval_cfg = dict(
|
||||||
evaluator=dict(type=NeedleBenchMultiEvaluator),
|
evaluator=dict(type=NeedleBenchMultiEvaluator),
|
||||||
pred_postprocessor=dict(type=needlebench_postprocess),
|
pred_postprocessor=dict(type=needlebench_postprocess),
|
||||||
dataset_postprocessor=dict(type=needlebench_dataset_postprocess),
|
dataset_postprocessor=dict(type=needlebench_dataset_postprocess),
|
||||||
pred_role='BOT')
|
pred_role='BOT',
|
||||||
|
)
|
||||||
|
|
||||||
# context_lengths = list([16000, 32000, 48000, 64000, 80000, 96000, 112000, 128000, 144000, 160000, 176000, 192000, 200000])
|
# context_lengths = list([16000, 32000, 48000, 64000, 80000, 96000, 112000, 128000, 144000, 160000, 176000, 192000, 200000])
|
||||||
context_lengths = [16000, 48000, 80000, 112000, 128000, 144000, 176000, 200000]
|
context_lengths = [16000, 48000, 80000, 112000, 128000, 144000, 176000, 200000]
|
||||||
depths_list = [0, 10, 21, 31, 42, 52, 63, 73, 84, 94, 100]
|
depths_list = [0, 10, 21, 31, 42, 52, 63, 73, 84, 94, 100]
|
||||||
|
|
||||||
# ----------English Version----------
|
# ----------English Version----------
|
||||||
base_path = './data/needlebench'
|
base_path = 'opencompass/needlebench'
|
||||||
file_list = ['PaulGrahamEssays.jsonl']
|
file_list = ['PaulGrahamEssays.jsonl']
|
||||||
|
|
||||||
needle_file_name = 'multi_needle_reasoning_en.json'
|
needle_file_name = 'multi_needle_reasoning_en.json'
|
||||||
@ -70,7 +72,7 @@ for original_context_length in context_lengths:
|
|||||||
for depth_percent in depths_list:
|
for depth_percent in depths_list:
|
||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': f'Length{original_context_length}'
|
'abbr': f'Length{original_context_length}'
|
||||||
f'Depth{int(depth_percent)}_{num_needles}needle_en_200k',
|
f'Depth{int(depth_percent)}_{num_needles}needle_en_200k',
|
||||||
'type': NeedleBenchMultiDataset,
|
'type': NeedleBenchMultiDataset,
|
||||||
'path': base_path,
|
'path': base_path,
|
||||||
'length': original_context_length,
|
'length': original_context_length,
|
||||||
@ -86,7 +88,7 @@ for original_context_length in context_lengths:
|
|||||||
'diff': diff,
|
'diff': diff,
|
||||||
'reader_cfg': needlebench_reader_cfg,
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
'infer_cfg': needlebench_infer_cfg,
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
'eval_cfg': needlebench_eval_cfg
|
'eval_cfg': needlebench_eval_cfg,
|
||||||
}
|
}
|
||||||
needlebench_2needle_en_datasets.append(dataset_dict)
|
needlebench_2needle_en_datasets.append(dataset_dict)
|
||||||
|
|
||||||
@ -97,7 +99,7 @@ for original_context_length in context_lengths:
|
|||||||
for depth_percent in depths_list:
|
for depth_percent in depths_list:
|
||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': f'Length{original_context_length}'
|
'abbr': f'Length{original_context_length}'
|
||||||
f'Depth{int(depth_percent)}_{num_needles}needle_en_200k',
|
f'Depth{int(depth_percent)}_{num_needles}needle_en_200k',
|
||||||
'type': NeedleBenchMultiDataset,
|
'type': NeedleBenchMultiDataset,
|
||||||
'path': base_path,
|
'path': base_path,
|
||||||
'length': original_context_length,
|
'length': original_context_length,
|
||||||
@ -113,7 +115,7 @@ for original_context_length in context_lengths:
|
|||||||
'diff': diff,
|
'diff': diff,
|
||||||
'reader_cfg': needlebench_reader_cfg,
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
'infer_cfg': needlebench_infer_cfg,
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
'eval_cfg': needlebench_eval_cfg
|
'eval_cfg': needlebench_eval_cfg,
|
||||||
}
|
}
|
||||||
needlebench_3needle_en_datasets.append(dataset_dict)
|
needlebench_3needle_en_datasets.append(dataset_dict)
|
||||||
|
|
||||||
@ -124,7 +126,7 @@ for original_context_length in context_lengths:
|
|||||||
for depth_percent in depths_list:
|
for depth_percent in depths_list:
|
||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': f'Length{original_context_length}'
|
'abbr': f'Length{original_context_length}'
|
||||||
f'Depth{int(depth_percent)}_{num_needles}needle_en_200k',
|
f'Depth{int(depth_percent)}_{num_needles}needle_en_200k',
|
||||||
'type': NeedleBenchMultiDataset,
|
'type': NeedleBenchMultiDataset,
|
||||||
'path': base_path,
|
'path': base_path,
|
||||||
'length': original_context_length,
|
'length': original_context_length,
|
||||||
@ -140,7 +142,7 @@ for original_context_length in context_lengths:
|
|||||||
'diff': diff,
|
'diff': diff,
|
||||||
'reader_cfg': needlebench_reader_cfg,
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
'infer_cfg': needlebench_infer_cfg,
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
'eval_cfg': needlebench_eval_cfg
|
'eval_cfg': needlebench_eval_cfg,
|
||||||
}
|
}
|
||||||
needlebench_4needle_en_datasets.append(dataset_dict)
|
needlebench_4needle_en_datasets.append(dataset_dict)
|
||||||
|
|
||||||
@ -151,7 +153,7 @@ for original_context_length in context_lengths:
|
|||||||
for depth_percent in depths_list:
|
for depth_percent in depths_list:
|
||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': f'Length{original_context_length}'
|
'abbr': f'Length{original_context_length}'
|
||||||
f'Depth{int(depth_percent)}_{num_needles}needle_en_200k',
|
f'Depth{int(depth_percent)}_{num_needles}needle_en_200k',
|
||||||
'type': NeedleBenchMultiDataset,
|
'type': NeedleBenchMultiDataset,
|
||||||
'path': base_path,
|
'path': base_path,
|
||||||
'length': original_context_length,
|
'length': original_context_length,
|
||||||
@ -167,12 +169,12 @@ for original_context_length in context_lengths:
|
|||||||
'diff': diff,
|
'diff': diff,
|
||||||
'reader_cfg': needlebench_reader_cfg,
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
'infer_cfg': needlebench_infer_cfg,
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
'eval_cfg': needlebench_eval_cfg
|
'eval_cfg': needlebench_eval_cfg,
|
||||||
}
|
}
|
||||||
needlebench_5needle_en_datasets.append(dataset_dict)
|
needlebench_5needle_en_datasets.append(dataset_dict)
|
||||||
|
|
||||||
# ----------Chinese Version----------
|
# ----------Chinese Version----------
|
||||||
base_path = './data/needlebench'
|
base_path = 'opencompass/needlebench'
|
||||||
file_list = ['zh_finance.jsonl']
|
file_list = ['zh_finance.jsonl']
|
||||||
|
|
||||||
needle_file_name = 'multi_needle_reasoning_zh.json'
|
needle_file_name = 'multi_needle_reasoning_zh.json'
|
||||||
@ -185,7 +187,7 @@ for original_context_length in context_lengths:
|
|||||||
for depth_percent in depths_list:
|
for depth_percent in depths_list:
|
||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': f'Length{original_context_length}'
|
'abbr': f'Length{original_context_length}'
|
||||||
f'Depth{int(depth_percent)}_{num_needles}needle_zh_200k',
|
f'Depth{int(depth_percent)}_{num_needles}needle_zh_200k',
|
||||||
'type': NeedleBenchMultiDataset,
|
'type': NeedleBenchMultiDataset,
|
||||||
'path': base_path,
|
'path': base_path,
|
||||||
'length': original_context_length,
|
'length': original_context_length,
|
||||||
@ -201,7 +203,7 @@ for original_context_length in context_lengths:
|
|||||||
'diff': diff,
|
'diff': diff,
|
||||||
'reader_cfg': needlebench_reader_cfg,
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
'infer_cfg': needlebench_infer_cfg,
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
'eval_cfg': needlebench_eval_cfg
|
'eval_cfg': needlebench_eval_cfg,
|
||||||
}
|
}
|
||||||
needlebench_2needle_zh_datasets.append(dataset_dict)
|
needlebench_2needle_zh_datasets.append(dataset_dict)
|
||||||
|
|
||||||
@ -212,7 +214,7 @@ for original_context_length in context_lengths:
|
|||||||
for depth_percent in depths_list:
|
for depth_percent in depths_list:
|
||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': f'Length{original_context_length}'
|
'abbr': f'Length{original_context_length}'
|
||||||
f'Depth{int(depth_percent)}_{num_needles}needle_zh_200k',
|
f'Depth{int(depth_percent)}_{num_needles}needle_zh_200k',
|
||||||
'type': NeedleBenchMultiDataset,
|
'type': NeedleBenchMultiDataset,
|
||||||
'path': base_path,
|
'path': base_path,
|
||||||
'length': original_context_length,
|
'length': original_context_length,
|
||||||
@ -228,7 +230,7 @@ for original_context_length in context_lengths:
|
|||||||
'diff': diff,
|
'diff': diff,
|
||||||
'reader_cfg': needlebench_reader_cfg,
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
'infer_cfg': needlebench_infer_cfg,
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
'eval_cfg': needlebench_eval_cfg
|
'eval_cfg': needlebench_eval_cfg,
|
||||||
}
|
}
|
||||||
needlebench_3needle_zh_datasets.append(dataset_dict)
|
needlebench_3needle_zh_datasets.append(dataset_dict)
|
||||||
|
|
||||||
@ -239,7 +241,7 @@ for original_context_length in context_lengths:
|
|||||||
for depth_percent in depths_list:
|
for depth_percent in depths_list:
|
||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': f'Length{original_context_length}'
|
'abbr': f'Length{original_context_length}'
|
||||||
f'Depth{int(depth_percent)}_{num_needles}needle_zh_200k',
|
f'Depth{int(depth_percent)}_{num_needles}needle_zh_200k',
|
||||||
'type': NeedleBenchMultiDataset,
|
'type': NeedleBenchMultiDataset,
|
||||||
'path': base_path,
|
'path': base_path,
|
||||||
'length': original_context_length,
|
'length': original_context_length,
|
||||||
@ -255,7 +257,7 @@ for original_context_length in context_lengths:
|
|||||||
'diff': diff,
|
'diff': diff,
|
||||||
'reader_cfg': needlebench_reader_cfg,
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
'infer_cfg': needlebench_infer_cfg,
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
'eval_cfg': needlebench_eval_cfg
|
'eval_cfg': needlebench_eval_cfg,
|
||||||
}
|
}
|
||||||
needlebench_4needle_zh_datasets.append(dataset_dict)
|
needlebench_4needle_zh_datasets.append(dataset_dict)
|
||||||
|
|
||||||
@ -266,7 +268,7 @@ for original_context_length in context_lengths:
|
|||||||
for depth_percent in depths_list:
|
for depth_percent in depths_list:
|
||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': f'Length{original_context_length}'
|
'abbr': f'Length{original_context_length}'
|
||||||
f'Depth{int(depth_percent)}_{num_needles}needle_zh_200k',
|
f'Depth{int(depth_percent)}_{num_needles}needle_zh_200k',
|
||||||
'type': NeedleBenchMultiDataset,
|
'type': NeedleBenchMultiDataset,
|
||||||
'path': base_path,
|
'path': base_path,
|
||||||
'length': original_context_length,
|
'length': original_context_length,
|
||||||
@ -282,6 +284,6 @@ for original_context_length in context_lengths:
|
|||||||
'diff': diff,
|
'diff': diff,
|
||||||
'reader_cfg': needlebench_reader_cfg,
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
'infer_cfg': needlebench_infer_cfg,
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
'eval_cfg': needlebench_eval_cfg
|
'eval_cfg': needlebench_eval_cfg,
|
||||||
}
|
}
|
||||||
needlebench_5needle_zh_datasets.append(dataset_dict)
|
needlebench_5needle_zh_datasets.append(dataset_dict)
|
||||||
|
@ -41,23 +41,25 @@ needlebench_infer_cfg = dict(
|
|||||||
dict(role='HUMAN', prompt='{prompt}'),
|
dict(role='HUMAN', prompt='{prompt}'),
|
||||||
dict(role='BOT', prompt='{answer}\n'),
|
dict(role='BOT', prompt='{answer}\n'),
|
||||||
]
|
]
|
||||||
)
|
|
||||||
),
|
),
|
||||||
|
),
|
||||||
retriever=dict(type=ZeroRetriever),
|
retriever=dict(type=ZeroRetriever),
|
||||||
inferencer=dict(type=GenInferencer))
|
inferencer=dict(type=GenInferencer),
|
||||||
|
)
|
||||||
|
|
||||||
needlebench_eval_cfg = dict(
|
needlebench_eval_cfg = dict(
|
||||||
evaluator=dict(type=NeedleBenchParallelEvaluator),
|
evaluator=dict(type=NeedleBenchParallelEvaluator),
|
||||||
pred_postprocessor=dict(type=needlebench_postprocess),
|
pred_postprocessor=dict(type=needlebench_postprocess),
|
||||||
dataset_postprocessor=dict(type=needlebench_dataset_postprocess),
|
dataset_postprocessor=dict(type=needlebench_dataset_postprocess),
|
||||||
pred_role='BOT')
|
pred_role='BOT',
|
||||||
|
)
|
||||||
|
|
||||||
# context_lengths = list([16000, 32000, 48000, 64000, 80000, 96000, 112000, 128000, 144000, 160000, 176000, 192000, 200000])
|
# context_lengths = list([16000, 32000, 48000, 64000, 80000, 96000, 112000, 128000, 144000, 160000, 176000, 192000, 200000])
|
||||||
context_lengths = list([16000, 48000, 80000, 112000, 128000, 144000, 176000, 200000])
|
context_lengths = list([16000, 48000, 80000, 112000, 128000, 144000, 176000, 200000])
|
||||||
document_depth_percent_intervals = 20
|
document_depth_percent_intervals = 20
|
||||||
document_depth_percent_interval_type = 'linear'
|
document_depth_percent_interval_type = 'linear'
|
||||||
|
|
||||||
base_path = './data/needlebench'
|
base_path = 'opencompass/needlebench'
|
||||||
file_list = ['PaulGrahamEssays.jsonl']
|
file_list = ['PaulGrahamEssays.jsonl']
|
||||||
needlebench_en_datasets = []
|
needlebench_en_datasets = []
|
||||||
needle_file_name = 'needles.jsonl'
|
needle_file_name = 'needles.jsonl'
|
||||||
@ -65,8 +67,7 @@ depths = [0, 10, 21, 31, 42, 52, 63, 73, 84, 94, 100]
|
|||||||
|
|
||||||
for original_context_length in context_lengths:
|
for original_context_length in context_lengths:
|
||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': f'Length{original_context_length}'
|
'abbr': f'Length{original_context_length}' f'_parallel_en_200k',
|
||||||
f'_parallel_en_200k',
|
|
||||||
'type': NeedleBenchParallelDataset,
|
'type': NeedleBenchParallelDataset,
|
||||||
'path': base_path,
|
'path': base_path,
|
||||||
'needle_file_name': needle_file_name,
|
'needle_file_name': needle_file_name,
|
||||||
@ -80,7 +81,7 @@ for original_context_length in context_lengths:
|
|||||||
'language': 'English',
|
'language': 'English',
|
||||||
'reader_cfg': needlebench_reader_cfg,
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
'infer_cfg': needlebench_infer_cfg,
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
'eval_cfg': needlebench_eval_cfg
|
'eval_cfg': needlebench_eval_cfg,
|
||||||
}
|
}
|
||||||
needlebench_en_datasets.append(dataset_dict)
|
needlebench_en_datasets.append(dataset_dict)
|
||||||
|
|
||||||
@ -89,8 +90,7 @@ needlebench_zh_datasets = []
|
|||||||
|
|
||||||
for original_context_length in context_lengths:
|
for original_context_length in context_lengths:
|
||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': f'Length{original_context_length}'
|
'abbr': f'Length{original_context_length}' f'_parallel_zh_200k',
|
||||||
f'_parallel_zh_200k',
|
|
||||||
'type': NeedleBenchParallelDataset,
|
'type': NeedleBenchParallelDataset,
|
||||||
'path': base_path,
|
'path': base_path,
|
||||||
'needle_file_name': needle_file_name,
|
'needle_file_name': needle_file_name,
|
||||||
@ -104,6 +104,6 @@ for original_context_length in context_lengths:
|
|||||||
'language': 'Chinese',
|
'language': 'Chinese',
|
||||||
'reader_cfg': needlebench_reader_cfg,
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
'infer_cfg': needlebench_infer_cfg,
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
'eval_cfg': needlebench_eval_cfg
|
'eval_cfg': needlebench_eval_cfg,
|
||||||
}
|
}
|
||||||
needlebench_zh_datasets.append(dataset_dict)
|
needlebench_zh_datasets.append(dataset_dict)
|
||||||
|
@ -41,22 +41,24 @@ needlebench_infer_cfg = dict(
|
|||||||
dict(role='HUMAN', prompt='{prompt}'),
|
dict(role='HUMAN', prompt='{prompt}'),
|
||||||
dict(role='BOT', prompt='{answer}\n'),
|
dict(role='BOT', prompt='{answer}\n'),
|
||||||
]
|
]
|
||||||
)
|
|
||||||
),
|
),
|
||||||
|
),
|
||||||
retriever=dict(type=ZeroRetriever),
|
retriever=dict(type=ZeroRetriever),
|
||||||
inferencer=dict(type=GenInferencer))
|
inferencer=dict(type=GenInferencer),
|
||||||
|
)
|
||||||
|
|
||||||
needlebench_eval_cfg = dict(
|
needlebench_eval_cfg = dict(
|
||||||
evaluator=dict(type=NeedleBenchOriginEvaluator),
|
evaluator=dict(type=NeedleBenchOriginEvaluator),
|
||||||
pred_postprocessor=dict(type=needlebench_postprocess),
|
pred_postprocessor=dict(type=needlebench_postprocess),
|
||||||
dataset_postprocessor=dict(type=needlebench_dataset_postprocess),
|
dataset_postprocessor=dict(type=needlebench_dataset_postprocess),
|
||||||
pred_role='BOT')
|
pred_role='BOT',
|
||||||
|
)
|
||||||
|
|
||||||
# context_lengths = list([16000, 32000, 48000, 64000, 80000, 96000, 112000, 128000, 144000, 160000, 176000, 192000, 200000])
|
# context_lengths = list([16000, 32000, 48000, 64000, 80000, 96000, 112000, 128000, 144000, 160000, 176000, 192000, 200000])
|
||||||
context_lengths = [16000, 48000, 80000, 112000, 128000, 144000, 176000, 200000]
|
context_lengths = [16000, 48000, 80000, 112000, 128000, 144000, 176000, 200000]
|
||||||
depths_list = [0, 10, 21, 31, 42, 52, 63, 73, 84, 94, 100]
|
depths_list = [0, 10, 21, 31, 42, 52, 63, 73, 84, 94, 100]
|
||||||
|
|
||||||
base_path = './data/needlebench'
|
base_path = 'opencompass/needlebench'
|
||||||
file_list = ['PaulGrahamEssays.jsonl']
|
file_list = ['PaulGrahamEssays.jsonl']
|
||||||
needlebench_en_datasets = []
|
needlebench_en_datasets = []
|
||||||
needle_file_name = 'needles.jsonl'
|
needle_file_name = 'needles.jsonl'
|
||||||
@ -65,7 +67,7 @@ for original_context_length in context_lengths:
|
|||||||
for depth_percent in depths_list:
|
for depth_percent in depths_list:
|
||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': f'Length{original_context_length}'
|
'abbr': f'Length{original_context_length}'
|
||||||
f'Depth{int(depth_percent)}_origin_en_200k',
|
f'Depth{int(depth_percent)}_origin_en_200k',
|
||||||
'type': NeedleBenchOriginDataset,
|
'type': NeedleBenchOriginDataset,
|
||||||
'path': base_path,
|
'path': base_path,
|
||||||
'length': original_context_length,
|
'length': original_context_length,
|
||||||
@ -79,7 +81,7 @@ for original_context_length in context_lengths:
|
|||||||
'needle_file_name': needle_file_name,
|
'needle_file_name': needle_file_name,
|
||||||
'reader_cfg': needlebench_reader_cfg,
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
'infer_cfg': needlebench_infer_cfg,
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
'eval_cfg': needlebench_eval_cfg
|
'eval_cfg': needlebench_eval_cfg,
|
||||||
}
|
}
|
||||||
needlebench_en_datasets.append(dataset_dict)
|
needlebench_en_datasets.append(dataset_dict)
|
||||||
|
|
||||||
@ -91,7 +93,7 @@ for original_context_length in context_lengths:
|
|||||||
for depth_percent in depths_list:
|
for depth_percent in depths_list:
|
||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': f'Length{original_context_length}'
|
'abbr': f'Length{original_context_length}'
|
||||||
f'Depth{int(depth_percent)}_origin_zh_200k',
|
f'Depth{int(depth_percent)}_origin_zh_200k',
|
||||||
'type': NeedleBenchOriginDataset,
|
'type': NeedleBenchOriginDataset,
|
||||||
'path': base_path,
|
'path': base_path,
|
||||||
'length': original_context_length,
|
'length': original_context_length,
|
||||||
@ -105,6 +107,6 @@ for original_context_length in context_lengths:
|
|||||||
'needle_file_name': needle_file_name,
|
'needle_file_name': needle_file_name,
|
||||||
'reader_cfg': needlebench_reader_cfg,
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
'infer_cfg': needlebench_infer_cfg,
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
'eval_cfg': needlebench_eval_cfg
|
'eval_cfg': needlebench_eval_cfg,
|
||||||
}
|
}
|
||||||
needlebench_zh_datasets.append(dataset_dict)
|
needlebench_zh_datasets.append(dataset_dict)
|
||||||
|
@ -41,23 +41,25 @@ needlebench_infer_cfg = dict(
|
|||||||
dict(role='HUMAN', prompt='{prompt}'),
|
dict(role='HUMAN', prompt='{prompt}'),
|
||||||
dict(role='BOT', prompt='{answer}\n'),
|
dict(role='BOT', prompt='{answer}\n'),
|
||||||
]
|
]
|
||||||
)
|
|
||||||
),
|
),
|
||||||
|
),
|
||||||
retriever=dict(type=ZeroRetriever),
|
retriever=dict(type=ZeroRetriever),
|
||||||
inferencer=dict(type=GenInferencer))
|
inferencer=dict(type=GenInferencer),
|
||||||
|
)
|
||||||
|
|
||||||
needlebench_eval_cfg = dict(
|
needlebench_eval_cfg = dict(
|
||||||
evaluator=dict(type=NeedleBenchMultiEvaluator),
|
evaluator=dict(type=NeedleBenchMultiEvaluator),
|
||||||
pred_postprocessor=dict(type=needlebench_postprocess),
|
pred_postprocessor=dict(type=needlebench_postprocess),
|
||||||
dataset_postprocessor=dict(type=needlebench_dataset_postprocess),
|
dataset_postprocessor=dict(type=needlebench_dataset_postprocess),
|
||||||
pred_role='BOT')
|
pred_role='BOT',
|
||||||
|
)
|
||||||
|
|
||||||
# context_lengths = list([16000, 32000, 48000, 64000, 80000, 96000, 112000, 128000, 144000, 160000, 176000, 192000, 200000])
|
# context_lengths = list([16000, 32000, 48000, 64000, 80000, 96000, 112000, 128000, 144000, 160000, 176000, 192000, 200000])
|
||||||
context_lengths = [32000, 128000, 256000]
|
context_lengths = [32000, 128000, 256000]
|
||||||
depths_list = [0, 10, 21, 31, 42, 52, 63, 73, 84, 94, 100]
|
depths_list = [0, 10, 21, 31, 42, 52, 63, 73, 84, 94, 100]
|
||||||
|
|
||||||
# ----------English Version----------
|
# ----------English Version----------
|
||||||
base_path = './data/needlebench'
|
base_path = 'opencompass/needlebench'
|
||||||
file_list = ['PaulGrahamEssays.jsonl']
|
file_list = ['PaulGrahamEssays.jsonl']
|
||||||
|
|
||||||
needle_file_name = 'multi_needle_reasoning_en.json'
|
needle_file_name = 'multi_needle_reasoning_en.json'
|
||||||
@ -70,7 +72,7 @@ for original_context_length in context_lengths:
|
|||||||
for depth_percent in depths_list:
|
for depth_percent in depths_list:
|
||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': f'Length{original_context_length}'
|
'abbr': f'Length{original_context_length}'
|
||||||
f'Depth{int(depth_percent)}_{num_needles}needle_en_256k',
|
f'Depth{int(depth_percent)}_{num_needles}needle_en_256k',
|
||||||
'type': NeedleBenchMultiDataset,
|
'type': NeedleBenchMultiDataset,
|
||||||
'path': base_path,
|
'path': base_path,
|
||||||
'length': original_context_length,
|
'length': original_context_length,
|
||||||
@ -86,7 +88,7 @@ for original_context_length in context_lengths:
|
|||||||
'diff': diff,
|
'diff': diff,
|
||||||
'reader_cfg': needlebench_reader_cfg,
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
'infer_cfg': needlebench_infer_cfg,
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
'eval_cfg': needlebench_eval_cfg
|
'eval_cfg': needlebench_eval_cfg,
|
||||||
}
|
}
|
||||||
needlebench_2needle_en_datasets.append(dataset_dict)
|
needlebench_2needle_en_datasets.append(dataset_dict)
|
||||||
|
|
||||||
@ -97,7 +99,7 @@ for original_context_length in context_lengths:
|
|||||||
for depth_percent in depths_list:
|
for depth_percent in depths_list:
|
||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': f'Length{original_context_length}'
|
'abbr': f'Length{original_context_length}'
|
||||||
f'Depth{int(depth_percent)}_{num_needles}needle_en_256k',
|
f'Depth{int(depth_percent)}_{num_needles}needle_en_256k',
|
||||||
'type': NeedleBenchMultiDataset,
|
'type': NeedleBenchMultiDataset,
|
||||||
'path': base_path,
|
'path': base_path,
|
||||||
'length': original_context_length,
|
'length': original_context_length,
|
||||||
@ -113,7 +115,7 @@ for original_context_length in context_lengths:
|
|||||||
'diff': diff,
|
'diff': diff,
|
||||||
'reader_cfg': needlebench_reader_cfg,
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
'infer_cfg': needlebench_infer_cfg,
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
'eval_cfg': needlebench_eval_cfg
|
'eval_cfg': needlebench_eval_cfg,
|
||||||
}
|
}
|
||||||
needlebench_3needle_en_datasets.append(dataset_dict)
|
needlebench_3needle_en_datasets.append(dataset_dict)
|
||||||
|
|
||||||
@ -124,7 +126,7 @@ for original_context_length in context_lengths:
|
|||||||
for depth_percent in depths_list:
|
for depth_percent in depths_list:
|
||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': f'Length{original_context_length}'
|
'abbr': f'Length{original_context_length}'
|
||||||
f'Depth{int(depth_percent)}_{num_needles}needle_en_256k',
|
f'Depth{int(depth_percent)}_{num_needles}needle_en_256k',
|
||||||
'type': NeedleBenchMultiDataset,
|
'type': NeedleBenchMultiDataset,
|
||||||
'path': base_path,
|
'path': base_path,
|
||||||
'length': original_context_length,
|
'length': original_context_length,
|
||||||
@ -140,7 +142,7 @@ for original_context_length in context_lengths:
|
|||||||
'diff': diff,
|
'diff': diff,
|
||||||
'reader_cfg': needlebench_reader_cfg,
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
'infer_cfg': needlebench_infer_cfg,
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
'eval_cfg': needlebench_eval_cfg
|
'eval_cfg': needlebench_eval_cfg,
|
||||||
}
|
}
|
||||||
needlebench_4needle_en_datasets.append(dataset_dict)
|
needlebench_4needle_en_datasets.append(dataset_dict)
|
||||||
|
|
||||||
@ -151,7 +153,7 @@ for original_context_length in context_lengths:
|
|||||||
for depth_percent in depths_list:
|
for depth_percent in depths_list:
|
||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': f'Length{original_context_length}'
|
'abbr': f'Length{original_context_length}'
|
||||||
f'Depth{int(depth_percent)}_{num_needles}needle_en_256k',
|
f'Depth{int(depth_percent)}_{num_needles}needle_en_256k',
|
||||||
'type': NeedleBenchMultiDataset,
|
'type': NeedleBenchMultiDataset,
|
||||||
'path': base_path,
|
'path': base_path,
|
||||||
'length': original_context_length,
|
'length': original_context_length,
|
||||||
@ -167,12 +169,12 @@ for original_context_length in context_lengths:
|
|||||||
'diff': diff,
|
'diff': diff,
|
||||||
'reader_cfg': needlebench_reader_cfg,
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
'infer_cfg': needlebench_infer_cfg,
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
'eval_cfg': needlebench_eval_cfg
|
'eval_cfg': needlebench_eval_cfg,
|
||||||
}
|
}
|
||||||
needlebench_5needle_en_datasets.append(dataset_dict)
|
needlebench_5needle_en_datasets.append(dataset_dict)
|
||||||
|
|
||||||
# ----------Chinese Version----------
|
# ----------Chinese Version----------
|
||||||
base_path = './data/needlebench'
|
base_path = 'opencompass/needlebench'
|
||||||
file_list = ['zh_finance.jsonl']
|
file_list = ['zh_finance.jsonl']
|
||||||
|
|
||||||
needle_file_name = 'multi_needle_reasoning_zh.json'
|
needle_file_name = 'multi_needle_reasoning_zh.json'
|
||||||
@ -185,7 +187,7 @@ for original_context_length in context_lengths:
|
|||||||
for depth_percent in depths_list:
|
for depth_percent in depths_list:
|
||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': f'Length{original_context_length}'
|
'abbr': f'Length{original_context_length}'
|
||||||
f'Depth{int(depth_percent)}_{num_needles}needle_zh_256k',
|
f'Depth{int(depth_percent)}_{num_needles}needle_zh_256k',
|
||||||
'type': NeedleBenchMultiDataset,
|
'type': NeedleBenchMultiDataset,
|
||||||
'path': base_path,
|
'path': base_path,
|
||||||
'length': original_context_length,
|
'length': original_context_length,
|
||||||
@ -201,7 +203,7 @@ for original_context_length in context_lengths:
|
|||||||
'diff': diff,
|
'diff': diff,
|
||||||
'reader_cfg': needlebench_reader_cfg,
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
'infer_cfg': needlebench_infer_cfg,
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
'eval_cfg': needlebench_eval_cfg
|
'eval_cfg': needlebench_eval_cfg,
|
||||||
}
|
}
|
||||||
needlebench_2needle_zh_datasets.append(dataset_dict)
|
needlebench_2needle_zh_datasets.append(dataset_dict)
|
||||||
|
|
||||||
@ -212,7 +214,7 @@ for original_context_length in context_lengths:
|
|||||||
for depth_percent in depths_list:
|
for depth_percent in depths_list:
|
||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': f'Length{original_context_length}'
|
'abbr': f'Length{original_context_length}'
|
||||||
f'Depth{int(depth_percent)}_{num_needles}needle_zh_256k',
|
f'Depth{int(depth_percent)}_{num_needles}needle_zh_256k',
|
||||||
'type': NeedleBenchMultiDataset,
|
'type': NeedleBenchMultiDataset,
|
||||||
'path': base_path,
|
'path': base_path,
|
||||||
'length': original_context_length,
|
'length': original_context_length,
|
||||||
@ -228,7 +230,7 @@ for original_context_length in context_lengths:
|
|||||||
'diff': diff,
|
'diff': diff,
|
||||||
'reader_cfg': needlebench_reader_cfg,
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
'infer_cfg': needlebench_infer_cfg,
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
'eval_cfg': needlebench_eval_cfg
|
'eval_cfg': needlebench_eval_cfg,
|
||||||
}
|
}
|
||||||
needlebench_3needle_zh_datasets.append(dataset_dict)
|
needlebench_3needle_zh_datasets.append(dataset_dict)
|
||||||
|
|
||||||
@ -239,7 +241,7 @@ for original_context_length in context_lengths:
|
|||||||
for depth_percent in depths_list:
|
for depth_percent in depths_list:
|
||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': f'Length{original_context_length}'
|
'abbr': f'Length{original_context_length}'
|
||||||
f'Depth{int(depth_percent)}_{num_needles}needle_zh_256k',
|
f'Depth{int(depth_percent)}_{num_needles}needle_zh_256k',
|
||||||
'type': NeedleBenchMultiDataset,
|
'type': NeedleBenchMultiDataset,
|
||||||
'path': base_path,
|
'path': base_path,
|
||||||
'length': original_context_length,
|
'length': original_context_length,
|
||||||
@ -255,7 +257,7 @@ for original_context_length in context_lengths:
|
|||||||
'diff': diff,
|
'diff': diff,
|
||||||
'reader_cfg': needlebench_reader_cfg,
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
'infer_cfg': needlebench_infer_cfg,
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
'eval_cfg': needlebench_eval_cfg
|
'eval_cfg': needlebench_eval_cfg,
|
||||||
}
|
}
|
||||||
needlebench_4needle_zh_datasets.append(dataset_dict)
|
needlebench_4needle_zh_datasets.append(dataset_dict)
|
||||||
|
|
||||||
@ -266,7 +268,7 @@ for original_context_length in context_lengths:
|
|||||||
for depth_percent in depths_list:
|
for depth_percent in depths_list:
|
||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': f'Length{original_context_length}'
|
'abbr': f'Length{original_context_length}'
|
||||||
f'Depth{int(depth_percent)}_{num_needles}needle_zh_256k',
|
f'Depth{int(depth_percent)}_{num_needles}needle_zh_256k',
|
||||||
'type': NeedleBenchMultiDataset,
|
'type': NeedleBenchMultiDataset,
|
||||||
'path': base_path,
|
'path': base_path,
|
||||||
'length': original_context_length,
|
'length': original_context_length,
|
||||||
@ -282,6 +284,6 @@ for original_context_length in context_lengths:
|
|||||||
'diff': diff,
|
'diff': diff,
|
||||||
'reader_cfg': needlebench_reader_cfg,
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
'infer_cfg': needlebench_infer_cfg,
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
'eval_cfg': needlebench_eval_cfg
|
'eval_cfg': needlebench_eval_cfg,
|
||||||
}
|
}
|
||||||
needlebench_5needle_zh_datasets.append(dataset_dict)
|
needlebench_5needle_zh_datasets.append(dataset_dict)
|
||||||
|
@ -41,23 +41,25 @@ needlebench_infer_cfg = dict(
|
|||||||
dict(role='HUMAN', prompt='{prompt}'),
|
dict(role='HUMAN', prompt='{prompt}'),
|
||||||
dict(role='BOT', prompt='{answer}\n'),
|
dict(role='BOT', prompt='{answer}\n'),
|
||||||
]
|
]
|
||||||
)
|
|
||||||
),
|
),
|
||||||
|
),
|
||||||
retriever=dict(type=ZeroRetriever),
|
retriever=dict(type=ZeroRetriever),
|
||||||
inferencer=dict(type=GenInferencer))
|
inferencer=dict(type=GenInferencer),
|
||||||
|
)
|
||||||
|
|
||||||
needlebench_eval_cfg = dict(
|
needlebench_eval_cfg = dict(
|
||||||
evaluator=dict(type=NeedleBenchParallelEvaluator),
|
evaluator=dict(type=NeedleBenchParallelEvaluator),
|
||||||
pred_postprocessor=dict(type=needlebench_postprocess),
|
pred_postprocessor=dict(type=needlebench_postprocess),
|
||||||
dataset_postprocessor=dict(type=needlebench_dataset_postprocess),
|
dataset_postprocessor=dict(type=needlebench_dataset_postprocess),
|
||||||
pred_role='BOT')
|
pred_role='BOT',
|
||||||
|
)
|
||||||
|
|
||||||
# context_lengths = list([16000, 32000, 48000, 64000, 80000, 96000, 112000, 128000, 144000, 160000, 176000, 192000, 200000])
|
# context_lengths = list([16000, 32000, 48000, 64000, 80000, 96000, 112000, 128000, 144000, 160000, 176000, 192000, 200000])
|
||||||
context_lengths = [32000, 128000, 256000]
|
context_lengths = [32000, 128000, 256000]
|
||||||
document_depth_percent_intervals = 20
|
document_depth_percent_intervals = 20
|
||||||
document_depth_percent_interval_type = 'linear'
|
document_depth_percent_interval_type = 'linear'
|
||||||
|
|
||||||
base_path = './data/needlebench'
|
base_path = 'opencompass/needlebench'
|
||||||
file_list = ['PaulGrahamEssays.jsonl']
|
file_list = ['PaulGrahamEssays.jsonl']
|
||||||
needlebench_en_datasets = []
|
needlebench_en_datasets = []
|
||||||
needle_file_name = 'needles.jsonl'
|
needle_file_name = 'needles.jsonl'
|
||||||
@ -65,8 +67,7 @@ depths = [0, 10, 21, 31, 42, 52, 63, 73, 84, 94, 100]
|
|||||||
|
|
||||||
for original_context_length in context_lengths:
|
for original_context_length in context_lengths:
|
||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': f'Length{original_context_length}'
|
'abbr': f'Length{original_context_length}' f'_parallel_en_256k',
|
||||||
f'_parallel_en_256k',
|
|
||||||
'type': NeedleBenchParallelDataset,
|
'type': NeedleBenchParallelDataset,
|
||||||
'path': base_path,
|
'path': base_path,
|
||||||
'needle_file_name': needle_file_name,
|
'needle_file_name': needle_file_name,
|
||||||
@ -80,7 +81,7 @@ for original_context_length in context_lengths:
|
|||||||
'language': 'English',
|
'language': 'English',
|
||||||
'reader_cfg': needlebench_reader_cfg,
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
'infer_cfg': needlebench_infer_cfg,
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
'eval_cfg': needlebench_eval_cfg
|
'eval_cfg': needlebench_eval_cfg,
|
||||||
}
|
}
|
||||||
needlebench_en_datasets.append(dataset_dict)
|
needlebench_en_datasets.append(dataset_dict)
|
||||||
|
|
||||||
@ -89,8 +90,7 @@ needlebench_zh_datasets = []
|
|||||||
|
|
||||||
for original_context_length in context_lengths:
|
for original_context_length in context_lengths:
|
||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': f'Length{original_context_length}'
|
'abbr': f'Length{original_context_length}' f'_parallel_zh_256k',
|
||||||
f'_parallel_zh_256k',
|
|
||||||
'type': NeedleBenchParallelDataset,
|
'type': NeedleBenchParallelDataset,
|
||||||
'path': base_path,
|
'path': base_path,
|
||||||
'needle_file_name': needle_file_name,
|
'needle_file_name': needle_file_name,
|
||||||
@ -104,6 +104,6 @@ for original_context_length in context_lengths:
|
|||||||
'language': 'Chinese',
|
'language': 'Chinese',
|
||||||
'reader_cfg': needlebench_reader_cfg,
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
'infer_cfg': needlebench_infer_cfg,
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
'eval_cfg': needlebench_eval_cfg
|
'eval_cfg': needlebench_eval_cfg,
|
||||||
}
|
}
|
||||||
needlebench_zh_datasets.append(dataset_dict)
|
needlebench_zh_datasets.append(dataset_dict)
|
||||||
|
@ -41,22 +41,24 @@ needlebench_infer_cfg = dict(
|
|||||||
dict(role='HUMAN', prompt='{prompt}'),
|
dict(role='HUMAN', prompt='{prompt}'),
|
||||||
dict(role='BOT', prompt='{answer}\n'),
|
dict(role='BOT', prompt='{answer}\n'),
|
||||||
]
|
]
|
||||||
)
|
|
||||||
),
|
),
|
||||||
|
),
|
||||||
retriever=dict(type=ZeroRetriever),
|
retriever=dict(type=ZeroRetriever),
|
||||||
inferencer=dict(type=GenInferencer))
|
inferencer=dict(type=GenInferencer),
|
||||||
|
)
|
||||||
|
|
||||||
needlebench_eval_cfg = dict(
|
needlebench_eval_cfg = dict(
|
||||||
evaluator=dict(type=NeedleBenchOriginEvaluator),
|
evaluator=dict(type=NeedleBenchOriginEvaluator),
|
||||||
pred_postprocessor=dict(type=needlebench_postprocess),
|
pred_postprocessor=dict(type=needlebench_postprocess),
|
||||||
dataset_postprocessor=dict(type=needlebench_dataset_postprocess),
|
dataset_postprocessor=dict(type=needlebench_dataset_postprocess),
|
||||||
pred_role='BOT')
|
pred_role='BOT',
|
||||||
|
)
|
||||||
|
|
||||||
# context_lengths = list([16000, 32000, 48000, 64000, 80000, 96000, 112000, 128000, 144000, 160000, 176000, 192000, 200000])
|
# context_lengths = list([16000, 32000, 48000, 64000, 80000, 96000, 112000, 128000, 144000, 160000, 176000, 192000, 200000])
|
||||||
context_lengths = [32000, 128000, 256000]
|
context_lengths = [32000, 128000, 256000]
|
||||||
depths_list = [0, 10, 21, 31, 42, 52, 63, 73, 84, 94, 100]
|
depths_list = [0, 10, 21, 31, 42, 52, 63, 73, 84, 94, 100]
|
||||||
|
|
||||||
base_path = './data/needlebench'
|
base_path = 'opencompass/needlebench'
|
||||||
file_list = ['PaulGrahamEssays.jsonl']
|
file_list = ['PaulGrahamEssays.jsonl']
|
||||||
needlebench_en_datasets = []
|
needlebench_en_datasets = []
|
||||||
needle_file_name = 'needles.jsonl'
|
needle_file_name = 'needles.jsonl'
|
||||||
@ -65,7 +67,7 @@ for original_context_length in context_lengths:
|
|||||||
for depth_percent in depths_list:
|
for depth_percent in depths_list:
|
||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': f'Length{original_context_length}'
|
'abbr': f'Length{original_context_length}'
|
||||||
f'Depth{int(depth_percent)}_origin_en_256k',
|
f'Depth{int(depth_percent)}_origin_en_256k',
|
||||||
'type': NeedleBenchOriginDataset,
|
'type': NeedleBenchOriginDataset,
|
||||||
'path': base_path,
|
'path': base_path,
|
||||||
'length': original_context_length,
|
'length': original_context_length,
|
||||||
@ -79,7 +81,7 @@ for original_context_length in context_lengths:
|
|||||||
'needle_file_name': needle_file_name,
|
'needle_file_name': needle_file_name,
|
||||||
'reader_cfg': needlebench_reader_cfg,
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
'infer_cfg': needlebench_infer_cfg,
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
'eval_cfg': needlebench_eval_cfg
|
'eval_cfg': needlebench_eval_cfg,
|
||||||
}
|
}
|
||||||
needlebench_en_datasets.append(dataset_dict)
|
needlebench_en_datasets.append(dataset_dict)
|
||||||
|
|
||||||
@ -91,7 +93,7 @@ for original_context_length in context_lengths:
|
|||||||
for depth_percent in depths_list:
|
for depth_percent in depths_list:
|
||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': f'Length{original_context_length}'
|
'abbr': f'Length{original_context_length}'
|
||||||
f'Depth{int(depth_percent)}_origin_zh_256k',
|
f'Depth{int(depth_percent)}_origin_zh_256k',
|
||||||
'type': NeedleBenchOriginDataset,
|
'type': NeedleBenchOriginDataset,
|
||||||
'path': base_path,
|
'path': base_path,
|
||||||
'length': original_context_length,
|
'length': original_context_length,
|
||||||
@ -105,6 +107,6 @@ for original_context_length in context_lengths:
|
|||||||
'needle_file_name': needle_file_name,
|
'needle_file_name': needle_file_name,
|
||||||
'reader_cfg': needlebench_reader_cfg,
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
'infer_cfg': needlebench_infer_cfg,
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
'eval_cfg': needlebench_eval_cfg
|
'eval_cfg': needlebench_eval_cfg,
|
||||||
}
|
}
|
||||||
needlebench_zh_datasets.append(dataset_dict)
|
needlebench_zh_datasets.append(dataset_dict)
|
||||||
|
@ -41,16 +41,18 @@ needlebench_infer_cfg = dict(
|
|||||||
dict(role='HUMAN', prompt='{prompt}'),
|
dict(role='HUMAN', prompt='{prompt}'),
|
||||||
dict(role='BOT', prompt='{answer}\n'),
|
dict(role='BOT', prompt='{answer}\n'),
|
||||||
]
|
]
|
||||||
)
|
|
||||||
),
|
),
|
||||||
|
),
|
||||||
retriever=dict(type=ZeroRetriever),
|
retriever=dict(type=ZeroRetriever),
|
||||||
inferencer=dict(type=GenInferencer))
|
inferencer=dict(type=GenInferencer),
|
||||||
|
)
|
||||||
|
|
||||||
needlebench_eval_cfg = dict(
|
needlebench_eval_cfg = dict(
|
||||||
evaluator=dict(type=NeedleBenchMultiEvaluator),
|
evaluator=dict(type=NeedleBenchMultiEvaluator),
|
||||||
pred_postprocessor=dict(type=needlebench_postprocess),
|
pred_postprocessor=dict(type=needlebench_postprocess),
|
||||||
dataset_postprocessor=dict(type=needlebench_dataset_postprocess),
|
dataset_postprocessor=dict(type=needlebench_dataset_postprocess),
|
||||||
pred_role='BOT')
|
pred_role='BOT',
|
||||||
|
)
|
||||||
|
|
||||||
context_lengths = list([9000, 13000, 17000, 21000, 25000, 29000, 31000, 32000])
|
context_lengths = list([9000, 13000, 17000, 21000, 25000, 29000, 31000, 32000])
|
||||||
depths_list = [0, 10, 21, 31, 42, 52, 63, 73, 84, 94, 100]
|
depths_list = [0, 10, 21, 31, 42, 52, 63, 73, 84, 94, 100]
|
||||||
@ -58,7 +60,7 @@ document_depth_percent_intervals = 20
|
|||||||
document_depth_percent_interval_type = 'linear'
|
document_depth_percent_interval_type = 'linear'
|
||||||
|
|
||||||
# ----------English Version----------
|
# ----------English Version----------
|
||||||
base_path = './data/needlebench'
|
base_path = 'opencompass/needlebench'
|
||||||
file_list = ['PaulGrahamEssays.jsonl']
|
file_list = ['PaulGrahamEssays.jsonl']
|
||||||
|
|
||||||
needle_file_name = 'multi_needle_reasoning_en.json'
|
needle_file_name = 'multi_needle_reasoning_en.json'
|
||||||
@ -71,7 +73,7 @@ for original_context_length in context_lengths:
|
|||||||
for depth_percent in depths_list:
|
for depth_percent in depths_list:
|
||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': f'Length{original_context_length}'
|
'abbr': f'Length{original_context_length}'
|
||||||
f'Depth{int(depth_percent)}_{num_needles}needle_en_32k',
|
f'Depth{int(depth_percent)}_{num_needles}needle_en_32k',
|
||||||
'type': NeedleBenchMultiDataset,
|
'type': NeedleBenchMultiDataset,
|
||||||
'path': base_path,
|
'path': base_path,
|
||||||
'length': original_context_length,
|
'length': original_context_length,
|
||||||
@ -87,7 +89,7 @@ for original_context_length in context_lengths:
|
|||||||
'diff': diff,
|
'diff': diff,
|
||||||
'reader_cfg': needlebench_reader_cfg,
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
'infer_cfg': needlebench_infer_cfg,
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
'eval_cfg': needlebench_eval_cfg
|
'eval_cfg': needlebench_eval_cfg,
|
||||||
}
|
}
|
||||||
needlebench_2needle_en_datasets.append(dataset_dict)
|
needlebench_2needle_en_datasets.append(dataset_dict)
|
||||||
|
|
||||||
@ -98,7 +100,7 @@ for original_context_length in context_lengths:
|
|||||||
for depth_percent in depths_list:
|
for depth_percent in depths_list:
|
||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': f'Length{original_context_length}'
|
'abbr': f'Length{original_context_length}'
|
||||||
f'Depth{int(depth_percent)}_{num_needles}needle_en_32k',
|
f'Depth{int(depth_percent)}_{num_needles}needle_en_32k',
|
||||||
'type': NeedleBenchMultiDataset,
|
'type': NeedleBenchMultiDataset,
|
||||||
'path': base_path,
|
'path': base_path,
|
||||||
'length': original_context_length,
|
'length': original_context_length,
|
||||||
@ -114,7 +116,7 @@ for original_context_length in context_lengths:
|
|||||||
'diff': diff,
|
'diff': diff,
|
||||||
'reader_cfg': needlebench_reader_cfg,
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
'infer_cfg': needlebench_infer_cfg,
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
'eval_cfg': needlebench_eval_cfg
|
'eval_cfg': needlebench_eval_cfg,
|
||||||
}
|
}
|
||||||
needlebench_3needle_en_datasets.append(dataset_dict)
|
needlebench_3needle_en_datasets.append(dataset_dict)
|
||||||
|
|
||||||
@ -125,7 +127,7 @@ for original_context_length in context_lengths:
|
|||||||
for depth_percent in depths_list:
|
for depth_percent in depths_list:
|
||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': f'Length{original_context_length}'
|
'abbr': f'Length{original_context_length}'
|
||||||
f'Depth{int(depth_percent)}_{num_needles}needle_en_32k',
|
f'Depth{int(depth_percent)}_{num_needles}needle_en_32k',
|
||||||
'type': NeedleBenchMultiDataset,
|
'type': NeedleBenchMultiDataset,
|
||||||
'path': base_path,
|
'path': base_path,
|
||||||
'length': original_context_length,
|
'length': original_context_length,
|
||||||
@ -141,7 +143,7 @@ for original_context_length in context_lengths:
|
|||||||
'diff': diff,
|
'diff': diff,
|
||||||
'reader_cfg': needlebench_reader_cfg,
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
'infer_cfg': needlebench_infer_cfg,
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
'eval_cfg': needlebench_eval_cfg
|
'eval_cfg': needlebench_eval_cfg,
|
||||||
}
|
}
|
||||||
needlebench_4needle_en_datasets.append(dataset_dict)
|
needlebench_4needle_en_datasets.append(dataset_dict)
|
||||||
|
|
||||||
@ -152,7 +154,7 @@ for original_context_length in context_lengths:
|
|||||||
for depth_percent in depths_list:
|
for depth_percent in depths_list:
|
||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': f'Length{original_context_length}'
|
'abbr': f'Length{original_context_length}'
|
||||||
f'Depth{int(depth_percent)}_{num_needles}needle_en_32k',
|
f'Depth{int(depth_percent)}_{num_needles}needle_en_32k',
|
||||||
'type': NeedleBenchMultiDataset,
|
'type': NeedleBenchMultiDataset,
|
||||||
'path': base_path,
|
'path': base_path,
|
||||||
'length': original_context_length,
|
'length': original_context_length,
|
||||||
@ -168,12 +170,12 @@ for original_context_length in context_lengths:
|
|||||||
'diff': diff,
|
'diff': diff,
|
||||||
'reader_cfg': needlebench_reader_cfg,
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
'infer_cfg': needlebench_infer_cfg,
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
'eval_cfg': needlebench_eval_cfg
|
'eval_cfg': needlebench_eval_cfg,
|
||||||
}
|
}
|
||||||
needlebench_5needle_en_datasets.append(dataset_dict)
|
needlebench_5needle_en_datasets.append(dataset_dict)
|
||||||
|
|
||||||
# ----------Chinese Version----------
|
# ----------Chinese Version----------
|
||||||
base_path = './data/needlebench'
|
base_path = 'opencompass/needlebench'
|
||||||
file_list = ['zh_finance.jsonl']
|
file_list = ['zh_finance.jsonl']
|
||||||
|
|
||||||
needle_file_name = 'multi_needle_reasoning_zh.json'
|
needle_file_name = 'multi_needle_reasoning_zh.json'
|
||||||
@ -186,7 +188,7 @@ for original_context_length in context_lengths:
|
|||||||
for depth_percent in depths_list:
|
for depth_percent in depths_list:
|
||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': f'Length{original_context_length}'
|
'abbr': f'Length{original_context_length}'
|
||||||
f'Depth{int(depth_percent)}_{num_needles}needle_zh_32k',
|
f'Depth{int(depth_percent)}_{num_needles}needle_zh_32k',
|
||||||
'type': NeedleBenchMultiDataset,
|
'type': NeedleBenchMultiDataset,
|
||||||
'path': base_path,
|
'path': base_path,
|
||||||
'length': original_context_length,
|
'length': original_context_length,
|
||||||
@ -202,7 +204,7 @@ for original_context_length in context_lengths:
|
|||||||
'diff': diff,
|
'diff': diff,
|
||||||
'reader_cfg': needlebench_reader_cfg,
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
'infer_cfg': needlebench_infer_cfg,
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
'eval_cfg': needlebench_eval_cfg
|
'eval_cfg': needlebench_eval_cfg,
|
||||||
}
|
}
|
||||||
needlebench_2needle_zh_datasets.append(dataset_dict)
|
needlebench_2needle_zh_datasets.append(dataset_dict)
|
||||||
|
|
||||||
@ -213,7 +215,7 @@ for original_context_length in context_lengths:
|
|||||||
for depth_percent in depths_list:
|
for depth_percent in depths_list:
|
||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': f'Length{original_context_length}'
|
'abbr': f'Length{original_context_length}'
|
||||||
f'Depth{int(depth_percent)}_{num_needles}needle_zh_32k',
|
f'Depth{int(depth_percent)}_{num_needles}needle_zh_32k',
|
||||||
'type': NeedleBenchMultiDataset,
|
'type': NeedleBenchMultiDataset,
|
||||||
'path': base_path,
|
'path': base_path,
|
||||||
'length': original_context_length,
|
'length': original_context_length,
|
||||||
@ -229,7 +231,7 @@ for original_context_length in context_lengths:
|
|||||||
'diff': diff,
|
'diff': diff,
|
||||||
'reader_cfg': needlebench_reader_cfg,
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
'infer_cfg': needlebench_infer_cfg,
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
'eval_cfg': needlebench_eval_cfg
|
'eval_cfg': needlebench_eval_cfg,
|
||||||
}
|
}
|
||||||
needlebench_3needle_zh_datasets.append(dataset_dict)
|
needlebench_3needle_zh_datasets.append(dataset_dict)
|
||||||
|
|
||||||
@ -240,7 +242,7 @@ for original_context_length in context_lengths:
|
|||||||
for depth_percent in depths_list:
|
for depth_percent in depths_list:
|
||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': f'Length{original_context_length}'
|
'abbr': f'Length{original_context_length}'
|
||||||
f'Depth{int(depth_percent)}_{num_needles}needle_zh_32k',
|
f'Depth{int(depth_percent)}_{num_needles}needle_zh_32k',
|
||||||
'type': NeedleBenchMultiDataset,
|
'type': NeedleBenchMultiDataset,
|
||||||
'path': base_path,
|
'path': base_path,
|
||||||
'length': original_context_length,
|
'length': original_context_length,
|
||||||
@ -256,7 +258,7 @@ for original_context_length in context_lengths:
|
|||||||
'diff': diff,
|
'diff': diff,
|
||||||
'reader_cfg': needlebench_reader_cfg,
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
'infer_cfg': needlebench_infer_cfg,
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
'eval_cfg': needlebench_eval_cfg
|
'eval_cfg': needlebench_eval_cfg,
|
||||||
}
|
}
|
||||||
needlebench_4needle_zh_datasets.append(dataset_dict)
|
needlebench_4needle_zh_datasets.append(dataset_dict)
|
||||||
|
|
||||||
@ -267,7 +269,7 @@ for original_context_length in context_lengths:
|
|||||||
for depth_percent in depths_list:
|
for depth_percent in depths_list:
|
||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': f'Length{original_context_length}'
|
'abbr': f'Length{original_context_length}'
|
||||||
f'Depth{int(depth_percent)}_{num_needles}needle_zh_32k',
|
f'Depth{int(depth_percent)}_{num_needles}needle_zh_32k',
|
||||||
'type': NeedleBenchMultiDataset,
|
'type': NeedleBenchMultiDataset,
|
||||||
'path': base_path,
|
'path': base_path,
|
||||||
'length': original_context_length,
|
'length': original_context_length,
|
||||||
@ -283,6 +285,6 @@ for original_context_length in context_lengths:
|
|||||||
'diff': diff,
|
'diff': diff,
|
||||||
'reader_cfg': needlebench_reader_cfg,
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
'infer_cfg': needlebench_infer_cfg,
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
'eval_cfg': needlebench_eval_cfg
|
'eval_cfg': needlebench_eval_cfg,
|
||||||
}
|
}
|
||||||
needlebench_5needle_zh_datasets.append(dataset_dict)
|
needlebench_5needle_zh_datasets.append(dataset_dict)
|
||||||
|
@ -41,22 +41,24 @@ needlebench_infer_cfg = dict(
|
|||||||
dict(role='HUMAN', prompt='{prompt}'),
|
dict(role='HUMAN', prompt='{prompt}'),
|
||||||
dict(role='BOT', prompt='{answer}\n'),
|
dict(role='BOT', prompt='{answer}\n'),
|
||||||
]
|
]
|
||||||
)
|
|
||||||
),
|
),
|
||||||
|
),
|
||||||
retriever=dict(type=ZeroRetriever),
|
retriever=dict(type=ZeroRetriever),
|
||||||
inferencer=dict(type=GenInferencer))
|
inferencer=dict(type=GenInferencer),
|
||||||
|
)
|
||||||
|
|
||||||
needlebench_eval_cfg = dict(
|
needlebench_eval_cfg = dict(
|
||||||
evaluator=dict(type=NeedleBenchParallelEvaluator),
|
evaluator=dict(type=NeedleBenchParallelEvaluator),
|
||||||
pred_postprocessor=dict(type=needlebench_postprocess),
|
pred_postprocessor=dict(type=needlebench_postprocess),
|
||||||
dataset_postprocessor=dict(type=needlebench_dataset_postprocess),
|
dataset_postprocessor=dict(type=needlebench_dataset_postprocess),
|
||||||
pred_role='BOT')
|
pred_role='BOT',
|
||||||
|
)
|
||||||
|
|
||||||
context_lengths = list([9000, 13000, 17000, 21000, 25000, 29000, 31000, 32000])
|
context_lengths = list([9000, 13000, 17000, 21000, 25000, 29000, 31000, 32000])
|
||||||
document_depth_percent_intervals = 20
|
document_depth_percent_intervals = 20
|
||||||
document_depth_percent_interval_type = 'linear'
|
document_depth_percent_interval_type = 'linear'
|
||||||
|
|
||||||
base_path = './data/needlebench'
|
base_path = 'opencompass/needlebench'
|
||||||
file_list = ['PaulGrahamEssays.jsonl']
|
file_list = ['PaulGrahamEssays.jsonl']
|
||||||
needlebench_en_datasets = []
|
needlebench_en_datasets = []
|
||||||
needle_file_name = 'needles.jsonl'
|
needle_file_name = 'needles.jsonl'
|
||||||
@ -64,8 +66,7 @@ depths = [0, 10, 21, 31, 42, 52, 63, 73, 84, 94, 100]
|
|||||||
|
|
||||||
for original_context_length in context_lengths:
|
for original_context_length in context_lengths:
|
||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': f'Length{original_context_length}'
|
'abbr': f'Length{original_context_length}' f'_parallel_en_32k',
|
||||||
f'_parallel_en_32k',
|
|
||||||
'type': NeedleBenchParallelDataset,
|
'type': NeedleBenchParallelDataset,
|
||||||
'path': base_path,
|
'path': base_path,
|
||||||
'needle_file_name': needle_file_name,
|
'needle_file_name': needle_file_name,
|
||||||
@ -79,7 +80,7 @@ for original_context_length in context_lengths:
|
|||||||
'language': 'English',
|
'language': 'English',
|
||||||
'reader_cfg': needlebench_reader_cfg,
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
'infer_cfg': needlebench_infer_cfg,
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
'eval_cfg': needlebench_eval_cfg
|
'eval_cfg': needlebench_eval_cfg,
|
||||||
}
|
}
|
||||||
needlebench_en_datasets.append(dataset_dict)
|
needlebench_en_datasets.append(dataset_dict)
|
||||||
|
|
||||||
@ -88,8 +89,7 @@ needlebench_zh_datasets = []
|
|||||||
|
|
||||||
for original_context_length in context_lengths:
|
for original_context_length in context_lengths:
|
||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': f'Length{original_context_length}'
|
'abbr': f'Length{original_context_length}' f'_parallel_zh_32k',
|
||||||
f'_parallel_zh_32k',
|
|
||||||
'type': NeedleBenchParallelDataset,
|
'type': NeedleBenchParallelDataset,
|
||||||
'path': base_path,
|
'path': base_path,
|
||||||
'needle_file_name': needle_file_name,
|
'needle_file_name': needle_file_name,
|
||||||
@ -103,6 +103,6 @@ for original_context_length in context_lengths:
|
|||||||
'language': 'Chinese',
|
'language': 'Chinese',
|
||||||
'reader_cfg': needlebench_reader_cfg,
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
'infer_cfg': needlebench_infer_cfg,
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
'eval_cfg': needlebench_eval_cfg
|
'eval_cfg': needlebench_eval_cfg,
|
||||||
}
|
}
|
||||||
needlebench_zh_datasets.append(dataset_dict)
|
needlebench_zh_datasets.append(dataset_dict)
|
||||||
|
@ -41,23 +41,25 @@ needlebench_infer_cfg = dict(
|
|||||||
dict(role='HUMAN', prompt='{prompt}'),
|
dict(role='HUMAN', prompt='{prompt}'),
|
||||||
dict(role='BOT', prompt='{answer}\n'),
|
dict(role='BOT', prompt='{answer}\n'),
|
||||||
]
|
]
|
||||||
)
|
|
||||||
),
|
),
|
||||||
|
),
|
||||||
retriever=dict(type=ZeroRetriever),
|
retriever=dict(type=ZeroRetriever),
|
||||||
inferencer=dict(type=GenInferencer))
|
inferencer=dict(type=GenInferencer),
|
||||||
|
)
|
||||||
|
|
||||||
needlebench_eval_cfg = dict(
|
needlebench_eval_cfg = dict(
|
||||||
evaluator=dict(type=NeedleBenchOriginEvaluator),
|
evaluator=dict(type=NeedleBenchOriginEvaluator),
|
||||||
pred_postprocessor=dict(type=needlebench_postprocess),
|
pred_postprocessor=dict(type=needlebench_postprocess),
|
||||||
dataset_postprocessor=dict(type=needlebench_dataset_postprocess),
|
dataset_postprocessor=dict(type=needlebench_dataset_postprocess),
|
||||||
pred_role='BOT')
|
pred_role='BOT',
|
||||||
|
)
|
||||||
|
|
||||||
context_lengths = list([9000, 13000, 17000, 21000, 25000, 29000, 31000, 32000])
|
context_lengths = list([9000, 13000, 17000, 21000, 25000, 29000, 31000, 32000])
|
||||||
depths_list = [0, 10, 21, 31, 42, 52, 63, 73, 84, 94, 100]
|
depths_list = [0, 10, 21, 31, 42, 52, 63, 73, 84, 94, 100]
|
||||||
document_depth_percent_intervals = 20
|
document_depth_percent_intervals = 20
|
||||||
document_depth_percent_interval_type = 'linear'
|
document_depth_percent_interval_type = 'linear'
|
||||||
|
|
||||||
base_path = './data/needlebench'
|
base_path = 'opencompass/needlebench'
|
||||||
file_list = ['PaulGrahamEssays.jsonl']
|
file_list = ['PaulGrahamEssays.jsonl']
|
||||||
needlebench_en_datasets = []
|
needlebench_en_datasets = []
|
||||||
needle_file_name = 'needles.jsonl'
|
needle_file_name = 'needles.jsonl'
|
||||||
@ -66,7 +68,7 @@ for original_context_length in context_lengths:
|
|||||||
for depth_percent in depths_list:
|
for depth_percent in depths_list:
|
||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': f'Length{original_context_length}'
|
'abbr': f'Length{original_context_length}'
|
||||||
f'Depth{int(depth_percent)}_origin_en_32k',
|
f'Depth{int(depth_percent)}_origin_en_32k',
|
||||||
'type': NeedleBenchOriginDataset,
|
'type': NeedleBenchOriginDataset,
|
||||||
'path': base_path,
|
'path': base_path,
|
||||||
'length': original_context_length,
|
'length': original_context_length,
|
||||||
@ -80,7 +82,7 @@ for original_context_length in context_lengths:
|
|||||||
'needle_file_name': needle_file_name,
|
'needle_file_name': needle_file_name,
|
||||||
'reader_cfg': needlebench_reader_cfg,
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
'infer_cfg': needlebench_infer_cfg,
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
'eval_cfg': needlebench_eval_cfg
|
'eval_cfg': needlebench_eval_cfg,
|
||||||
}
|
}
|
||||||
needlebench_en_datasets.append(dataset_dict)
|
needlebench_en_datasets.append(dataset_dict)
|
||||||
|
|
||||||
@ -92,7 +94,7 @@ for original_context_length in context_lengths:
|
|||||||
for depth_percent in depths_list:
|
for depth_percent in depths_list:
|
||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': f'Length{original_context_length}'
|
'abbr': f'Length{original_context_length}'
|
||||||
f'Depth{int(depth_percent)}_origin_zh_32k',
|
f'Depth{int(depth_percent)}_origin_zh_32k',
|
||||||
'type': NeedleBenchOriginDataset,
|
'type': NeedleBenchOriginDataset,
|
||||||
'path': base_path,
|
'path': base_path,
|
||||||
'length': original_context_length,
|
'length': original_context_length,
|
||||||
@ -106,6 +108,6 @@ for original_context_length in context_lengths:
|
|||||||
'needle_file_name': needle_file_name,
|
'needle_file_name': needle_file_name,
|
||||||
'reader_cfg': needlebench_reader_cfg,
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
'infer_cfg': needlebench_infer_cfg,
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
'eval_cfg': needlebench_eval_cfg
|
'eval_cfg': needlebench_eval_cfg,
|
||||||
}
|
}
|
||||||
needlebench_zh_datasets.append(dataset_dict)
|
needlebench_zh_datasets.append(dataset_dict)
|
||||||
|
@ -41,23 +41,25 @@ needlebench_infer_cfg = dict(
|
|||||||
dict(role='HUMAN', prompt='{prompt}'),
|
dict(role='HUMAN', prompt='{prompt}'),
|
||||||
dict(role='BOT', prompt='{answer}\n'),
|
dict(role='BOT', prompt='{answer}\n'),
|
||||||
]
|
]
|
||||||
)
|
|
||||||
),
|
),
|
||||||
|
),
|
||||||
retriever=dict(type=ZeroRetriever),
|
retriever=dict(type=ZeroRetriever),
|
||||||
inferencer=dict(type=GenInferencer))
|
inferencer=dict(type=GenInferencer),
|
||||||
|
)
|
||||||
|
|
||||||
needlebench_eval_cfg = dict(
|
needlebench_eval_cfg = dict(
|
||||||
evaluator=dict(type=NeedleBenchMultiEvaluator),
|
evaluator=dict(type=NeedleBenchMultiEvaluator),
|
||||||
pred_postprocessor=dict(type=needlebench_postprocess),
|
pred_postprocessor=dict(type=needlebench_postprocess),
|
||||||
dataset_postprocessor=dict(type=needlebench_dataset_postprocess),
|
dataset_postprocessor=dict(type=needlebench_dataset_postprocess),
|
||||||
pred_role='BOT')
|
pred_role='BOT',
|
||||||
|
)
|
||||||
|
|
||||||
context_lengths = list(range(1000, 5000, 1000))
|
context_lengths = list(range(1000, 5000, 1000))
|
||||||
document_depth_percent_intervals = 20
|
document_depth_percent_intervals = 20
|
||||||
document_depth_percent_interval_type = 'linear'
|
document_depth_percent_interval_type = 'linear'
|
||||||
|
|
||||||
# ----------English Version----------
|
# ----------English Version----------
|
||||||
base_path = './data/needlebench'
|
base_path = 'opencompass/needlebench'
|
||||||
file_list = ['PaulGrahamEssays.jsonl']
|
file_list = ['PaulGrahamEssays.jsonl']
|
||||||
|
|
||||||
needle_file_name = 'multi_needle_reasoning_en.json'
|
needle_file_name = 'multi_needle_reasoning_en.json'
|
||||||
@ -68,11 +70,11 @@ language = 'English'
|
|||||||
|
|
||||||
for original_context_length in context_lengths:
|
for original_context_length in context_lengths:
|
||||||
for depth_percent in generate_depth_percents(
|
for depth_percent in generate_depth_percents(
|
||||||
document_depth_percent_intervals,
|
document_depth_percent_intervals, document_depth_percent_interval_type
|
||||||
document_depth_percent_interval_type):
|
):
|
||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': f'Length{original_context_length}'
|
'abbr': f'Length{original_context_length}'
|
||||||
f'Depth{int(depth_percent)}_{num_needles}needle_en_4k',
|
f'Depth{int(depth_percent)}_{num_needles}needle_en_4k',
|
||||||
'type': NeedleBenchMultiDataset,
|
'type': NeedleBenchMultiDataset,
|
||||||
'path': base_path,
|
'path': base_path,
|
||||||
'length': original_context_length,
|
'length': original_context_length,
|
||||||
@ -88,7 +90,7 @@ for original_context_length in context_lengths:
|
|||||||
'diff': diff,
|
'diff': diff,
|
||||||
'reader_cfg': needlebench_reader_cfg,
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
'infer_cfg': needlebench_infer_cfg,
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
'eval_cfg': needlebench_eval_cfg
|
'eval_cfg': needlebench_eval_cfg,
|
||||||
}
|
}
|
||||||
needlebench_2needle_en_datasets.append(dataset_dict)
|
needlebench_2needle_en_datasets.append(dataset_dict)
|
||||||
|
|
||||||
@ -97,11 +99,11 @@ needlebench_3needle_en_datasets = []
|
|||||||
|
|
||||||
for original_context_length in context_lengths:
|
for original_context_length in context_lengths:
|
||||||
for depth_percent in generate_depth_percents(
|
for depth_percent in generate_depth_percents(
|
||||||
document_depth_percent_intervals,
|
document_depth_percent_intervals, document_depth_percent_interval_type
|
||||||
document_depth_percent_interval_type):
|
):
|
||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': f'Length{original_context_length}'
|
'abbr': f'Length{original_context_length}'
|
||||||
f'Depth{int(depth_percent)}_{num_needles}needle_en_4k',
|
f'Depth{int(depth_percent)}_{num_needles}needle_en_4k',
|
||||||
'type': NeedleBenchMultiDataset,
|
'type': NeedleBenchMultiDataset,
|
||||||
'path': base_path,
|
'path': base_path,
|
||||||
'length': original_context_length,
|
'length': original_context_length,
|
||||||
@ -117,7 +119,7 @@ for original_context_length in context_lengths:
|
|||||||
'diff': diff,
|
'diff': diff,
|
||||||
'reader_cfg': needlebench_reader_cfg,
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
'infer_cfg': needlebench_infer_cfg,
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
'eval_cfg': needlebench_eval_cfg
|
'eval_cfg': needlebench_eval_cfg,
|
||||||
}
|
}
|
||||||
needlebench_3needle_en_datasets.append(dataset_dict)
|
needlebench_3needle_en_datasets.append(dataset_dict)
|
||||||
|
|
||||||
@ -126,11 +128,11 @@ needlebench_4needle_en_datasets = []
|
|||||||
|
|
||||||
for original_context_length in context_lengths:
|
for original_context_length in context_lengths:
|
||||||
for depth_percent in generate_depth_percents(
|
for depth_percent in generate_depth_percents(
|
||||||
document_depth_percent_intervals,
|
document_depth_percent_intervals, document_depth_percent_interval_type
|
||||||
document_depth_percent_interval_type):
|
):
|
||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': f'Length{original_context_length}'
|
'abbr': f'Length{original_context_length}'
|
||||||
f'Depth{int(depth_percent)}_{num_needles}needle_en_4k',
|
f'Depth{int(depth_percent)}_{num_needles}needle_en_4k',
|
||||||
'type': NeedleBenchMultiDataset,
|
'type': NeedleBenchMultiDataset,
|
||||||
'path': base_path,
|
'path': base_path,
|
||||||
'length': original_context_length,
|
'length': original_context_length,
|
||||||
@ -146,7 +148,7 @@ for original_context_length in context_lengths:
|
|||||||
'diff': diff,
|
'diff': diff,
|
||||||
'reader_cfg': needlebench_reader_cfg,
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
'infer_cfg': needlebench_infer_cfg,
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
'eval_cfg': needlebench_eval_cfg
|
'eval_cfg': needlebench_eval_cfg,
|
||||||
}
|
}
|
||||||
needlebench_4needle_en_datasets.append(dataset_dict)
|
needlebench_4needle_en_datasets.append(dataset_dict)
|
||||||
|
|
||||||
@ -155,11 +157,11 @@ needlebench_5needle_en_datasets = []
|
|||||||
|
|
||||||
for original_context_length in context_lengths:
|
for original_context_length in context_lengths:
|
||||||
for depth_percent in generate_depth_percents(
|
for depth_percent in generate_depth_percents(
|
||||||
document_depth_percent_intervals,
|
document_depth_percent_intervals, document_depth_percent_interval_type
|
||||||
document_depth_percent_interval_type):
|
):
|
||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': f'Length{original_context_length}'
|
'abbr': f'Length{original_context_length}'
|
||||||
f'Depth{int(depth_percent)}_{num_needles}needle_en_4k',
|
f'Depth{int(depth_percent)}_{num_needles}needle_en_4k',
|
||||||
'type': NeedleBenchMultiDataset,
|
'type': NeedleBenchMultiDataset,
|
||||||
'path': base_path,
|
'path': base_path,
|
||||||
'length': original_context_length,
|
'length': original_context_length,
|
||||||
@ -175,12 +177,12 @@ for original_context_length in context_lengths:
|
|||||||
'diff': diff,
|
'diff': diff,
|
||||||
'reader_cfg': needlebench_reader_cfg,
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
'infer_cfg': needlebench_infer_cfg,
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
'eval_cfg': needlebench_eval_cfg
|
'eval_cfg': needlebench_eval_cfg,
|
||||||
}
|
}
|
||||||
needlebench_5needle_en_datasets.append(dataset_dict)
|
needlebench_5needle_en_datasets.append(dataset_dict)
|
||||||
|
|
||||||
# ----------Chinese Version----------
|
# ----------Chinese Version----------
|
||||||
base_path = './data/needlebench'
|
base_path = 'opencompass/needlebench'
|
||||||
file_list = ['zh_finance.jsonl']
|
file_list = ['zh_finance.jsonl']
|
||||||
|
|
||||||
needle_file_name = 'multi_needle_reasoning_zh.json'
|
needle_file_name = 'multi_needle_reasoning_zh.json'
|
||||||
@ -191,11 +193,11 @@ language = 'Chinese'
|
|||||||
|
|
||||||
for original_context_length in context_lengths:
|
for original_context_length in context_lengths:
|
||||||
for depth_percent in generate_depth_percents(
|
for depth_percent in generate_depth_percents(
|
||||||
document_depth_percent_intervals,
|
document_depth_percent_intervals, document_depth_percent_interval_type
|
||||||
document_depth_percent_interval_type):
|
):
|
||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': f'Length{original_context_length}'
|
'abbr': f'Length{original_context_length}'
|
||||||
f'Depth{int(depth_percent)}_{num_needles}needle_zh_4k',
|
f'Depth{int(depth_percent)}_{num_needles}needle_zh_4k',
|
||||||
'type': NeedleBenchMultiDataset,
|
'type': NeedleBenchMultiDataset,
|
||||||
'path': base_path,
|
'path': base_path,
|
||||||
'length': original_context_length,
|
'length': original_context_length,
|
||||||
@ -211,7 +213,7 @@ for original_context_length in context_lengths:
|
|||||||
'diff': diff,
|
'diff': diff,
|
||||||
'reader_cfg': needlebench_reader_cfg,
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
'infer_cfg': needlebench_infer_cfg,
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
'eval_cfg': needlebench_eval_cfg
|
'eval_cfg': needlebench_eval_cfg,
|
||||||
}
|
}
|
||||||
needlebench_2needle_zh_datasets.append(dataset_dict)
|
needlebench_2needle_zh_datasets.append(dataset_dict)
|
||||||
|
|
||||||
@ -220,11 +222,11 @@ needlebench_3needle_zh_datasets = []
|
|||||||
|
|
||||||
for original_context_length in context_lengths:
|
for original_context_length in context_lengths:
|
||||||
for depth_percent in generate_depth_percents(
|
for depth_percent in generate_depth_percents(
|
||||||
document_depth_percent_intervals,
|
document_depth_percent_intervals, document_depth_percent_interval_type
|
||||||
document_depth_percent_interval_type):
|
):
|
||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': f'Length{original_context_length}'
|
'abbr': f'Length{original_context_length}'
|
||||||
f'Depth{int(depth_percent)}_{num_needles}needle_zh_4k',
|
f'Depth{int(depth_percent)}_{num_needles}needle_zh_4k',
|
||||||
'type': NeedleBenchMultiDataset,
|
'type': NeedleBenchMultiDataset,
|
||||||
'path': base_path,
|
'path': base_path,
|
||||||
'length': original_context_length,
|
'length': original_context_length,
|
||||||
@ -240,7 +242,7 @@ for original_context_length in context_lengths:
|
|||||||
'diff': diff,
|
'diff': diff,
|
||||||
'reader_cfg': needlebench_reader_cfg,
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
'infer_cfg': needlebench_infer_cfg,
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
'eval_cfg': needlebench_eval_cfg
|
'eval_cfg': needlebench_eval_cfg,
|
||||||
}
|
}
|
||||||
needlebench_3needle_zh_datasets.append(dataset_dict)
|
needlebench_3needle_zh_datasets.append(dataset_dict)
|
||||||
|
|
||||||
@ -249,11 +251,11 @@ needlebench_4needle_zh_datasets = []
|
|||||||
|
|
||||||
for original_context_length in context_lengths:
|
for original_context_length in context_lengths:
|
||||||
for depth_percent in generate_depth_percents(
|
for depth_percent in generate_depth_percents(
|
||||||
document_depth_percent_intervals,
|
document_depth_percent_intervals, document_depth_percent_interval_type
|
||||||
document_depth_percent_interval_type):
|
):
|
||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': f'Length{original_context_length}'
|
'abbr': f'Length{original_context_length}'
|
||||||
f'Depth{int(depth_percent)}_{num_needles}needle_zh_4k',
|
f'Depth{int(depth_percent)}_{num_needles}needle_zh_4k',
|
||||||
'type': NeedleBenchMultiDataset,
|
'type': NeedleBenchMultiDataset,
|
||||||
'path': base_path,
|
'path': base_path,
|
||||||
'length': original_context_length,
|
'length': original_context_length,
|
||||||
@ -269,7 +271,7 @@ for original_context_length in context_lengths:
|
|||||||
'diff': diff,
|
'diff': diff,
|
||||||
'reader_cfg': needlebench_reader_cfg,
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
'infer_cfg': needlebench_infer_cfg,
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
'eval_cfg': needlebench_eval_cfg
|
'eval_cfg': needlebench_eval_cfg,
|
||||||
}
|
}
|
||||||
needlebench_4needle_zh_datasets.append(dataset_dict)
|
needlebench_4needle_zh_datasets.append(dataset_dict)
|
||||||
|
|
||||||
@ -278,11 +280,11 @@ needlebench_5needle_zh_datasets = []
|
|||||||
|
|
||||||
for original_context_length in context_lengths:
|
for original_context_length in context_lengths:
|
||||||
for depth_percent in generate_depth_percents(
|
for depth_percent in generate_depth_percents(
|
||||||
document_depth_percent_intervals,
|
document_depth_percent_intervals, document_depth_percent_interval_type
|
||||||
document_depth_percent_interval_type):
|
):
|
||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': f'Length{original_context_length}'
|
'abbr': f'Length{original_context_length}'
|
||||||
f'Depth{int(depth_percent)}_{num_needles}needle_zh_4k',
|
f'Depth{int(depth_percent)}_{num_needles}needle_zh_4k',
|
||||||
'type': NeedleBenchMultiDataset,
|
'type': NeedleBenchMultiDataset,
|
||||||
'path': base_path,
|
'path': base_path,
|
||||||
'length': original_context_length,
|
'length': original_context_length,
|
||||||
@ -298,6 +300,6 @@ for original_context_length in context_lengths:
|
|||||||
'diff': diff,
|
'diff': diff,
|
||||||
'reader_cfg': needlebench_reader_cfg,
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
'infer_cfg': needlebench_infer_cfg,
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
'eval_cfg': needlebench_eval_cfg
|
'eval_cfg': needlebench_eval_cfg,
|
||||||
}
|
}
|
||||||
needlebench_5needle_zh_datasets.append(dataset_dict)
|
needlebench_5needle_zh_datasets.append(dataset_dict)
|
||||||
|
@ -41,34 +41,35 @@ needlebench_infer_cfg = dict(
|
|||||||
dict(role='HUMAN', prompt='{prompt}'),
|
dict(role='HUMAN', prompt='{prompt}'),
|
||||||
dict(role='BOT', prompt='{answer}\n'),
|
dict(role='BOT', prompt='{answer}\n'),
|
||||||
]
|
]
|
||||||
)
|
|
||||||
),
|
),
|
||||||
|
),
|
||||||
retriever=dict(type=ZeroRetriever),
|
retriever=dict(type=ZeroRetriever),
|
||||||
inferencer=dict(type=GenInferencer))
|
inferencer=dict(type=GenInferencer),
|
||||||
|
)
|
||||||
|
|
||||||
needlebench_eval_cfg = dict(
|
needlebench_eval_cfg = dict(
|
||||||
evaluator=dict(type=NeedleBenchParallelEvaluator),
|
evaluator=dict(type=NeedleBenchParallelEvaluator),
|
||||||
pred_postprocessor=dict(type=needlebench_postprocess),
|
pred_postprocessor=dict(type=needlebench_postprocess),
|
||||||
dataset_postprocessor=dict(type=needlebench_dataset_postprocess),
|
dataset_postprocessor=dict(type=needlebench_dataset_postprocess),
|
||||||
pred_role='BOT')
|
pred_role='BOT',
|
||||||
|
)
|
||||||
|
|
||||||
context_lengths = list(range(1000, 5000, 1000))
|
context_lengths = list(range(1000, 5000, 1000))
|
||||||
document_depth_percent_intervals = 20
|
document_depth_percent_intervals = 20
|
||||||
document_depth_percent_interval_type = 'linear'
|
document_depth_percent_interval_type = 'linear'
|
||||||
|
|
||||||
base_path = './data/needlebench'
|
base_path = 'opencompass/needlebench'
|
||||||
file_list = ['PaulGrahamEssays.jsonl']
|
file_list = ['PaulGrahamEssays.jsonl']
|
||||||
needlebench_en_datasets = []
|
needlebench_en_datasets = []
|
||||||
needle_file_name = 'needles.jsonl'
|
needle_file_name = 'needles.jsonl'
|
||||||
depths_float = generate_depth_percents(
|
depths_float = generate_depth_percents(
|
||||||
document_depth_percent_intervals,
|
document_depth_percent_intervals, document_depth_percent_interval_type
|
||||||
document_depth_percent_interval_type)
|
)
|
||||||
depths = [int(depth) for depth in depths_float]
|
depths = [int(depth) for depth in depths_float]
|
||||||
|
|
||||||
for original_context_length in context_lengths:
|
for original_context_length in context_lengths:
|
||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': f'Length{original_context_length}'
|
'abbr': f'Length{original_context_length}' f'_parallel_en_4k',
|
||||||
f'_parallel_en_4k',
|
|
||||||
'type': NeedleBenchParallelDataset,
|
'type': NeedleBenchParallelDataset,
|
||||||
'path': base_path,
|
'path': base_path,
|
||||||
'needle_file_name': needle_file_name,
|
'needle_file_name': needle_file_name,
|
||||||
@ -82,7 +83,7 @@ for original_context_length in context_lengths:
|
|||||||
'language': 'English',
|
'language': 'English',
|
||||||
'reader_cfg': needlebench_reader_cfg,
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
'infer_cfg': needlebench_infer_cfg,
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
'eval_cfg': needlebench_eval_cfg
|
'eval_cfg': needlebench_eval_cfg,
|
||||||
}
|
}
|
||||||
needlebench_en_datasets.append(dataset_dict)
|
needlebench_en_datasets.append(dataset_dict)
|
||||||
|
|
||||||
@ -91,8 +92,7 @@ needlebench_zh_datasets = []
|
|||||||
|
|
||||||
for original_context_length in context_lengths:
|
for original_context_length in context_lengths:
|
||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': f'Length{original_context_length}'
|
'abbr': f'Length{original_context_length}' f'_parallel_zh_4k',
|
||||||
f'_parallel_zh_4k',
|
|
||||||
'type': NeedleBenchParallelDataset,
|
'type': NeedleBenchParallelDataset,
|
||||||
'path': base_path,
|
'path': base_path,
|
||||||
'needle_file_name': needle_file_name,
|
'needle_file_name': needle_file_name,
|
||||||
@ -106,6 +106,6 @@ for original_context_length in context_lengths:
|
|||||||
'language': 'Chinese',
|
'language': 'Chinese',
|
||||||
'reader_cfg': needlebench_reader_cfg,
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
'infer_cfg': needlebench_infer_cfg,
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
'eval_cfg': needlebench_eval_cfg
|
'eval_cfg': needlebench_eval_cfg,
|
||||||
}
|
}
|
||||||
needlebench_zh_datasets.append(dataset_dict)
|
needlebench_zh_datasets.append(dataset_dict)
|
||||||
|
@ -41,33 +41,35 @@ needlebench_infer_cfg = dict(
|
|||||||
dict(role='HUMAN', prompt='{prompt}'),
|
dict(role='HUMAN', prompt='{prompt}'),
|
||||||
dict(role='BOT', prompt='{answer}\n'),
|
dict(role='BOT', prompt='{answer}\n'),
|
||||||
]
|
]
|
||||||
)
|
|
||||||
),
|
),
|
||||||
|
),
|
||||||
retriever=dict(type=ZeroRetriever),
|
retriever=dict(type=ZeroRetriever),
|
||||||
inferencer=dict(type=GenInferencer))
|
inferencer=dict(type=GenInferencer),
|
||||||
|
)
|
||||||
|
|
||||||
needlebench_eval_cfg = dict(
|
needlebench_eval_cfg = dict(
|
||||||
evaluator=dict(type=NeedleBenchOriginEvaluator),
|
evaluator=dict(type=NeedleBenchOriginEvaluator),
|
||||||
pred_postprocessor=dict(type=needlebench_postprocess),
|
pred_postprocessor=dict(type=needlebench_postprocess),
|
||||||
dataset_postprocessor=dict(type=needlebench_dataset_postprocess),
|
dataset_postprocessor=dict(type=needlebench_dataset_postprocess),
|
||||||
pred_role='BOT')
|
pred_role='BOT',
|
||||||
|
)
|
||||||
|
|
||||||
context_lengths = list(range(1000, 5000, 1000))
|
context_lengths = list(range(1000, 5000, 1000))
|
||||||
document_depth_percent_intervals = 20
|
document_depth_percent_intervals = 20
|
||||||
document_depth_percent_interval_type = 'linear'
|
document_depth_percent_interval_type = 'linear'
|
||||||
|
|
||||||
base_path = './data/needlebench'
|
base_path = 'opencompass/needlebench'
|
||||||
file_list = ['PaulGrahamEssays.jsonl']
|
file_list = ['PaulGrahamEssays.jsonl']
|
||||||
needlebench_en_datasets = []
|
needlebench_en_datasets = []
|
||||||
needle_file_name = 'needles.jsonl'
|
needle_file_name = 'needles.jsonl'
|
||||||
|
|
||||||
for original_context_length in context_lengths:
|
for original_context_length in context_lengths:
|
||||||
for depth_percent in generate_depth_percents(
|
for depth_percent in generate_depth_percents(
|
||||||
document_depth_percent_intervals,
|
document_depth_percent_intervals, document_depth_percent_interval_type
|
||||||
document_depth_percent_interval_type):
|
):
|
||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': f'Length{original_context_length}'
|
'abbr': f'Length{original_context_length}'
|
||||||
f'Depth{int(depth_percent)}_origin_en_4k',
|
f'Depth{int(depth_percent)}_origin_en_4k',
|
||||||
'type': NeedleBenchOriginDataset,
|
'type': NeedleBenchOriginDataset,
|
||||||
'path': base_path,
|
'path': base_path,
|
||||||
'length': original_context_length,
|
'length': original_context_length,
|
||||||
@ -81,7 +83,7 @@ for original_context_length in context_lengths:
|
|||||||
'needle_file_name': needle_file_name,
|
'needle_file_name': needle_file_name,
|
||||||
'reader_cfg': needlebench_reader_cfg,
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
'infer_cfg': needlebench_infer_cfg,
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
'eval_cfg': needlebench_eval_cfg
|
'eval_cfg': needlebench_eval_cfg,
|
||||||
}
|
}
|
||||||
needlebench_en_datasets.append(dataset_dict)
|
needlebench_en_datasets.append(dataset_dict)
|
||||||
|
|
||||||
@ -91,11 +93,11 @@ needle_file_name = 'needles.jsonl'
|
|||||||
|
|
||||||
for original_context_length in context_lengths:
|
for original_context_length in context_lengths:
|
||||||
for depth_percent in generate_depth_percents(
|
for depth_percent in generate_depth_percents(
|
||||||
document_depth_percent_intervals,
|
document_depth_percent_intervals, document_depth_percent_interval_type
|
||||||
document_depth_percent_interval_type):
|
):
|
||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': f'Length{original_context_length}'
|
'abbr': f'Length{original_context_length}'
|
||||||
f'Depth{int(depth_percent)}_origin_zh_4k',
|
f'Depth{int(depth_percent)}_origin_zh_4k',
|
||||||
'type': NeedleBenchOriginDataset,
|
'type': NeedleBenchOriginDataset,
|
||||||
'path': base_path,
|
'path': base_path,
|
||||||
'length': original_context_length,
|
'length': original_context_length,
|
||||||
@ -109,6 +111,6 @@ for original_context_length in context_lengths:
|
|||||||
'needle_file_name': needle_file_name,
|
'needle_file_name': needle_file_name,
|
||||||
'reader_cfg': needlebench_reader_cfg,
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
'infer_cfg': needlebench_infer_cfg,
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
'eval_cfg': needlebench_eval_cfg
|
'eval_cfg': needlebench_eval_cfg,
|
||||||
}
|
}
|
||||||
needlebench_zh_datasets.append(dataset_dict)
|
needlebench_zh_datasets.append(dataset_dict)
|
||||||
|
@ -41,23 +41,25 @@ needlebench_infer_cfg = dict(
|
|||||||
dict(role='HUMAN', prompt='{prompt}'),
|
dict(role='HUMAN', prompt='{prompt}'),
|
||||||
dict(role='BOT', prompt='{answer}\n'),
|
dict(role='BOT', prompt='{answer}\n'),
|
||||||
]
|
]
|
||||||
)
|
|
||||||
),
|
),
|
||||||
|
),
|
||||||
retriever=dict(type=ZeroRetriever),
|
retriever=dict(type=ZeroRetriever),
|
||||||
inferencer=dict(type=GenInferencer))
|
inferencer=dict(type=GenInferencer),
|
||||||
|
)
|
||||||
|
|
||||||
needlebench_eval_cfg = dict(
|
needlebench_eval_cfg = dict(
|
||||||
evaluator=dict(type=NeedleBenchMultiEvaluator),
|
evaluator=dict(type=NeedleBenchMultiEvaluator),
|
||||||
pred_postprocessor=dict(type=needlebench_postprocess),
|
pred_postprocessor=dict(type=needlebench_postprocess),
|
||||||
dataset_postprocessor=dict(type=needlebench_dataset_postprocess),
|
dataset_postprocessor=dict(type=needlebench_dataset_postprocess),
|
||||||
pred_role='BOT')
|
pred_role='BOT',
|
||||||
|
)
|
||||||
|
|
||||||
context_lengths = list(range(5000, 9000, 1000))
|
context_lengths = list(range(5000, 9000, 1000))
|
||||||
document_depth_percent_intervals = 20
|
document_depth_percent_intervals = 20
|
||||||
document_depth_percent_interval_type = 'linear'
|
document_depth_percent_interval_type = 'linear'
|
||||||
|
|
||||||
# ----------English Version----------
|
# ----------English Version----------
|
||||||
base_path = './data/needlebench'
|
base_path = 'opencompass/needlebench'
|
||||||
file_list = ['PaulGrahamEssays.jsonl']
|
file_list = ['PaulGrahamEssays.jsonl']
|
||||||
|
|
||||||
needle_file_name = 'multi_needle_reasoning_en.json'
|
needle_file_name = 'multi_needle_reasoning_en.json'
|
||||||
@ -68,11 +70,11 @@ language = 'English'
|
|||||||
|
|
||||||
for original_context_length in context_lengths:
|
for original_context_length in context_lengths:
|
||||||
for depth_percent in generate_depth_percents(
|
for depth_percent in generate_depth_percents(
|
||||||
document_depth_percent_intervals,
|
document_depth_percent_intervals, document_depth_percent_interval_type
|
||||||
document_depth_percent_interval_type):
|
):
|
||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': f'Length{original_context_length}'
|
'abbr': f'Length{original_context_length}'
|
||||||
f'Depth{int(depth_percent)}_{num_needles}needle_en_8k',
|
f'Depth{int(depth_percent)}_{num_needles}needle_en_8k',
|
||||||
'type': NeedleBenchMultiDataset,
|
'type': NeedleBenchMultiDataset,
|
||||||
'path': base_path,
|
'path': base_path,
|
||||||
'length': original_context_length,
|
'length': original_context_length,
|
||||||
@ -88,7 +90,7 @@ for original_context_length in context_lengths:
|
|||||||
'diff': diff,
|
'diff': diff,
|
||||||
'reader_cfg': needlebench_reader_cfg,
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
'infer_cfg': needlebench_infer_cfg,
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
'eval_cfg': needlebench_eval_cfg
|
'eval_cfg': needlebench_eval_cfg,
|
||||||
}
|
}
|
||||||
needlebench_2needle_en_datasets.append(dataset_dict)
|
needlebench_2needle_en_datasets.append(dataset_dict)
|
||||||
|
|
||||||
@ -97,11 +99,11 @@ needlebench_3needle_en_datasets = []
|
|||||||
|
|
||||||
for original_context_length in context_lengths:
|
for original_context_length in context_lengths:
|
||||||
for depth_percent in generate_depth_percents(
|
for depth_percent in generate_depth_percents(
|
||||||
document_depth_percent_intervals,
|
document_depth_percent_intervals, document_depth_percent_interval_type
|
||||||
document_depth_percent_interval_type):
|
):
|
||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': f'Length{original_context_length}'
|
'abbr': f'Length{original_context_length}'
|
||||||
f'Depth{int(depth_percent)}_{num_needles}needle_en_8k',
|
f'Depth{int(depth_percent)}_{num_needles}needle_en_8k',
|
||||||
'type': NeedleBenchMultiDataset,
|
'type': NeedleBenchMultiDataset,
|
||||||
'path': base_path,
|
'path': base_path,
|
||||||
'length': original_context_length,
|
'length': original_context_length,
|
||||||
@ -117,7 +119,7 @@ for original_context_length in context_lengths:
|
|||||||
'diff': diff,
|
'diff': diff,
|
||||||
'reader_cfg': needlebench_reader_cfg,
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
'infer_cfg': needlebench_infer_cfg,
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
'eval_cfg': needlebench_eval_cfg
|
'eval_cfg': needlebench_eval_cfg,
|
||||||
}
|
}
|
||||||
needlebench_3needle_en_datasets.append(dataset_dict)
|
needlebench_3needle_en_datasets.append(dataset_dict)
|
||||||
|
|
||||||
@ -126,11 +128,11 @@ needlebench_4needle_en_datasets = []
|
|||||||
|
|
||||||
for original_context_length in context_lengths:
|
for original_context_length in context_lengths:
|
||||||
for depth_percent in generate_depth_percents(
|
for depth_percent in generate_depth_percents(
|
||||||
document_depth_percent_intervals,
|
document_depth_percent_intervals, document_depth_percent_interval_type
|
||||||
document_depth_percent_interval_type):
|
):
|
||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': f'Length{original_context_length}'
|
'abbr': f'Length{original_context_length}'
|
||||||
f'Depth{int(depth_percent)}_{num_needles}needle_en_8k',
|
f'Depth{int(depth_percent)}_{num_needles}needle_en_8k',
|
||||||
'type': NeedleBenchMultiDataset,
|
'type': NeedleBenchMultiDataset,
|
||||||
'path': base_path,
|
'path': base_path,
|
||||||
'length': original_context_length,
|
'length': original_context_length,
|
||||||
@ -146,7 +148,7 @@ for original_context_length in context_lengths:
|
|||||||
'diff': diff,
|
'diff': diff,
|
||||||
'reader_cfg': needlebench_reader_cfg,
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
'infer_cfg': needlebench_infer_cfg,
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
'eval_cfg': needlebench_eval_cfg
|
'eval_cfg': needlebench_eval_cfg,
|
||||||
}
|
}
|
||||||
needlebench_4needle_en_datasets.append(dataset_dict)
|
needlebench_4needle_en_datasets.append(dataset_dict)
|
||||||
|
|
||||||
@ -155,11 +157,11 @@ needlebench_5needle_en_datasets = []
|
|||||||
|
|
||||||
for original_context_length in context_lengths:
|
for original_context_length in context_lengths:
|
||||||
for depth_percent in generate_depth_percents(
|
for depth_percent in generate_depth_percents(
|
||||||
document_depth_percent_intervals,
|
document_depth_percent_intervals, document_depth_percent_interval_type
|
||||||
document_depth_percent_interval_type):
|
):
|
||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': f'Length{original_context_length}'
|
'abbr': f'Length{original_context_length}'
|
||||||
f'Depth{int(depth_percent)}_{num_needles}needle_en_8k',
|
f'Depth{int(depth_percent)}_{num_needles}needle_en_8k',
|
||||||
'type': NeedleBenchMultiDataset,
|
'type': NeedleBenchMultiDataset,
|
||||||
'path': base_path,
|
'path': base_path,
|
||||||
'length': original_context_length,
|
'length': original_context_length,
|
||||||
@ -175,12 +177,12 @@ for original_context_length in context_lengths:
|
|||||||
'diff': diff,
|
'diff': diff,
|
||||||
'reader_cfg': needlebench_reader_cfg,
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
'infer_cfg': needlebench_infer_cfg,
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
'eval_cfg': needlebench_eval_cfg
|
'eval_cfg': needlebench_eval_cfg,
|
||||||
}
|
}
|
||||||
needlebench_5needle_en_datasets.append(dataset_dict)
|
needlebench_5needle_en_datasets.append(dataset_dict)
|
||||||
|
|
||||||
# ----------Chinese Version----------
|
# ----------Chinese Version----------
|
||||||
base_path = './data/needlebench'
|
base_path = 'opencompass/needlebench'
|
||||||
file_list = ['zh_finance.jsonl']
|
file_list = ['zh_finance.jsonl']
|
||||||
|
|
||||||
needle_file_name = 'multi_needle_reasoning_zh.json'
|
needle_file_name = 'multi_needle_reasoning_zh.json'
|
||||||
@ -191,11 +193,11 @@ language = 'Chinese'
|
|||||||
|
|
||||||
for original_context_length in context_lengths:
|
for original_context_length in context_lengths:
|
||||||
for depth_percent in generate_depth_percents(
|
for depth_percent in generate_depth_percents(
|
||||||
document_depth_percent_intervals,
|
document_depth_percent_intervals, document_depth_percent_interval_type
|
||||||
document_depth_percent_interval_type):
|
):
|
||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': f'Length{original_context_length}'
|
'abbr': f'Length{original_context_length}'
|
||||||
f'Depth{int(depth_percent)}_{num_needles}needle_zh_8k',
|
f'Depth{int(depth_percent)}_{num_needles}needle_zh_8k',
|
||||||
'type': NeedleBenchMultiDataset,
|
'type': NeedleBenchMultiDataset,
|
||||||
'path': base_path,
|
'path': base_path,
|
||||||
'length': original_context_length,
|
'length': original_context_length,
|
||||||
@ -211,7 +213,7 @@ for original_context_length in context_lengths:
|
|||||||
'diff': diff,
|
'diff': diff,
|
||||||
'reader_cfg': needlebench_reader_cfg,
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
'infer_cfg': needlebench_infer_cfg,
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
'eval_cfg': needlebench_eval_cfg
|
'eval_cfg': needlebench_eval_cfg,
|
||||||
}
|
}
|
||||||
needlebench_2needle_zh_datasets.append(dataset_dict)
|
needlebench_2needle_zh_datasets.append(dataset_dict)
|
||||||
|
|
||||||
@ -220,11 +222,11 @@ needlebench_3needle_zh_datasets = []
|
|||||||
|
|
||||||
for original_context_length in context_lengths:
|
for original_context_length in context_lengths:
|
||||||
for depth_percent in generate_depth_percents(
|
for depth_percent in generate_depth_percents(
|
||||||
document_depth_percent_intervals,
|
document_depth_percent_intervals, document_depth_percent_interval_type
|
||||||
document_depth_percent_interval_type):
|
):
|
||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': f'Length{original_context_length}'
|
'abbr': f'Length{original_context_length}'
|
||||||
f'Depth{int(depth_percent)}_{num_needles}needle_zh_8k',
|
f'Depth{int(depth_percent)}_{num_needles}needle_zh_8k',
|
||||||
'type': NeedleBenchMultiDataset,
|
'type': NeedleBenchMultiDataset,
|
||||||
'path': base_path,
|
'path': base_path,
|
||||||
'length': original_context_length,
|
'length': original_context_length,
|
||||||
@ -240,7 +242,7 @@ for original_context_length in context_lengths:
|
|||||||
'diff': diff,
|
'diff': diff,
|
||||||
'reader_cfg': needlebench_reader_cfg,
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
'infer_cfg': needlebench_infer_cfg,
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
'eval_cfg': needlebench_eval_cfg
|
'eval_cfg': needlebench_eval_cfg,
|
||||||
}
|
}
|
||||||
needlebench_3needle_zh_datasets.append(dataset_dict)
|
needlebench_3needle_zh_datasets.append(dataset_dict)
|
||||||
|
|
||||||
@ -249,11 +251,11 @@ needlebench_4needle_zh_datasets = []
|
|||||||
|
|
||||||
for original_context_length in context_lengths:
|
for original_context_length in context_lengths:
|
||||||
for depth_percent in generate_depth_percents(
|
for depth_percent in generate_depth_percents(
|
||||||
document_depth_percent_intervals,
|
document_depth_percent_intervals, document_depth_percent_interval_type
|
||||||
document_depth_percent_interval_type):
|
):
|
||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': f'Length{original_context_length}'
|
'abbr': f'Length{original_context_length}'
|
||||||
f'Depth{int(depth_percent)}_{num_needles}needle_zh_8k',
|
f'Depth{int(depth_percent)}_{num_needles}needle_zh_8k',
|
||||||
'type': NeedleBenchMultiDataset,
|
'type': NeedleBenchMultiDataset,
|
||||||
'path': base_path,
|
'path': base_path,
|
||||||
'length': original_context_length,
|
'length': original_context_length,
|
||||||
@ -269,7 +271,7 @@ for original_context_length in context_lengths:
|
|||||||
'diff': diff,
|
'diff': diff,
|
||||||
'reader_cfg': needlebench_reader_cfg,
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
'infer_cfg': needlebench_infer_cfg,
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
'eval_cfg': needlebench_eval_cfg
|
'eval_cfg': needlebench_eval_cfg,
|
||||||
}
|
}
|
||||||
needlebench_4needle_zh_datasets.append(dataset_dict)
|
needlebench_4needle_zh_datasets.append(dataset_dict)
|
||||||
|
|
||||||
@ -278,11 +280,11 @@ needlebench_5needle_zh_datasets = []
|
|||||||
|
|
||||||
for original_context_length in context_lengths:
|
for original_context_length in context_lengths:
|
||||||
for depth_percent in generate_depth_percents(
|
for depth_percent in generate_depth_percents(
|
||||||
document_depth_percent_intervals,
|
document_depth_percent_intervals, document_depth_percent_interval_type
|
||||||
document_depth_percent_interval_type):
|
):
|
||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': f'Length{original_context_length}'
|
'abbr': f'Length{original_context_length}'
|
||||||
f'Depth{int(depth_percent)}_{num_needles}needle_zh_8k',
|
f'Depth{int(depth_percent)}_{num_needles}needle_zh_8k',
|
||||||
'type': NeedleBenchMultiDataset,
|
'type': NeedleBenchMultiDataset,
|
||||||
'path': base_path,
|
'path': base_path,
|
||||||
'length': original_context_length,
|
'length': original_context_length,
|
||||||
@ -298,6 +300,6 @@ for original_context_length in context_lengths:
|
|||||||
'diff': diff,
|
'diff': diff,
|
||||||
'reader_cfg': needlebench_reader_cfg,
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
'infer_cfg': needlebench_infer_cfg,
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
'eval_cfg': needlebench_eval_cfg
|
'eval_cfg': needlebench_eval_cfg,
|
||||||
}
|
}
|
||||||
needlebench_5needle_zh_datasets.append(dataset_dict)
|
needlebench_5needle_zh_datasets.append(dataset_dict)
|
||||||
|
@ -41,34 +41,35 @@ needlebench_infer_cfg = dict(
|
|||||||
dict(role='HUMAN', prompt='{prompt}'),
|
dict(role='HUMAN', prompt='{prompt}'),
|
||||||
dict(role='BOT', prompt='{answer}\n'),
|
dict(role='BOT', prompt='{answer}\n'),
|
||||||
]
|
]
|
||||||
)
|
|
||||||
),
|
),
|
||||||
|
),
|
||||||
retriever=dict(type=ZeroRetriever),
|
retriever=dict(type=ZeroRetriever),
|
||||||
inferencer=dict(type=GenInferencer))
|
inferencer=dict(type=GenInferencer),
|
||||||
|
)
|
||||||
|
|
||||||
needlebench_eval_cfg = dict(
|
needlebench_eval_cfg = dict(
|
||||||
evaluator=dict(type=NeedleBenchParallelEvaluator),
|
evaluator=dict(type=NeedleBenchParallelEvaluator),
|
||||||
pred_postprocessor=dict(type=needlebench_postprocess),
|
pred_postprocessor=dict(type=needlebench_postprocess),
|
||||||
dataset_postprocessor=dict(type=needlebench_dataset_postprocess),
|
dataset_postprocessor=dict(type=needlebench_dataset_postprocess),
|
||||||
pred_role='BOT')
|
pred_role='BOT',
|
||||||
|
)
|
||||||
|
|
||||||
context_lengths = list(range(5000, 9000, 1000))
|
context_lengths = list(range(5000, 9000, 1000))
|
||||||
document_depth_percent_intervals = 20
|
document_depth_percent_intervals = 20
|
||||||
document_depth_percent_interval_type = 'linear'
|
document_depth_percent_interval_type = 'linear'
|
||||||
|
|
||||||
base_path = './data/needlebench'
|
base_path = 'opencompass/needlebench'
|
||||||
file_list = ['PaulGrahamEssays.jsonl']
|
file_list = ['PaulGrahamEssays.jsonl']
|
||||||
needlebench_en_datasets = []
|
needlebench_en_datasets = []
|
||||||
needle_file_name = 'needles.jsonl'
|
needle_file_name = 'needles.jsonl'
|
||||||
depths_float = generate_depth_percents(
|
depths_float = generate_depth_percents(
|
||||||
document_depth_percent_intervals,
|
document_depth_percent_intervals, document_depth_percent_interval_type
|
||||||
document_depth_percent_interval_type)
|
)
|
||||||
depths = [int(depth) for depth in depths_float]
|
depths = [int(depth) for depth in depths_float]
|
||||||
|
|
||||||
for original_context_length in context_lengths:
|
for original_context_length in context_lengths:
|
||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': f'Length{original_context_length}'
|
'abbr': f'Length{original_context_length}' f'_parallel_en_8k',
|
||||||
f'_parallel_en_8k',
|
|
||||||
'type': NeedleBenchParallelDataset,
|
'type': NeedleBenchParallelDataset,
|
||||||
'path': base_path,
|
'path': base_path,
|
||||||
'needle_file_name': needle_file_name,
|
'needle_file_name': needle_file_name,
|
||||||
@ -82,7 +83,7 @@ for original_context_length in context_lengths:
|
|||||||
'language': 'English',
|
'language': 'English',
|
||||||
'reader_cfg': needlebench_reader_cfg,
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
'infer_cfg': needlebench_infer_cfg,
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
'eval_cfg': needlebench_eval_cfg
|
'eval_cfg': needlebench_eval_cfg,
|
||||||
}
|
}
|
||||||
needlebench_en_datasets.append(dataset_dict)
|
needlebench_en_datasets.append(dataset_dict)
|
||||||
|
|
||||||
@ -91,8 +92,7 @@ needlebench_zh_datasets = []
|
|||||||
|
|
||||||
for original_context_length in context_lengths:
|
for original_context_length in context_lengths:
|
||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': f'Length{original_context_length}'
|
'abbr': f'Length{original_context_length}' f'_parallel_zh_8k',
|
||||||
f'_parallel_zh_8k',
|
|
||||||
'type': NeedleBenchParallelDataset,
|
'type': NeedleBenchParallelDataset,
|
||||||
'path': base_path,
|
'path': base_path,
|
||||||
'needle_file_name': needle_file_name,
|
'needle_file_name': needle_file_name,
|
||||||
@ -106,6 +106,6 @@ for original_context_length in context_lengths:
|
|||||||
'language': 'Chinese',
|
'language': 'Chinese',
|
||||||
'reader_cfg': needlebench_reader_cfg,
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
'infer_cfg': needlebench_infer_cfg,
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
'eval_cfg': needlebench_eval_cfg
|
'eval_cfg': needlebench_eval_cfg,
|
||||||
}
|
}
|
||||||
needlebench_zh_datasets.append(dataset_dict)
|
needlebench_zh_datasets.append(dataset_dict)
|
||||||
|
@ -41,36 +41,38 @@ needlebench_infer_cfg = dict(
|
|||||||
dict(role='HUMAN', prompt='{prompt}'),
|
dict(role='HUMAN', prompt='{prompt}'),
|
||||||
dict(role='BOT', prompt='{answer}\n'),
|
dict(role='BOT', prompt='{answer}\n'),
|
||||||
]
|
]
|
||||||
)
|
|
||||||
),
|
),
|
||||||
|
),
|
||||||
retriever=dict(type=ZeroRetriever),
|
retriever=dict(type=ZeroRetriever),
|
||||||
inferencer=dict(type=GenInferencer))
|
inferencer=dict(type=GenInferencer),
|
||||||
|
)
|
||||||
|
|
||||||
needlebench_eval_cfg = dict(
|
needlebench_eval_cfg = dict(
|
||||||
evaluator=dict(type=NeedleBenchParallelEvaluator),
|
evaluator=dict(type=NeedleBenchParallelEvaluator),
|
||||||
pred_postprocessor=dict(type=needlebench_postprocess),
|
pred_postprocessor=dict(type=needlebench_postprocess),
|
||||||
dataset_postprocessor=dict(type=needlebench_dataset_postprocess),
|
dataset_postprocessor=dict(type=needlebench_dataset_postprocess),
|
||||||
pred_role='BOT')
|
pred_role='BOT',
|
||||||
|
)
|
||||||
|
|
||||||
context_lengths = list(range(5000, 9000, 1000))
|
context_lengths = list(range(5000, 9000, 1000))
|
||||||
document_depth_percent_intervals_list = [1, 5, 10, 15, 20]
|
document_depth_percent_intervals_list = [1, 5, 10, 15, 20]
|
||||||
document_depth_percent_interval_type = 'linear'
|
document_depth_percent_interval_type = 'linear'
|
||||||
|
|
||||||
base_path = './data/needlebench'
|
base_path = 'opencompass/needlebench'
|
||||||
file_list = ['PaulGrahamEssays.jsonl']
|
file_list = ['PaulGrahamEssays.jsonl']
|
||||||
needlebench_en_datasets = []
|
needlebench_en_datasets = []
|
||||||
needle_file_name = 'needles.jsonl'
|
needle_file_name = 'needles.jsonl'
|
||||||
|
|
||||||
for document_depth_percent_intervals in document_depth_percent_intervals_list:
|
for document_depth_percent_intervals in document_depth_percent_intervals_list:
|
||||||
depths_float = generate_depth_percents(
|
depths_float = generate_depth_percents(
|
||||||
document_depth_percent_intervals,
|
document_depth_percent_intervals, document_depth_percent_interval_type
|
||||||
document_depth_percent_interval_type)
|
)
|
||||||
depths = [int(depth) for depth in depths_float]
|
depths = [int(depth) for depth in depths_float]
|
||||||
|
|
||||||
for original_context_length in context_lengths:
|
for original_context_length in context_lengths:
|
||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': f'Length{original_context_length}'
|
'abbr': f'Length{original_context_length}'
|
||||||
f'_parallel_en_8k_batch{document_depth_percent_intervals}',
|
f'_parallel_en_8k_batch{document_depth_percent_intervals}',
|
||||||
'type': NeedleBenchParallelDataset,
|
'type': NeedleBenchParallelDataset,
|
||||||
'path': base_path,
|
'path': base_path,
|
||||||
'needle_file_name': needle_file_name,
|
'needle_file_name': needle_file_name,
|
||||||
@ -84,7 +86,7 @@ for document_depth_percent_intervals in document_depth_percent_intervals_list:
|
|||||||
'language': 'English',
|
'language': 'English',
|
||||||
'reader_cfg': needlebench_reader_cfg,
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
'infer_cfg': needlebench_infer_cfg,
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
'eval_cfg': needlebench_eval_cfg
|
'eval_cfg': needlebench_eval_cfg,
|
||||||
}
|
}
|
||||||
needlebench_en_datasets.append(dataset_dict)
|
needlebench_en_datasets.append(dataset_dict)
|
||||||
|
|
||||||
@ -94,14 +96,14 @@ needle_file_name = 'needles.jsonl'
|
|||||||
|
|
||||||
for document_depth_percent_intervals in document_depth_percent_intervals_list:
|
for document_depth_percent_intervals in document_depth_percent_intervals_list:
|
||||||
depths_float = generate_depth_percents(
|
depths_float = generate_depth_percents(
|
||||||
document_depth_percent_intervals,
|
document_depth_percent_intervals, document_depth_percent_interval_type
|
||||||
document_depth_percent_interval_type)
|
)
|
||||||
depths = [int(depth) for depth in depths_float]
|
depths = [int(depth) for depth in depths_float]
|
||||||
|
|
||||||
for original_context_length in context_lengths:
|
for original_context_length in context_lengths:
|
||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': f'Length{original_context_length}'
|
'abbr': f'Length{original_context_length}'
|
||||||
f'_parallel_zh_8k_batch{document_depth_percent_intervals}',
|
f'_parallel_zh_8k_batch{document_depth_percent_intervals}',
|
||||||
'type': NeedleBenchParallelDataset,
|
'type': NeedleBenchParallelDataset,
|
||||||
'path': base_path,
|
'path': base_path,
|
||||||
'needle_file_name': needle_file_name,
|
'needle_file_name': needle_file_name,
|
||||||
@ -115,6 +117,6 @@ for document_depth_percent_intervals in document_depth_percent_intervals_list:
|
|||||||
'language': 'Chinese',
|
'language': 'Chinese',
|
||||||
'reader_cfg': needlebench_reader_cfg,
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
'infer_cfg': needlebench_infer_cfg,
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
'eval_cfg': needlebench_eval_cfg
|
'eval_cfg': needlebench_eval_cfg,
|
||||||
}
|
}
|
||||||
needlebench_zh_datasets.append(dataset_dict)
|
needlebench_zh_datasets.append(dataset_dict)
|
||||||
|
@ -41,33 +41,35 @@ needlebench_infer_cfg = dict(
|
|||||||
dict(role='HUMAN', prompt='{prompt}'),
|
dict(role='HUMAN', prompt='{prompt}'),
|
||||||
dict(role='BOT', prompt='{answer}\n'),
|
dict(role='BOT', prompt='{answer}\n'),
|
||||||
]
|
]
|
||||||
)
|
|
||||||
),
|
),
|
||||||
|
),
|
||||||
retriever=dict(type=ZeroRetriever),
|
retriever=dict(type=ZeroRetriever),
|
||||||
inferencer=dict(type=GenInferencer))
|
inferencer=dict(type=GenInferencer),
|
||||||
|
)
|
||||||
|
|
||||||
needlebench_eval_cfg = dict(
|
needlebench_eval_cfg = dict(
|
||||||
evaluator=dict(type=NeedleBenchOriginEvaluator),
|
evaluator=dict(type=NeedleBenchOriginEvaluator),
|
||||||
pred_postprocessor=dict(type=needlebench_postprocess),
|
pred_postprocessor=dict(type=needlebench_postprocess),
|
||||||
dataset_postprocessor=dict(type=needlebench_dataset_postprocess),
|
dataset_postprocessor=dict(type=needlebench_dataset_postprocess),
|
||||||
pred_role='BOT')
|
pred_role='BOT',
|
||||||
|
)
|
||||||
|
|
||||||
context_lengths = list(range(5000, 9000, 1000))
|
context_lengths = list(range(5000, 9000, 1000))
|
||||||
document_depth_percent_intervals = 20
|
document_depth_percent_intervals = 20
|
||||||
document_depth_percent_interval_type = 'linear'
|
document_depth_percent_interval_type = 'linear'
|
||||||
|
|
||||||
base_path = './data/needlebench'
|
base_path = 'opencompass/needlebench'
|
||||||
file_list = ['PaulGrahamEssays.jsonl']
|
file_list = ['PaulGrahamEssays.jsonl']
|
||||||
needlebench_en_datasets = []
|
needlebench_en_datasets = []
|
||||||
needle_file_name = 'needles.jsonl'
|
needle_file_name = 'needles.jsonl'
|
||||||
|
|
||||||
for original_context_length in context_lengths:
|
for original_context_length in context_lengths:
|
||||||
for depth_percent in generate_depth_percents(
|
for depth_percent in generate_depth_percents(
|
||||||
document_depth_percent_intervals,
|
document_depth_percent_intervals, document_depth_percent_interval_type
|
||||||
document_depth_percent_interval_type):
|
):
|
||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': f'Length{original_context_length}'
|
'abbr': f'Length{original_context_length}'
|
||||||
f'Depth{int(depth_percent)}_origin_en_8k',
|
f'Depth{int(depth_percent)}_origin_en_8k',
|
||||||
'type': NeedleBenchOriginDataset,
|
'type': NeedleBenchOriginDataset,
|
||||||
'path': base_path,
|
'path': base_path,
|
||||||
'length': original_context_length,
|
'length': original_context_length,
|
||||||
@ -81,7 +83,7 @@ for original_context_length in context_lengths:
|
|||||||
'needle_file_name': needle_file_name,
|
'needle_file_name': needle_file_name,
|
||||||
'reader_cfg': needlebench_reader_cfg,
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
'infer_cfg': needlebench_infer_cfg,
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
'eval_cfg': needlebench_eval_cfg
|
'eval_cfg': needlebench_eval_cfg,
|
||||||
}
|
}
|
||||||
needlebench_en_datasets.append(dataset_dict)
|
needlebench_en_datasets.append(dataset_dict)
|
||||||
|
|
||||||
@ -91,11 +93,11 @@ needle_file_name = 'needles.jsonl'
|
|||||||
|
|
||||||
for original_context_length in context_lengths:
|
for original_context_length in context_lengths:
|
||||||
for depth_percent in generate_depth_percents(
|
for depth_percent in generate_depth_percents(
|
||||||
document_depth_percent_intervals,
|
document_depth_percent_intervals, document_depth_percent_interval_type
|
||||||
document_depth_percent_interval_type):
|
):
|
||||||
dataset_dict = {
|
dataset_dict = {
|
||||||
'abbr': f'Length{original_context_length}'
|
'abbr': f'Length{original_context_length}'
|
||||||
f'Depth{int(depth_percent)}_origin_zh_8k',
|
f'Depth{int(depth_percent)}_origin_zh_8k',
|
||||||
'type': NeedleBenchOriginDataset,
|
'type': NeedleBenchOriginDataset,
|
||||||
'path': base_path,
|
'path': base_path,
|
||||||
'length': original_context_length,
|
'length': original_context_length,
|
||||||
@ -109,6 +111,6 @@ for original_context_length in context_lengths:
|
|||||||
'needle_file_name': needle_file_name,
|
'needle_file_name': needle_file_name,
|
||||||
'reader_cfg': needlebench_reader_cfg,
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
'infer_cfg': needlebench_infer_cfg,
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
'eval_cfg': needlebench_eval_cfg
|
'eval_cfg': needlebench_eval_cfg,
|
||||||
}
|
}
|
||||||
needlebench_zh_datasets.append(dataset_dict)
|
needlebench_zh_datasets.append(dataset_dict)
|
||||||
|
@ -23,9 +23,9 @@ def create_m_rs_names_list(context_lengths, depths, needle_counts,
|
|||||||
multi_needle_en_list.extend(names_list)
|
multi_needle_en_list.extend(names_list)
|
||||||
elif language == 'zh':
|
elif language == 'zh':
|
||||||
multi_needle_zh_list.extend(names_list)
|
multi_needle_zh_list.extend(names_list)
|
||||||
names_dict['Multi-Needle-Reasoning(M-RS)'] = multi_needle_list
|
names_dict[f'Multi-Needle-Reasoning(M-RS)-{dataset_size.upper()}'] = multi_needle_list
|
||||||
names_dict['Multi-Needle-Reasoning-EN'] = multi_needle_en_list
|
names_dict[f'Multi-Needle-Reasoning-EN-{dataset_size.upper()}'] = multi_needle_en_list
|
||||||
names_dict['Multi-Needle-Reasoning-ZH'] = multi_needle_zh_list
|
names_dict[f'Multi-Needle-Reasoning-ZH-{dataset_size.upper()}'] = multi_needle_zh_list
|
||||||
|
|
||||||
return names_dict
|
return names_dict
|
||||||
|
|
||||||
@ -56,9 +56,9 @@ def create_summarizer(context_lengths, depths, dataset_size,
|
|||||||
single_needle_en_list.extend(names_list)
|
single_needle_en_list.extend(names_list)
|
||||||
elif language == 'zh':
|
elif language == 'zh':
|
||||||
single_needle_zh_list.extend(names_list)
|
single_needle_zh_list.extend(names_list)
|
||||||
names_dict['Single-Needle-Retrieval(S-RT)'] = single_needle_list
|
names_dict[f'Single-Needle-Retrieval(S-RT)-{dataset_size.upper()}'] = single_needle_list
|
||||||
names_dict['Single-Needle-Retrieval-EN'] = single_needle_en_list
|
names_dict[f'Single-Needle-Retrieval-EN-{dataset_size.upper()}'] = single_needle_en_list
|
||||||
names_dict['Single-Needle-Retrieval-ZH'] = single_needle_zh_list
|
names_dict[f'Single-Needle-Retrieval-ZH-{dataset_size.upper()}'] = single_needle_zh_list
|
||||||
|
|
||||||
parallel_list = []
|
parallel_list = []
|
||||||
parallel_en_list = []
|
parallel_en_list = []
|
||||||
@ -74,39 +74,39 @@ def create_summarizer(context_lengths, depths, dataset_size,
|
|||||||
parallel_en_list.extend(names_list)
|
parallel_en_list.extend(names_list)
|
||||||
elif language == 'zh':
|
elif language == 'zh':
|
||||||
parallel_zh_list.extend(names_list)
|
parallel_zh_list.extend(names_list)
|
||||||
names_dict['Multi-Needle-Retrieval(M-RT)'] = parallel_list
|
names_dict[f'Multi-Needle-Retrieval(M-RT)-{dataset_size.upper()}'] = parallel_list
|
||||||
names_dict['Multi-Needle-Retrieval-EN'] = parallel_en_list
|
names_dict[f'Multi-Needle-Retrieval-EN-{dataset_size.upper()}'] = parallel_en_list
|
||||||
names_dict['Multi-Needle-Retrieval-ZH'] = parallel_zh_list
|
names_dict[f'Multi-Needle-Retrieval-ZH-{dataset_size.upper()}'] = parallel_zh_list
|
||||||
|
|
||||||
summary_groups = [
|
summary_groups = [
|
||||||
{'name': key, 'subsets': value} for key, value in names_dict.items()
|
{'name': key, 'subsets': value} for key, value in names_dict.items()
|
||||||
]
|
]
|
||||||
|
|
||||||
summary_groups.append({
|
summary_groups.append({
|
||||||
'name': 'NeedleBench-Overall-Score',
|
'name': f'NeedleBench-Overall-Score-{dataset_size.upper()}',
|
||||||
'subsets': [['Single-Needle-Retrieval(S-RT)', 'naive_average'],
|
'subsets': [[f'Single-Needle-Retrieval(S-RT)-{dataset_size.upper()}', 'naive_average'],
|
||||||
['Multi-Needle-Reasoning(M-RS)', 'naive_average'],
|
[f'Multi-Needle-Reasoning(M-RS)-{dataset_size.upper()}', 'naive_average'],
|
||||||
['Multi-Needle-Retrieval(M-RT)', 'average_score']],
|
[f'Multi-Needle-Retrieval(M-RT)-{dataset_size.upper()}', 'average_score']],
|
||||||
'weights': {'Single-Needle-Retrieval(S-RT)': 0.4,
|
'weights': {f'Single-Needle-Retrieval(S-RT)-{dataset_size.upper()}': 0.4,
|
||||||
'Multi-Needle-Reasoning(M-RS)': 0.3,
|
f'Multi-Needle-Reasoning(M-RS)-{dataset_size.upper()}': 0.3,
|
||||||
'Multi-Needle-Retrieval(M-RT)': 0.3}})
|
f'Multi-Needle-Retrieval(M-RT)-{dataset_size.upper()}': 0.3}})
|
||||||
summarizer_config = {
|
summarizer_config = {
|
||||||
'type': NeedleBenchSummarizer,
|
'type': NeedleBenchSummarizer,
|
||||||
'summary_groups': summary_groups,
|
'summary_groups': summary_groups,
|
||||||
'dataset_abbrs': [
|
'dataset_abbrs': [
|
||||||
'NeedleBench-Overall-Score',
|
f'NeedleBench-Overall-Score-{dataset_size.upper()}',
|
||||||
f'--------- NeedleBench-{dataset_size.upper()}-Single-Needle-Retrieval ---------',
|
f'--------- NeedleBench-{dataset_size.upper()}-Single-Needle-Retrieval ---------',
|
||||||
'Single-Needle-Retrieval(S-RT)',
|
f'Single-Needle-Retrieval(S-RT)-{dataset_size.upper()}',
|
||||||
'Single-Needle-Retrieval-EN',
|
f'Single-Needle-Retrieval-EN-{dataset_size.upper()}',
|
||||||
'Single-Needle-Retrieval-ZH',
|
f'Single-Needle-Retrieval-ZH-{dataset_size.upper()}',
|
||||||
f'--------- NeedleBench-{dataset_size.upper()}-Multi-Needle-Retrieval ---------',
|
f'--------- NeedleBench-{dataset_size.upper()}-Multi-Needle-Retrieval ---------',
|
||||||
'Multi-Needle-Retrieval(M-RT)',
|
f'Multi-Needle-Retrieval(M-RT)-{dataset_size.upper()}',
|
||||||
'Multi-Needle-Retrieval-EN',
|
f'Multi-Needle-Retrieval-EN-{dataset_size.upper()}',
|
||||||
'Multi-Needle-Retrieval-ZH',
|
f'Multi-Needle-Retrieval-ZH-{dataset_size.upper()}',
|
||||||
f'--------- NeedleBench-{dataset_size.upper()}-Multi-Needle-Reasoning ---------',
|
f'--------- NeedleBench-{dataset_size.upper()}-Multi-Needle-Reasoning ---------',
|
||||||
'Multi-Needle-Reasoning(M-RS)',
|
f'Multi-Needle-Reasoning(M-RS)-{dataset_size.upper()}',
|
||||||
'Multi-Needle-Reasoning-EN',
|
f'Multi-Needle-Reasoning-EN-{dataset_size.upper()}',
|
||||||
'Multi-Needle-Reasoning-ZH',
|
f'Multi-Needle-Reasoning-ZH-{dataset_size.upper()}',
|
||||||
f'2-Needle-EN-{dataset_size.upper()}',
|
f'2-Needle-EN-{dataset_size.upper()}',
|
||||||
f'2-Needle-ZH-{dataset_size.upper()}',
|
f'2-Needle-ZH-{dataset_size.upper()}',
|
||||||
f'3-Needle-EN-{dataset_size.upper()}',
|
f'3-Needle-EN-{dataset_size.upper()}',
|
||||||
|
@ -1,11 +1,13 @@
|
|||||||
# flake8: noqa
|
# flake8: noqa
|
||||||
import json
|
import json
|
||||||
|
import os
|
||||||
import random
|
import random
|
||||||
|
|
||||||
from datasets import Dataset
|
from datasets import Dataset
|
||||||
|
|
||||||
from opencompass.datasets.base import BaseDataset
|
from opencompass.datasets.base import BaseDataset
|
||||||
from opencompass.registry import LOAD_DATASET
|
from opencompass.registry import LOAD_DATASET
|
||||||
|
from opencompass.utils import get_data_path
|
||||||
|
|
||||||
|
|
||||||
@LOAD_DATASET.register_module()
|
@LOAD_DATASET.register_module()
|
||||||
@ -14,13 +16,20 @@ class NeedleBenchATCDataset(BaseDataset):
|
|||||||
@staticmethod
|
@staticmethod
|
||||||
def load(
|
def load(
|
||||||
path,
|
path,
|
||||||
|
file_name: str,
|
||||||
num_needles: int,
|
num_needles: int,
|
||||||
language: str,
|
language: str,
|
||||||
repeats: int,
|
repeats: int,
|
||||||
):
|
):
|
||||||
data = {'prompt': [], 'answer': []}
|
data = {'prompt': [], 'answer': []}
|
||||||
|
path = get_data_path(path)
|
||||||
|
if os.environ.get('DATASET_SOURCE') == 'HF':
|
||||||
|
from huggingface_hub import snapshot_download
|
||||||
|
|
||||||
with open(path, 'r', encoding='utf-8') as file:
|
path = snapshot_download(repo_id=path, repo_type='dataset')
|
||||||
|
file_path = os.path.join(path, file_name)
|
||||||
|
|
||||||
|
with open(file_path, 'r', encoding='utf-8') as file:
|
||||||
names_data = json.load(file)
|
names_data = json.load(file)
|
||||||
|
|
||||||
all_names = names_data[language].split(',')
|
all_names = names_data[language].split(',')
|
||||||
@ -30,7 +39,16 @@ class NeedleBenchATCDataset(BaseDataset):
|
|||||||
if language == 'Chinese':
|
if language == 'Chinese':
|
||||||
|
|
||||||
relationship_terms = [
|
relationship_terms = [
|
||||||
'父亲', '母亲', '爸爸', '妈妈', '爷爷', '奶奶', '姥姥', '姥爷', '外公', '外婆'
|
'父亲',
|
||||||
|
'母亲',
|
||||||
|
'爸爸',
|
||||||
|
'妈妈',
|
||||||
|
'爷爷',
|
||||||
|
'奶奶',
|
||||||
|
'姥姥',
|
||||||
|
'姥爷',
|
||||||
|
'外公',
|
||||||
|
'外婆',
|
||||||
]
|
]
|
||||||
|
|
||||||
relationship_templates = [
|
relationship_templates = [
|
||||||
@ -46,10 +64,16 @@ class NeedleBenchATCDataset(BaseDataset):
|
|||||||
elif language == 'English':
|
elif language == 'English':
|
||||||
|
|
||||||
relationship_terms = [
|
relationship_terms = [
|
||||||
'father', 'mother', 'dad', 'mom', 'grandfather',
|
'father',
|
||||||
'grandmother', 'maternal grandmother',
|
'mother',
|
||||||
'maternal grandfather', 'paternal grandfather',
|
'dad',
|
||||||
'paternal grandmother'
|
'mom',
|
||||||
|
'grandfather',
|
||||||
|
'grandmother',
|
||||||
|
'maternal grandmother',
|
||||||
|
'maternal grandfather',
|
||||||
|
'paternal grandfather',
|
||||||
|
'paternal grandmother',
|
||||||
]
|
]
|
||||||
|
|
||||||
relationship_templates = [
|
relationship_templates = [
|
||||||
@ -96,21 +120,20 @@ class NeedleBenchATCDataset(BaseDataset):
|
|||||||
|
|
||||||
# Generating the prompt based on the language
|
# Generating the prompt based on the language
|
||||||
if language == 'Chinese':
|
if language == 'Chinese':
|
||||||
prompt = (f"""
|
prompt = f"""
|
||||||
在上面提供的打乱的家族关系文本中,'{last_person}'的能够向上追溯到的最年长的亲人是谁?
|
在上面提供的打乱的家族关系文本中,'{last_person}'的能够向上追溯到的最年长的亲人是谁?
|
||||||
例如:
|
例如:
|
||||||
例子1.如果张强的父亲是马克,除此以外提供的文本中没有更多关于亲属关系的信息,那么在提供的文本中张强能够向上追溯到的最年长的亲人就是马克。
|
例子1.如果张强的父亲是马克,除此以外提供的文本中没有更多关于亲属关系的信息,那么在提供的文本中张强能够向上追溯到的最年长的亲人就是马克。
|
||||||
例子2.如果李明的姥姥是张红,而张红的父亲是张强,除此以外提供的文本中没有更多关于亲属关系的信息,那么在提供的文本中李明能够向上追溯到的最年长的亲人就是张强。
|
例子2.如果李明的姥姥是张红,而张红的父亲是张强,除此以外提供的文本中没有更多关于亲属关系的信息,那么在提供的文本中李明能够向上追溯到的最年长的亲人就是张强。
|
||||||
例子3.如果小明是张红的曾孙女,张红的祖母是王华,王华的父亲是王刚,除此以外提供的文本中没有更多关于亲属关系的信息,那么小明能够向上追溯到的最年长的亲人就是王刚。
|
例子3.如果小明是张红的曾孙女,张红的祖母是王华,王华的父亲是王刚,除此以外提供的文本中没有更多关于亲属关系的信息,那么小明能够向上追溯到的最年长的亲人就是王刚。
|
||||||
""")
|
"""
|
||||||
elif language == 'English':
|
elif language == 'English':
|
||||||
prompt = (f"""
|
prompt = f"""
|
||||||
Given the scrambled family relationships described above, who is the eldest relative that '{last_person}' can trace back to in the context?
|
Given the scrambled family relationships described above, who is the eldest relative that '{last_person}' can trace back to in the context?
|
||||||
For example:
|
For example:
|
||||||
Example 1: If Zhang Qiang's father is Mark, and no further information about familial relationships is provided in the text, then the oldest relative Zhang Qiang can trace back to in the provided text is Mark.
|
Example 1: If Zhang Qiang's father is Mark, and no further information about familial relationships is provided in the text, then the oldest relative Zhang Qiang can trace back to in the provided text is Mark.
|
||||||
Example 2: If Li Ming's grandmother is Zhang Hong, and Zhang Hong's father is Zhang Qiang, and no further information about familial relationships is provided in the text, then the oldest relative Li Ming can trace back to in the provided text is Zhang Qiang.
|
Example 2: If Li Ming's grandmother is Zhang Hong, and Zhang Hong's father is Zhang Qiang, and no further information about familial relationships is provided in the text, then the oldest relative Li Ming can trace back to in the provided text is Zhang Qiang.
|
||||||
Example 3: If Xiao Ming is Zhang Hong's great-granddaughter, Zhang Hong's grandmother is Wang Hua, and Wang Hua's father is Wang Gang, and no further information about familial relationships is provided in the text, then the oldest relative Xiao Ming can trace back to in the provided text is Wang Gang."""
|
Example 3: If Xiao Ming is Zhang Hong's great-granddaughter, Zhang Hong's grandmother is Wang Hua, and Wang Hua's father is Wang Gang, and no further information about familial relationships is provided in the text, then the oldest relative Xiao Ming can trace back to in the provided text is Wang Gang."""
|
||||||
)
|
|
||||||
else:
|
else:
|
||||||
prompt = 'Language not supported.'
|
prompt = 'Language not supported.'
|
||||||
raise Exception('Unsupported language specified. '
|
raise Exception('Unsupported language specified. '
|
||||||
@ -135,13 +158,20 @@ class NeedleBenchATCOrderedDataset(BaseDataset):
|
|||||||
@staticmethod
|
@staticmethod
|
||||||
def load(
|
def load(
|
||||||
path,
|
path,
|
||||||
|
file_name,
|
||||||
num_needles: int,
|
num_needles: int,
|
||||||
language: str,
|
language: str,
|
||||||
repeats: int,
|
repeats: int,
|
||||||
):
|
):
|
||||||
data = {'prompt': [], 'answer': []}
|
data = {'prompt': [], 'answer': []}
|
||||||
|
path = get_data_path(path)
|
||||||
|
if os.environ.get('DATASET_SOURCE') == 'HF':
|
||||||
|
from huggingface_hub import snapshot_download
|
||||||
|
|
||||||
with open(path, 'r', encoding='utf-8') as file:
|
path = snapshot_download(repo_id=path, repo_type='dataset')
|
||||||
|
file_path = os.path.join(path, file_name)
|
||||||
|
|
||||||
|
with open(file_path, 'r', encoding='utf-8') as file:
|
||||||
names_data = json.load(file)
|
names_data = json.load(file)
|
||||||
|
|
||||||
all_names = names_data[language].split(',')
|
all_names = names_data[language].split(',')
|
||||||
@ -151,7 +181,16 @@ class NeedleBenchATCOrderedDataset(BaseDataset):
|
|||||||
if language == 'Chinese':
|
if language == 'Chinese':
|
||||||
|
|
||||||
relationship_terms = [
|
relationship_terms = [
|
||||||
'父亲', '母亲', '爸爸', '妈妈', '爷爷', '奶奶', '姥姥', '姥爷', '外公', '外婆'
|
'父亲',
|
||||||
|
'母亲',
|
||||||
|
'爸爸',
|
||||||
|
'妈妈',
|
||||||
|
'爷爷',
|
||||||
|
'奶奶',
|
||||||
|
'姥姥',
|
||||||
|
'姥爷',
|
||||||
|
'外公',
|
||||||
|
'外婆',
|
||||||
]
|
]
|
||||||
|
|
||||||
relationship_templates = [
|
relationship_templates = [
|
||||||
@ -167,10 +206,16 @@ class NeedleBenchATCOrderedDataset(BaseDataset):
|
|||||||
elif language == 'English':
|
elif language == 'English':
|
||||||
|
|
||||||
relationship_terms = [
|
relationship_terms = [
|
||||||
'father', 'mother', 'dad', 'mom', 'grandfather',
|
'father',
|
||||||
'grandmother', 'maternal grandmother',
|
'mother',
|
||||||
'maternal grandfather', 'paternal grandfather',
|
'dad',
|
||||||
'paternal grandmother'
|
'mom',
|
||||||
|
'grandfather',
|
||||||
|
'grandmother',
|
||||||
|
'maternal grandmother',
|
||||||
|
'maternal grandfather',
|
||||||
|
'paternal grandfather',
|
||||||
|
'paternal grandmother',
|
||||||
]
|
]
|
||||||
|
|
||||||
relationship_templates = [
|
relationship_templates = [
|
||||||
@ -214,21 +259,20 @@ class NeedleBenchATCOrderedDataset(BaseDataset):
|
|||||||
|
|
||||||
# Generating the prompt based on the language
|
# Generating the prompt based on the language
|
||||||
if language == 'Chinese':
|
if language == 'Chinese':
|
||||||
prompt = (f"""
|
prompt = f"""
|
||||||
在上面提供的打乱的家族关系文本中,'{last_person}'的能够向上追溯到的最年长的亲人是谁?
|
在上面提供的打乱的家族关系文本中,'{last_person}'的能够向上追溯到的最年长的亲人是谁?
|
||||||
例如:
|
例如:
|
||||||
例子1.如果张强的父亲是马克,除此以外提供的文本中没有更多关于亲属关系的信息,那么在提供的文本中张强能够向上追溯到的最年长的亲人就是马克。
|
例子1.如果张强的父亲是马克,除此以外提供的文本中没有更多关于亲属关系的信息,那么在提供的文本中张强能够向上追溯到的最年长的亲人就是马克。
|
||||||
例子2.如果李明的姥姥是张红,而张红的父亲是张强,除此以外提供的文本中没有更多关于亲属关系的信息,那么在提供的文本中李明能够向上追溯到的最年长的亲人就是张强。
|
例子2.如果李明的姥姥是张红,而张红的父亲是张强,除此以外提供的文本中没有更多关于亲属关系的信息,那么在提供的文本中李明能够向上追溯到的最年长的亲人就是张强。
|
||||||
例子3.如果小明是张红的曾孙女,张红的祖母是王华,王华的父亲是王刚,除此以外提供的文本中没有更多关于亲属关系的信息,那么小明能够向上追溯到的最年长的亲人就是王刚。
|
例子3.如果小明是张红的曾孙女,张红的祖母是王华,王华的父亲是王刚,除此以外提供的文本中没有更多关于亲属关系的信息,那么小明能够向上追溯到的最年长的亲人就是王刚。
|
||||||
""")
|
"""
|
||||||
elif language == 'English':
|
elif language == 'English':
|
||||||
prompt = (f"""
|
prompt = f"""
|
||||||
Given the scrambled family relationships described above, who is the eldest relative that '{last_person}' can trace back to in the context?
|
Given the scrambled family relationships described above, who is the eldest relative that '{last_person}' can trace back to in the context?
|
||||||
For example:
|
For example:
|
||||||
Example 1: If Zhang Qiang's father is Mark, and no further information about familial relationships is provided in the text, then the oldest relative Zhang Qiang can trace back to in the provided text is Mark.
|
Example 1: If Zhang Qiang's father is Mark, and no further information about familial relationships is provided in the text, then the oldest relative Zhang Qiang can trace back to in the provided text is Mark.
|
||||||
Example 2: If Li Ming's grandmother is Zhang Hong, and Zhang Hong's father is Zhang Qiang, and no further information about familial relationships is provided in the text, then the oldest relative Li Ming can trace back to in the provided text is Zhang Qiang.
|
Example 2: If Li Ming's grandmother is Zhang Hong, and Zhang Hong's father is Zhang Qiang, and no further information about familial relationships is provided in the text, then the oldest relative Li Ming can trace back to in the provided text is Zhang Qiang.
|
||||||
Example 3: If Xiao Ming is Zhang Hong's great-granddaughter, Zhang Hong's grandmother is Wang Hua, and Wang Hua's father is Wang Gang, and no further information about familial relationships is provided in the text, then the oldest relative Xiao Ming can trace back to in the provided text is Wang Gang."""
|
Example 3: If Xiao Ming is Zhang Hong's great-granddaughter, Zhang Hong's grandmother is Wang Hua, and Wang Hua's father is Wang Gang, and no further information about familial relationships is provided in the text, then the oldest relative Xiao Ming can trace back to in the provided text is Wang Gang."""
|
||||||
)
|
|
||||||
else:
|
else:
|
||||||
prompt = 'Language not supported.'
|
prompt = 'Language not supported.'
|
||||||
raise Exception('Unsupported language specified. '
|
raise Exception('Unsupported language specified. '
|
||||||
|
@ -1,11 +1,13 @@
|
|||||||
# flake8: noqa
|
# flake8: noqa
|
||||||
import copy
|
import copy
|
||||||
import json
|
import json
|
||||||
|
import os
|
||||||
import random
|
import random
|
||||||
|
|
||||||
from datasets import Dataset
|
from datasets import Dataset
|
||||||
|
|
||||||
from opencompass.registry import LOAD_DATASET
|
from opencompass.registry import LOAD_DATASET
|
||||||
|
from opencompass.utils import get_data_path
|
||||||
|
|
||||||
from ..base import BaseDataset
|
from ..base import BaseDataset
|
||||||
|
|
||||||
@ -46,11 +48,14 @@ def get_circular_example(entry, id):
|
|||||||
class NeedleBenchATCDataset(BaseDataset):
|
class NeedleBenchATCDataset(BaseDataset):
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def load(path: str,
|
def load(
|
||||||
num_needles: int,
|
path: str,
|
||||||
language: str,
|
file_name: str,
|
||||||
repeats: int,
|
num_needles: int,
|
||||||
with_circular: bool = True):
|
language: str,
|
||||||
|
repeats: int,
|
||||||
|
with_circular: bool = True,
|
||||||
|
):
|
||||||
"""NeedleBenthATC Dataset.
|
"""NeedleBenthATC Dataset.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
@ -61,8 +66,14 @@ class NeedleBenchATCDataset(BaseDataset):
|
|||||||
"""
|
"""
|
||||||
data = []
|
data = []
|
||||||
entry = {}
|
entry = {}
|
||||||
|
path = get_data_path(path)
|
||||||
|
if os.environ.get('DATASET_SOURCE') == 'HF':
|
||||||
|
from huggingface_hub import snapshot_download
|
||||||
|
|
||||||
with open(path, 'r', encoding='utf-8') as file:
|
path = snapshot_download(repo_id=path, repo_type='dataset')
|
||||||
|
file_path = os.path.join(path, file_name)
|
||||||
|
|
||||||
|
with open(file_path, 'r', encoding='utf-8') as file:
|
||||||
names_data = json.load(file)
|
names_data = json.load(file)
|
||||||
|
|
||||||
all_names = names_data[language].split(',')
|
all_names = names_data[language].split(',')
|
||||||
@ -73,7 +84,16 @@ class NeedleBenchATCDataset(BaseDataset):
|
|||||||
if language == 'Chinese':
|
if language == 'Chinese':
|
||||||
|
|
||||||
relationship_terms = [
|
relationship_terms = [
|
||||||
'父亲', '母亲', '爸爸', '妈妈', '爷爷', '奶奶', '姥姥', '姥爷', '外公', '外婆'
|
'父亲',
|
||||||
|
'母亲',
|
||||||
|
'爸爸',
|
||||||
|
'妈妈',
|
||||||
|
'爷爷',
|
||||||
|
'奶奶',
|
||||||
|
'姥姥',
|
||||||
|
'姥爷',
|
||||||
|
'外公',
|
||||||
|
'外婆',
|
||||||
]
|
]
|
||||||
|
|
||||||
relationship_templates = [
|
relationship_templates = [
|
||||||
@ -89,10 +109,16 @@ class NeedleBenchATCDataset(BaseDataset):
|
|||||||
elif language == 'English':
|
elif language == 'English':
|
||||||
|
|
||||||
relationship_terms = [
|
relationship_terms = [
|
||||||
'father', 'mother', 'dad', 'mom', 'grandfather',
|
'father',
|
||||||
'grandmother', 'maternal grandmother',
|
'mother',
|
||||||
'maternal grandfather', 'paternal grandfather',
|
'dad',
|
||||||
'paternal grandmother'
|
'mom',
|
||||||
|
'grandfather',
|
||||||
|
'grandmother',
|
||||||
|
'maternal grandmother',
|
||||||
|
'maternal grandfather',
|
||||||
|
'paternal grandfather',
|
||||||
|
'paternal grandmother',
|
||||||
]
|
]
|
||||||
|
|
||||||
relationship_templates = [
|
relationship_templates = [
|
||||||
@ -139,12 +165,11 @@ class NeedleBenchATCDataset(BaseDataset):
|
|||||||
|
|
||||||
# Generating the prompt based on the language
|
# Generating the prompt based on the language
|
||||||
if language == 'Chinese':
|
if language == 'Chinese':
|
||||||
prompt = (f"""
|
prompt = f"""
|
||||||
在上面提供的打乱的家族关系文本中,'{last_person}'的能够向上追溯到的最年长的亲人是谁?""")
|
在上面提供的打乱的家族关系文本中,'{last_person}'的能够向上追溯到的最年长的亲人是谁?"""
|
||||||
elif language == 'English':
|
elif language == 'English':
|
||||||
prompt = (f"""
|
prompt = f"""
|
||||||
Given the scrambled family relationships described above, who is the eldest relative that '{last_person}' can trace back to in the context?"""
|
Given the scrambled family relationships described above, who is the eldest relative that '{last_person}' can trace back to in the context?"""
|
||||||
)
|
|
||||||
else:
|
else:
|
||||||
prompt = 'Language not supported.'
|
prompt = 'Language not supported.'
|
||||||
raise Exception('Unsupported language specified. '
|
raise Exception('Unsupported language specified. '
|
||||||
@ -158,7 +183,8 @@ Given the scrambled family relationships described above, who is the eldest rela
|
|||||||
additional_names_needed = max(4 - len(names), 0)
|
additional_names_needed = max(4 - len(names), 0)
|
||||||
additional_names = random.sample(
|
additional_names = random.sample(
|
||||||
[name for name in all_names if name not in names],
|
[name for name in all_names if name not in names],
|
||||||
additional_names_needed)
|
additional_names_needed,
|
||||||
|
)
|
||||||
names.extend(additional_names)
|
names.extend(additional_names)
|
||||||
|
|
||||||
entry['options'] = names[0:4]
|
entry['options'] = names[0:4]
|
||||||
|
@ -4,11 +4,11 @@ import random
|
|||||||
|
|
||||||
import tiktoken
|
import tiktoken
|
||||||
from datasets import Dataset
|
from datasets import Dataset
|
||||||
from huggingface_hub import hf_hub_download
|
|
||||||
|
|
||||||
from opencompass.datasets.base import BaseDataset
|
from opencompass.datasets.base import BaseDataset
|
||||||
from opencompass.openicl import BaseEvaluator
|
from opencompass.openicl import BaseEvaluator
|
||||||
from opencompass.registry import LOAD_DATASET
|
from opencompass.registry import LOAD_DATASET
|
||||||
|
from opencompass.utils import get_data_path
|
||||||
|
|
||||||
|
|
||||||
def get_random_needles(counter, file_path, needle_count):
|
def get_random_needles(counter, file_path, needle_count):
|
||||||
@ -37,7 +37,7 @@ class NeedleBenchMultiDataset(BaseDataset):
|
|||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def load(
|
def load(
|
||||||
path: str, # depreciated
|
path: str,
|
||||||
length: int,
|
length: int,
|
||||||
depth: int,
|
depth: int,
|
||||||
tokenizer_model: str,
|
tokenizer_model: str,
|
||||||
@ -152,25 +152,21 @@ class NeedleBenchMultiDataset(BaseDataset):
|
|||||||
|
|
||||||
return prompt
|
return prompt
|
||||||
|
|
||||||
repo_id = 'opencompass/NeedleBench'
|
|
||||||
file_names = [
|
file_names = [
|
||||||
'PaulGrahamEssays.jsonl', 'multi_needle_reasoning_en.json',
|
'PaulGrahamEssays.jsonl', 'multi_needle_reasoning_en.json',
|
||||||
'multi_needle_reasoning_zh.json', 'zh_finance.jsonl',
|
'multi_needle_reasoning_zh.json', 'zh_finance.jsonl',
|
||||||
'zh_game.jsonl', 'zh_general.jsonl', 'zh_government.jsonl',
|
'zh_game.jsonl', 'zh_general.jsonl', 'zh_government.jsonl',
|
||||||
'zh_movie.jsonl', 'zh_tech.jsonl'
|
'zh_movie.jsonl', 'zh_tech.jsonl'
|
||||||
]
|
]
|
||||||
downloaded_files = []
|
path = get_data_path(path)
|
||||||
base_file_path = ''
|
if os.environ.get('DATASET_SOURCE') == 'HF':
|
||||||
for file_name in file_names:
|
from huggingface_hub import snapshot_download
|
||||||
file_path = hf_hub_download(repo_id=repo_id,
|
path = snapshot_download(repo_id=path, repo_type='dataset')
|
||||||
filename=file_name,
|
needle_file_path = os.path.join(path, needle_file_name)
|
||||||
repo_type='dataset')
|
|
||||||
downloaded_files.append(file_path)
|
|
||||||
base_file_path = '/'.join(file_path.split('/')[:-1])
|
|
||||||
|
|
||||||
needle_file_path = os.path.join(base_file_path, needle_file_name)
|
for file_name in file_names:
|
||||||
for file_path in downloaded_files:
|
file_path = os.path.join(path, file_name)
|
||||||
if file_path.split('/')[-1] not in file_list:
|
if file_name not in file_list:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
with open(file_path, 'r', encoding='utf-8') as f:
|
with open(file_path, 'r', encoding='utf-8') as f:
|
||||||
|
@ -5,11 +5,11 @@ import re
|
|||||||
|
|
||||||
import tiktoken
|
import tiktoken
|
||||||
from datasets import Dataset
|
from datasets import Dataset
|
||||||
from huggingface_hub import hf_hub_download
|
|
||||||
|
|
||||||
from opencompass.datasets.base import BaseDataset
|
from opencompass.datasets.base import BaseDataset
|
||||||
from opencompass.openicl import BaseEvaluator
|
from opencompass.openicl import BaseEvaluator
|
||||||
from opencompass.registry import LOAD_DATASET, TEXT_POSTPROCESSORS
|
from opencompass.registry import LOAD_DATASET, TEXT_POSTPROCESSORS
|
||||||
|
from opencompass.utils import get_data_path
|
||||||
|
|
||||||
|
|
||||||
def get_random_line_by_language(counter, file_path, language):
|
def get_random_line_by_language(counter, file_path, language):
|
||||||
@ -36,7 +36,7 @@ class NeedleBenchOriginDataset(BaseDataset):
|
|||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def load(
|
def load(
|
||||||
path: str, # depreciated
|
path: str,
|
||||||
length: int,
|
length: int,
|
||||||
depth: int,
|
depth: int,
|
||||||
tokenizer_model: str,
|
tokenizer_model: str,
|
||||||
@ -128,33 +128,29 @@ class NeedleBenchOriginDataset(BaseDataset):
|
|||||||
|
|
||||||
return prompt
|
return prompt
|
||||||
|
|
||||||
repo_id = 'opencompass/NeedleBench'
|
|
||||||
file_names = [
|
file_names = [
|
||||||
'PaulGrahamEssays.jsonl', 'needles.jsonl', 'zh_finance.jsonl',
|
'PaulGrahamEssays.jsonl', 'multi_needle_reasoning_en.json',
|
||||||
|
'multi_needle_reasoning_zh.json', 'zh_finance.jsonl',
|
||||||
'zh_game.jsonl', 'zh_general.jsonl', 'zh_government.jsonl',
|
'zh_game.jsonl', 'zh_general.jsonl', 'zh_government.jsonl',
|
||||||
'zh_movie.jsonl', 'zh_tech.jsonl'
|
'zh_movie.jsonl', 'zh_tech.jsonl'
|
||||||
]
|
]
|
||||||
|
path = get_data_path(path)
|
||||||
|
if os.environ.get('DATASET_SOURCE') == 'HF':
|
||||||
|
from huggingface_hub import snapshot_download
|
||||||
|
path = snapshot_download(repo_id=path, repo_type='dataset')
|
||||||
|
needle_file_path = os.path.join(path, needle_file_name)
|
||||||
|
|
||||||
downloaded_files = []
|
|
||||||
base_file_path = ''
|
|
||||||
for file_name in file_names:
|
for file_name in file_names:
|
||||||
file_path = hf_hub_download(repo_id=repo_id,
|
file_path = os.path.join(path, file_name)
|
||||||
filename=file_name,
|
if file_name not in file_list:
|
||||||
repo_type='dataset')
|
|
||||||
downloaded_files.append(file_path)
|
|
||||||
base_file_path = '/'.join(file_path.split('/')[:-1])
|
|
||||||
|
|
||||||
for file_path in downloaded_files:
|
|
||||||
if file_path.split('/')[-1] not in file_list:
|
|
||||||
continue
|
continue
|
||||||
|
|
||||||
with open(file_path, 'r', encoding='utf-8') as f:
|
with open(file_path, 'r', encoding='utf-8') as f:
|
||||||
lines_bak = [json.loads(line.strip()) for line in f]
|
lines_bak = [json.loads(line.strip()) for line in f]
|
||||||
lines = lines_bak.copy()
|
lines = lines_bak.copy()
|
||||||
for counter in range(num_repeats_per_file):
|
for counter in range(num_repeats_per_file):
|
||||||
random.seed(counter)
|
random.seed(counter)
|
||||||
random.shuffle(lines)
|
random.shuffle(lines)
|
||||||
needle_file_path = os.path.join(base_file_path,
|
|
||||||
needle_file_name)
|
|
||||||
random_needle = get_random_line_by_language(
|
random_needle = get_random_line_by_language(
|
||||||
counter, needle_file_path, language)
|
counter, needle_file_path, language)
|
||||||
needle = '\n' + random_needle['needle'] + '\n'
|
needle = '\n' + random_needle['needle'] + '\n'
|
||||||
|
@ -1,21 +1,24 @@
|
|||||||
import json
|
import json
|
||||||
|
import os
|
||||||
import random
|
import random
|
||||||
|
|
||||||
import tiktoken
|
import tiktoken
|
||||||
from datasets import Dataset
|
from datasets import Dataset
|
||||||
from huggingface_hub import hf_hub_download
|
|
||||||
|
|
||||||
from opencompass.datasets.base import BaseDataset
|
from opencompass.datasets.base import BaseDataset
|
||||||
from opencompass.openicl import BaseEvaluator
|
from opencompass.openicl import BaseEvaluator
|
||||||
from opencompass.registry import LOAD_DATASET
|
from opencompass.registry import LOAD_DATASET
|
||||||
|
from opencompass.utils import get_data_path
|
||||||
|
|
||||||
|
|
||||||
def get_unique_entries(file_path,
|
def get_unique_entries(
|
||||||
n,
|
file_path,
|
||||||
language,
|
n,
|
||||||
unique_arg1=False,
|
language,
|
||||||
unique_arg2=False,
|
unique_arg1=False,
|
||||||
unique_combination=False):
|
unique_arg2=False,
|
||||||
|
unique_combination=False,
|
||||||
|
):
|
||||||
seen_arg1 = set()
|
seen_arg1 = set()
|
||||||
seen_arg2 = set()
|
seen_arg2 = set()
|
||||||
seen_combinations = set()
|
seen_combinations = set()
|
||||||
@ -38,9 +41,11 @@ def get_unique_entries(file_path,
|
|||||||
key2 = entry.get('arg2', '') if unique_arg2 else ''
|
key2 = entry.get('arg2', '') if unique_arg2 else ''
|
||||||
combination = (key1, key2) if unique_combination else ''
|
combination = (key1, key2) if unique_combination else ''
|
||||||
|
|
||||||
if (key1 not in seen_arg1 or not unique_arg1) and \
|
if ((key1 not in seen_arg1 or not unique_arg1) # noqa: E501
|
||||||
(key2 not in seen_arg2 or not unique_arg2) and \
|
and (key2 not in seen_arg2 or not unique_arg2)
|
||||||
(combination not in seen_combinations or not unique_combination):
|
and # noqa: E501
|
||||||
|
(combination not in seen_combinations
|
||||||
|
or not unique_combination)): # noqa: E501
|
||||||
seen_arg1.add(key1)
|
seen_arg1.add(key1)
|
||||||
seen_arg2.add(key2)
|
seen_arg2.add(key2)
|
||||||
seen_combinations.add(combination)
|
seen_combinations.add(combination)
|
||||||
@ -57,7 +62,7 @@ class NeedleBenchParallelDataset(BaseDataset):
|
|||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def load(
|
def load(
|
||||||
path: str, # depreciated
|
path: str,
|
||||||
needle_file_name: str,
|
needle_file_name: str,
|
||||||
length: int,
|
length: int,
|
||||||
depths: list[int],
|
depths: list[int],
|
||||||
@ -72,30 +77,32 @@ class NeedleBenchParallelDataset(BaseDataset):
|
|||||||
data = {'prompt': [], 'answer': []}
|
data = {'prompt': [], 'answer': []}
|
||||||
tokenizer = tiktoken.encoding_for_model(tokenizer_model)
|
tokenizer = tiktoken.encoding_for_model(tokenizer_model)
|
||||||
|
|
||||||
repo_id = 'opencompass/NeedleBench'
|
|
||||||
file_names = [
|
file_names = [
|
||||||
'PaulGrahamEssays.jsonl', 'needles.jsonl', 'zh_finance.jsonl',
|
'PaulGrahamEssays.jsonl',
|
||||||
'zh_game.jsonl', 'zh_general.jsonl', 'zh_government.jsonl',
|
'multi_needle_reasoning_en.json',
|
||||||
'zh_movie.jsonl', 'zh_tech.jsonl'
|
'multi_needle_reasoning_zh.json',
|
||||||
|
'zh_finance.jsonl',
|
||||||
|
'zh_game.jsonl',
|
||||||
|
'zh_general.jsonl',
|
||||||
|
'zh_government.jsonl',
|
||||||
|
'zh_movie.jsonl',
|
||||||
|
'zh_tech.jsonl',
|
||||||
]
|
]
|
||||||
|
path = get_data_path(path)
|
||||||
|
if os.environ.get('DATASET_SOURCE') == 'HF':
|
||||||
|
from huggingface_hub import snapshot_download
|
||||||
|
|
||||||
downloaded_files = []
|
path = snapshot_download(repo_id=path, repo_type='dataset')
|
||||||
for file_name in file_names:
|
needle_file_path = os.path.join(path, needle_file_name)
|
||||||
file_path = hf_hub_download(repo_id=repo_id,
|
|
||||||
filename=file_name,
|
|
||||||
repo_type='dataset')
|
|
||||||
downloaded_files.append(file_path)
|
|
||||||
|
|
||||||
for file in downloaded_files:
|
predefined_needles_bak = get_unique_entries(
|
||||||
if file.split('/')[-1] == needle_file_name:
|
needle_file_path,
|
||||||
needle_file_path = file
|
len(depths),
|
||||||
|
language,
|
||||||
predefined_needles_bak = get_unique_entries(needle_file_path,
|
unique_arg1=True,
|
||||||
len(depths),
|
unique_arg2=True,
|
||||||
language,
|
unique_combination=True,
|
||||||
unique_arg1=True,
|
)
|
||||||
unique_arg2=True,
|
|
||||||
unique_combination=True)
|
|
||||||
|
|
||||||
def _generate_context(tokens_context, depths, needles):
|
def _generate_context(tokens_context, depths, needles):
|
||||||
insertion_points = [
|
insertion_points = [
|
||||||
@ -108,10 +115,12 @@ class NeedleBenchParallelDataset(BaseDataset):
|
|||||||
needle_tokens = _get_tokens_from_context(needle)
|
needle_tokens = _get_tokens_from_context(needle)
|
||||||
current_insertion_point = min(
|
current_insertion_point = min(
|
||||||
insertion_points[i] + cumulative_inserted_length,
|
insertion_points[i] + cumulative_inserted_length,
|
||||||
len(tokens_context))
|
len(tokens_context),
|
||||||
|
)
|
||||||
|
|
||||||
tokens_context = tokens_context[:current_insertion_point] + \
|
tokens_context = (tokens_context[:current_insertion_point] +
|
||||||
needle_tokens + tokens_context[current_insertion_point:]
|
needle_tokens +
|
||||||
|
tokens_context[current_insertion_point:])
|
||||||
cumulative_inserted_length += len(needle_tokens)
|
cumulative_inserted_length += len(needle_tokens)
|
||||||
|
|
||||||
new_context = _decode_tokens(tokens_context)
|
new_context = _decode_tokens(tokens_context)
|
||||||
@ -191,8 +200,9 @@ class NeedleBenchParallelDataset(BaseDataset):
|
|||||||
|
|
||||||
return prompt
|
return prompt
|
||||||
|
|
||||||
for file_path in downloaded_files:
|
for file_name in file_names:
|
||||||
if file_path.split('/')[-1] not in file_list:
|
file_path = os.path.join(path, file_name)
|
||||||
|
if file_name not in file_list:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
with open(file_path, 'r', encoding='utf-8') as f:
|
with open(file_path, 'r', encoding='utf-8') as f:
|
||||||
@ -219,8 +229,8 @@ class NeedleBenchParallelDataset(BaseDataset):
|
|||||||
item['retrieval_question'].split("'")[1].split('。')[0]
|
item['retrieval_question'].split("'")[1].split('。')[0]
|
||||||
for item in predefined_needles
|
for item in predefined_needles
|
||||||
])
|
])
|
||||||
retrieval_question = questions + "请按照'" + \
|
retrieval_question = (questions + "请按照'" + answers_format +
|
||||||
answers_format + "'的格式回答。"
|
"'的格式回答。")
|
||||||
elif language == 'English':
|
elif language == 'English':
|
||||||
questions = '、'.join([
|
questions = '、'.join([
|
||||||
item['retrieval_question'].split('?')[0] + '?'
|
item['retrieval_question'].split('?')[0] + '?'
|
||||||
@ -231,14 +241,14 @@ class NeedleBenchParallelDataset(BaseDataset):
|
|||||||
item['retrieval_question'].split("'")[1].split('.')[0]
|
item['retrieval_question'].split("'")[1].split('.')[0]
|
||||||
for item in predefined_needles
|
for item in predefined_needles
|
||||||
])
|
])
|
||||||
retrieval_question = questions + \
|
retrieval_question = (questions +
|
||||||
"Please answer in the format of '" + \
|
"Please answer in the format of '" +
|
||||||
answers_format + "'"
|
answers_format + "'")
|
||||||
|
|
||||||
context_length = length - length_buffer
|
context_length = length - length_buffer
|
||||||
target_length_per_record = context_length - \
|
target_length_per_record = context_length - sum(
|
||||||
sum(len(tokens) for tokens
|
len(tokens)
|
||||||
in _get_tokens_from_context(needles))
|
for tokens in _get_tokens_from_context(needles))
|
||||||
target_length_per_record = max(target_length_per_record, 0)
|
target_length_per_record = max(target_length_per_record, 0)
|
||||||
accumulated_tokens = []
|
accumulated_tokens = []
|
||||||
for line in lines:
|
for line in lines:
|
||||||
@ -317,7 +327,8 @@ class NeedleBenchParallelEvaluator(BaseEvaluator):
|
|||||||
}
|
}
|
||||||
|
|
||||||
result = {
|
result = {
|
||||||
**flattened_scores, 'details': details,
|
**flattened_scores,
|
||||||
'average_score': average_score
|
'details': details,
|
||||||
|
'average_score': average_score,
|
||||||
}
|
}
|
||||||
return result
|
return result
|
||||||
|
@ -265,6 +265,12 @@ DATASETS_MAPPING = {
|
|||||||
"hf_id": "opencompass/xsum",
|
"hf_id": "opencompass/xsum",
|
||||||
"local": "./data/Xsum/dev.jsonl",
|
"local": "./data/Xsum/dev.jsonl",
|
||||||
},
|
},
|
||||||
|
# Needlebench
|
||||||
|
"opencompass/needlebench": {
|
||||||
|
"ms_id": "",
|
||||||
|
"hf_id": "opencompass/needlebench",
|
||||||
|
"local": "./data/needlebench",
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
DATASETS_URL = {
|
DATASETS_URL = {
|
||||||
@ -396,4 +402,8 @@ DATASETS_URL = {
|
|||||||
"url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/mmlu_pro.zip",
|
"url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/mmlu_pro.zip",
|
||||||
"md5": "e3200c7380f4cea5f13c768f2815fabb",
|
"md5": "e3200c7380f4cea5f13c768f2815fabb",
|
||||||
},
|
},
|
||||||
|
"/needlebench": {
|
||||||
|
"url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/needlebench.zip",
|
||||||
|
"md5": "b546da0397746eaff4d3ff0f20d6ede2",
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user