update needlebench

This commit is contained in:
Mor-Li 2025-04-25 17:42:47 +08:00
parent e8bc8c1e8c
commit bd17b3c984
45 changed files with 2212 additions and 4623 deletions

View File

@ -0,0 +1,55 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets.needlebench.atc import NeedleBenchATCDataset
from opencompass.datasets.needlebench.atc import needlebench_atc_postprocess_v2
from opencompass.datasets.needlebench.atc import NeedleBenchATCEvaluator
# ----------------------- Prompt Settings ----------------------- #
needle_num_list = [2, 4, 8, 16, 32, 64, 128, 256, 512]
path = 'opencompass/needlebench'
file_name = 'names.json'
repeats = 10
# ----------------------- Dataset Settings ----------------------- #
needlebench_datasets = []
needlebench_atc_reader_cfg = dict(input_columns=['prompt'], output_column='answer')
needlebench_atc_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(role='HUMAN', prompt='{prompt}'),
],
),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(
type=GenInferencer,
),
)
needlebench_atc_eval_cfg = dict(
evaluator=dict(type=NeedleBenchATCEvaluator),
pred_postprocessor=dict(type=needlebench_atc_postprocess_v2),
)
for num_needles in needle_num_list:
abbr = f'NeedleBenchATCDataset-{num_needles}Needle-EN'
language = 'English'
dataset_dict = {
'abbr': abbr,
'type': NeedleBenchATCDataset,
'path': path,
'file_name': file_name,
'num_needles': num_needles,
'language': language,
'repeats': repeats,
'reader_cfg': needlebench_atc_reader_cfg,
'infer_cfg': needlebench_atc_infer_cfg,
'eval_cfg': needlebench_atc_eval_cfg,
}
needlebench_datasets.append(dataset_dict)

View File

@ -1,136 +0,0 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import CircularEvaluator
from opencompass.datasets.needlebench.atc_choice import NeedleBenchATCDataset
from opencompass.utils.text_postprocessors import first_option_postprocess
# ----------------------- Prompt Content----------------------- #
few_shot_prompts = {
'single_choice_prompts': {
'single_choice_cn': [
dict(role='HUMAN', prompt='问题: 毕东作为刘红的爸爸,对刘红的成长有重要影响。 \n在上面提供的打乱的家族关系文本中,"刘红"的能够向上追溯到的最年长的亲人是谁?\nA. 毕东\nB. 刘红\nC. 李清亮\nD. 王展\n'),
dict(role='BOT', prompt='回答: A'),
dict(role='HUMAN', prompt='问题: 梅建平作为黄玉兰的姥姥,对黄玉兰的成长有重要影响。梅建平的妈妈是朱丽丽。蓝鑫把黄玉兰视为其母亲。焦慧不仅是朱丽丽的妈妈,还是朱丽丽的榜样。 \n在上面提供的打乱的家族关系文本中,"蓝鑫"的能够向上追溯到的最年长的亲人是谁?\nA. 梅建平\nB. 朱丽丽\nC. 蓝鑫\nD. 焦慧\n'),
dict(role='BOT', prompt='回答: D'),
dict(role='HUMAN', prompt='问题: 毕东把柳金凤视为其姥姥。奉兵作为柳金凤的妈妈,对柳金凤的成长有重要影响。余萍把杨颖视为其爸爸。毕东在郭建华的生命中扮演着父亲的角色。常宁的外公是余萍。刘慧是郭建华所生的孩子。刘慧在杨颖的生命中扮演着外公的角色。 \n在上面提供的打乱的家族关系文本中,"常宁"的能够向上追溯到的最年长的亲人是谁?\nA. 柳金凤\nB. 毕东\nC. 奉兵\nD. 余萍\n'),
dict(role='BOT', prompt='回答: C'),
dict(role='HUMAN', prompt='问题: 魏丽丽在谢平的生命中扮演着奶奶的角色。郭兵是魏阳的姥姥。谢平是郑玉珍的外婆。丁颖把武波视为其外公。丁颖在李建国的生命中扮演着外婆的角色。武波的父亲是刘瑜。许玲把余桂芳视为其父亲。刘瑜把许玲视为其爷爷。李建国对郭兵来说,不只是一个爷爷,还是一个朋友。魏丽丽的外公是魏阳。 \n在上面提供的打乱的家族关系文本中,"郑玉珍"的能够向上追溯到的最年长的亲人是谁?\nA. 魏丽丽\nB. 刘瑜\nC. 李建国\nD. 余桂芳\n'),
dict(role='BOT', prompt='回答: D'),
dict(role='HUMAN', prompt='问题: {question}'),
],
'single_choice_en': [
dict(role='HUMAN', prompt="Question: Jasmine Lane plays the role of James Hill's father in James Hill's life. \nGiven the scrambled family relationships described above, who is the eldest relative that 'James Hill' can trace back to in the context?\nA. Jasmine Lane\nB. James Hill\nC. Christopher Flores\nD. Paula Reeves\n"),
dict(role='BOT', prompt='Answer: A'),
dict(role='HUMAN', prompt="Question: For Michael Goodwin, Annette Valdez is not just a dad, but also a friend.For Annette Valdez, Mary Sanders is not just a father, but also a friend. \nGiven the scrambled family relationships described above, who is the eldest relative that 'Michael Goodwin' can trace back to in the context?\nA. Michael Goodwin\nB. Annette Valdez\nC. Mary Sanders\nD. James Mitchell\n"),
dict(role='BOT', prompt='Answer: C'),
dict(role='HUMAN', prompt="Question: Laura Garcia considers Mr. Stephen Valdez DDS as their mom.Mr. Stephen Valdez DDS's maternal grandfather is Elizabeth Smith.Laura Garcia is Ashley Johnson's mom. \nGiven the scrambled family relationships described above, who is the eldest relative that 'Ashley Johnson' can trace back to in the context?\nA. Ashley Johnson\nB. Mr. Stephen Valdez DDS\nC. Laura Garcia\nD. Elizabeth Smith\n"),
dict(role='BOT', prompt='Answer: D'),
dict(role='HUMAN', prompt="Question: Justin Sanchez's paternal grandfather is Laurie Bean.For Laurie Bean, Christina Kim is not just a mother, but also a friend.Jessica Bass, as Christina Kim's maternal grandfather, has a significant impact on Christina Kim's upbringing.Sarah Hampton considers Justin Sanchez as their father. \nGiven the scrambled family relationships described above, who is the eldest relative that 'Sarah Hampton' can trace back to in the context?\nA. Christina Kim\nB. Jessica Bass\nC. Laurie Bean\nD. Justin Sanchez\n"),
dict(role='BOT', prompt='Answer: B'),
dict(role='HUMAN', prompt='Question: {question}'),
],
'single_choice_cn_reasoning': [
dict(role='HUMAN', prompt='问题: 毕东作为刘红的爸爸,对刘红的成长有重要影响。 \n在上面提供的打乱的家族关系文本中,"刘红"的能够向上追溯到的最年长的亲人是谁?\nA. 毕东\nB. 刘红\nC. 李清亮\nD. 王展\n'),
dict(role='BOT', prompt='回答: 根据提供的信息,我们可以构建以下家族关系链:\n\n- 刘红的爸爸是毕东。\n\n因此,"刘红"能够向上追溯到的最年长的亲人是毕东。所以答案是A'),
dict(role='HUMAN', prompt='问题: 梅建平作为黄玉兰的姥姥,对黄玉兰的成长有重要影响。梅建平的妈妈是朱丽丽。蓝鑫把黄玉兰视为其母亲。焦慧不仅是朱丽丽的妈妈,还是朱丽丽的榜样。 \n在上面提供的打乱的家族关系文本中,"蓝鑫"的能够向上追溯到的最年长的亲人是谁?\nA. 梅建平\nB. 朱丽丽\nC. 蓝鑫\nD. 焦慧\n'),
dict(role='BOT', prompt='回答: 根据提供的信息,我们可以构建以下家族关系链:\n\n- 蓝鑫把黄玉兰视为其母亲。\n- 黄玉兰的姥姥是梅建平。\n- 梅建平的妈妈是朱丽丽。\n- 朱丽丽的妈妈是焦慧。\n\n因此,"蓝鑫"能够向上追溯到的最年长的亲人是焦慧。所以答案是D'),
dict(role='HUMAN', prompt='问题: 毕东把柳金凤视为其姥姥。奉兵作为柳金凤的妈妈,对柳金凤的成长有重要影响。余萍把杨颖视为其爸爸。毕东在郭建华的生命中扮演着父亲的角色。常宁的外公是余萍。刘慧是郭建华所生的孩子。刘慧在杨颖的生命中扮演着外公的角色。 \n在上面提供的打乱的家族关系文本中,"常宁"的能够向上追溯到的最年长的亲人是谁?\nA. 柳金凤\nB. 毕东\nC. 奉兵\nD. 余萍\n'),
dict(role='BOT', prompt='回答: 根据提供的信息,我们可以构建以下家族关系链:\n\n- 常宁的外公是余萍。\n- 余萍把杨颖视为其爸爸。\n- 杨颖的外公是刘慧。\n- 刘慧是郭建华所生的孩子。\n- 郭建华的父亲是毕东。\n- 毕东的姥姥是柳金凤。\n- 柳金凤的妈妈是奉兵。\n\n因此,"常宁"能够向上追溯到的最年长的亲人是奉兵。所以答案是C'),
dict(role='HUMAN', prompt='问题: 魏丽丽在谢平的生命中扮演着奶奶的角色。郭兵是魏阳的姥姥。谢平是郑玉珍的外婆。丁颖把武波视为其外公。丁颖在李建国的生命中扮演着外婆的角色。武波的父亲是刘瑜。许玲把余桂芳视为其父亲。刘瑜把许玲视为其爷爷。李建国对郭兵来说,不只是一个爷爷,还是一个朋友。魏丽丽的外公是魏阳。 \n在上面提供的打乱的家族关系文本中,"郑玉珍"的能够向上追溯到的最年长的亲人是谁?\nA. 魏丽丽\nB. 刘瑜\nC. 李建国\nD. 余桂芳\n'),
dict(role='BOT', prompt='回答: 根据提供的信息,我们可以构建以下家族关系链:\n\n- 郑玉珍的外婆是谢平。\n- 谢平的奶奶是魏丽丽。\n- 魏丽丽的外公是魏阳。 \n- 魏阳的姥姥是郭兵。\n- 郭兵的爷爷是李建国。 \n- 李建国的外婆是丁颖。 \n- 丁颖的外公是武波。 \n- 武波的父亲是刘瑜。\n- 刘瑜的爷爷是许玲。\n- 许玲的父亲是余桂芳。 \n\n因此,"郑玉珍"能够向上追溯到的最年长的亲人是余桂芳。所以答案是D'),
dict(role='HUMAN', prompt='问题: {question}'),
],
'single_choice_en_reasoning': [
dict(role='HUMAN', prompt="Question: Sharon House, as Jessica Stewart's father, has a significant impact on Jessica Stewart's upbringing. \nGiven the scrambled family relationships described above, who is the eldest relative that 'Jessica Stewart' can trace back to in the context?\nA. Jack Burch\nB. Jessica Stewart\nC. Sharon House\nD. Carolyn Jackson\n"),
dict(role='BOT', prompt="Answer: Based on the provided information, we can construct the following family relationship chain\n\n- Sharon House, as Jessica Stewart's father, has a significant impact on Jessica Stewart's upbringing.\n\nTherefore, the eldest relative that 'Jessica Stewart' can trace back to in the context is Sharon House. The answer is: C"),
dict(role='HUMAN', prompt="Question: For Robert Hill, Mikayla Scott is not just a paternal grandfather, but also a friend.Jacob Oconnor's paternal grandmother is Robert Hill. \nGiven the scrambled family relationships described above, who is the eldest relative that 'Jacob Oconnor' can trace back to in the context?\nA. Laura Holland\nB. Robert Hill\nC. Jacob Oconnor\nD. Mikayla Scott\n"),
dict(role='BOT', prompt="Answer: Based on the provided information, we can construct the following family relationship chain\n\n-Jacob Oconnor's paternal grandmother is Robert Hill. \n- For Robert Hill, Mikayla Scott is not just a paternal grandfather, but also a friend.\n\nTherefore, the eldest relative that 'Jacob Oconnor' can trace back to in the context is Mikayla Scott. The answer is: D"),
dict(role='HUMAN', prompt="Question: Misty Moore plays the role of Barbara Fuentes's maternal grandfather in Barbara Fuentes's life.Jennifer Garcia, as Michael Martinez's grandmother, has a significant impact on Michael Martinez's upbringing.Michael Martinez is not only Misty Moore's father but also Misty Moore's role model. \nGiven the scrambled family relationships described above, who is the eldest relative that 'Barbara Fuentes' can trace back to in the context?\nA. Michael Martinez\nB. Jennifer Garcia\nC. Misty Moore\nD. Barbara Fuentes\n"),
dict(role='BOT', prompt="Answer: Based on the provided information, we can construct the following family relationship chain\n- Misty Moore plays the role of Barbara Fuentes's maternal grandfather in Barbara Fuentes's life. \n- Michael Martinez is not only Misty Moore's father but also Misty Moore's role model.\n- Jennifer Garcia, as Michael Martinez's grandmother, has a significant impact on Michael Martinez's upbringing.\n\nTherefore, the eldest relative that 'Barbara Fuentes' can trace back to in the context is Jennifer Garcia. The answer is: B"),
dict(role='HUMAN', prompt="Question: Carlos Smith, as Mary Gay's grandfather, has a significant impact on Mary Gay's upbringing.Victor Dean considers Mary Gay as their grandfather.Marcus Miller, as Carlos Smith's paternal grandfather, has a significant impact on Carlos Smith's upbringing.Victor Dean is not only Danielle Yates's maternal grandmother but also Danielle Yates's role model.Danielle Yates is not only David Hernandez's paternal grandmother but also David Hernandez's role model.David Hernandez is Jennifer Williams's mom. \nGiven the scrambled family relationships described above, who is the eldest relative that 'Jennifer Williams' can trace back to in the context?\nA. Marcus Miller\nB. Carlos Smith\nC. Mary Gay\nD. Victor Dean\n"),
dict(role='BOT', prompt="Answer: Based on the provided information, we can construct the following family relationship chain\n\n- David Hernandez is Jennifer Williams's mom.\n- Danielle Yates is not only David Hernandez's paternal grandmother but also David Hernandez's role model.\n- Victor Dean is not only Danielle Yates's maternal grandmother but also Danielle Yates's role model.\n- Victor Dean considers Mary Gay as their grandfather. \n- Carlos Smith, as Mary Gay's grandfather, has a significant impact on Mary Gay's upbringing.\n- Marcus Miller, as Carlos Smith's paternal grandfather, has a significant impact on Carlos Smith's upbringing.\n\nTherefore, the eldest relative that 'Jennifer Williams' can trace back to in the context is Marcus Miller. The answer is: A"),
dict(role='HUMAN', prompt='Question: {question}'),
],
},
}
# ----------------------- Prompt Settings ----------------------- #
needle_num_list = list(range(2, 20, 1))
path = 'opencompass/needlebench'
file_name = 'names.json'
repeats = 10
# Use Zero-Shot or not
with_few_shot = True
# Max for this dataset is 4, should be set with `with_few_shot`
few_shot_samples = 4
# Generate reasoning path or not, only for single choice
with_reasoning = True
# Use circular evaluation or not
with_circular_eval = True
needlebench_prompts = few_shot_prompts
single_choice_prompts = needlebench_prompts['single_choice_prompts']
# Set few shot prompt number
for _name in list(single_choice_prompts.keys()):
if with_few_shot:
assert few_shot_samples > 0 and few_shot_samples <= 4
single_choice_prompts[_name] = \
single_choice_prompts[_name][- few_shot_samples * 2 - 1:]
# ----------------------- Dataset Settings ----------------------- #
needlebench_datasets = []
needlebench_atc_reader_cfg = dict(input_columns=['question'],
output_column='answer')
for _name in list(single_choice_prompts.keys()):
needlebench_atc_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=(single_choice_prompts[_name])),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer,),
)
needlebench_atc_eval_cfg = dict(
evaluator=dict(type=CircularEvaluator),
pred_postprocessor=dict(type=first_option_postprocess, options='ABCD'))
current_needle_num_list = needle_num_list
for num_needles in current_needle_num_list:
abbr = (f'NeedleBenchATCDataset-'
f'{num_needles}Needle-{"EN" if "en" in _name else "ZH"}')
language = 'English' if 'en' in _name else 'Chinese'
if 'reasoning' in _name:
abbr += '-Reasoning'
dataset_dict = {
'abbr': abbr,
'type': NeedleBenchATCDataset,
'path': path,
'file_name': file_name,
'num_needles': num_needles,
'language': language,
'repeats': repeats,
'with_circular': with_circular_eval,
'reader_cfg': needlebench_atc_reader_cfg,
'infer_cfg': needlebench_atc_infer_cfg,
'eval_cfg': needlebench_atc_eval_cfg
}
needlebench_datasets.append(dataset_dict)

View File

@ -1,133 +0,0 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import CircularEvaluator
from opencompass.datasets.needlebench.atc_choice import NeedleBenchATCDataset
from opencompass.utils.text_postprocessors import first_option_postprocess
# ----------------------- Prompt Content----------------------- #
few_shot_prompts = {
'single_choice_prompts': {
'single_choice_cn': [
dict(role='HUMAN', prompt='问题: 毕东作为刘红的爸爸,对刘红的成长有重要影响。 \n在上面提供的打乱的家族关系文本中,"刘红"的能够向上追溯到的最年长的亲人是谁?\nA. 毕东\nB. 刘红\nC. 李清亮\nD. 王展\n'),
dict(role='BOT', prompt='回答: A'),
dict(role='HUMAN', prompt='问题: 梅建平作为黄玉兰的姥姥,对黄玉兰的成长有重要影响。梅建平的妈妈是朱丽丽。蓝鑫把黄玉兰视为其母亲。焦慧不仅是朱丽丽的妈妈,还是朱丽丽的榜样。 \n在上面提供的打乱的家族关系文本中,"蓝鑫"的能够向上追溯到的最年长的亲人是谁?\nA. 梅建平\nB. 朱丽丽\nC. 蓝鑫\nD. 焦慧\n'),
dict(role='BOT', prompt='回答: D'),
dict(role='HUMAN', prompt='问题: 毕东把柳金凤视为其姥姥。奉兵作为柳金凤的妈妈,对柳金凤的成长有重要影响。余萍把杨颖视为其爸爸。毕东在郭建华的生命中扮演着父亲的角色。常宁的外公是余萍。刘慧是郭建华所生的孩子。刘慧在杨颖的生命中扮演着外公的角色。 \n在上面提供的打乱的家族关系文本中,"常宁"的能够向上追溯到的最年长的亲人是谁?\nA. 柳金凤\nB. 毕东\nC. 奉兵\nD. 余萍\n'),
dict(role='BOT', prompt='回答: C'),
dict(role='HUMAN', prompt='问题: 魏丽丽在谢平的生命中扮演着奶奶的角色。郭兵是魏阳的姥姥。谢平是郑玉珍的外婆。丁颖把武波视为其外公。丁颖在李建国的生命中扮演着外婆的角色。武波的父亲是刘瑜。许玲把余桂芳视为其父亲。刘瑜把许玲视为其爷爷。李建国对郭兵来说,不只是一个爷爷,还是一个朋友。魏丽丽的外公是魏阳。 \n在上面提供的打乱的家族关系文本中,"郑玉珍"的能够向上追溯到的最年长的亲人是谁?\nA. 魏丽丽\nB. 刘瑜\nC. 李建国\nD. 余桂芳\n'),
dict(role='BOT', prompt='回答: D'),
dict(role='HUMAN', prompt='问题: {question}'),
],
'single_choice_en': [
dict(role='HUMAN', prompt="Question: Jasmine Lane plays the role of James Hill's father in James Hill's life. \nGiven the scrambled family relationships described above, who is the eldest relative that 'James Hill' can trace back to in the context?\nA. Jasmine Lane\nB. James Hill\nC. Christopher Flores\nD. Paula Reeves\n"),
dict(role='BOT', prompt='Answer: A'),
dict(role='HUMAN', prompt="Question: For Michael Goodwin, Annette Valdez is not just a dad, but also a friend.For Annette Valdez, Mary Sanders is not just a father, but also a friend. \nGiven the scrambled family relationships described above, who is the eldest relative that 'Michael Goodwin' can trace back to in the context?\nA. Michael Goodwin\nB. Annette Valdez\nC. Mary Sanders\nD. James Mitchell\n"),
dict(role='BOT', prompt='Answer: C'),
dict(role='HUMAN', prompt="Question: Laura Garcia considers Mr. Stephen Valdez DDS as their mom.Mr. Stephen Valdez DDS's maternal grandfather is Elizabeth Smith.Laura Garcia is Ashley Johnson's mom. \nGiven the scrambled family relationships described above, who is the eldest relative that 'Ashley Johnson' can trace back to in the context?\nA. Ashley Johnson\nB. Mr. Stephen Valdez DDS\nC. Laura Garcia\nD. Elizabeth Smith\n"),
dict(role='BOT', prompt='Answer: D'),
dict(role='HUMAN', prompt="Question: Justin Sanchez's paternal grandfather is Laurie Bean.For Laurie Bean, Christina Kim is not just a mother, but also a friend.Jessica Bass, as Christina Kim's maternal grandfather, has a significant impact on Christina Kim's upbringing.Sarah Hampton considers Justin Sanchez as their father. \nGiven the scrambled family relationships described above, who is the eldest relative that 'Sarah Hampton' can trace back to in the context?\nA. Christina Kim\nB. Jessica Bass\nC. Laurie Bean\nD. Justin Sanchez\n"),
dict(role='BOT', prompt='Answer: B'),
dict(role='HUMAN', prompt='Question: {question}'),
],
'single_choice_cn_reasoning': [
dict(role='HUMAN', prompt='问题: 毕东作为刘红的爸爸,对刘红的成长有重要影响。 \n在上面提供的打乱的家族关系文本中,"刘红"的能够向上追溯到的最年长的亲人是谁?\nA. 毕东\nB. 刘红\nC. 李清亮\nD. 王展\n'),
dict(role='BOT', prompt='回答: 根据提供的信息,我们可以构建以下家族关系链:\n\n- 刘红的爸爸是毕东。\n\n因此,"刘红"能够向上追溯到的最年长的亲人是毕东。所以答案是A'),
dict(role='HUMAN', prompt='问题: 梅建平作为黄玉兰的姥姥,对黄玉兰的成长有重要影响。梅建平的妈妈是朱丽丽。蓝鑫把黄玉兰视为其母亲。焦慧不仅是朱丽丽的妈妈,还是朱丽丽的榜样。 \n在上面提供的打乱的家族关系文本中,"蓝鑫"的能够向上追溯到的最年长的亲人是谁?\nA. 梅建平\nB. 朱丽丽\nC. 蓝鑫\nD. 焦慧\n'),
dict(role='BOT', prompt='回答: 根据提供的信息,我们可以构建以下家族关系链:\n\n- 蓝鑫把黄玉兰视为其母亲。\n- 黄玉兰的姥姥是梅建平。\n- 梅建平的妈妈是朱丽丽。\n- 朱丽丽的妈妈是焦慧。\n\n因此,"蓝鑫"能够向上追溯到的最年长的亲人是焦慧。所以答案是D'),
dict(role='HUMAN', prompt='问题: 毕东把柳金凤视为其姥姥。奉兵作为柳金凤的妈妈,对柳金凤的成长有重要影响。余萍把杨颖视为其爸爸。毕东在郭建华的生命中扮演着父亲的角色。常宁的外公是余萍。刘慧是郭建华所生的孩子。刘慧在杨颖的生命中扮演着外公的角色。 \n在上面提供的打乱的家族关系文本中,"常宁"的能够向上追溯到的最年长的亲人是谁?\nA. 柳金凤\nB. 毕东\nC. 奉兵\nD. 余萍\n'),
dict(role='BOT', prompt='回答: 根据提供的信息,我们可以构建以下家族关系链:\n\n- 常宁的外公是余萍。\n- 余萍把杨颖视为其爸爸。\n- 杨颖的外公是刘慧。\n- 刘慧是郭建华所生的孩子。\n- 郭建华的父亲是毕东。\n- 毕东的姥姥是柳金凤。\n- 柳金凤的妈妈是奉兵。\n\n因此,"常宁"能够向上追溯到的最年长的亲人是奉兵。所以答案是C'),
dict(role='HUMAN', prompt='问题: 魏丽丽在谢平的生命中扮演着w奶奶的角色。郭兵是魏阳的姥姥。谢平是郑玉珍的外婆。丁颖把武波视为其外公。丁颖在李建国的生命中扮演着外婆的角色。武波的父亲是刘瑜。许玲把余桂芳视为其父亲。刘瑜把许玲视为其爷爷。李建国对郭兵来说不只是一个爷爷还是一个朋友。魏丽丽的外公是魏阳。 \n在上面提供的打乱的家族关系文本中,"郑玉珍"的能够向上追溯到的最年长的亲人是谁?\nA. 魏丽丽\nB. 刘瑜\nC. 李建国\nD. 余桂芳\n'),
dict(role='BOT', prompt='回答: 根据提供的信息,我们可以构建以下家族关系链:\n\n- 郑玉珍的外婆是谢平。\n- 谢平的奶奶是魏丽丽。\n- 魏丽丽的外公是魏阳。 \n- 魏阳的姥姥是郭兵。\n- 郭兵的爷爷是李建国。 \n- 李建国的外婆是丁颖。 \n- 丁颖的外公是武波。 \n- 武波的父亲是刘瑜。\n- 刘瑜的爷爷是许玲。\n- 许玲的父亲是余桂芳。 \n\n因此,"郑玉珍"能够向上追溯到的最年长的亲人是余桂芳。所以答案是D'),
dict(role='HUMAN', prompt='问题: {question}'),
],
'single_choice_en_reasoning': [
dict(role='HUMAN', prompt="Question: Sharon House, as Jessica Stewart's father, has a significant impact on Jessica Stewart's upbringing. \nGiven the scrambled family relationships described above, who is the eldest relative that 'Jessica Stewart' can trace back to in the context?\nA. Jack Burch\nB. Jessica Stewart\nC. Sharon House\nD. Carolyn Jackson\n"),
dict(role='BOT', prompt="Answer: Based on the provided information, we can construct the following family relationship chain\n\n- Sharon House, as Jessica Stewart's father, has a significant impact on Jessica Stewart's upbringing.\n\nTherefore, the eldest relative that 'Jessica Stewart' can trace back to in the context is Sharon House. The answer is: C"),
dict(role='HUMAN', prompt="Question: For Robert Hill, Mikayla Scott is not just a paternal grandfather, but also a friend.Jacob Oconnor's paternal grandmother is Robert Hill. \nGiven the scrambled family relationships described above, who is the eldest relative that 'Jacob Oconnor' can trace back to in the context?\nA. Laura Holland\nB. Robert Hill\nC. Jacob Oconnor\nD. Mikayla Scott\n"),
dict(role='BOT', prompt="Answer: Based on the provided information, we can construct the following family relationship chain\n\n-Jacob Oconnor's paternal grandmother is Robert Hill. \n- For Robert Hill, Mikayla Scott is not just a paternal grandfather, but also a friend.\n\nTherefore, the eldest relative that 'Jacob Oconnor' can trace back to in the context is Mikayla Scott. The answer is: D"),
dict(role='HUMAN', prompt="Question: Misty Moore plays the role of Barbara Fuentes's maternal grandfather in Barbara Fuentes's life.Jennifer Garcia, as Michael Martinez's grandmother, has a significant impact on Michael Martinez's upbringing.Michael Martinez is not only Misty Moore's father but also Misty Moore's role model. \nGiven the scrambled family relationships described above, who is the eldest relative that 'Barbara Fuentes' can trace back to in the context?\nA. Michael Martinez\nB. Jennifer Garcia\nC. Misty Moore\nD. Barbara Fuentes\n"),
dict(role='BOT', prompt="Answer: Based on the provided information, we can construct the following family relationship chain\n- Misty Moore plays the role of Barbara Fuentes's maternal grandfather in Barbara Fuentes's life. \n- Michael Martinez is not only Misty Moore's father but also Misty Moore's role model.\n- Jennifer Garcia, as Michael Martinez's grandmother, has a significant impact on Michael Martinez's upbringing.\n\nTherefore, the eldest relative that 'Barbara Fuentes' can trace back to in the context is Jennifer Garcia. The answer is: B"),
dict(role='HUMAN', prompt="Question: Carlos Smith, as Mary Gay's grandfather, has a significant impact on Mary Gay's upbringing.Victor Dean considers Mary Gay as their grandfather.Marcus Miller, as Carlos Smith's paternal grandfather, has a significant impact on Carlos Smith's upbringing.Victor Dean is not only Danielle Yates's maternal grandmother but also Danielle Yates's role model.Danielle Yates is not only David Hernandez's paternal grandmother but also David Hernandez's role model.David Hernandez is Jennifer Williams's mom. \nGiven the scrambled family relationships described above, who is the eldest relative that 'Jennifer Williams' can trace back to in the context?\nA. Marcus Miller\nB. Carlos Smith\nC. Mary Gay\nD. Victor Dean\n"),
dict(role='BOT', prompt="Answer: Based on the provided information, we can construct the following family relationship chain\n\n- David Hernandez is Jennifer Williams's mom.\n- Danielle Yates is not only David Hernandez's paternal grandmother but also David Hernandez's role model.\n- Victor Dean is not only Danielle Yates's maternal grandmother but also Danielle Yates's role model.\n- Victor Dean considers Mary Gay as their grandfather. \n- Carlos Smith, as Mary Gay's grandfather, has a significant impact on Mary Gay's upbringing.\n- Marcus Miller, as Carlos Smith's paternal grandfather, has a significant impact on Carlos Smith's upbringing.\n\nTherefore, the eldest relative that 'Jennifer Williams' can trace back to in the context is Marcus Miller. The answer is: A"),
dict(role='HUMAN', prompt='Question: {question}'),
],
},
}
# ----------------------- Prompt Settings ----------------------- #
needle_num_list = list(range(2, 20, 1))
path = 'opencompass/needlebench'
file_name = 'names.json'
repeats = 10
# Use Zero-Shot or not
with_few_shot = True
# Max for this dataset is 4, should be set with `with_few_shot`
few_shot_samples = 4
# Generate reasoning path or not, only for single choice
with_reasoning = True
# Use circular evaluation or not
with_circular_eval = True
needlebench_prompts = few_shot_prompts
single_choice_prompts = needlebench_prompts['single_choice_prompts']
# Set few shot prompt number
for _name in list(single_choice_prompts.keys()):
if with_few_shot:
assert few_shot_samples > 0 and few_shot_samples <= 4
single_choice_prompts[_name] = \
single_choice_prompts[_name][- few_shot_samples * 2 - 1:]
# ----------------------- Dataset Settings ----------------------- #
needlebench_datasets = []
needlebench_atc_reader_cfg = dict(input_columns=['question'],
output_column='answer')
for _name in list(single_choice_prompts.keys()):
needlebench_atc_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=(single_choice_prompts[_name])),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer,),
)
needlebench_atc_eval_cfg = dict(
evaluator=dict(type=CircularEvaluator),
pred_postprocessor=dict(type=first_option_postprocess, options='ABCD'))
for num_needles in needle_num_list:
abbr = (f'NeedleBenchATCDataset-'
f'{num_needles}Needle-{"EN" if "en" in _name else "ZH"}')
language = 'English' if 'en' in _name else 'Chinese'
if 'reasoning' in _name:
abbr += '-Reasoning'
dataset_dict = {
'abbr': abbr,
'type': NeedleBenchATCDataset,
'path': path,
'file_name': file_name,
'num_needles': num_needles,
'language': language,
'repeats': repeats,
'with_circular': with_circular_eval,
'reader_cfg': needlebench_atc_reader_cfg,
'infer_cfg': needlebench_atc_infer_cfg,
'eval_cfg': needlebench_atc_eval_cfg
}
needlebench_datasets.append(dataset_dict)

View File

@ -1,43 +0,0 @@
from mmengine.config import read_base
with read_base():
from .atc_choice_20 import *
needle_num_list = list(range(2, 50, 1))
needlebench_datasets = []
for _name in list(single_choice_prompts.keys()):
needlebench_atc_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=(single_choice_prompts[_name])),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer,),
)
needlebench_atc_eval_cfg = dict(
evaluator=dict(type=CircularEvaluator),
pred_postprocessor=dict(type=first_option_postprocess, options='ABCD'))
for num_needles in needle_num_list:
abbr = (f'NeedleBenchATCDataset-'
f'{num_needles}Needle-{"EN" if "en" in _name else "ZH"}')
language = 'English' if 'en' in _name else 'Chinese'
if 'reasoning' in _name:
abbr += '-Reasoning'
dataset_dict = {
'abbr': abbr,
'type': NeedleBenchATCDataset,
'path': path,
'file_name': file_name,
'num_needles': num_needles,
'language': language,
'repeats': repeats,
'with_circular': with_circular_eval,
'reader_cfg': needlebench_atc_reader_cfg,
'infer_cfg': needlebench_atc_infer_cfg,
'eval_cfg': needlebench_atc_eval_cfg
}
needlebench_datasets.append(dataset_dict)

View File

@ -53,11 +53,8 @@ path = 'opencompass/needlebench'
file_name = 'names.json'
repeats = 10
# Use Zero-Shot or not
with_few_shot = True
# Max for this dataset is 4, should be set with `with_few_shot`
few_shot_samples = 4
# Max for this dataset is 4, if you don't want to use few shot, set few_shot_samples to 0
few_shot_samples = 0
# Generate reasoning path or not, only for single choice
with_reasoning = True
@ -70,11 +67,10 @@ single_choice_prompts = needlebench_prompts['single_choice_prompts']
# Set few shot prompt number
for _name in list(single_choice_prompts.keys()):
if with_few_shot:
assert few_shot_samples > 0 and few_shot_samples <= 4
single_choice_prompts[_name] = single_choice_prompts[_name][
-few_shot_samples * 2 - 1 :
]
assert few_shot_samples >= 0 and few_shot_samples <= 4
single_choice_prompts[_name] = single_choice_prompts[_name][
-few_shot_samples * 2 - 1 :
]
# ----------------------- Dataset Settings ----------------------- #

View File

@ -1,43 +0,0 @@
from mmengine.config import read_base
with read_base():
from .atc_choice_20 import *
needle_num_list = list(range(2, 80, 1))
needlebench_datasets = []
for _name in list(single_choice_prompts.keys()):
needlebench_atc_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=(single_choice_prompts[_name])),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer,),
)
needlebench_atc_eval_cfg = dict(
evaluator=dict(type=CircularEvaluator),
pred_postprocessor=dict(type=first_option_postprocess, options='ABCD'))
for num_needles in needle_num_list:
abbr = (f'NeedleBenchATCDataset-'
f'{num_needles}Needle-{"EN" if "en" in _name else "ZH"}')
language = 'English' if 'en' in _name else 'Chinese'
if 'reasoning' in _name:
abbr += '-Reasoning'
dataset_dict = {
'abbr': abbr,
'type': NeedleBenchATCDataset,
'path': path,
'file_name': file_name,
'num_needles': num_needles,
'language': language,
'repeats': repeats,
'with_circular': with_circular_eval,
'reader_cfg': needlebench_atc_reader_cfg,
'infer_cfg': needlebench_atc_infer_cfg,
'eval_cfg': needlebench_atc_eval_cfg
}
needlebench_datasets.append(dataset_dict)

View File

@ -1,97 +0,0 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import CircularEvaluator
from opencompass.datasets.needlebench.atc_choice import NeedleBenchATCDataset
from opencompass.utils.text_postprocessors import first_option_postprocess
# ----------------------- Prompt Content----------------------- #
few_shot_prompts = {
'single_choice_prompts': {
'single_choice_en_reasoning': [
dict(role='HUMAN', prompt="Question: Sharon House, as Jessica Stewart's father, has a significant impact on Jessica Stewart's upbringing. \nGiven the scrambled family relationships described above, who is the eldest relative that 'Jessica Stewart' can trace back to in the context?\nA. Jack Burch\nB. Jessica Stewart\nC. Sharon House\nD. Carolyn Jackson\n"),
dict(role='BOT', prompt="Answer: Based on the provided information, we can construct the following family relationship chain\n\n- Sharon House, as Jessica Stewart's father, has a significant impact on Jessica Stewart's upbringing.\n\nTherefore, the eldest relative that 'Jessica Stewart' can trace back to in the context is Sharon House. The answer is: C"),
dict(role='HUMAN', prompt="Question: For Robert Hill, Mikayla Scott is not just a paternal grandfather, but also a friend.Jacob Oconnor's paternal grandmother is Robert Hill. \nGiven the scrambled family relationships described above, who is the eldest relative that 'Jacob Oconnor' can trace back to in the context?\nA. Laura Holland\nB. Robert Hill\nC. Jacob Oconnor\nD. Mikayla Scott\n"),
dict(role='BOT', prompt="Answer: Based on the provided information, we can construct the following family relationship chain\n\n-Jacob Oconnor's paternal grandmother is Robert Hill. \n- For Robert Hill, Mikayla Scott is not just a paternal grandfather, but also a friend.\n\nTherefore, the eldest relative that 'Jacob Oconnor' can trace back to in the context is Mikayla Scott. The answer is: D"),
dict(role='HUMAN', prompt="Question: Misty Moore plays the role of Barbara Fuentes's maternal grandfather in Barbara Fuentes's life.Jennifer Garcia, as Michael Martinez's grandmother, has a significant impact on Michael Martinez's upbringing.Michael Martinez is not only Misty Moore's father but also Misty Moore's role model. \nGiven the scrambled family relationships described above, who is the eldest relative that 'Barbara Fuentes' can trace back to in the context?\nA. Michael Martinez\nB. Jennifer Garcia\nC. Misty Moore\nD. Barbara Fuentes\n"),
dict(role='BOT', prompt="Answer: Based on the provided information, we can construct the following family relationship chain\n- Misty Moore plays the role of Barbara Fuentes's maternal grandfather in Barbara Fuentes's life. \n- Michael Martinez is not only Misty Moore's father but also Misty Moore's role model.\n- Jennifer Garcia, as Michael Martinez's grandmother, has a significant impact on Michael Martinez's upbringing.\n\nTherefore, the eldest relative that 'Barbara Fuentes' can trace back to in the context is Jennifer Garcia. The answer is: B"),
dict(role='HUMAN', prompt="Question: Carlos Smith, as Mary Gay's grandfather, has a significant impact on Mary Gay's upbringing.Victor Dean considers Mary Gay as their grandfather.Marcus Miller, as Carlos Smith's paternal grandfather, has a significant impact on Carlos Smith's upbringing.Victor Dean is not only Danielle Yates's maternal grandmother but also Danielle Yates's role model.Danielle Yates is not only David Hernandez's paternal grandmother but also David Hernandez's role model.David Hernandez is Jennifer Williams's mom. \nGiven the scrambled family relationships described above, who is the eldest relative that 'Jennifer Williams' can trace back to in the context?\nA. Marcus Miller\nB. Carlos Smith\nC. Mary Gay\nD. Victor Dean\n"),
dict(role='BOT', prompt="Answer: Based on the provided information, we can construct the following family relationship chain\n\n- David Hernandez is Jennifer Williams's mom.\n- Danielle Yates is not only David Hernandez's paternal grandmother but also David Hernandez's role model.\n- Victor Dean is not only Danielle Yates's maternal grandmother but also Danielle Yates's role model.\n- Victor Dean considers Mary Gay as their grandfather. \n- Carlos Smith, as Mary Gay's grandfather, has a significant impact on Mary Gay's upbringing.\n- Marcus Miller, as Carlos Smith's paternal grandfather, has a significant impact on Carlos Smith's upbringing.\n\nTherefore, the eldest relative that 'Jennifer Williams' can trace back to in the context is Marcus Miller. The answer is: A"),
dict(role='HUMAN', prompt='Question: {question}'),
],
},
}
# ----------------------- Prompt Settings ----------------------- #
needle_num_list = list(range(2, 80, 1))
path = 'opencompass/needlebench'
file_name = 'names.json'
repeats = 10
# Use Zero-Shot or not
with_few_shot = True
# Max for this dataset is 4, should be set with `with_few_shot`
few_shot_samples = 4
# Generate reasoning path or not, only for single choice
with_reasoning = True
# Use circular evaluation or not
with_circular_eval = True
needlebench_prompts = few_shot_prompts
single_choice_prompts = needlebench_prompts['single_choice_prompts']
# Set few shot prompt number
for _name in list(single_choice_prompts.keys()):
if with_few_shot:
assert few_shot_samples > 0 and few_shot_samples <= 4
single_choice_prompts[_name] = \
single_choice_prompts[_name][- few_shot_samples * 2 - 1:]
# ----------------------- Dataset Settings ----------------------- #
needlebench_datasets = []
needlebench_atc_reader_cfg = dict(input_columns=['question'],
output_column='answer')
for _name in list(single_choice_prompts.keys()):
needlebench_atc_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=(single_choice_prompts[_name])),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer,),
)
needlebench_atc_eval_cfg = dict(
evaluator=dict(type=CircularEvaluator),
pred_postprocessor=dict(type=first_option_postprocess, options='ABCD'))
for num_needles in needle_num_list:
abbr = (f'NeedleBenchATCDataset-'
f'{num_needles}Needle-{"EN" if "en" in _name else "ZH"}')
language = 'English' if 'en' in _name else 'Chinese'
if 'reasoning' in _name:
abbr += '-Reasoning'
dataset_dict = {
'abbr': abbr,
'type': NeedleBenchATCDataset,
'path': path,
'file_name': file_name,
'num_needles': num_needles,
'language': language,
'repeats': repeats,
'with_circular': with_circular_eval,
'reader_cfg': needlebench_atc_reader_cfg,
'infer_cfg': needlebench_atc_infer_cfg,
'eval_cfg': needlebench_atc_eval_cfg
}
needlebench_datasets.append(dataset_dict)

View File

@ -1,18 +1,18 @@
from mmengine.config import read_base
with read_base():
from .needlebench_multi_reasoning_1000k import needlebench_2needle_en_datasets as needlebench_multi_2needle_en_datasets
from .needlebench_multi_reasoning_1000k import needlebench_3needle_en_datasets as needlebench_multi_3needle_en_datasets
from .needlebench_multi_reasoning_1000k import needlebench_4needle_en_datasets as needlebench_multi_4needle_en_datasets
from .needlebench_multi_reasoning_1000k import needlebench_5needle_en_datasets as needlebench_multi_5needle_en_datasets
from .needlebench_multi_reasoning_1000k import needlebench_2needle_zh_datasets as needlebench_multi_2needle_zh_datasets
from .needlebench_multi_reasoning_1000k import needlebench_3needle_zh_datasets as needlebench_multi_3needle_zh_datasets
from .needlebench_multi_reasoning_1000k import needlebench_4needle_zh_datasets as needlebench_multi_4needle_zh_datasets
from .needlebench_multi_reasoning_1000k import needlebench_5needle_zh_datasets as needlebench_multi_5needle_zh_datasets
from opencompass.configs.datasets.needlebench.needlebench_1000k.needlebench_multi_reasoning_1000k import needlebench_2needle_en_datasets as needlebench_multi_2needle_en_datasets
from opencompass.configs.datasets.needlebench.needlebench_1000k.needlebench_multi_reasoning_1000k import needlebench_3needle_en_datasets as needlebench_multi_3needle_en_datasets
from opencompass.configs.datasets.needlebench.needlebench_1000k.needlebench_multi_reasoning_1000k import needlebench_4needle_en_datasets as needlebench_multi_4needle_en_datasets
from opencompass.configs.datasets.needlebench.needlebench_1000k.needlebench_multi_reasoning_1000k import needlebench_5needle_en_datasets as needlebench_multi_5needle_en_datasets
from opencompass.configs.datasets.needlebench.needlebench_1000k.needlebench_multi_reasoning_1000k import needlebench_2needle_zh_datasets as needlebench_multi_2needle_zh_datasets
from opencompass.configs.datasets.needlebench.needlebench_1000k.needlebench_multi_reasoning_1000k import needlebench_3needle_zh_datasets as needlebench_multi_3needle_zh_datasets
from opencompass.configs.datasets.needlebench.needlebench_1000k.needlebench_multi_reasoning_1000k import needlebench_4needle_zh_datasets as needlebench_multi_4needle_zh_datasets
from opencompass.configs.datasets.needlebench.needlebench_1000k.needlebench_multi_reasoning_1000k import needlebench_5needle_zh_datasets as needlebench_multi_5needle_zh_datasets
from .needlebench_single_1000k import needlebench_en_datasets as needlebench_origin_en_datasets
from .needlebench_single_1000k import needlebench_zh_datasets as needlebench_origin_zh_datasets
from .needlebench_multi_retrieval_1000k import needlebench_en_datasets as needlebench_parallel_en_datasets
from .needlebench_multi_retrieval_1000k import needlebench_zh_datasets as needlebench_parallel_zh_datasets
from opencompass.configs.datasets.needlebench.needlebench_1000k.needlebench_single_1000k import needlebench_en_datasets as needlebench_origin_en_datasets
from opencompass.configs.datasets.needlebench.needlebench_1000k.needlebench_single_1000k import needlebench_zh_datasets as needlebench_origin_zh_datasets
from opencompass.configs.datasets.needlebench.needlebench_1000k.needlebench_multi_retrieval_1000k import needlebench_en_datasets as needlebench_parallel_en_datasets
from opencompass.configs.datasets.needlebench.needlebench_1000k.needlebench_multi_retrieval_1000k import needlebench_zh_datasets as needlebench_parallel_zh_datasets
needlebench_datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])

View File

@ -1,288 +1,93 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets.needlebench.multi import NeedleBenchMultiDataset
from opencompass.datasets.needlebench.multi import NeedleBenchMultiEvaluator
from opencompass.datasets.needlebench.origin import needlebench_postprocess
from opencompass.datasets.needlebench.origin import needlebench_dataset_postprocess
import math
from mmengine.config import read_base
with read_base():
from .needlebench_single_1000k import depths_list, context_lengths
from .needlebench_single_1000k import needlebench_reader_cfg, needlebench_infer_cfg
from opencompass.configs.datasets.needlebench.atc.atc_0shot_nocot_2_power_en import needlebench_atc_eval_cfg as needlebench_eval_cfg
def logistic(x, L=100, x0=50, k=0.1):
return round(L / (1 + math.exp(-k * (x - x0))), 3)
def generate_linear_space(start, end, num):
if num == 1:
return [start]
elif num < 1:
raise ValueError('num must be at least 1.')
step = (end - start) / (num - 1)
return [start + step * i for i in range(num)]
def generate_depth_percents(intervals, interval_type):
if interval_type == 'linear':
return generate_linear_space(0, 100, intervals)
elif interval_type == 'sigmoid':
linear_space = generate_linear_space(0, 100, intervals)
return [logistic(x) for x in linear_space]
else:
raise ValueError('Unsupported interval type')
needlebench_reader_cfg = dict(input_columns=['prompt'], output_column='answer')
needlebench_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(role='HUMAN', prompt='{prompt}'),
dict(role='BOT', prompt='{answer}\n'),
]
),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer),
)
needlebench_eval_cfg = dict(
evaluator=dict(type=NeedleBenchMultiEvaluator),
pred_postprocessor=dict(type=needlebench_postprocess),
dataset_postprocessor=dict(type=needlebench_dataset_postprocess),
pred_role='BOT',
)
context_lengths = [20000, 160000, 300000, 440000, 580000, 720000, 860000, 1000000]
depths_list = [0, 10, 21, 31, 42, 52, 63, 73, 84, 94, 100]
# ----------English Version----------
base_path = 'opencompass/needlebench'
file_list = ['PaulGrahamEssays.jsonl']
needle_file_name = 'multi_needle_reasoning_en.json'
needle_file_name = 'names.json'
diff = 10
num_needles = 2
needlebench_2needle_en_datasets = []
language = 'English'
length_buffer = 3000
for original_context_length in context_lengths:
for depth_percent in depths_list:
dataset_dict = {
'abbr': f'Length{original_context_length}'
f'Depth{int(depth_percent)}_{num_needles}needle_en_1000k',
'type': NeedleBenchMultiDataset,
'path': base_path,
'length': original_context_length,
'depth': int(depth_percent),
'tokenizer_model': 'gpt-4',
'file_list': file_list,
'num_repeats_per_file': 10,
'length_buffer': 600,
'guide': True,
'language': language,
'needle_file_name': needle_file_name,
'num_needles': num_needles,
'diff': diff,
'reader_cfg': needlebench_reader_cfg,
'infer_cfg': needlebench_infer_cfg,
'eval_cfg': needlebench_eval_cfg,
}
needlebench_2needle_en_datasets.append(dataset_dict)
num_needles = 3
# Initialize dataset lists
needlebench_2needle_en_datasets = []
needlebench_3needle_en_datasets = []
for original_context_length in context_lengths:
for depth_percent in depths_list:
dataset_dict = {
'abbr': f'Length{original_context_length}'
f'Depth{int(depth_percent)}_{num_needles}needle_en_1000k',
'type': NeedleBenchMultiDataset,
'path': base_path,
'length': original_context_length,
'depth': int(depth_percent),
'tokenizer_model': 'gpt-4',
'file_list': file_list,
'num_repeats_per_file': 10,
'length_buffer': 600,
'guide': True,
'language': language,
'needle_file_name': needle_file_name,
'num_needles': num_needles,
'diff': diff,
'reader_cfg': needlebench_reader_cfg,
'infer_cfg': needlebench_infer_cfg,
'eval_cfg': needlebench_eval_cfg,
}
needlebench_3needle_en_datasets.append(dataset_dict)
num_needles = 4
needlebench_4needle_en_datasets = []
for original_context_length in context_lengths:
for depth_percent in depths_list:
dataset_dict = {
'abbr': f'Length{original_context_length}'
f'Depth{int(depth_percent)}_{num_needles}needle_en_1000k',
'type': NeedleBenchMultiDataset,
'path': base_path,
'length': original_context_length,
'depth': int(depth_percent),
'tokenizer_model': 'gpt-4',
'file_list': file_list,
'num_repeats_per_file': 10,
'length_buffer': 600,
'guide': True,
'language': language,
'needle_file_name': needle_file_name,
'num_needles': num_needles,
'diff': diff,
'reader_cfg': needlebench_reader_cfg,
'infer_cfg': needlebench_infer_cfg,
'eval_cfg': needlebench_eval_cfg,
}
needlebench_4needle_en_datasets.append(dataset_dict)
num_needles = 5
needlebench_5needle_en_datasets = []
for original_context_length in context_lengths:
for depth_percent in depths_list:
dataset_dict = {
'abbr': f'Length{original_context_length}'
f'Depth{int(depth_percent)}_{num_needles}needle_en_1000k',
'type': NeedleBenchMultiDataset,
'path': base_path,
'length': original_context_length,
'depth': int(depth_percent),
'tokenizer_model': 'gpt-4',
'file_list': file_list,
'num_repeats_per_file': 10,
'length_buffer': 600,
'guide': True,
'language': language,
'needle_file_name': needle_file_name,
'num_needles': num_needles,
'diff': diff,
'reader_cfg': needlebench_reader_cfg,
'infer_cfg': needlebench_infer_cfg,
'eval_cfg': needlebench_eval_cfg,
}
needlebench_5needle_en_datasets.append(dataset_dict)
# Create datasets for different numbers of needles
for num_needles in range(2, 6):
dataset_list_name = f'needlebench_{num_needles}needle_en_datasets'
for original_context_length in context_lengths:
for depth_percent in depths_list:
dataset_dict = {
'abbr': f'Length{original_context_length}'
f'Depth{int(depth_percent)}_{num_needles}needle_en_1000k',
'type': NeedleBenchMultiDataset,
'path': base_path,
'length': original_context_length,
'depth': int(depth_percent),
'tokenizer_model': 'gpt-4',
'file_list': file_list,
'num_repeats_per_file': 10,
'length_buffer': length_buffer,
'language': language,
'needle_file_name': needle_file_name,
'num_needles': num_needles,
'diff': diff,
'reader_cfg': needlebench_reader_cfg,
'infer_cfg': needlebench_infer_cfg,
'eval_cfg': needlebench_eval_cfg,
}
# Add to the appropriate list using globals()
globals()[f'needlebench_{num_needles}needle_en_datasets'].append(dataset_dict)
# ----------Chinese Version----------
base_path = 'opencompass/needlebench'
file_list = ['zh_finance.jsonl']
needle_file_name = 'multi_needle_reasoning_zh.json'
needle_file_name = 'names.json'
diff = 10
num_needles = 2
needlebench_2needle_zh_datasets = []
language = 'Chinese'
length_buffer = 200
for original_context_length in context_lengths:
for depth_percent in depths_list:
dataset_dict = {
'abbr': f'Length{original_context_length}'
f'Depth{int(depth_percent)}_{num_needles}needle_zh_1000k',
'type': NeedleBenchMultiDataset,
'path': base_path,
'length': original_context_length,
'depth': int(depth_percent),
'tokenizer_model': 'gpt-4',
'file_list': file_list,
'num_repeats_per_file': 10,
'length_buffer': 200,
'guide': True,
'language': language,
'needle_file_name': needle_file_name,
'num_needles': num_needles,
'diff': diff,
'reader_cfg': needlebench_reader_cfg,
'infer_cfg': needlebench_infer_cfg,
'eval_cfg': needlebench_eval_cfg,
}
needlebench_2needle_zh_datasets.append(dataset_dict)
num_needles = 3
# Initialize dataset lists
needlebench_2needle_zh_datasets = []
needlebench_3needle_zh_datasets = []
for original_context_length in context_lengths:
for depth_percent in depths_list:
dataset_dict = {
'abbr': f'Length{original_context_length}'
f'Depth{int(depth_percent)}_{num_needles}needle_zh_1000k',
'type': NeedleBenchMultiDataset,
'path': base_path,
'length': original_context_length,
'depth': int(depth_percent),
'tokenizer_model': 'gpt-4',
'file_list': file_list,
'num_repeats_per_file': 10,
'length_buffer': 200,
'guide': True,
'language': language,
'needle_file_name': needle_file_name,
'num_needles': num_needles,
'diff': diff,
'reader_cfg': needlebench_reader_cfg,
'infer_cfg': needlebench_infer_cfg,
'eval_cfg': needlebench_eval_cfg,
}
needlebench_3needle_zh_datasets.append(dataset_dict)
num_needles = 4
needlebench_4needle_zh_datasets = []
for original_context_length in context_lengths:
for depth_percent in depths_list:
dataset_dict = {
'abbr': f'Length{original_context_length}'
f'Depth{int(depth_percent)}_{num_needles}needle_zh_1000k',
'type': NeedleBenchMultiDataset,
'path': base_path,
'length': original_context_length,
'depth': int(depth_percent),
'tokenizer_model': 'gpt-4',
'file_list': file_list,
'num_repeats_per_file': 10,
'length_buffer': 200,
'guide': True,
'language': language,
'needle_file_name': needle_file_name,
'num_needles': num_needles,
'diff': diff,
'reader_cfg': needlebench_reader_cfg,
'infer_cfg': needlebench_infer_cfg,
'eval_cfg': needlebench_eval_cfg,
}
needlebench_4needle_zh_datasets.append(dataset_dict)
num_needles = 5
needlebench_5needle_zh_datasets = []
for original_context_length in context_lengths:
for depth_percent in depths_list:
dataset_dict = {
'abbr': f'Length{original_context_length}'
f'Depth{int(depth_percent)}_{num_needles}needle_zh_1000k',
'type': NeedleBenchMultiDataset,
'path': base_path,
'length': original_context_length,
'depth': int(depth_percent),
'tokenizer_model': 'gpt-4',
'file_list': file_list,
'num_repeats_per_file': 10,
'length_buffer': 200,
'guide': True,
'language': language,
'needle_file_name': needle_file_name,
'num_needles': num_needles,
'diff': diff,
'reader_cfg': needlebench_reader_cfg,
'infer_cfg': needlebench_infer_cfg,
'eval_cfg': needlebench_eval_cfg,
}
needlebench_5needle_zh_datasets.append(dataset_dict)
# Create datasets for different numbers of needles
for num_needles in range(2, 6):
dataset_list_name = f'needlebench_{num_needles}needle_zh_datasets'
for original_context_length in context_lengths:
for depth_percent in depths_list:
dataset_dict = {
'abbr': f'Length{original_context_length}'
f'Depth{int(depth_percent)}_{num_needles}needle_zh_1000k',
'type': NeedleBenchMultiDataset,
'path': base_path,
'length': original_context_length,
'depth': int(depth_percent),
'tokenizer_model': 'gpt-4',
'file_list': file_list,
'num_repeats_per_file': 10,
'length_buffer': length_buffer,
'language': language,
'needle_file_name': needle_file_name,
'num_needles': num_needles,
'diff': diff,
'reader_cfg': needlebench_reader_cfg,
'infer_cfg': needlebench_infer_cfg,
'eval_cfg': needlebench_eval_cfg,
}
# Add to the appropriate list using globals()
globals()[f'needlebench_{num_needles}needle_zh_datasets'].append(dataset_dict)

View File

@ -1,108 +1,55 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets.needlebench.parallel import NeedleBenchParallelDataset
from opencompass.datasets.needlebench.parallel import NeedleBenchParallelEvaluator
from opencompass.datasets.needlebench.origin import needlebench_postprocess
from opencompass.datasets.needlebench.origin import needlebench_dataset_postprocess
import math
from mmengine.config import read_base
with read_base():
from .needlebench_single_1000k import depths_list as depths, context_lengths
from .needlebench_single_1000k import needlebench_reader_cfg, needlebench_infer_cfg, needlebench_eval_cfg
def logistic(x, L=100, x0=50, k=0.1):
return round(L / (1 + math.exp(-k * (x - x0))), 3)
def generate_linear_space(start, end, num):
if num == 1:
return [start]
elif num < 1:
raise ValueError('num must be at least 1.')
step = (end - start) / (num - 1)
return [start + step * i for i in range(num)]
def generate_depth_percents(intervals, interval_type):
if interval_type == 'linear':
return generate_linear_space(0, 100, intervals)
elif interval_type == 'sigmoid':
linear_space = generate_linear_space(0, 100, intervals)
return [logistic(x) for x in linear_space]
else:
raise ValueError('Unsupported interval type')
needlebench_reader_cfg = dict(input_columns=['prompt'], output_column='answer')
needlebench_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(role='HUMAN', prompt='{prompt}'),
dict(role='BOT', prompt='{answer}\n'),
]
),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer),
)
needlebench_eval_cfg = dict(
evaluator=dict(type=NeedleBenchParallelEvaluator),
pred_postprocessor=dict(type=needlebench_postprocess),
dataset_postprocessor=dict(type=needlebench_dataset_postprocess),
pred_role='BOT',
)
context_lengths = list([20000, 160000, 300000, 440000, 580000, 720000, 860000, 1000000])
document_depth_percent_intervals = 20
document_depth_percent_interval_type = 'linear'
needlebench_eval_cfg['evaluator']['type'] = NeedleBenchParallelEvaluator
base_path = 'opencompass/needlebench'
file_list = ['PaulGrahamEssays.jsonl']
needlebench_en_datasets = []
needle_file_name = 'needles.jsonl'
depths = [0, 10, 21, 31, 42, 52, 63, 73, 84, 94, 100]
for original_context_length in context_lengths:
dataset_dict = {
'abbr': f'Length{original_context_length}' f'_parallel_en_1000k',
'type': NeedleBenchParallelDataset,
'path': base_path,
'needle_file_name': needle_file_name,
'length': original_context_length,
'depths': depths,
'tokenizer_model': 'gpt-4',
'file_list': file_list,
'num_repeats_per_file': 25,
'length_buffer': 3000,
'guide': True,
# Define configurations for both English and Chinese datasets
language_configs = [
{
'file_list': ['PaulGrahamEssays.jsonl'],
'dataset_var': 'needlebench_en_datasets',
'language': 'English',
'reader_cfg': needlebench_reader_cfg,
'infer_cfg': needlebench_infer_cfg,
'eval_cfg': needlebench_eval_cfg,
'length_buffer': 3000,
'suffix': 'en'
},
{
'file_list': ['zh_finance.jsonl'],
'dataset_var': 'needlebench_zh_datasets',
'language': 'Chinese',
'length_buffer': 200,
'suffix': 'zh'
}
needlebench_en_datasets.append(dataset_dict)
]
file_list = ['zh_finance.jsonl']
# Initialize empty dataset lists
needlebench_en_datasets = []
needlebench_zh_datasets = []
for original_context_length in context_lengths:
dataset_dict = {
'abbr': f'Length{original_context_length}' f'_parallel_zh_1000k',
'type': NeedleBenchParallelDataset,
'path': base_path,
'needle_file_name': needle_file_name,
'length': original_context_length,
'depths': depths,
'tokenizer_model': 'gpt-4',
'file_list': file_list,
'num_repeats_per_file': 25,
'length_buffer': 200,
'guide': True,
'language': 'Chinese',
'reader_cfg': needlebench_reader_cfg,
'infer_cfg': needlebench_infer_cfg,
'eval_cfg': needlebench_eval_cfg,
}
needlebench_zh_datasets.append(dataset_dict)
# Single loop to handle both languages
for config in language_configs:
for original_context_length in context_lengths:
dataset_dict = {
'abbr': f'Length{original_context_length}_parallel_{config["suffix"]}_1000k',
'type': NeedleBenchParallelDataset,
'path': base_path,
'needle_file_name': needle_file_name,
'length': original_context_length,
'depths': depths,
'tokenizer_model': 'gpt-4',
'file_list': config['file_list'],
'num_repeats_per_file': 25,
'length_buffer': config['length_buffer'],
'language': config['language'],
'reader_cfg': needlebench_reader_cfg,
'infer_cfg': needlebench_infer_cfg,
'eval_cfg': needlebench_eval_cfg,
}
globals()[config['dataset_var']].append(dataset_dict)

View File

@ -5,30 +5,6 @@ from opencompass.datasets.needlebench.origin import NeedleBenchOriginDataset
from opencompass.datasets.needlebench.origin import NeedleBenchOriginEvaluator
from opencompass.datasets.needlebench.origin import needlebench_postprocess
from opencompass.datasets.needlebench.origin import needlebench_dataset_postprocess
import math
def logistic(x, L=100, x0=50, k=0.1):
return round(L / (1 + math.exp(-k * (x - x0))), 3)
def generate_linear_space(start, end, num):
if num == 1:
return [start]
elif num < 1:
raise ValueError('num must be at least 1.')
step = (end - start) / (num - 1)
return [start + step * i for i in range(num)]
def generate_depth_percents(intervals, interval_type):
if interval_type == 'linear':
return generate_linear_space(0, 100, intervals)
elif interval_type == 'sigmoid':
linear_space = generate_linear_space(0, 100, intervals)
return [logistic(x) for x in linear_space]
else:
raise ValueError('Unsupported interval type')
needlebench_reader_cfg = dict(input_columns=['prompt'], output_column='answer')
@ -54,58 +30,52 @@ needlebench_eval_cfg = dict(
pred_role='BOT',
)
context_lengths = [20000, 160000, 300000, 440000, 580000, 720000, 860000, 1000000]
depths_list = [0, 10, 21, 31, 42, 52, 63, 73, 84, 94, 100]
context_lengths = list([1000, 125000, 250000, 375000, 500000, 625000, 750000, 875000, 1000000])
depths_list = [0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
base_path = 'opencompass/needlebench'
file_list = ['PaulGrahamEssays.jsonl']
needle_file_name = 'needles.jsonl'
# Define configurations for both English and Chinese datasets
language_configs = [
{
'file_list': ['PaulGrahamEssays.jsonl'],
'dataset_var': 'needlebench_en_datasets',
'language': 'English',
'length_buffer': 3000,
'suffix': 'en'
},
{
'file_list': ['zh_finance.jsonl'],
'dataset_var': 'needlebench_zh_datasets',
'language': 'Chinese',
'length_buffer': 200,
'suffix': 'zh'
}
]
# Initialize empty dataset lists
needlebench_en_datasets = []
needle_file_name = 'needles.jsonl'
for original_context_length in context_lengths:
for depth_percent in depths_list:
dataset_dict = {
'abbr': f'Length{original_context_length}'
f'Depth{int(depth_percent)}_origin_en_1000k',
'type': NeedleBenchOriginDataset,
'path': base_path,
'length': original_context_length,
'depth': int(depth_percent),
'tokenizer_model': 'gpt-4',
'file_list': file_list,
'num_repeats_per_file': 10,
'length_buffer': 600,
'guide': True,
'language': 'English',
'needle_file_name': needle_file_name,
'reader_cfg': needlebench_reader_cfg,
'infer_cfg': needlebench_infer_cfg,
'eval_cfg': needlebench_eval_cfg,
}
needlebench_en_datasets.append(dataset_dict)
file_list = ['zh_finance.jsonl']
needlebench_zh_datasets = []
needle_file_name = 'needles.jsonl'
for original_context_length in context_lengths:
for depth_percent in depths_list:
dataset_dict = {
'abbr': f'Length{original_context_length}'
f'Depth{int(depth_percent)}_origin_zh_1000k',
'type': NeedleBenchOriginDataset,
'path': base_path,
'length': original_context_length,
'depth': int(depth_percent),
'tokenizer_model': 'gpt-4',
'file_list': file_list,
'num_repeats_per_file': 10,
'length_buffer': 200,
'guide': True,
'language': 'Chinese',
'needle_file_name': needle_file_name,
'reader_cfg': needlebench_reader_cfg,
'infer_cfg': needlebench_infer_cfg,
'eval_cfg': needlebench_eval_cfg,
}
needlebench_zh_datasets.append(dataset_dict)
# Single loop to handle both languages
for config in language_configs:
for original_context_length in context_lengths:
for depth_percent in depths_list:
dataset_dict = {
'abbr': f'Length{original_context_length}'
f'Depth{int(depth_percent)}_origin_{config["suffix"]}_1000k',
'type': NeedleBenchOriginDataset,
'path': base_path,
'length': original_context_length,
'depth': int(depth_percent),
'tokenizer_model': 'gpt-4',
'file_list': config['file_list'],
'num_repeats_per_file': 10,
'length_buffer': config['length_buffer'],
'language': config['language'],
'needle_file_name': needle_file_name,
'reader_cfg': needlebench_reader_cfg,
'infer_cfg': needlebench_infer_cfg,
'eval_cfg': needlebench_eval_cfg,
}
globals()[config['dataset_var']].append(dataset_dict)

View File

@ -1,18 +1,32 @@
from mmengine.config import read_base
with read_base():
from .needlebench_multi_reasoning_128k import needlebench_2needle_en_datasets as needlebench_multi_2needle_en_datasets
from .needlebench_multi_reasoning_128k import needlebench_3needle_en_datasets as needlebench_multi_3needle_en_datasets
from .needlebench_multi_reasoning_128k import needlebench_4needle_en_datasets as needlebench_multi_4needle_en_datasets
from .needlebench_multi_reasoning_128k import needlebench_5needle_en_datasets as needlebench_multi_5needle_en_datasets
from .needlebench_multi_reasoning_128k import needlebench_2needle_zh_datasets as needlebench_multi_2needle_zh_datasets
from .needlebench_multi_reasoning_128k import needlebench_3needle_zh_datasets as needlebench_multi_3needle_zh_datasets
from .needlebench_multi_reasoning_128k import needlebench_4needle_zh_datasets as needlebench_multi_4needle_zh_datasets
from .needlebench_multi_reasoning_128k import needlebench_5needle_zh_datasets as needlebench_multi_5needle_zh_datasets
from opencompass.configs.datasets.needlebench.needlebench_128k.needlebench_multi_reasoning_128k import needlebench_2needle_en_datasets as needlebench_multi_2needle_en_datasets
from opencompass.configs.datasets.needlebench.needlebench_128k.needlebench_multi_reasoning_128k import needlebench_3needle_en_datasets as needlebench_multi_3needle_en_datasets
from opencompass.configs.datasets.needlebench.needlebench_128k.needlebench_multi_reasoning_128k import needlebench_4needle_en_datasets as needlebench_multi_4needle_en_datasets
from opencompass.configs.datasets.needlebench.needlebench_128k.needlebench_multi_reasoning_128k import needlebench_5needle_en_datasets as needlebench_multi_5needle_en_datasets
from opencompass.configs.datasets.needlebench.needlebench_128k.needlebench_multi_reasoning_128k import needlebench_2needle_zh_datasets as needlebench_multi_2needle_zh_datasets
from opencompass.configs.datasets.needlebench.needlebench_128k.needlebench_multi_reasoning_128k import needlebench_3needle_zh_datasets as needlebench_multi_3needle_zh_datasets
from opencompass.configs.datasets.needlebench.needlebench_128k.needlebench_multi_reasoning_128k import needlebench_4needle_zh_datasets as needlebench_multi_4needle_zh_datasets
from opencompass.configs.datasets.needlebench.needlebench_128k.needlebench_multi_reasoning_128k import needlebench_5needle_zh_datasets as needlebench_multi_5needle_zh_datasets
from .needlebench_single_128k import needlebench_en_datasets as needlebench_origin_en_datasets
from .needlebench_single_128k import needlebench_zh_datasets as needlebench_origin_zh_datasets
from .needlebench_multi_retrieval_128k import needlebench_en_datasets as needlebench_parallel_en_datasets
from .needlebench_multi_retrieval_128k import needlebench_zh_datasets as needlebench_parallel_zh_datasets
from opencompass.configs.datasets.needlebench.needlebench_128k.needlebench_single_128k import needlebench_en_datasets as needlebench_origin_en_datasets
from opencompass.configs.datasets.needlebench.needlebench_128k.needlebench_single_128k import needlebench_zh_datasets as needlebench_origin_zh_datasets
from opencompass.configs.datasets.needlebench.needlebench_128k.needlebench_multi_retrieval_128k import needlebench_en_datasets as needlebench_parallel_en_datasets
from opencompass.configs.datasets.needlebench.needlebench_128k.needlebench_multi_retrieval_128k import needlebench_zh_datasets as needlebench_parallel_zh_datasets
needlebench_datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
if __name__ == '__main__':
print(len(needlebench_datasets))
# sum num_repeats_per_file of all datasets
num_repeats_per_file = sum(dataset['num_repeats_per_file'] for dataset in needlebench_datasets) * 8
print(num_repeats_per_file)
# every repeat is 5 seconds
print(num_repeats_per_file * 5 / 60, 'minutes')
# print number of hours
print(num_repeats_per_file * 5 / 3600, 'hours')
# if every repeat is 2 minutes, how many days
print(num_repeats_per_file * 2 / 60 / 24, 'days')

View File

@ -1,290 +1,95 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets.needlebench.multi import NeedleBenchMultiDataset
from opencompass.datasets.needlebench.multi import NeedleBenchMultiEvaluator
from opencompass.datasets.needlebench.origin import needlebench_postprocess
from opencompass.datasets.needlebench.origin import needlebench_dataset_postprocess
import math
from mmengine.config import read_base
with read_base():
from .needlebench_single_128k import depths_list, context_lengths
from .needlebench_single_128k import needlebench_reader_cfg, needlebench_infer_cfg
from opencompass.configs.datasets.needlebench.atc.atc_0shot_nocot_2_power_en import needlebench_atc_eval_cfg as needlebench_eval_cfg
def logistic(x, L=100, x0=50, k=0.1):
return round(L / (1 + math.exp(-k * (x - x0))), 3)
def generate_linear_space(start, end, num):
if num == 1:
return [start]
elif num < 1:
raise ValueError('num must be at least 1.')
step = (end - start) / (num - 1)
return [start + step * i for i in range(num)]
def generate_depth_percents(intervals, interval_type):
if interval_type == 'linear':
return generate_linear_space(0, 100, intervals)
elif interval_type == 'sigmoid':
linear_space = generate_linear_space(0, 100, intervals)
return [logistic(x) for x in linear_space]
else:
raise ValueError('Unsupported interval type')
needlebench_reader_cfg = dict(input_columns=['prompt'], output_column='answer')
needlebench_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(role='HUMAN', prompt='{prompt}'),
dict(role='BOT', prompt='{answer}\n'),
]
),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer),
)
needlebench_eval_cfg = dict(
evaluator=dict(type=NeedleBenchMultiEvaluator),
pred_postprocessor=dict(type=needlebench_postprocess),
dataset_postprocessor=dict(type=needlebench_dataset_postprocess),
pred_role='BOT',
)
context_lengths = list([16000, 32000, 48000, 64000, 80000, 96000, 112000, 128000])
depths_list = [0, 10, 21, 31, 42, 52, 63, 73, 84, 94, 100]
document_depth_percent_intervals = 20
document_depth_percent_interval_type = 'linear'
# ----------English Version----------
base_path = 'opencompass/needlebench'
file_list = ['PaulGrahamEssays.jsonl']
needle_file_name = 'multi_needle_reasoning_en.json'
needle_file_name = 'names.json'
diff = 10
num_needles = 2
needlebench_2needle_en_datasets = []
language = 'English'
length_buffer = 3000
for original_context_length in context_lengths:
for depth_percent in depths_list:
dataset_dict = {
'abbr': f'Length{original_context_length}'
f'Depth{int(depth_percent)}_{num_needles}needle_en_128k',
'type': NeedleBenchMultiDataset,
'path': base_path,
'length': original_context_length,
'depth': int(depth_percent),
'tokenizer_model': 'gpt-4',
'file_list': file_list,
'num_repeats_per_file': 10,
'length_buffer': 600,
'guide': True,
'language': language,
'needle_file_name': needle_file_name,
'num_needles': num_needles,
'diff': diff,
'reader_cfg': needlebench_reader_cfg,
'infer_cfg': needlebench_infer_cfg,
'eval_cfg': needlebench_eval_cfg,
}
needlebench_2needle_en_datasets.append(dataset_dict)
num_needles = 3
# Initialize dataset lists
needlebench_2needle_en_datasets = []
needlebench_3needle_en_datasets = []
for original_context_length in context_lengths:
for depth_percent in depths_list:
dataset_dict = {
'abbr': f'Length{original_context_length}'
f'Depth{int(depth_percent)}_{num_needles}needle_en_128k',
'type': NeedleBenchMultiDataset,
'path': base_path,
'length': original_context_length,
'depth': int(depth_percent),
'tokenizer_model': 'gpt-4',
'file_list': file_list,
'num_repeats_per_file': 10,
'length_buffer': 600,
'guide': True,
'language': language,
'needle_file_name': needle_file_name,
'num_needles': num_needles,
'diff': diff,
'reader_cfg': needlebench_reader_cfg,
'infer_cfg': needlebench_infer_cfg,
'eval_cfg': needlebench_eval_cfg,
}
needlebench_3needle_en_datasets.append(dataset_dict)
num_needles = 4
needlebench_4needle_en_datasets = []
for original_context_length in context_lengths:
for depth_percent in depths_list:
dataset_dict = {
'abbr': f'Length{original_context_length}'
f'Depth{int(depth_percent)}_{num_needles}needle_en_128k',
'type': NeedleBenchMultiDataset,
'path': base_path,
'length': original_context_length,
'depth': int(depth_percent),
'tokenizer_model': 'gpt-4',
'file_list': file_list,
'num_repeats_per_file': 10,
'length_buffer': 600,
'guide': True,
'language': language,
'needle_file_name': needle_file_name,
'num_needles': num_needles,
'diff': diff,
'reader_cfg': needlebench_reader_cfg,
'infer_cfg': needlebench_infer_cfg,
'eval_cfg': needlebench_eval_cfg,
}
needlebench_4needle_en_datasets.append(dataset_dict)
num_needles = 5
needlebench_5needle_en_datasets = []
for original_context_length in context_lengths:
for depth_percent in depths_list:
dataset_dict = {
'abbr': f'Length{original_context_length}'
f'Depth{int(depth_percent)}_{num_needles}needle_en_128k',
'type': NeedleBenchMultiDataset,
'path': base_path,
'length': original_context_length,
'depth': int(depth_percent),
'tokenizer_model': 'gpt-4',
'file_list': file_list,
'num_repeats_per_file': 10,
'length_buffer': 600,
'guide': True,
'language': language,
'needle_file_name': needle_file_name,
'num_needles': num_needles,
'diff': diff,
'reader_cfg': needlebench_reader_cfg,
'infer_cfg': needlebench_infer_cfg,
'eval_cfg': needlebench_eval_cfg,
}
needlebench_5needle_en_datasets.append(dataset_dict)
# Create datasets for different numbers of needles
for num_needles in range(2, 6):
dataset_list_name = f'needlebench_{num_needles}needle_en_datasets'
for original_context_length in context_lengths:
for depth_percent in depths_list:
dataset_dict = {
'abbr': f'Length{original_context_length}'
f'Depth{int(depth_percent)}_{num_needles}needle_en_128k',
'type': NeedleBenchMultiDataset,
'path': base_path,
'length': original_context_length,
'depth': int(depth_percent),
'tokenizer_model': 'gpt-4',
'file_list': file_list,
'num_repeats_per_file': 10,
'length_buffer': length_buffer,
'guide': True,
'language': language,
'needle_file_name': needle_file_name,
'num_needles': num_needles,
'diff': diff,
'reader_cfg': needlebench_reader_cfg,
'infer_cfg': needlebench_infer_cfg,
'eval_cfg': needlebench_eval_cfg,
}
# Add to the appropriate list using globals()
globals()[f'needlebench_{num_needles}needle_en_datasets'].append(dataset_dict)
# ----------Chinese Version----------
base_path = 'opencompass/needlebench'
file_list = ['zh_finance.jsonl']
needle_file_name = 'multi_needle_reasoning_zh.json'
needle_file_name = 'names.json'
diff = 10
num_needles = 2
needlebench_2needle_zh_datasets = []
language = 'Chinese'
length_buffer = 200
for original_context_length in context_lengths:
for depth_percent in depths_list:
dataset_dict = {
'abbr': f'Length{original_context_length}'
f'Depth{int(depth_percent)}_{num_needles}needle_zh_128k',
'type': NeedleBenchMultiDataset,
'path': base_path,
'length': original_context_length,
'depth': int(depth_percent),
'tokenizer_model': 'gpt-4',
'file_list': file_list,
'num_repeats_per_file': 10,
'length_buffer': 200,
'guide': True,
'language': language,
'needle_file_name': needle_file_name,
'num_needles': num_needles,
'diff': diff,
'reader_cfg': needlebench_reader_cfg,
'infer_cfg': needlebench_infer_cfg,
'eval_cfg': needlebench_eval_cfg,
}
needlebench_2needle_zh_datasets.append(dataset_dict)
num_needles = 3
# Initialize dataset lists
needlebench_2needle_zh_datasets = []
needlebench_3needle_zh_datasets = []
for original_context_length in context_lengths:
for depth_percent in depths_list:
dataset_dict = {
'abbr': f'Length{original_context_length}'
f'Depth{int(depth_percent)}_{num_needles}needle_zh_128k',
'type': NeedleBenchMultiDataset,
'path': base_path,
'length': original_context_length,
'depth': int(depth_percent),
'tokenizer_model': 'gpt-4',
'file_list': file_list,
'num_repeats_per_file': 10,
'length_buffer': 200,
'guide': True,
'language': language,
'needle_file_name': needle_file_name,
'num_needles': num_needles,
'diff': diff,
'reader_cfg': needlebench_reader_cfg,
'infer_cfg': needlebench_infer_cfg,
'eval_cfg': needlebench_eval_cfg,
}
needlebench_3needle_zh_datasets.append(dataset_dict)
num_needles = 4
needlebench_4needle_zh_datasets = []
for original_context_length in context_lengths:
for depth_percent in depths_list:
dataset_dict = {
'abbr': f'Length{original_context_length}'
f'Depth{int(depth_percent)}_{num_needles}needle_zh_128k',
'type': NeedleBenchMultiDataset,
'path': base_path,
'length': original_context_length,
'depth': int(depth_percent),
'tokenizer_model': 'gpt-4',
'file_list': file_list,
'num_repeats_per_file': 10,
'length_buffer': 200,
'guide': True,
'language': language,
'needle_file_name': needle_file_name,
'num_needles': num_needles,
'diff': diff,
'reader_cfg': needlebench_reader_cfg,
'infer_cfg': needlebench_infer_cfg,
'eval_cfg': needlebench_eval_cfg,
}
needlebench_4needle_zh_datasets.append(dataset_dict)
num_needles = 5
needlebench_5needle_zh_datasets = []
for original_context_length in context_lengths:
for depth_percent in depths_list:
dataset_dict = {
'abbr': f'Length{original_context_length}'
f'Depth{int(depth_percent)}_{num_needles}needle_zh_128k',
'type': NeedleBenchMultiDataset,
'path': base_path,
'length': original_context_length,
'depth': int(depth_percent),
'tokenizer_model': 'gpt-4',
'file_list': file_list,
'num_repeats_per_file': 10,
'length_buffer': 200,
'guide': True,
'language': language,
'needle_file_name': needle_file_name,
'num_needles': num_needles,
'diff': diff,
'reader_cfg': needlebench_reader_cfg,
'infer_cfg': needlebench_infer_cfg,
'eval_cfg': needlebench_eval_cfg,
}
needlebench_5needle_zh_datasets.append(dataset_dict)
# Create datasets for different numbers of needles
for num_needles in range(2, 6):
dataset_list_name = f'needlebench_{num_needles}needle_zh_datasets'
for original_context_length in context_lengths:
for depth_percent in depths_list:
dataset_dict = {
'abbr': f'Length{original_context_length}'
f'Depth{int(depth_percent)}_{num_needles}needle_zh_128k',
'type': NeedleBenchMultiDataset,
'path': base_path,
'length': original_context_length,
'depth': int(depth_percent),
'tokenizer_model': 'gpt-4',
'file_list': file_list,
'num_repeats_per_file': 10,
'length_buffer': length_buffer,
'guide': True,
'language': language,
'needle_file_name': needle_file_name,
'num_needles': num_needles,
'diff': diff,
'reader_cfg': needlebench_reader_cfg,
'infer_cfg': needlebench_infer_cfg,
'eval_cfg': needlebench_eval_cfg,
}
# Add to the appropriate list using globals()
globals()[f'needlebench_{num_needles}needle_zh_datasets'].append(dataset_dict)

View File

@ -1,108 +1,56 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets.needlebench.parallel import NeedleBenchParallelDataset
from opencompass.datasets.needlebench.parallel import NeedleBenchParallelEvaluator
from opencompass.datasets.needlebench.origin import needlebench_postprocess
from opencompass.datasets.needlebench.origin import needlebench_dataset_postprocess
import math
from mmengine.config import read_base
with read_base():
from .needlebench_single_128k import depths_list as depths, context_lengths
from .needlebench_single_128k import needlebench_reader_cfg, needlebench_infer_cfg, needlebench_eval_cfg
def logistic(x, L=100, x0=50, k=0.1):
return round(L / (1 + math.exp(-k * (x - x0))), 3)
def generate_linear_space(start, end, num):
if num == 1:
return [start]
elif num < 1:
raise ValueError('num must be at least 1.')
step = (end - start) / (num - 1)
return [start + step * i for i in range(num)]
def generate_depth_percents(intervals, interval_type):
if interval_type == 'linear':
return generate_linear_space(0, 100, intervals)
elif interval_type == 'sigmoid':
linear_space = generate_linear_space(0, 100, intervals)
return [logistic(x) for x in linear_space]
else:
raise ValueError('Unsupported interval type')
needlebench_reader_cfg = dict(input_columns=['prompt'], output_column='answer')
needlebench_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(role='HUMAN', prompt='{prompt}'),
dict(role='BOT', prompt='{answer}\n'),
]
),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer),
)
needlebench_eval_cfg = dict(
evaluator=dict(type=NeedleBenchParallelEvaluator),
pred_postprocessor=dict(type=needlebench_postprocess),
dataset_postprocessor=dict(type=needlebench_dataset_postprocess),
pred_role='BOT',
)
context_lengths = list([16000, 32000, 48000, 64000, 80000, 96000, 112000, 128000])
document_depth_percent_intervals = 20
document_depth_percent_interval_type = 'linear'
needlebench_eval_cfg['evaluator']['type'] = NeedleBenchParallelEvaluator
base_path = 'opencompass/needlebench'
file_list = ['PaulGrahamEssays.jsonl']
needlebench_en_datasets = []
needle_file_name = 'needles.jsonl'
depths = [0, 10, 21, 31, 42, 52, 63, 73, 84, 94, 100]
for original_context_length in context_lengths:
dataset_dict = {
'abbr': f'Length{original_context_length}' f'_parallel_en_128k',
'type': NeedleBenchParallelDataset,
'path': base_path,
'needle_file_name': needle_file_name,
'length': original_context_length,
'depths': depths,
'tokenizer_model': 'gpt-4',
'file_list': file_list,
'num_repeats_per_file': 25,
'length_buffer': 3000,
'guide': True,
# Define configurations for both English and Chinese datasets
language_configs = [
{
'file_list': ['PaulGrahamEssays.jsonl'],
'dataset_var': 'needlebench_en_datasets',
'language': 'English',
'reader_cfg': needlebench_reader_cfg,
'infer_cfg': needlebench_infer_cfg,
'eval_cfg': needlebench_eval_cfg,
'length_buffer': 3000,
'suffix': 'en'
},
{
'file_list': ['zh_finance.jsonl'],
'dataset_var': 'needlebench_zh_datasets',
'language': 'Chinese',
'length_buffer': 200,
'suffix': 'zh'
}
needlebench_en_datasets.append(dataset_dict)
]
file_list = ['zh_finance.jsonl']
# Initialize empty dataset lists
needlebench_en_datasets = []
needlebench_zh_datasets = []
for original_context_length in context_lengths:
dataset_dict = {
'abbr': f'Length{original_context_length}' f'_parallel_zh_128k',
'type': NeedleBenchParallelDataset,
'path': base_path,
'needle_file_name': needle_file_name,
'length': original_context_length,
'depths': depths,
'tokenizer_model': 'gpt-4',
'file_list': file_list,
'num_repeats_per_file': 25,
'length_buffer': 200,
'guide': True,
'language': 'Chinese',
'reader_cfg': needlebench_reader_cfg,
'infer_cfg': needlebench_infer_cfg,
'eval_cfg': needlebench_eval_cfg,
}
needlebench_zh_datasets.append(dataset_dict)
# Single loop to handle both languages
for config in language_configs:
for original_context_length in context_lengths:
dataset_dict = {
'abbr': f'Length{original_context_length}_parallel_{config["suffix"]}_128k',
'type': NeedleBenchParallelDataset,
'path': base_path,
'needle_file_name': needle_file_name,
'length': original_context_length,
'depths': depths,
'tokenizer_model': 'gpt-4',
'file_list': config['file_list'],
'num_repeats_per_file': 25,
'length_buffer': config['length_buffer'],
'guide': True,
'language': config['language'],
'reader_cfg': needlebench_reader_cfg,
'infer_cfg': needlebench_infer_cfg,
'eval_cfg': needlebench_eval_cfg,
}
globals()[config['dataset_var']].append(dataset_dict)

View File

@ -5,30 +5,6 @@ from opencompass.datasets.needlebench.origin import NeedleBenchOriginDataset
from opencompass.datasets.needlebench.origin import NeedleBenchOriginEvaluator
from opencompass.datasets.needlebench.origin import needlebench_postprocess
from opencompass.datasets.needlebench.origin import needlebench_dataset_postprocess
import math
def logistic(x, L=100, x0=50, k=0.1):
return round(L / (1 + math.exp(-k * (x - x0))), 3)
def generate_linear_space(start, end, num):
if num == 1:
return [start]
elif num < 1:
raise ValueError('num must be at least 1.')
step = (end - start) / (num - 1)
return [start + step * i for i in range(num)]
def generate_depth_percents(intervals, interval_type):
if interval_type == 'linear':
return generate_linear_space(0, 100, intervals)
elif interval_type == 'sigmoid':
linear_space = generate_linear_space(0, 100, intervals)
return [logistic(x) for x in linear_space]
else:
raise ValueError('Unsupported interval type')
needlebench_reader_cfg = dict(input_columns=['prompt'], output_column='answer')
@ -54,60 +30,54 @@ needlebench_eval_cfg = dict(
pred_role='BOT',
)
context_lengths = list([16000, 32000, 48000, 64000, 80000, 96000, 112000, 128000])
depths_list = [0, 10, 21, 31, 42, 52, 63, 73, 84, 94, 100]
document_depth_percent_intervals = 20
document_depth_percent_interval_type = 'linear'
context_lengths = list([1000, 2000, 4000, 8000, 16000, 32000, 64000, 128000])
# context_lengths = [128000]
depths_list = [0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
base_path = 'opencompass/needlebench'
file_list = ['PaulGrahamEssays.jsonl']
needle_file_name = 'needles.jsonl'
# Define configurations for both English and Chinese datasets
language_configs = [
{
'file_list': ['PaulGrahamEssays.jsonl'],
'dataset_var': 'needlebench_en_datasets',
'language': 'English',
'length_buffer': 3000,
'suffix': 'en'
},
{
'file_list': ['zh_finance.jsonl'],
'dataset_var': 'needlebench_zh_datasets',
'language': 'Chinese',
'length_buffer': 200,
'suffix': 'zh'
}
]
# Initialize empty dataset lists
needlebench_en_datasets = []
needle_file_name = 'needles.jsonl'
for original_context_length in context_lengths:
for depth_percent in depths_list:
dataset_dict = {
'abbr': f'Length{original_context_length}'
f'Depth{int(depth_percent)}_origin_en_128k',
'type': NeedleBenchOriginDataset,
'path': base_path,
'length': original_context_length,
'depth': int(depth_percent),
'tokenizer_model': 'gpt-4',
'file_list': file_list,
'num_repeats_per_file': 10,
'length_buffer': 600,
'guide': True,
'language': 'English',
'needle_file_name': needle_file_name,
'reader_cfg': needlebench_reader_cfg,
'infer_cfg': needlebench_infer_cfg,
'eval_cfg': needlebench_eval_cfg,
}
needlebench_en_datasets.append(dataset_dict)
file_list = ['zh_finance.jsonl']
needlebench_zh_datasets = []
needle_file_name = 'needles.jsonl'
for original_context_length in context_lengths:
for depth_percent in depths_list:
dataset_dict = {
'abbr': f'Length{original_context_length}'
f'Depth{int(depth_percent)}_origin_zh_128k',
'type': NeedleBenchOriginDataset,
'path': base_path,
'length': original_context_length,
'depth': int(depth_percent),
'tokenizer_model': 'gpt-4',
'file_list': file_list,
'num_repeats_per_file': 10,
'length_buffer': 200,
'guide': True,
'language': 'Chinese',
'needle_file_name': needle_file_name,
'reader_cfg': needlebench_reader_cfg,
'infer_cfg': needlebench_infer_cfg,
'eval_cfg': needlebench_eval_cfg,
}
needlebench_zh_datasets.append(dataset_dict)
# Single loop to handle both languages
for config in language_configs:
for original_context_length in context_lengths:
for depth_percent in depths_list:
dataset_dict = {
'abbr': f'Length{original_context_length}'
f'Depth{int(depth_percent)}_origin_{config["suffix"]}_128k',
'type': NeedleBenchOriginDataset,
'path': base_path,
'length': original_context_length,
'depth': int(depth_percent),
'tokenizer_model': 'gpt-4',
'file_list': config['file_list'],
'num_repeats_per_file': 10,
'length_buffer': config['length_buffer'],
'guide': True,
'language': config['language'],
'needle_file_name': needle_file_name,
'reader_cfg': needlebench_reader_cfg,
'infer_cfg': needlebench_infer_cfg,
'eval_cfg': needlebench_eval_cfg,
}
globals()[config['dataset_var']].append(dataset_dict)

View File

@ -1,18 +1,18 @@
from mmengine.config import read_base
with read_base():
from .needlebench_multi_reasoning_200k import needlebench_2needle_en_datasets as needlebench_multi_2needle_en_datasets
from .needlebench_multi_reasoning_200k import needlebench_3needle_en_datasets as needlebench_multi_3needle_en_datasets
from .needlebench_multi_reasoning_200k import needlebench_4needle_en_datasets as needlebench_multi_4needle_en_datasets
from .needlebench_multi_reasoning_200k import needlebench_5needle_en_datasets as needlebench_multi_5needle_en_datasets
from .needlebench_multi_reasoning_200k import needlebench_2needle_zh_datasets as needlebench_multi_2needle_zh_datasets
from .needlebench_multi_reasoning_200k import needlebench_3needle_zh_datasets as needlebench_multi_3needle_zh_datasets
from .needlebench_multi_reasoning_200k import needlebench_4needle_zh_datasets as needlebench_multi_4needle_zh_datasets
from .needlebench_multi_reasoning_200k import needlebench_5needle_zh_datasets as needlebench_multi_5needle_zh_datasets
from opencompass.configs.datasets.needlebench.needlebench_200k.needlebench_multi_reasoning_200k import needlebench_2needle_en_datasets as needlebench_multi_2needle_en_datasets
from opencompass.configs.datasets.needlebench.needlebench_200k.needlebench_multi_reasoning_200k import needlebench_3needle_en_datasets as needlebench_multi_3needle_en_datasets
from opencompass.configs.datasets.needlebench.needlebench_200k.needlebench_multi_reasoning_200k import needlebench_4needle_en_datasets as needlebench_multi_4needle_en_datasets
from opencompass.configs.datasets.needlebench.needlebench_200k.needlebench_multi_reasoning_200k import needlebench_5needle_en_datasets as needlebench_multi_5needle_en_datasets
from opencompass.configs.datasets.needlebench.needlebench_200k.needlebench_multi_reasoning_200k import needlebench_2needle_zh_datasets as needlebench_multi_2needle_zh_datasets
from opencompass.configs.datasets.needlebench.needlebench_200k.needlebench_multi_reasoning_200k import needlebench_3needle_zh_datasets as needlebench_multi_3needle_zh_datasets
from opencompass.configs.datasets.needlebench.needlebench_200k.needlebench_multi_reasoning_200k import needlebench_4needle_zh_datasets as needlebench_multi_4needle_zh_datasets
from opencompass.configs.datasets.needlebench.needlebench_200k.needlebench_multi_reasoning_200k import needlebench_5needle_zh_datasets as needlebench_multi_5needle_zh_datasets
from .needlebench_single_200k import needlebench_en_datasets as needlebench_origin_en_datasets
from .needlebench_single_200k import needlebench_zh_datasets as needlebench_origin_zh_datasets
from .needlebench_multi_retrieval_200k import needlebench_en_datasets as needlebench_parallel_en_datasets
from .needlebench_multi_retrieval_200k import needlebench_zh_datasets as needlebench_parallel_zh_datasets
from opencompass.configs.datasets.needlebench.needlebench_200k.needlebench_single_200k import needlebench_en_datasets as needlebench_origin_en_datasets
from opencompass.configs.datasets.needlebench.needlebench_200k.needlebench_single_200k import needlebench_zh_datasets as needlebench_origin_zh_datasets
from opencompass.configs.datasets.needlebench.needlebench_200k.needlebench_multi_retrieval_200k import needlebench_en_datasets as needlebench_parallel_en_datasets
from opencompass.configs.datasets.needlebench.needlebench_200k.needlebench_multi_retrieval_200k import needlebench_zh_datasets as needlebench_parallel_zh_datasets
needlebench_datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])

View File

@ -1,289 +1,93 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets.needlebench.multi import NeedleBenchMultiDataset
from opencompass.datasets.needlebench.multi import NeedleBenchMultiEvaluator
from opencompass.datasets.needlebench.origin import needlebench_postprocess
from opencompass.datasets.needlebench.origin import needlebench_dataset_postprocess
import math
from mmengine.config import read_base
with read_base():
from .needlebench_single_200k import depths_list, context_lengths
from .needlebench_single_200k import needlebench_reader_cfg, needlebench_infer_cfg
from opencompass.configs.datasets.needlebench.atc.atc_0shot_nocot_2_power_en import needlebench_atc_eval_cfg as needlebench_eval_cfg
def logistic(x, L=100, x0=50, k=0.1):
return round(L / (1 + math.exp(-k * (x - x0))), 3)
def generate_linear_space(start, end, num):
if num == 1:
return [start]
elif num < 1:
raise ValueError('num must be at least 1.')
step = (end - start) / (num - 1)
return [start + step * i for i in range(num)]
def generate_depth_percents(intervals, interval_type):
if interval_type == 'linear':
return generate_linear_space(0, 100, intervals)
elif interval_type == 'sigmoid':
linear_space = generate_linear_space(0, 100, intervals)
return [logistic(x) for x in linear_space]
else:
raise ValueError('Unsupported interval type')
needlebench_reader_cfg = dict(input_columns=['prompt'], output_column='answer')
needlebench_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(role='HUMAN', prompt='{prompt}'),
dict(role='BOT', prompt='{answer}\n'),
]
),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer),
)
needlebench_eval_cfg = dict(
evaluator=dict(type=NeedleBenchMultiEvaluator),
pred_postprocessor=dict(type=needlebench_postprocess),
dataset_postprocessor=dict(type=needlebench_dataset_postprocess),
pred_role='BOT',
)
# context_lengths = list([16000, 32000, 48000, 64000, 80000, 96000, 112000, 128000, 144000, 160000, 176000, 192000, 200000])
context_lengths = [16000, 48000, 80000, 112000, 128000, 144000, 176000, 200000]
depths_list = [0, 10, 21, 31, 42, 52, 63, 73, 84, 94, 100]
# ----------English Version----------
base_path = 'opencompass/needlebench'
file_list = ['PaulGrahamEssays.jsonl']
needle_file_name = 'multi_needle_reasoning_en.json'
needle_file_name = 'names.json'
diff = 10
num_needles = 2
needlebench_2needle_en_datasets = []
language = 'English'
length_buffer = 3000
for original_context_length in context_lengths:
for depth_percent in depths_list:
dataset_dict = {
'abbr': f'Length{original_context_length}'
f'Depth{int(depth_percent)}_{num_needles}needle_en_200k',
'type': NeedleBenchMultiDataset,
'path': base_path,
'length': original_context_length,
'depth': int(depth_percent),
'tokenizer_model': 'gpt-4',
'file_list': file_list,
'num_repeats_per_file': 10,
'length_buffer': 600,
'guide': True,
'language': language,
'needle_file_name': needle_file_name,
'num_needles': num_needles,
'diff': diff,
'reader_cfg': needlebench_reader_cfg,
'infer_cfg': needlebench_infer_cfg,
'eval_cfg': needlebench_eval_cfg,
}
needlebench_2needle_en_datasets.append(dataset_dict)
num_needles = 3
# Initialize dataset lists
needlebench_2needle_en_datasets = []
needlebench_3needle_en_datasets = []
for original_context_length in context_lengths:
for depth_percent in depths_list:
dataset_dict = {
'abbr': f'Length{original_context_length}'
f'Depth{int(depth_percent)}_{num_needles}needle_en_200k',
'type': NeedleBenchMultiDataset,
'path': base_path,
'length': original_context_length,
'depth': int(depth_percent),
'tokenizer_model': 'gpt-4',
'file_list': file_list,
'num_repeats_per_file': 10,
'length_buffer': 600,
'guide': True,
'language': language,
'needle_file_name': needle_file_name,
'num_needles': num_needles,
'diff': diff,
'reader_cfg': needlebench_reader_cfg,
'infer_cfg': needlebench_infer_cfg,
'eval_cfg': needlebench_eval_cfg,
}
needlebench_3needle_en_datasets.append(dataset_dict)
num_needles = 4
needlebench_4needle_en_datasets = []
for original_context_length in context_lengths:
for depth_percent in depths_list:
dataset_dict = {
'abbr': f'Length{original_context_length}'
f'Depth{int(depth_percent)}_{num_needles}needle_en_200k',
'type': NeedleBenchMultiDataset,
'path': base_path,
'length': original_context_length,
'depth': int(depth_percent),
'tokenizer_model': 'gpt-4',
'file_list': file_list,
'num_repeats_per_file': 10,
'length_buffer': 600,
'guide': True,
'language': language,
'needle_file_name': needle_file_name,
'num_needles': num_needles,
'diff': diff,
'reader_cfg': needlebench_reader_cfg,
'infer_cfg': needlebench_infer_cfg,
'eval_cfg': needlebench_eval_cfg,
}
needlebench_4needle_en_datasets.append(dataset_dict)
num_needles = 5
needlebench_5needle_en_datasets = []
for original_context_length in context_lengths:
for depth_percent in depths_list:
dataset_dict = {
'abbr': f'Length{original_context_length}'
f'Depth{int(depth_percent)}_{num_needles}needle_en_200k',
'type': NeedleBenchMultiDataset,
'path': base_path,
'length': original_context_length,
'depth': int(depth_percent),
'tokenizer_model': 'gpt-4',
'file_list': file_list,
'num_repeats_per_file': 10,
'length_buffer': 600,
'guide': True,
'language': language,
'needle_file_name': needle_file_name,
'num_needles': num_needles,
'diff': diff,
'reader_cfg': needlebench_reader_cfg,
'infer_cfg': needlebench_infer_cfg,
'eval_cfg': needlebench_eval_cfg,
}
needlebench_5needle_en_datasets.append(dataset_dict)
# Create datasets for different numbers of needles
for num_needles in range(2, 6):
dataset_list_name = f'needlebench_{num_needles}needle_en_datasets'
for original_context_length in context_lengths:
for depth_percent in depths_list:
dataset_dict = {
'abbr': f'Length{original_context_length}'
f'Depth{int(depth_percent)}_{num_needles}needle_en_200k',
'type': NeedleBenchMultiDataset,
'path': base_path,
'length': original_context_length,
'depth': int(depth_percent),
'tokenizer_model': 'gpt-4',
'file_list': file_list,
'num_repeats_per_file': 10,
'length_buffer': length_buffer,
'language': language,
'needle_file_name': needle_file_name,
'num_needles': num_needles,
'diff': diff,
'reader_cfg': needlebench_reader_cfg,
'infer_cfg': needlebench_infer_cfg,
'eval_cfg': needlebench_eval_cfg,
}
# Add to the appropriate list using globals()
globals()[f'needlebench_{num_needles}needle_en_datasets'].append(dataset_dict)
# ----------Chinese Version----------
base_path = 'opencompass/needlebench'
file_list = ['zh_finance.jsonl']
needle_file_name = 'multi_needle_reasoning_zh.json'
needle_file_name = 'names.json'
diff = 10
num_needles = 2
needlebench_2needle_zh_datasets = []
language = 'Chinese'
length_buffer = 200
for original_context_length in context_lengths:
for depth_percent in depths_list:
dataset_dict = {
'abbr': f'Length{original_context_length}'
f'Depth{int(depth_percent)}_{num_needles}needle_zh_200k',
'type': NeedleBenchMultiDataset,
'path': base_path,
'length': original_context_length,
'depth': int(depth_percent),
'tokenizer_model': 'gpt-4',
'file_list': file_list,
'num_repeats_per_file': 10,
'length_buffer': 200,
'guide': True,
'language': language,
'needle_file_name': needle_file_name,
'num_needles': num_needles,
'diff': diff,
'reader_cfg': needlebench_reader_cfg,
'infer_cfg': needlebench_infer_cfg,
'eval_cfg': needlebench_eval_cfg,
}
needlebench_2needle_zh_datasets.append(dataset_dict)
num_needles = 3
# Initialize dataset lists
needlebench_2needle_zh_datasets = []
needlebench_3needle_zh_datasets = []
for original_context_length in context_lengths:
for depth_percent in depths_list:
dataset_dict = {
'abbr': f'Length{original_context_length}'
f'Depth{int(depth_percent)}_{num_needles}needle_zh_200k',
'type': NeedleBenchMultiDataset,
'path': base_path,
'length': original_context_length,
'depth': int(depth_percent),
'tokenizer_model': 'gpt-4',
'file_list': file_list,
'num_repeats_per_file': 10,
'length_buffer': 200,
'guide': True,
'language': language,
'needle_file_name': needle_file_name,
'num_needles': num_needles,
'diff': diff,
'reader_cfg': needlebench_reader_cfg,
'infer_cfg': needlebench_infer_cfg,
'eval_cfg': needlebench_eval_cfg,
}
needlebench_3needle_zh_datasets.append(dataset_dict)
num_needles = 4
needlebench_4needle_zh_datasets = []
for original_context_length in context_lengths:
for depth_percent in depths_list:
dataset_dict = {
'abbr': f'Length{original_context_length}'
f'Depth{int(depth_percent)}_{num_needles}needle_zh_200k',
'type': NeedleBenchMultiDataset,
'path': base_path,
'length': original_context_length,
'depth': int(depth_percent),
'tokenizer_model': 'gpt-4',
'file_list': file_list,
'num_repeats_per_file': 10,
'length_buffer': 200,
'guide': True,
'language': language,
'needle_file_name': needle_file_name,
'num_needles': num_needles,
'diff': diff,
'reader_cfg': needlebench_reader_cfg,
'infer_cfg': needlebench_infer_cfg,
'eval_cfg': needlebench_eval_cfg,
}
needlebench_4needle_zh_datasets.append(dataset_dict)
num_needles = 5
needlebench_5needle_zh_datasets = []
for original_context_length in context_lengths:
for depth_percent in depths_list:
dataset_dict = {
'abbr': f'Length{original_context_length}'
f'Depth{int(depth_percent)}_{num_needles}needle_zh_200k',
'type': NeedleBenchMultiDataset,
'path': base_path,
'length': original_context_length,
'depth': int(depth_percent),
'tokenizer_model': 'gpt-4',
'file_list': file_list,
'num_repeats_per_file': 10,
'length_buffer': 200,
'guide': True,
'language': language,
'needle_file_name': needle_file_name,
'num_needles': num_needles,
'diff': diff,
'reader_cfg': needlebench_reader_cfg,
'infer_cfg': needlebench_infer_cfg,
'eval_cfg': needlebench_eval_cfg,
}
needlebench_5needle_zh_datasets.append(dataset_dict)
# Create datasets for different numbers of needles
for num_needles in range(2, 6):
dataset_list_name = f'needlebench_{num_needles}needle_zh_datasets'
for original_context_length in context_lengths:
for depth_percent in depths_list:
dataset_dict = {
'abbr': f'Length{original_context_length}'
f'Depth{int(depth_percent)}_{num_needles}needle_zh_200k',
'type': NeedleBenchMultiDataset,
'path': base_path,
'length': original_context_length,
'depth': int(depth_percent),
'tokenizer_model': 'gpt-4',
'file_list': file_list,
'num_repeats_per_file': 10,
'length_buffer': length_buffer,
'language': language,
'needle_file_name': needle_file_name,
'num_needles': num_needles,
'diff': diff,
'reader_cfg': needlebench_reader_cfg,
'infer_cfg': needlebench_infer_cfg,
'eval_cfg': needlebench_eval_cfg,
}
# Add to the appropriate list using globals()
globals()[f'needlebench_{num_needles}needle_zh_datasets'].append(dataset_dict)

View File

@ -1,109 +1,55 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets.needlebench.parallel import NeedleBenchParallelDataset
from opencompass.datasets.needlebench.parallel import NeedleBenchParallelEvaluator
from opencompass.datasets.needlebench.origin import needlebench_postprocess
from opencompass.datasets.needlebench.origin import needlebench_dataset_postprocess
import math
from mmengine.config import read_base
with read_base():
from .needlebench_single_200k import depths_list as depths, context_lengths
from .needlebench_single_200k import needlebench_reader_cfg, needlebench_infer_cfg, needlebench_eval_cfg
def logistic(x, L=100, x0=50, k=0.1):
return round(L / (1 + math.exp(-k * (x - x0))), 3)
def generate_linear_space(start, end, num):
if num == 1:
return [start]
elif num < 1:
raise ValueError('num must be at least 1.')
step = (end - start) / (num - 1)
return [start + step * i for i in range(num)]
def generate_depth_percents(intervals, interval_type):
if interval_type == 'linear':
return generate_linear_space(0, 100, intervals)
elif interval_type == 'sigmoid':
linear_space = generate_linear_space(0, 100, intervals)
return [logistic(x) for x in linear_space]
else:
raise ValueError('Unsupported interval type')
needlebench_reader_cfg = dict(input_columns=['prompt'], output_column='answer')
needlebench_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(role='HUMAN', prompt='{prompt}'),
dict(role='BOT', prompt='{answer}\n'),
]
),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer),
)
needlebench_eval_cfg = dict(
evaluator=dict(type=NeedleBenchParallelEvaluator),
pred_postprocessor=dict(type=needlebench_postprocess),
dataset_postprocessor=dict(type=needlebench_dataset_postprocess),
pred_role='BOT',
)
# context_lengths = list([16000, 32000, 48000, 64000, 80000, 96000, 112000, 128000, 144000, 160000, 176000, 192000, 200000])
context_lengths = list([16000, 48000, 80000, 112000, 128000, 144000, 176000, 200000])
document_depth_percent_intervals = 20
document_depth_percent_interval_type = 'linear'
needlebench_eval_cfg['evaluator']['type'] = NeedleBenchParallelEvaluator
base_path = 'opencompass/needlebench'
file_list = ['PaulGrahamEssays.jsonl']
needlebench_en_datasets = []
needle_file_name = 'needles.jsonl'
depths = [0, 10, 21, 31, 42, 52, 63, 73, 84, 94, 100]
for original_context_length in context_lengths:
dataset_dict = {
'abbr': f'Length{original_context_length}' f'_parallel_en_200k',
'type': NeedleBenchParallelDataset,
'path': base_path,
'needle_file_name': needle_file_name,
'length': original_context_length,
'depths': depths,
'tokenizer_model': 'gpt-4',
'file_list': file_list,
'num_repeats_per_file': 25,
'length_buffer': 3000,
'guide': True,
# Define configurations for both English and Chinese datasets
language_configs = [
{
'file_list': ['PaulGrahamEssays.jsonl'],
'dataset_var': 'needlebench_en_datasets',
'language': 'English',
'reader_cfg': needlebench_reader_cfg,
'infer_cfg': needlebench_infer_cfg,
'eval_cfg': needlebench_eval_cfg,
'length_buffer': 3000,
'suffix': 'en'
},
{
'file_list': ['zh_finance.jsonl'],
'dataset_var': 'needlebench_zh_datasets',
'language': 'Chinese',
'length_buffer': 200,
'suffix': 'zh'
}
needlebench_en_datasets.append(dataset_dict)
]
file_list = ['zh_finance.jsonl']
# Initialize empty dataset lists
needlebench_en_datasets = []
needlebench_zh_datasets = []
for original_context_length in context_lengths:
dataset_dict = {
'abbr': f'Length{original_context_length}' f'_parallel_zh_200k',
'type': NeedleBenchParallelDataset,
'path': base_path,
'needle_file_name': needle_file_name,
'length': original_context_length,
'depths': depths,
'tokenizer_model': 'gpt-4',
'file_list': file_list,
'num_repeats_per_file': 25,
'length_buffer': 200,
'guide': True,
'language': 'Chinese',
'reader_cfg': needlebench_reader_cfg,
'infer_cfg': needlebench_infer_cfg,
'eval_cfg': needlebench_eval_cfg,
}
needlebench_zh_datasets.append(dataset_dict)
# Single loop to handle both languages
for config in language_configs:
for original_context_length in context_lengths:
dataset_dict = {
'abbr': f'Length{original_context_length}_parallel_{config["suffix"]}_200k',
'type': NeedleBenchParallelDataset,
'path': base_path,
'needle_file_name': needle_file_name,
'length': original_context_length,
'depths': depths,
'tokenizer_model': 'gpt-4',
'file_list': config['file_list'],
'num_repeats_per_file': 25,
'length_buffer': config['length_buffer'],
'language': config['language'],
'reader_cfg': needlebench_reader_cfg,
'infer_cfg': needlebench_infer_cfg,
'eval_cfg': needlebench_eval_cfg,
}
globals()[config['dataset_var']].append(dataset_dict)

View File

@ -5,30 +5,6 @@ from opencompass.datasets.needlebench.origin import NeedleBenchOriginDataset
from opencompass.datasets.needlebench.origin import NeedleBenchOriginEvaluator
from opencompass.datasets.needlebench.origin import needlebench_postprocess
from opencompass.datasets.needlebench.origin import needlebench_dataset_postprocess
import math
def logistic(x, L=100, x0=50, k=0.1):
return round(L / (1 + math.exp(-k * (x - x0))), 3)
def generate_linear_space(start, end, num):
if num == 1:
return [start]
elif num < 1:
raise ValueError('num must be at least 1.')
step = (end - start) / (num - 1)
return [start + step * i for i in range(num)]
def generate_depth_percents(intervals, interval_type):
if interval_type == 'linear':
return generate_linear_space(0, 100, intervals)
elif interval_type == 'sigmoid':
linear_space = generate_linear_space(0, 100, intervals)
return [logistic(x) for x in linear_space]
else:
raise ValueError('Unsupported interval type')
needlebench_reader_cfg = dict(input_columns=['prompt'], output_column='answer')
@ -54,59 +30,52 @@ needlebench_eval_cfg = dict(
pred_role='BOT',
)
# context_lengths = list([16000, 32000, 48000, 64000, 80000, 96000, 112000, 128000, 144000, 160000, 176000, 192000, 200000])
context_lengths = [16000, 48000, 80000, 112000, 128000, 144000, 176000, 200000]
depths_list = [0, 10, 21, 31, 42, 52, 63, 73, 84, 94, 100]
context_lengths = list([1000, 25000, 50000, 75000, 100000, 125000, 150000, 175000, 200000])
depths_list = [0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
base_path = 'opencompass/needlebench'
file_list = ['PaulGrahamEssays.jsonl']
needle_file_name = 'needles.jsonl'
# Define configurations for both English and Chinese datasets
language_configs = [
{
'file_list': ['PaulGrahamEssays.jsonl'],
'dataset_var': 'needlebench_en_datasets',
'language': 'English',
'length_buffer': 3000,
'suffix': 'en'
},
{
'file_list': ['zh_finance.jsonl'],
'dataset_var': 'needlebench_zh_datasets',
'language': 'Chinese',
'length_buffer': 200,
'suffix': 'zh'
}
]
# Initialize empty dataset lists
needlebench_en_datasets = []
needle_file_name = 'needles.jsonl'
for original_context_length in context_lengths:
for depth_percent in depths_list:
dataset_dict = {
'abbr': f'Length{original_context_length}'
f'Depth{int(depth_percent)}_origin_en_200k',
'type': NeedleBenchOriginDataset,
'path': base_path,
'length': original_context_length,
'depth': int(depth_percent),
'tokenizer_model': 'gpt-4',
'file_list': file_list,
'num_repeats_per_file': 10,
'length_buffer': 600,
'guide': True,
'language': 'English',
'needle_file_name': needle_file_name,
'reader_cfg': needlebench_reader_cfg,
'infer_cfg': needlebench_infer_cfg,
'eval_cfg': needlebench_eval_cfg,
}
needlebench_en_datasets.append(dataset_dict)
file_list = ['zh_finance.jsonl']
needlebench_zh_datasets = []
needle_file_name = 'needles.jsonl'
for original_context_length in context_lengths:
for depth_percent in depths_list:
dataset_dict = {
'abbr': f'Length{original_context_length}'
f'Depth{int(depth_percent)}_origin_zh_200k',
'type': NeedleBenchOriginDataset,
'path': base_path,
'length': original_context_length,
'depth': int(depth_percent),
'tokenizer_model': 'gpt-4',
'file_list': file_list,
'num_repeats_per_file': 10,
'length_buffer': 200,
'guide': True,
'language': 'Chinese',
'needle_file_name': needle_file_name,
'reader_cfg': needlebench_reader_cfg,
'infer_cfg': needlebench_infer_cfg,
'eval_cfg': needlebench_eval_cfg,
}
needlebench_zh_datasets.append(dataset_dict)
# Single loop to handle both languages
for config in language_configs:
for original_context_length in context_lengths:
for depth_percent in depths_list:
dataset_dict = {
'abbr': f'Length{original_context_length}'
f'Depth{int(depth_percent)}_origin_{config["suffix"]}_200k',
'type': NeedleBenchOriginDataset,
'path': base_path,
'length': original_context_length,
'depth': int(depth_percent),
'tokenizer_model': 'gpt-4',
'file_list': config['file_list'],
'num_repeats_per_file': 10,
'length_buffer': config['length_buffer'],
'language': config['language'],
'needle_file_name': needle_file_name,
'reader_cfg': needlebench_reader_cfg,
'infer_cfg': needlebench_infer_cfg,
'eval_cfg': needlebench_eval_cfg,
}
globals()[config['dataset_var']].append(dataset_dict)

View File

@ -1,18 +1,18 @@
from mmengine.config import read_base
with read_base():
from .needlebench_multi_reasoning_256k import needlebench_2needle_en_datasets as needlebench_multi_2needle_en_datasets
from .needlebench_multi_reasoning_256k import needlebench_3needle_en_datasets as needlebench_multi_3needle_en_datasets
from .needlebench_multi_reasoning_256k import needlebench_4needle_en_datasets as needlebench_multi_4needle_en_datasets
from .needlebench_multi_reasoning_256k import needlebench_5needle_en_datasets as needlebench_multi_5needle_en_datasets
from .needlebench_multi_reasoning_256k import needlebench_2needle_zh_datasets as needlebench_multi_2needle_zh_datasets
from .needlebench_multi_reasoning_256k import needlebench_3needle_zh_datasets as needlebench_multi_3needle_zh_datasets
from .needlebench_multi_reasoning_256k import needlebench_4needle_zh_datasets as needlebench_multi_4needle_zh_datasets
from .needlebench_multi_reasoning_256k import needlebench_5needle_zh_datasets as needlebench_multi_5needle_zh_datasets
from opencompass.configs.datasets.needlebench.needlebench_256k.needlebench_multi_reasoning_256k import needlebench_2needle_en_datasets as needlebench_multi_2needle_en_datasets
from opencompass.configs.datasets.needlebench.needlebench_256k.needlebench_multi_reasoning_256k import needlebench_3needle_en_datasets as needlebench_multi_3needle_en_datasets
from opencompass.configs.datasets.needlebench.needlebench_256k.needlebench_multi_reasoning_256k import needlebench_4needle_en_datasets as needlebench_multi_4needle_en_datasets
from opencompass.configs.datasets.needlebench.needlebench_256k.needlebench_multi_reasoning_256k import needlebench_5needle_en_datasets as needlebench_multi_5needle_en_datasets
from opencompass.configs.datasets.needlebench.needlebench_256k.needlebench_multi_reasoning_256k import needlebench_2needle_zh_datasets as needlebench_multi_2needle_zh_datasets
from opencompass.configs.datasets.needlebench.needlebench_256k.needlebench_multi_reasoning_256k import needlebench_3needle_zh_datasets as needlebench_multi_3needle_zh_datasets
from opencompass.configs.datasets.needlebench.needlebench_256k.needlebench_multi_reasoning_256k import needlebench_4needle_zh_datasets as needlebench_multi_4needle_zh_datasets
from opencompass.configs.datasets.needlebench.needlebench_256k.needlebench_multi_reasoning_256k import needlebench_5needle_zh_datasets as needlebench_multi_5needle_zh_datasets
from .needlebench_single_256k import needlebench_en_datasets as needlebench_origin_en_datasets
from .needlebench_single_256k import needlebench_zh_datasets as needlebench_origin_zh_datasets
from .needlebench_multi_retrieval_256k import needlebench_en_datasets as needlebench_parallel_en_datasets
from .needlebench_multi_retrieval_256k import needlebench_zh_datasets as needlebench_parallel_zh_datasets
from opencompass.configs.datasets.needlebench.needlebench_256k.needlebench_single_256k import needlebench_en_datasets as needlebench_origin_en_datasets
from opencompass.configs.datasets.needlebench.needlebench_256k.needlebench_single_256k import needlebench_zh_datasets as needlebench_origin_zh_datasets
from opencompass.configs.datasets.needlebench.needlebench_256k.needlebench_multi_retrieval_256k import needlebench_en_datasets as needlebench_parallel_en_datasets
from opencompass.configs.datasets.needlebench.needlebench_256k.needlebench_multi_retrieval_256k import needlebench_zh_datasets as needlebench_parallel_zh_datasets
needlebench_datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])

View File

@ -1,289 +1,93 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets.needlebench.multi import NeedleBenchMultiDataset
from opencompass.datasets.needlebench.multi import NeedleBenchMultiEvaluator
from opencompass.datasets.needlebench.origin import needlebench_postprocess
from opencompass.datasets.needlebench.origin import needlebench_dataset_postprocess
import math
from mmengine.config import read_base
with read_base():
from .needlebench_single_256k import depths_list, context_lengths
from .needlebench_single_256k import needlebench_reader_cfg, needlebench_infer_cfg
from opencompass.configs.datasets.needlebench.atc.atc_0shot_nocot_2_power_en import needlebench_atc_eval_cfg as needlebench_eval_cfg
def logistic(x, L=100, x0=50, k=0.1):
return round(L / (1 + math.exp(-k * (x - x0))), 3)
def generate_linear_space(start, end, num):
if num == 1:
return [start]
elif num < 1:
raise ValueError('num must be at least 1.')
step = (end - start) / (num - 1)
return [start + step * i for i in range(num)]
def generate_depth_percents(intervals, interval_type):
if interval_type == 'linear':
return generate_linear_space(0, 100, intervals)
elif interval_type == 'sigmoid':
linear_space = generate_linear_space(0, 100, intervals)
return [logistic(x) for x in linear_space]
else:
raise ValueError('Unsupported interval type')
needlebench_reader_cfg = dict(input_columns=['prompt'], output_column='answer')
needlebench_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(role='HUMAN', prompt='{prompt}'),
dict(role='BOT', prompt='{answer}\n'),
]
),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer),
)
needlebench_eval_cfg = dict(
evaluator=dict(type=NeedleBenchMultiEvaluator),
pred_postprocessor=dict(type=needlebench_postprocess),
dataset_postprocessor=dict(type=needlebench_dataset_postprocess),
pred_role='BOT',
)
# context_lengths = list([16000, 32000, 48000, 64000, 80000, 96000, 112000, 128000, 144000, 160000, 176000, 192000, 200000])
context_lengths = [32000, 128000, 256000]
depths_list = [0, 10, 21, 31, 42, 52, 63, 73, 84, 94, 100]
# ----------English Version----------
base_path = 'opencompass/needlebench'
file_list = ['PaulGrahamEssays.jsonl']
needle_file_name = 'multi_needle_reasoning_en.json'
needle_file_name = 'names.json'
diff = 10
num_needles = 2
needlebench_2needle_en_datasets = []
language = 'English'
length_buffer = 3000
for original_context_length in context_lengths:
for depth_percent in depths_list:
dataset_dict = {
'abbr': f'Length{original_context_length}'
f'Depth{int(depth_percent)}_{num_needles}needle_en_256k',
'type': NeedleBenchMultiDataset,
'path': base_path,
'length': original_context_length,
'depth': int(depth_percent),
'tokenizer_model': 'gpt-4',
'file_list': file_list,
'num_repeats_per_file': 10,
'length_buffer': 600,
'guide': True,
'language': language,
'needle_file_name': needle_file_name,
'num_needles': num_needles,
'diff': diff,
'reader_cfg': needlebench_reader_cfg,
'infer_cfg': needlebench_infer_cfg,
'eval_cfg': needlebench_eval_cfg,
}
needlebench_2needle_en_datasets.append(dataset_dict)
num_needles = 3
# Initialize dataset lists
needlebench_2needle_en_datasets = []
needlebench_3needle_en_datasets = []
for original_context_length in context_lengths:
for depth_percent in depths_list:
dataset_dict = {
'abbr': f'Length{original_context_length}'
f'Depth{int(depth_percent)}_{num_needles}needle_en_256k',
'type': NeedleBenchMultiDataset,
'path': base_path,
'length': original_context_length,
'depth': int(depth_percent),
'tokenizer_model': 'gpt-4',
'file_list': file_list,
'num_repeats_per_file': 10,
'length_buffer': 600,
'guide': True,
'language': language,
'needle_file_name': needle_file_name,
'num_needles': num_needles,
'diff': diff,
'reader_cfg': needlebench_reader_cfg,
'infer_cfg': needlebench_infer_cfg,
'eval_cfg': needlebench_eval_cfg,
}
needlebench_3needle_en_datasets.append(dataset_dict)
num_needles = 4
needlebench_4needle_en_datasets = []
for original_context_length in context_lengths:
for depth_percent in depths_list:
dataset_dict = {
'abbr': f'Length{original_context_length}'
f'Depth{int(depth_percent)}_{num_needles}needle_en_256k',
'type': NeedleBenchMultiDataset,
'path': base_path,
'length': original_context_length,
'depth': int(depth_percent),
'tokenizer_model': 'gpt-4',
'file_list': file_list,
'num_repeats_per_file': 10,
'length_buffer': 600,
'guide': True,
'language': language,
'needle_file_name': needle_file_name,
'num_needles': num_needles,
'diff': diff,
'reader_cfg': needlebench_reader_cfg,
'infer_cfg': needlebench_infer_cfg,
'eval_cfg': needlebench_eval_cfg,
}
needlebench_4needle_en_datasets.append(dataset_dict)
num_needles = 5
needlebench_5needle_en_datasets = []
for original_context_length in context_lengths:
for depth_percent in depths_list:
dataset_dict = {
'abbr': f'Length{original_context_length}'
f'Depth{int(depth_percent)}_{num_needles}needle_en_256k',
'type': NeedleBenchMultiDataset,
'path': base_path,
'length': original_context_length,
'depth': int(depth_percent),
'tokenizer_model': 'gpt-4',
'file_list': file_list,
'num_repeats_per_file': 10,
'length_buffer': 600,
'guide': True,
'language': language,
'needle_file_name': needle_file_name,
'num_needles': num_needles,
'diff': diff,
'reader_cfg': needlebench_reader_cfg,
'infer_cfg': needlebench_infer_cfg,
'eval_cfg': needlebench_eval_cfg,
}
needlebench_5needle_en_datasets.append(dataset_dict)
# Create datasets for different numbers of needles
for num_needles in range(2, 6):
dataset_list_name = f'needlebench_{num_needles}needle_en_datasets'
for original_context_length in context_lengths:
for depth_percent in depths_list:
dataset_dict = {
'abbr': f'Length{original_context_length}'
f'Depth{int(depth_percent)}_{num_needles}needle_en_256k',
'type': NeedleBenchMultiDataset,
'path': base_path,
'length': original_context_length,
'depth': int(depth_percent),
'tokenizer_model': 'gpt-4',
'file_list': file_list,
'num_repeats_per_file': 10,
'length_buffer': length_buffer,
'language': language,
'needle_file_name': needle_file_name,
'num_needles': num_needles,
'diff': diff,
'reader_cfg': needlebench_reader_cfg,
'infer_cfg': needlebench_infer_cfg,
'eval_cfg': needlebench_eval_cfg,
}
# Add to the appropriate list using globals()
globals()[f'needlebench_{num_needles}needle_en_datasets'].append(dataset_dict)
# ----------Chinese Version----------
base_path = 'opencompass/needlebench'
file_list = ['zh_finance.jsonl']
needle_file_name = 'multi_needle_reasoning_zh.json'
needle_file_name = 'names.json'
diff = 10
num_needles = 2
needlebench_2needle_zh_datasets = []
language = 'Chinese'
length_buffer = 200
for original_context_length in context_lengths:
for depth_percent in depths_list:
dataset_dict = {
'abbr': f'Length{original_context_length}'
f'Depth{int(depth_percent)}_{num_needles}needle_zh_256k',
'type': NeedleBenchMultiDataset,
'path': base_path,
'length': original_context_length,
'depth': int(depth_percent),
'tokenizer_model': 'gpt-4',
'file_list': file_list,
'num_repeats_per_file': 10,
'length_buffer': 200,
'guide': True,
'language': language,
'needle_file_name': needle_file_name,
'num_needles': num_needles,
'diff': diff,
'reader_cfg': needlebench_reader_cfg,
'infer_cfg': needlebench_infer_cfg,
'eval_cfg': needlebench_eval_cfg,
}
needlebench_2needle_zh_datasets.append(dataset_dict)
num_needles = 3
# Initialize dataset lists
needlebench_2needle_zh_datasets = []
needlebench_3needle_zh_datasets = []
for original_context_length in context_lengths:
for depth_percent in depths_list:
dataset_dict = {
'abbr': f'Length{original_context_length}'
f'Depth{int(depth_percent)}_{num_needles}needle_zh_256k',
'type': NeedleBenchMultiDataset,
'path': base_path,
'length': original_context_length,
'depth': int(depth_percent),
'tokenizer_model': 'gpt-4',
'file_list': file_list,
'num_repeats_per_file': 10,
'length_buffer': 200,
'guide': True,
'language': language,
'needle_file_name': needle_file_name,
'num_needles': num_needles,
'diff': diff,
'reader_cfg': needlebench_reader_cfg,
'infer_cfg': needlebench_infer_cfg,
'eval_cfg': needlebench_eval_cfg,
}
needlebench_3needle_zh_datasets.append(dataset_dict)
num_needles = 4
needlebench_4needle_zh_datasets = []
for original_context_length in context_lengths:
for depth_percent in depths_list:
dataset_dict = {
'abbr': f'Length{original_context_length}'
f'Depth{int(depth_percent)}_{num_needles}needle_zh_256k',
'type': NeedleBenchMultiDataset,
'path': base_path,
'length': original_context_length,
'depth': int(depth_percent),
'tokenizer_model': 'gpt-4',
'file_list': file_list,
'num_repeats_per_file': 10,
'length_buffer': 200,
'guide': True,
'language': language,
'needle_file_name': needle_file_name,
'num_needles': num_needles,
'diff': diff,
'reader_cfg': needlebench_reader_cfg,
'infer_cfg': needlebench_infer_cfg,
'eval_cfg': needlebench_eval_cfg,
}
needlebench_4needle_zh_datasets.append(dataset_dict)
num_needles = 5
needlebench_5needle_zh_datasets = []
for original_context_length in context_lengths:
for depth_percent in depths_list:
dataset_dict = {
'abbr': f'Length{original_context_length}'
f'Depth{int(depth_percent)}_{num_needles}needle_zh_256k',
'type': NeedleBenchMultiDataset,
'path': base_path,
'length': original_context_length,
'depth': int(depth_percent),
'tokenizer_model': 'gpt-4',
'file_list': file_list,
'num_repeats_per_file': 10,
'length_buffer': 200,
'guide': True,
'language': language,
'needle_file_name': needle_file_name,
'num_needles': num_needles,
'diff': diff,
'reader_cfg': needlebench_reader_cfg,
'infer_cfg': needlebench_infer_cfg,
'eval_cfg': needlebench_eval_cfg,
}
needlebench_5needle_zh_datasets.append(dataset_dict)
# Create datasets for different numbers of needles
for num_needles in range(2, 6):
dataset_list_name = f'needlebench_{num_needles}needle_zh_datasets'
for original_context_length in context_lengths:
for depth_percent in depths_list:
dataset_dict = {
'abbr': f'Length{original_context_length}'
f'Depth{int(depth_percent)}_{num_needles}needle_zh_256k',
'type': NeedleBenchMultiDataset,
'path': base_path,
'length': original_context_length,
'depth': int(depth_percent),
'tokenizer_model': 'gpt-4',
'file_list': file_list,
'num_repeats_per_file': 10,
'length_buffer': length_buffer,
'language': language,
'needle_file_name': needle_file_name,
'num_needles': num_needles,
'diff': diff,
'reader_cfg': needlebench_reader_cfg,
'infer_cfg': needlebench_infer_cfg,
'eval_cfg': needlebench_eval_cfg,
}
# Add to the appropriate list using globals()
globals()[f'needlebench_{num_needles}needle_zh_datasets'].append(dataset_dict)

View File

@ -1,109 +1,55 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets.needlebench.parallel import NeedleBenchParallelDataset
from opencompass.datasets.needlebench.parallel import NeedleBenchParallelEvaluator
from opencompass.datasets.needlebench.origin import needlebench_postprocess
from opencompass.datasets.needlebench.origin import needlebench_dataset_postprocess
import math
from mmengine.config import read_base
with read_base():
from .needlebench_single_256k import depths_list as depths, context_lengths
from .needlebench_single_256k import needlebench_reader_cfg, needlebench_infer_cfg, needlebench_eval_cfg
def logistic(x, L=100, x0=50, k=0.1):
return round(L / (1 + math.exp(-k * (x - x0))), 3)
def generate_linear_space(start, end, num):
if num == 1:
return [start]
elif num < 1:
raise ValueError('num must be at least 1.')
step = (end - start) / (num - 1)
return [start + step * i for i in range(num)]
def generate_depth_percents(intervals, interval_type):
if interval_type == 'linear':
return generate_linear_space(0, 100, intervals)
elif interval_type == 'sigmoid':
linear_space = generate_linear_space(0, 100, intervals)
return [logistic(x) for x in linear_space]
else:
raise ValueError('Unsupported interval type')
needlebench_reader_cfg = dict(input_columns=['prompt'], output_column='answer')
needlebench_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(role='HUMAN', prompt='{prompt}'),
dict(role='BOT', prompt='{answer}\n'),
]
),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer),
)
needlebench_eval_cfg = dict(
evaluator=dict(type=NeedleBenchParallelEvaluator),
pred_postprocessor=dict(type=needlebench_postprocess),
dataset_postprocessor=dict(type=needlebench_dataset_postprocess),
pred_role='BOT',
)
# context_lengths = list([16000, 32000, 48000, 64000, 80000, 96000, 112000, 128000, 144000, 160000, 176000, 192000, 200000])
context_lengths = [32000, 128000, 256000]
document_depth_percent_intervals = 20
document_depth_percent_interval_type = 'linear'
needlebench_eval_cfg['evaluator']['type'] = NeedleBenchParallelEvaluator
base_path = 'opencompass/needlebench'
file_list = ['PaulGrahamEssays.jsonl']
needlebench_en_datasets = []
needle_file_name = 'needles.jsonl'
depths = [0, 10, 21, 31, 42, 52, 63, 73, 84, 94, 100]
for original_context_length in context_lengths:
dataset_dict = {
'abbr': f'Length{original_context_length}' f'_parallel_en_256k',
'type': NeedleBenchParallelDataset,
'path': base_path,
'needle_file_name': needle_file_name,
'length': original_context_length,
'depths': depths,
'tokenizer_model': 'gpt-4',
'file_list': file_list,
'num_repeats_per_file': 25,
'length_buffer': 3000,
'guide': True,
# Define configurations for both English and Chinese datasets
language_configs = [
{
'file_list': ['PaulGrahamEssays.jsonl'],
'dataset_var': 'needlebench_en_datasets',
'language': 'English',
'reader_cfg': needlebench_reader_cfg,
'infer_cfg': needlebench_infer_cfg,
'eval_cfg': needlebench_eval_cfg,
'length_buffer': 3000,
'suffix': 'en'
},
{
'file_list': ['zh_finance.jsonl'],
'dataset_var': 'needlebench_zh_datasets',
'language': 'Chinese',
'length_buffer': 200,
'suffix': 'zh'
}
needlebench_en_datasets.append(dataset_dict)
]
file_list = ['zh_finance.jsonl']
# Initialize empty dataset lists
needlebench_en_datasets = []
needlebench_zh_datasets = []
for original_context_length in context_lengths:
dataset_dict = {
'abbr': f'Length{original_context_length}' f'_parallel_zh_256k',
'type': NeedleBenchParallelDataset,
'path': base_path,
'needle_file_name': needle_file_name,
'length': original_context_length,
'depths': depths,
'tokenizer_model': 'gpt-4',
'file_list': file_list,
'num_repeats_per_file': 25,
'length_buffer': 200,
'guide': True,
'language': 'Chinese',
'reader_cfg': needlebench_reader_cfg,
'infer_cfg': needlebench_infer_cfg,
'eval_cfg': needlebench_eval_cfg,
}
needlebench_zh_datasets.append(dataset_dict)
# Single loop to handle both languages
for config in language_configs:
for original_context_length in context_lengths:
dataset_dict = {
'abbr': f'Length{original_context_length}_parallel_{config["suffix"]}_256k',
'type': NeedleBenchParallelDataset,
'path': base_path,
'needle_file_name': needle_file_name,
'length': original_context_length,
'depths': depths,
'tokenizer_model': 'gpt-4',
'file_list': config['file_list'],
'num_repeats_per_file': 25,
'length_buffer': config['length_buffer'],
'language': config['language'],
'reader_cfg': needlebench_reader_cfg,
'infer_cfg': needlebench_infer_cfg,
'eval_cfg': needlebench_eval_cfg,
}
globals()[config['dataset_var']].append(dataset_dict)

View File

@ -5,30 +5,6 @@ from opencompass.datasets.needlebench.origin import NeedleBenchOriginDataset
from opencompass.datasets.needlebench.origin import NeedleBenchOriginEvaluator
from opencompass.datasets.needlebench.origin import needlebench_postprocess
from opencompass.datasets.needlebench.origin import needlebench_dataset_postprocess
import math
def logistic(x, L=100, x0=50, k=0.1):
return round(L / (1 + math.exp(-k * (x - x0))), 3)
def generate_linear_space(start, end, num):
if num == 1:
return [start]
elif num < 1:
raise ValueError('num must be at least 1.')
step = (end - start) / (num - 1)
return [start + step * i for i in range(num)]
def generate_depth_percents(intervals, interval_type):
if interval_type == 'linear':
return generate_linear_space(0, 100, intervals)
elif interval_type == 'sigmoid':
linear_space = generate_linear_space(0, 100, intervals)
return [logistic(x) for x in linear_space]
else:
raise ValueError('Unsupported interval type')
needlebench_reader_cfg = dict(input_columns=['prompt'], output_column='answer')
@ -54,59 +30,52 @@ needlebench_eval_cfg = dict(
pred_role='BOT',
)
# context_lengths = list([16000, 32000, 48000, 64000, 80000, 96000, 112000, 128000, 144000, 160000, 176000, 192000, 200000])
context_lengths = [32000, 128000, 256000]
depths_list = [0, 10, 21, 31, 42, 52, 63, 73, 84, 94, 100]
depths_list = [0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
base_path = 'opencompass/needlebench'
file_list = ['PaulGrahamEssays.jsonl']
needle_file_name = 'needles.jsonl'
# Define configurations for both English and Chinese datasets
language_configs = [
{
'file_list': ['PaulGrahamEssays.jsonl'],
'dataset_var': 'needlebench_en_datasets',
'language': 'English',
'length_buffer': 3000,
'suffix': 'en'
},
{
'file_list': ['zh_finance.jsonl'],
'dataset_var': 'needlebench_zh_datasets',
'language': 'Chinese',
'length_buffer': 200,
'suffix': 'zh'
}
]
# Initialize empty dataset lists
needlebench_en_datasets = []
needle_file_name = 'needles.jsonl'
for original_context_length in context_lengths:
for depth_percent in depths_list:
dataset_dict = {
'abbr': f'Length{original_context_length}'
f'Depth{int(depth_percent)}_origin_en_256k',
'type': NeedleBenchOriginDataset,
'path': base_path,
'length': original_context_length,
'depth': int(depth_percent),
'tokenizer_model': 'gpt-4',
'file_list': file_list,
'num_repeats_per_file': 10,
'length_buffer': 600,
'guide': True,
'language': 'English',
'needle_file_name': needle_file_name,
'reader_cfg': needlebench_reader_cfg,
'infer_cfg': needlebench_infer_cfg,
'eval_cfg': needlebench_eval_cfg,
}
needlebench_en_datasets.append(dataset_dict)
file_list = ['zh_finance.jsonl']
needlebench_zh_datasets = []
needle_file_name = 'needles.jsonl'
for original_context_length in context_lengths:
for depth_percent in depths_list:
dataset_dict = {
'abbr': f'Length{original_context_length}'
f'Depth{int(depth_percent)}_origin_zh_256k',
'type': NeedleBenchOriginDataset,
'path': base_path,
'length': original_context_length,
'depth': int(depth_percent),
'tokenizer_model': 'gpt-4',
'file_list': file_list,
'num_repeats_per_file': 10,
'length_buffer': 200,
'guide': True,
'language': 'Chinese',
'needle_file_name': needle_file_name,
'reader_cfg': needlebench_reader_cfg,
'infer_cfg': needlebench_infer_cfg,
'eval_cfg': needlebench_eval_cfg,
}
needlebench_zh_datasets.append(dataset_dict)
# Single loop to handle both languages
for config in language_configs:
for original_context_length in context_lengths:
for depth_percent in depths_list:
dataset_dict = {
'abbr': f'Length{original_context_length}'
f'Depth{int(depth_percent)}_origin_{config["suffix"]}_256k',
'type': NeedleBenchOriginDataset,
'path': base_path,
'length': original_context_length,
'depth': int(depth_percent),
'tokenizer_model': 'gpt-4',
'file_list': config['file_list'],
'num_repeats_per_file': 10,
'length_buffer': config['length_buffer'],
'language': config['language'],
'needle_file_name': needle_file_name,
'reader_cfg': needlebench_reader_cfg,
'infer_cfg': needlebench_infer_cfg,
'eval_cfg': needlebench_eval_cfg,
}
globals()[config['dataset_var']].append(dataset_dict)

View File

@ -1,18 +1,18 @@
from mmengine.config import read_base
with read_base():
from .needlebench_multi_reasoning_32k import needlebench_2needle_en_datasets as needlebench_multi_2needle_en_datasets
from .needlebench_multi_reasoning_32k import needlebench_3needle_en_datasets as needlebench_multi_3needle_en_datasets
from .needlebench_multi_reasoning_32k import needlebench_4needle_en_datasets as needlebench_multi_4needle_en_datasets
from .needlebench_multi_reasoning_32k import needlebench_5needle_en_datasets as needlebench_multi_5needle_en_datasets
from .needlebench_multi_reasoning_32k import needlebench_2needle_zh_datasets as needlebench_multi_2needle_zh_datasets
from .needlebench_multi_reasoning_32k import needlebench_3needle_zh_datasets as needlebench_multi_3needle_zh_datasets
from .needlebench_multi_reasoning_32k import needlebench_4needle_zh_datasets as needlebench_multi_4needle_zh_datasets
from .needlebench_multi_reasoning_32k import needlebench_5needle_zh_datasets as needlebench_multi_5needle_zh_datasets
from opencompass.configs.datasets.needlebench.needlebench_32k.needlebench_multi_reasoning_32k import needlebench_2needle_en_datasets as needlebench_multi_2needle_en_datasets
from opencompass.configs.datasets.needlebench.needlebench_32k.needlebench_multi_reasoning_32k import needlebench_3needle_en_datasets as needlebench_multi_3needle_en_datasets
from opencompass.configs.datasets.needlebench.needlebench_32k.needlebench_multi_reasoning_32k import needlebench_4needle_en_datasets as needlebench_multi_4needle_en_datasets
from opencompass.configs.datasets.needlebench.needlebench_32k.needlebench_multi_reasoning_32k import needlebench_5needle_en_datasets as needlebench_multi_5needle_en_datasets
from opencompass.configs.datasets.needlebench.needlebench_32k.needlebench_multi_reasoning_32k import needlebench_2needle_zh_datasets as needlebench_multi_2needle_zh_datasets
from opencompass.configs.datasets.needlebench.needlebench_32k.needlebench_multi_reasoning_32k import needlebench_3needle_zh_datasets as needlebench_multi_3needle_zh_datasets
from opencompass.configs.datasets.needlebench.needlebench_32k.needlebench_multi_reasoning_32k import needlebench_4needle_zh_datasets as needlebench_multi_4needle_zh_datasets
from opencompass.configs.datasets.needlebench.needlebench_32k.needlebench_multi_reasoning_32k import needlebench_5needle_zh_datasets as needlebench_multi_5needle_zh_datasets
from .needlebench_single_32k import needlebench_en_datasets as needlebench_origin_en_datasets
from .needlebench_single_32k import needlebench_zh_datasets as needlebench_origin_zh_datasets
from .needlebench_multi_retrieval_32k import needlebench_en_datasets as needlebench_parallel_en_datasets
from .needlebench_multi_retrieval_32k import needlebench_zh_datasets as needlebench_parallel_zh_datasets
from opencompass.configs.datasets.needlebench.needlebench_32k.needlebench_single_32k import needlebench_en_datasets as needlebench_origin_en_datasets
from opencompass.configs.datasets.needlebench.needlebench_32k.needlebench_single_32k import needlebench_zh_datasets as needlebench_origin_zh_datasets
from opencompass.configs.datasets.needlebench.needlebench_32k.needlebench_multi_retrieval_32k import needlebench_en_datasets as needlebench_parallel_en_datasets
from opencompass.configs.datasets.needlebench.needlebench_32k.needlebench_multi_retrieval_32k import needlebench_zh_datasets as needlebench_parallel_zh_datasets
needlebench_datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])

View File

@ -1,290 +1,93 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets.needlebench.multi import NeedleBenchMultiDataset
from opencompass.datasets.needlebench.multi import NeedleBenchMultiEvaluator
from opencompass.datasets.needlebench.origin import needlebench_postprocess
from opencompass.datasets.needlebench.origin import needlebench_dataset_postprocess
import math
from mmengine.config import read_base
with read_base():
from .needlebench_single_32k import depths_list, context_lengths
from .needlebench_single_32k import needlebench_reader_cfg, needlebench_infer_cfg
from opencompass.configs.datasets.needlebench.atc.atc_0shot_nocot_2_power_en import needlebench_atc_eval_cfg as needlebench_eval_cfg
def logistic(x, L=100, x0=50, k=0.1):
return round(L / (1 + math.exp(-k * (x - x0))), 3)
def generate_linear_space(start, end, num):
if num == 1:
return [start]
elif num < 1:
raise ValueError('num must be at least 1.')
step = (end - start) / (num - 1)
return [start + step * i for i in range(num)]
def generate_depth_percents(intervals, interval_type):
if interval_type == 'linear':
return generate_linear_space(0, 100, intervals)
elif interval_type == 'sigmoid':
linear_space = generate_linear_space(0, 100, intervals)
return [logistic(x) for x in linear_space]
else:
raise ValueError('Unsupported interval type')
needlebench_reader_cfg = dict(input_columns=['prompt'], output_column='answer')
needlebench_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(role='HUMAN', prompt='{prompt}'),
dict(role='BOT', prompt='{answer}\n'),
]
),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer),
)
needlebench_eval_cfg = dict(
evaluator=dict(type=NeedleBenchMultiEvaluator),
pred_postprocessor=dict(type=needlebench_postprocess),
dataset_postprocessor=dict(type=needlebench_dataset_postprocess),
pred_role='BOT',
)
context_lengths = list([9000, 13000, 17000, 21000, 25000, 29000, 31000, 32000])
depths_list = [0, 10, 21, 31, 42, 52, 63, 73, 84, 94, 100]
document_depth_percent_intervals = 20
document_depth_percent_interval_type = 'linear'
# ----------English Version----------
base_path = 'opencompass/needlebench'
file_list = ['PaulGrahamEssays.jsonl']
needle_file_name = 'multi_needle_reasoning_en.json'
needle_file_name = 'names.json'
diff = 10
num_needles = 2
needlebench_2needle_en_datasets = []
language = 'English'
length_buffer = 3000
for original_context_length in context_lengths:
for depth_percent in depths_list:
dataset_dict = {
'abbr': f'Length{original_context_length}'
f'Depth{int(depth_percent)}_{num_needles}needle_en_32k',
'type': NeedleBenchMultiDataset,
'path': base_path,
'length': original_context_length,
'depth': int(depth_percent),
'tokenizer_model': 'gpt-4',
'file_list': file_list,
'num_repeats_per_file': 10,
'length_buffer': 3000,
'guide': True,
'language': language,
'needle_file_name': needle_file_name,
'num_needles': num_needles,
'diff': diff,
'reader_cfg': needlebench_reader_cfg,
'infer_cfg': needlebench_infer_cfg,
'eval_cfg': needlebench_eval_cfg,
}
needlebench_2needle_en_datasets.append(dataset_dict)
num_needles = 3
# Initialize dataset lists
needlebench_2needle_en_datasets = []
needlebench_3needle_en_datasets = []
for original_context_length in context_lengths:
for depth_percent in depths_list:
dataset_dict = {
'abbr': f'Length{original_context_length}'
f'Depth{int(depth_percent)}_{num_needles}needle_en_32k',
'type': NeedleBenchMultiDataset,
'path': base_path,
'length': original_context_length,
'depth': int(depth_percent),
'tokenizer_model': 'gpt-4',
'file_list': file_list,
'num_repeats_per_file': 10,
'length_buffer': 3000,
'guide': True,
'language': language,
'needle_file_name': needle_file_name,
'num_needles': num_needles,
'diff': diff,
'reader_cfg': needlebench_reader_cfg,
'infer_cfg': needlebench_infer_cfg,
'eval_cfg': needlebench_eval_cfg,
}
needlebench_3needle_en_datasets.append(dataset_dict)
num_needles = 4
needlebench_4needle_en_datasets = []
for original_context_length in context_lengths:
for depth_percent in depths_list:
dataset_dict = {
'abbr': f'Length{original_context_length}'
f'Depth{int(depth_percent)}_{num_needles}needle_en_32k',
'type': NeedleBenchMultiDataset,
'path': base_path,
'length': original_context_length,
'depth': int(depth_percent),
'tokenizer_model': 'gpt-4',
'file_list': file_list,
'num_repeats_per_file': 10,
'length_buffer': 3000,
'guide': True,
'language': language,
'needle_file_name': needle_file_name,
'num_needles': num_needles,
'diff': diff,
'reader_cfg': needlebench_reader_cfg,
'infer_cfg': needlebench_infer_cfg,
'eval_cfg': needlebench_eval_cfg,
}
needlebench_4needle_en_datasets.append(dataset_dict)
num_needles = 5
needlebench_5needle_en_datasets = []
for original_context_length in context_lengths:
for depth_percent in depths_list:
dataset_dict = {
'abbr': f'Length{original_context_length}'
f'Depth{int(depth_percent)}_{num_needles}needle_en_32k',
'type': NeedleBenchMultiDataset,
'path': base_path,
'length': original_context_length,
'depth': int(depth_percent),
'tokenizer_model': 'gpt-4',
'file_list': file_list,
'num_repeats_per_file': 10,
'length_buffer': 3000,
'guide': True,
'language': language,
'needle_file_name': needle_file_name,
'num_needles': num_needles,
'diff': diff,
'reader_cfg': needlebench_reader_cfg,
'infer_cfg': needlebench_infer_cfg,
'eval_cfg': needlebench_eval_cfg,
}
needlebench_5needle_en_datasets.append(dataset_dict)
# Create datasets for different numbers of needles
for num_needles in range(2, 6):
dataset_list_name = f'needlebench_{num_needles}needle_en_datasets'
for original_context_length in context_lengths:
for depth_percent in depths_list:
dataset_dict = {
'abbr': f'Length{original_context_length}'
f'Depth{int(depth_percent)}_{num_needles}needle_en_32k',
'type': NeedleBenchMultiDataset,
'path': base_path,
'length': original_context_length,
'depth': int(depth_percent),
'tokenizer_model': 'gpt-4',
'file_list': file_list,
'num_repeats_per_file': 10,
'length_buffer': length_buffer,
'language': language,
'needle_file_name': needle_file_name,
'num_needles': num_needles,
'diff': diff,
'reader_cfg': needlebench_reader_cfg,
'infer_cfg': needlebench_infer_cfg,
'eval_cfg': needlebench_eval_cfg,
}
# Add to the appropriate list using globals()
globals()[f'needlebench_{num_needles}needle_en_datasets'].append(dataset_dict)
# ----------Chinese Version----------
base_path = 'opencompass/needlebench'
file_list = ['zh_finance.jsonl']
needle_file_name = 'multi_needle_reasoning_zh.json'
needle_file_name = 'names.json'
diff = 10
num_needles = 2
needlebench_2needle_zh_datasets = []
language = 'Chinese'
length_buffer = 200
for original_context_length in context_lengths:
for depth_percent in depths_list:
dataset_dict = {
'abbr': f'Length{original_context_length}'
f'Depth{int(depth_percent)}_{num_needles}needle_zh_32k',
'type': NeedleBenchMultiDataset,
'path': base_path,
'length': original_context_length,
'depth': int(depth_percent),
'tokenizer_model': 'gpt-4',
'file_list': file_list,
'num_repeats_per_file': 10,
'length_buffer': 200,
'guide': True,
'language': language,
'needle_file_name': needle_file_name,
'num_needles': num_needles,
'diff': diff,
'reader_cfg': needlebench_reader_cfg,
'infer_cfg': needlebench_infer_cfg,
'eval_cfg': needlebench_eval_cfg,
}
needlebench_2needle_zh_datasets.append(dataset_dict)
num_needles = 3
# Initialize dataset lists
needlebench_2needle_zh_datasets = []
needlebench_3needle_zh_datasets = []
for original_context_length in context_lengths:
for depth_percent in depths_list:
dataset_dict = {
'abbr': f'Length{original_context_length}'
f'Depth{int(depth_percent)}_{num_needles}needle_zh_32k',
'type': NeedleBenchMultiDataset,
'path': base_path,
'length': original_context_length,
'depth': int(depth_percent),
'tokenizer_model': 'gpt-4',
'file_list': file_list,
'num_repeats_per_file': 10,
'length_buffer': 200,
'guide': True,
'language': language,
'needle_file_name': needle_file_name,
'num_needles': num_needles,
'diff': diff,
'reader_cfg': needlebench_reader_cfg,
'infer_cfg': needlebench_infer_cfg,
'eval_cfg': needlebench_eval_cfg,
}
needlebench_3needle_zh_datasets.append(dataset_dict)
num_needles = 4
needlebench_4needle_zh_datasets = []
for original_context_length in context_lengths:
for depth_percent in depths_list:
dataset_dict = {
'abbr': f'Length{original_context_length}'
f'Depth{int(depth_percent)}_{num_needles}needle_zh_32k',
'type': NeedleBenchMultiDataset,
'path': base_path,
'length': original_context_length,
'depth': int(depth_percent),
'tokenizer_model': 'gpt-4',
'file_list': file_list,
'num_repeats_per_file': 10,
'length_buffer': 200,
'guide': True,
'language': language,
'needle_file_name': needle_file_name,
'num_needles': num_needles,
'diff': diff,
'reader_cfg': needlebench_reader_cfg,
'infer_cfg': needlebench_infer_cfg,
'eval_cfg': needlebench_eval_cfg,
}
needlebench_4needle_zh_datasets.append(dataset_dict)
num_needles = 5
needlebench_5needle_zh_datasets = []
for original_context_length in context_lengths:
for depth_percent in depths_list:
dataset_dict = {
'abbr': f'Length{original_context_length}'
f'Depth{int(depth_percent)}_{num_needles}needle_zh_32k',
'type': NeedleBenchMultiDataset,
'path': base_path,
'length': original_context_length,
'depth': int(depth_percent),
'tokenizer_model': 'gpt-4',
'file_list': file_list,
'num_repeats_per_file': 10,
'length_buffer': 200,
'guide': True,
'language': language,
'needle_file_name': needle_file_name,
'num_needles': num_needles,
'diff': diff,
'reader_cfg': needlebench_reader_cfg,
'infer_cfg': needlebench_infer_cfg,
'eval_cfg': needlebench_eval_cfg,
}
needlebench_5needle_zh_datasets.append(dataset_dict)
# Create datasets for different numbers of needles
for num_needles in range(2, 6):
dataset_list_name = f'needlebench_{num_needles}needle_zh_datasets'
for original_context_length in context_lengths:
for depth_percent in depths_list:
dataset_dict = {
'abbr': f'Length{original_context_length}'
f'Depth{int(depth_percent)}_{num_needles}needle_zh_32k',
'type': NeedleBenchMultiDataset,
'path': base_path,
'length': original_context_length,
'depth': int(depth_percent),
'tokenizer_model': 'gpt-4',
'file_list': file_list,
'num_repeats_per_file': 10,
'length_buffer': length_buffer,
'language': language,
'needle_file_name': needle_file_name,
'num_needles': num_needles,
'diff': diff,
'reader_cfg': needlebench_reader_cfg,
'infer_cfg': needlebench_infer_cfg,
'eval_cfg': needlebench_eval_cfg,
}
# Add to the appropriate list using globals()
globals()[f'needlebench_{num_needles}needle_zh_datasets'].append(dataset_dict)

View File

@ -1,108 +1,55 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets.needlebench.parallel import NeedleBenchParallelDataset
from opencompass.datasets.needlebench.parallel import NeedleBenchParallelEvaluator
from opencompass.datasets.needlebench.origin import needlebench_postprocess
from opencompass.datasets.needlebench.origin import needlebench_dataset_postprocess
import math
from mmengine.config import read_base
with read_base():
from .needlebench_single_32k import depths_list as depths, context_lengths
from .needlebench_single_32k import needlebench_reader_cfg, needlebench_infer_cfg, needlebench_eval_cfg
def logistic(x, L=100, x0=50, k=0.1):
return round(L / (1 + math.exp(-k * (x - x0))), 3)
def generate_linear_space(start, end, num):
if num == 1:
return [start]
elif num < 1:
raise ValueError('num must be at least 1.')
step = (end - start) / (num - 1)
return [start + step * i for i in range(num)]
def generate_depth_percents(intervals, interval_type):
if interval_type == 'linear':
return generate_linear_space(0, 100, intervals)
elif interval_type == 'sigmoid':
linear_space = generate_linear_space(0, 100, intervals)
return [logistic(x) for x in linear_space]
else:
raise ValueError('Unsupported interval type')
needlebench_reader_cfg = dict(input_columns=['prompt'], output_column='answer')
needlebench_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(role='HUMAN', prompt='{prompt}'),
dict(role='BOT', prompt='{answer}\n'),
]
),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer),
)
needlebench_eval_cfg = dict(
evaluator=dict(type=NeedleBenchParallelEvaluator),
pred_postprocessor=dict(type=needlebench_postprocess),
dataset_postprocessor=dict(type=needlebench_dataset_postprocess),
pred_role='BOT',
)
context_lengths = list([9000, 13000, 17000, 21000, 25000, 29000, 31000, 32000])
document_depth_percent_intervals = 20
document_depth_percent_interval_type = 'linear'
needlebench_eval_cfg['evaluator']['type'] = NeedleBenchParallelEvaluator
base_path = 'opencompass/needlebench'
file_list = ['PaulGrahamEssays.jsonl']
needlebench_en_datasets = []
needle_file_name = 'needles.jsonl'
depths = [0, 10, 21, 31, 42, 52, 63, 73, 84, 94, 100]
for original_context_length in context_lengths:
dataset_dict = {
'abbr': f'Length{original_context_length}' f'_parallel_en_32k',
'type': NeedleBenchParallelDataset,
'path': base_path,
'needle_file_name': needle_file_name,
'length': original_context_length,
'depths': depths,
'tokenizer_model': 'gpt-4',
'file_list': file_list,
'num_repeats_per_file': 25,
'length_buffer': 3000,
'guide': True,
# Define configurations for both English and Chinese datasets
language_configs = [
{
'file_list': ['PaulGrahamEssays.jsonl'],
'dataset_var': 'needlebench_en_datasets',
'language': 'English',
'reader_cfg': needlebench_reader_cfg,
'infer_cfg': needlebench_infer_cfg,
'eval_cfg': needlebench_eval_cfg,
'length_buffer': 3000,
'suffix': 'en'
},
{
'file_list': ['zh_finance.jsonl'],
'dataset_var': 'needlebench_zh_datasets',
'language': 'Chinese',
'length_buffer': 200,
'suffix': 'zh'
}
needlebench_en_datasets.append(dataset_dict)
]
file_list = ['zh_finance.jsonl']
# Initialize empty dataset lists
needlebench_en_datasets = []
needlebench_zh_datasets = []
for original_context_length in context_lengths:
dataset_dict = {
'abbr': f'Length{original_context_length}' f'_parallel_zh_32k',
'type': NeedleBenchParallelDataset,
'path': base_path,
'needle_file_name': needle_file_name,
'length': original_context_length,
'depths': depths,
'tokenizer_model': 'gpt-4',
'file_list': file_list,
'num_repeats_per_file': 25,
'length_buffer': 200,
'guide': True,
'language': 'Chinese',
'reader_cfg': needlebench_reader_cfg,
'infer_cfg': needlebench_infer_cfg,
'eval_cfg': needlebench_eval_cfg,
}
needlebench_zh_datasets.append(dataset_dict)
# Single loop to handle both languages
for config in language_configs:
for original_context_length in context_lengths:
dataset_dict = {
'abbr': f'Length{original_context_length}_parallel_{config["suffix"]}_32k',
'type': NeedleBenchParallelDataset,
'path': base_path,
'needle_file_name': needle_file_name,
'length': original_context_length,
'depths': depths,
'tokenizer_model': 'gpt-4',
'file_list': config['file_list'],
'num_repeats_per_file': 25,
'length_buffer': config['length_buffer'],
'language': config['language'],
'reader_cfg': needlebench_reader_cfg,
'infer_cfg': needlebench_infer_cfg,
'eval_cfg': needlebench_eval_cfg,
}
globals()[config['dataset_var']].append(dataset_dict)

View File

@ -5,30 +5,6 @@ from opencompass.datasets.needlebench.origin import NeedleBenchOriginDataset
from opencompass.datasets.needlebench.origin import NeedleBenchOriginEvaluator
from opencompass.datasets.needlebench.origin import needlebench_postprocess
from opencompass.datasets.needlebench.origin import needlebench_dataset_postprocess
import math
def logistic(x, L=100, x0=50, k=0.1):
return round(L / (1 + math.exp(-k * (x - x0))), 3)
def generate_linear_space(start, end, num):
if num == 1:
return [start]
elif num < 1:
raise ValueError('num must be at least 1.')
step = (end - start) / (num - 1)
return [start + step * i for i in range(num)]
def generate_depth_percents(intervals, interval_type):
if interval_type == 'linear':
return generate_linear_space(0, 100, intervals)
elif interval_type == 'sigmoid':
linear_space = generate_linear_space(0, 100, intervals)
return [logistic(x) for x in linear_space]
else:
raise ValueError('Unsupported interval type')
needlebench_reader_cfg = dict(input_columns=['prompt'], output_column='answer')
@ -54,60 +30,52 @@ needlebench_eval_cfg = dict(
pred_role='BOT',
)
context_lengths = list([9000, 13000, 17000, 21000, 25000, 29000, 31000, 32000])
depths_list = [0, 10, 21, 31, 42, 52, 63, 73, 84, 94, 100]
document_depth_percent_intervals = 20
document_depth_percent_interval_type = 'linear'
context_lengths = list([1000, 4000, 8000, 12000, 16000, 20000, 24000, 28000, 32000])
depths_list = [0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
base_path = 'opencompass/needlebench'
file_list = ['PaulGrahamEssays.jsonl']
needle_file_name = 'needles.jsonl'
# Define configurations for both English and Chinese datasets
language_configs = [
{
'file_list': ['PaulGrahamEssays.jsonl'],
'dataset_var': 'needlebench_en_datasets',
'language': 'English',
'length_buffer': 3000,
'suffix': 'en'
},
{
'file_list': ['zh_finance.jsonl'],
'dataset_var': 'needlebench_zh_datasets',
'language': 'Chinese',
'length_buffer': 200,
'suffix': 'zh'
}
]
# Initialize empty dataset lists
needlebench_en_datasets = []
needle_file_name = 'needles.jsonl'
for original_context_length in context_lengths:
for depth_percent in depths_list:
dataset_dict = {
'abbr': f'Length{original_context_length}'
f'Depth{int(depth_percent)}_origin_en_32k',
'type': NeedleBenchOriginDataset,
'path': base_path,
'length': original_context_length,
'depth': int(depth_percent),
'tokenizer_model': 'gpt-4',
'file_list': file_list,
'num_repeats_per_file': 10,
'length_buffer': 3000,
'guide': True,
'language': 'English',
'needle_file_name': needle_file_name,
'reader_cfg': needlebench_reader_cfg,
'infer_cfg': needlebench_infer_cfg,
'eval_cfg': needlebench_eval_cfg,
}
needlebench_en_datasets.append(dataset_dict)
file_list = ['zh_finance.jsonl']
needlebench_zh_datasets = []
needle_file_name = 'needles.jsonl'
for original_context_length in context_lengths:
for depth_percent in depths_list:
dataset_dict = {
'abbr': f'Length{original_context_length}'
f'Depth{int(depth_percent)}_origin_zh_32k',
'type': NeedleBenchOriginDataset,
'path': base_path,
'length': original_context_length,
'depth': int(depth_percent),
'tokenizer_model': 'gpt-4',
'file_list': file_list,
'num_repeats_per_file': 10,
'length_buffer': 200,
'guide': True,
'language': 'Chinese',
'needle_file_name': needle_file_name,
'reader_cfg': needlebench_reader_cfg,
'infer_cfg': needlebench_infer_cfg,
'eval_cfg': needlebench_eval_cfg,
}
needlebench_zh_datasets.append(dataset_dict)
# Single loop to handle both languages
for config in language_configs:
for original_context_length in context_lengths:
for depth_percent in depths_list:
dataset_dict = {
'abbr': f'Length{original_context_length}'
f'Depth{int(depth_percent)}_origin_{config["suffix"]}_32k',
'type': NeedleBenchOriginDataset,
'path': base_path,
'length': original_context_length,
'depth': int(depth_percent),
'tokenizer_model': 'gpt-4',
'file_list': config['file_list'],
'num_repeats_per_file': 10,
'length_buffer': config['length_buffer'],
'language': config['language'],
'needle_file_name': needle_file_name,
'reader_cfg': needlebench_reader_cfg,
'infer_cfg': needlebench_infer_cfg,
'eval_cfg': needlebench_eval_cfg,
}
globals()[config['dataset_var']].append(dataset_dict)

View File

@ -1,18 +1,18 @@
from mmengine.config import read_base
with read_base():
from .needlebench_multi_reasoning_4k import needlebench_2needle_en_datasets as needlebench_multi_2needle_en_datasets
from .needlebench_multi_reasoning_4k import needlebench_3needle_en_datasets as needlebench_multi_3needle_en_datasets
from .needlebench_multi_reasoning_4k import needlebench_4needle_en_datasets as needlebench_multi_4needle_en_datasets
from .needlebench_multi_reasoning_4k import needlebench_5needle_en_datasets as needlebench_multi_5needle_en_datasets
from .needlebench_multi_reasoning_4k import needlebench_2needle_zh_datasets as needlebench_multi_2needle_zh_datasets
from .needlebench_multi_reasoning_4k import needlebench_3needle_zh_datasets as needlebench_multi_3needle_zh_datasets
from .needlebench_multi_reasoning_4k import needlebench_4needle_zh_datasets as needlebench_multi_4needle_zh_datasets
from .needlebench_multi_reasoning_4k import needlebench_5needle_zh_datasets as needlebench_multi_5needle_zh_datasets
from opencompass.configs.datasets.needlebench.needlebench_4k.needlebench_multi_reasoning_4k import needlebench_2needle_en_datasets as needlebench_multi_2needle_en_datasets
from opencompass.configs.datasets.needlebench.needlebench_4k.needlebench_multi_reasoning_4k import needlebench_3needle_en_datasets as needlebench_multi_3needle_en_datasets
from opencompass.configs.datasets.needlebench.needlebench_4k.needlebench_multi_reasoning_4k import needlebench_4needle_en_datasets as needlebench_multi_4needle_en_datasets
from opencompass.configs.datasets.needlebench.needlebench_4k.needlebench_multi_reasoning_4k import needlebench_5needle_en_datasets as needlebench_multi_5needle_en_datasets
from opencompass.configs.datasets.needlebench.needlebench_4k.needlebench_multi_reasoning_4k import needlebench_2needle_zh_datasets as needlebench_multi_2needle_zh_datasets
from opencompass.configs.datasets.needlebench.needlebench_4k.needlebench_multi_reasoning_4k import needlebench_3needle_zh_datasets as needlebench_multi_3needle_zh_datasets
from opencompass.configs.datasets.needlebench.needlebench_4k.needlebench_multi_reasoning_4k import needlebench_4needle_zh_datasets as needlebench_multi_4needle_zh_datasets
from opencompass.configs.datasets.needlebench.needlebench_4k.needlebench_multi_reasoning_4k import needlebench_5needle_zh_datasets as needlebench_multi_5needle_zh_datasets
from .needlebench_single_4k import needlebench_en_datasets as needlebench_origin_en_datasets
from .needlebench_single_4k import needlebench_zh_datasets as needlebench_origin_zh_datasets
from .needlebench_multi_retrieval_4k import needlebench_en_datasets as needlebench_parallel_en_datasets
from .needlebench_multi_retrieval_4k import needlebench_zh_datasets as needlebench_parallel_zh_datasets
from opencompass.configs.datasets.needlebench.needlebench_4k.needlebench_single_4k import needlebench_en_datasets as needlebench_origin_en_datasets
from opencompass.configs.datasets.needlebench.needlebench_4k.needlebench_single_4k import needlebench_zh_datasets as needlebench_origin_zh_datasets
from opencompass.configs.datasets.needlebench.needlebench_4k.needlebench_multi_retrieval_4k import needlebench_en_datasets as needlebench_parallel_en_datasets
from opencompass.configs.datasets.needlebench.needlebench_4k.needlebench_multi_retrieval_4k import needlebench_zh_datasets as needlebench_parallel_zh_datasets
needlebench_datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])

View File

@ -1,305 +1,93 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets.needlebench.multi import NeedleBenchMultiDataset
from opencompass.datasets.needlebench.multi import NeedleBenchMultiEvaluator
from opencompass.datasets.needlebench.origin import needlebench_postprocess
from opencompass.datasets.needlebench.origin import needlebench_dataset_postprocess
import math
from mmengine.config import read_base
with read_base():
from .needlebench_single_4k import depths_list, context_lengths
from .needlebench_single_4k import needlebench_reader_cfg, needlebench_infer_cfg
from opencompass.configs.datasets.needlebench.atc.atc_0shot_nocot_2_power_en import needlebench_atc_eval_cfg as needlebench_eval_cfg
def logistic(x, L=100, x0=50, k=0.1):
return round(L / (1 + math.exp(-k * (x - x0))), 3)
def generate_linear_space(start, end, num):
if num == 1:
return [start]
elif num < 1:
raise ValueError('num must be at least 1.')
step = (end - start) / (num - 1)
return [start + step * i for i in range(num)]
def generate_depth_percents(intervals, interval_type):
if interval_type == 'linear':
return generate_linear_space(0, 100, intervals)
elif interval_type == 'sigmoid':
linear_space = generate_linear_space(0, 100, intervals)
return [logistic(x) for x in linear_space]
else:
raise ValueError('Unsupported interval type')
needlebench_reader_cfg = dict(input_columns=['prompt'], output_column='answer')
needlebench_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(role='HUMAN', prompt='{prompt}'),
dict(role='BOT', prompt='{answer}\n'),
]
),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer),
)
needlebench_eval_cfg = dict(
evaluator=dict(type=NeedleBenchMultiEvaluator),
pred_postprocessor=dict(type=needlebench_postprocess),
dataset_postprocessor=dict(type=needlebench_dataset_postprocess),
pred_role='BOT',
)
context_lengths = list(range(1000, 5000, 1000))
document_depth_percent_intervals = 20
document_depth_percent_interval_type = 'linear'
# ----------English Version----------
base_path = 'opencompass/needlebench'
file_list = ['PaulGrahamEssays.jsonl']
needle_file_name = 'multi_needle_reasoning_en.json'
needle_file_name = 'names.json'
diff = 10
num_needles = 2
needlebench_2needle_en_datasets = []
language = 'English'
length_buffer = 500
for original_context_length in context_lengths:
for depth_percent in generate_depth_percents(
document_depth_percent_intervals, document_depth_percent_interval_type
):
dataset_dict = {
'abbr': f'Length{original_context_length}'
f'Depth{int(depth_percent)}_{num_needles}needle_en_4k',
'type': NeedleBenchMultiDataset,
'path': base_path,
'length': original_context_length,
'depth': int(depth_percent),
'tokenizer_model': 'gpt-4',
'file_list': file_list,
'num_repeats_per_file': 10,
'length_buffer': 600,
'guide': True,
'language': language,
'needle_file_name': needle_file_name,
'num_needles': num_needles,
'diff': diff,
'reader_cfg': needlebench_reader_cfg,
'infer_cfg': needlebench_infer_cfg,
'eval_cfg': needlebench_eval_cfg,
}
needlebench_2needle_en_datasets.append(dataset_dict)
num_needles = 3
# Initialize dataset lists
needlebench_2needle_en_datasets = []
needlebench_3needle_en_datasets = []
for original_context_length in context_lengths:
for depth_percent in generate_depth_percents(
document_depth_percent_intervals, document_depth_percent_interval_type
):
dataset_dict = {
'abbr': f'Length{original_context_length}'
f'Depth{int(depth_percent)}_{num_needles}needle_en_4k',
'type': NeedleBenchMultiDataset,
'path': base_path,
'length': original_context_length,
'depth': int(depth_percent),
'tokenizer_model': 'gpt-4',
'file_list': file_list,
'num_repeats_per_file': 10,
'length_buffer': 600,
'guide': True,
'language': language,
'needle_file_name': needle_file_name,
'num_needles': num_needles,
'diff': diff,
'reader_cfg': needlebench_reader_cfg,
'infer_cfg': needlebench_infer_cfg,
'eval_cfg': needlebench_eval_cfg,
}
needlebench_3needle_en_datasets.append(dataset_dict)
num_needles = 4
needlebench_4needle_en_datasets = []
for original_context_length in context_lengths:
for depth_percent in generate_depth_percents(
document_depth_percent_intervals, document_depth_percent_interval_type
):
dataset_dict = {
'abbr': f'Length{original_context_length}'
f'Depth{int(depth_percent)}_{num_needles}needle_en_4k',
'type': NeedleBenchMultiDataset,
'path': base_path,
'length': original_context_length,
'depth': int(depth_percent),
'tokenizer_model': 'gpt-4',
'file_list': file_list,
'num_repeats_per_file': 10,
'length_buffer': 600,
'guide': True,
'language': language,
'needle_file_name': needle_file_name,
'num_needles': num_needles,
'diff': diff,
'reader_cfg': needlebench_reader_cfg,
'infer_cfg': needlebench_infer_cfg,
'eval_cfg': needlebench_eval_cfg,
}
needlebench_4needle_en_datasets.append(dataset_dict)
num_needles = 5
needlebench_5needle_en_datasets = []
for original_context_length in context_lengths:
for depth_percent in generate_depth_percents(
document_depth_percent_intervals, document_depth_percent_interval_type
):
dataset_dict = {
'abbr': f'Length{original_context_length}'
f'Depth{int(depth_percent)}_{num_needles}needle_en_4k',
'type': NeedleBenchMultiDataset,
'path': base_path,
'length': original_context_length,
'depth': int(depth_percent),
'tokenizer_model': 'gpt-4',
'file_list': file_list,
'num_repeats_per_file': 10,
'length_buffer': 600,
'guide': True,
'language': language,
'needle_file_name': needle_file_name,
'num_needles': num_needles,
'diff': diff,
'reader_cfg': needlebench_reader_cfg,
'infer_cfg': needlebench_infer_cfg,
'eval_cfg': needlebench_eval_cfg,
}
needlebench_5needle_en_datasets.append(dataset_dict)
# Create datasets for different numbers of needles
for num_needles in range(2, 6):
dataset_list_name = f'needlebench_{num_needles}needle_en_datasets'
for original_context_length in context_lengths:
for depth_percent in depths_list:
dataset_dict = {
'abbr': f'Length{original_context_length}'
f'Depth{int(depth_percent)}_{num_needles}needle_en_4k',
'type': NeedleBenchMultiDataset,
'path': base_path,
'length': original_context_length,
'depth': int(depth_percent),
'tokenizer_model': 'gpt-4',
'file_list': file_list,
'num_repeats_per_file': 10,
'length_buffer': length_buffer,
'language': language,
'needle_file_name': needle_file_name,
'num_needles': num_needles,
'diff': diff,
'reader_cfg': needlebench_reader_cfg,
'infer_cfg': needlebench_infer_cfg,
'eval_cfg': needlebench_eval_cfg,
}
# Add to the appropriate list using globals()
globals()[f'needlebench_{num_needles}needle_en_datasets'].append(dataset_dict)
# ----------Chinese Version----------
base_path = 'opencompass/needlebench'
file_list = ['zh_finance.jsonl']
needle_file_name = 'multi_needle_reasoning_zh.json'
needle_file_name = 'names.json'
diff = 10
num_needles = 2
needlebench_2needle_zh_datasets = []
language = 'Chinese'
length_buffer = 200
for original_context_length in context_lengths:
for depth_percent in generate_depth_percents(
document_depth_percent_intervals, document_depth_percent_interval_type
):
dataset_dict = {
'abbr': f'Length{original_context_length}'
f'Depth{int(depth_percent)}_{num_needles}needle_zh_4k',
'type': NeedleBenchMultiDataset,
'path': base_path,
'length': original_context_length,
'depth': int(depth_percent),
'tokenizer_model': 'gpt-4',
'file_list': file_list,
'num_repeats_per_file': 10,
'length_buffer': 200,
'guide': True,
'language': language,
'needle_file_name': needle_file_name,
'num_needles': num_needles,
'diff': diff,
'reader_cfg': needlebench_reader_cfg,
'infer_cfg': needlebench_infer_cfg,
'eval_cfg': needlebench_eval_cfg,
}
needlebench_2needle_zh_datasets.append(dataset_dict)
num_needles = 3
# Initialize dataset lists
needlebench_2needle_zh_datasets = []
needlebench_3needle_zh_datasets = []
for original_context_length in context_lengths:
for depth_percent in generate_depth_percents(
document_depth_percent_intervals, document_depth_percent_interval_type
):
dataset_dict = {
'abbr': f'Length{original_context_length}'
f'Depth{int(depth_percent)}_{num_needles}needle_zh_4k',
'type': NeedleBenchMultiDataset,
'path': base_path,
'length': original_context_length,
'depth': int(depth_percent),
'tokenizer_model': 'gpt-4',
'file_list': file_list,
'num_repeats_per_file': 10,
'length_buffer': 200,
'guide': True,
'language': language,
'needle_file_name': needle_file_name,
'num_needles': num_needles,
'diff': diff,
'reader_cfg': needlebench_reader_cfg,
'infer_cfg': needlebench_infer_cfg,
'eval_cfg': needlebench_eval_cfg,
}
needlebench_3needle_zh_datasets.append(dataset_dict)
num_needles = 4
needlebench_4needle_zh_datasets = []
for original_context_length in context_lengths:
for depth_percent in generate_depth_percents(
document_depth_percent_intervals, document_depth_percent_interval_type
):
dataset_dict = {
'abbr': f'Length{original_context_length}'
f'Depth{int(depth_percent)}_{num_needles}needle_zh_4k',
'type': NeedleBenchMultiDataset,
'path': base_path,
'length': original_context_length,
'depth': int(depth_percent),
'tokenizer_model': 'gpt-4',
'file_list': file_list,
'num_repeats_per_file': 10,
'length_buffer': 200,
'guide': True,
'language': language,
'needle_file_name': needle_file_name,
'num_needles': num_needles,
'diff': diff,
'reader_cfg': needlebench_reader_cfg,
'infer_cfg': needlebench_infer_cfg,
'eval_cfg': needlebench_eval_cfg,
}
needlebench_4needle_zh_datasets.append(dataset_dict)
num_needles = 5
needlebench_5needle_zh_datasets = []
for original_context_length in context_lengths:
for depth_percent in generate_depth_percents(
document_depth_percent_intervals, document_depth_percent_interval_type
):
dataset_dict = {
'abbr': f'Length{original_context_length}'
f'Depth{int(depth_percent)}_{num_needles}needle_zh_4k',
'type': NeedleBenchMultiDataset,
'path': base_path,
'length': original_context_length,
'depth': int(depth_percent),
'tokenizer_model': 'gpt-4',
'file_list': file_list,
'num_repeats_per_file': 10,
'length_buffer': 200,
'guide': True,
'language': language,
'needle_file_name': needle_file_name,
'num_needles': num_needles,
'diff': diff,
'reader_cfg': needlebench_reader_cfg,
'infer_cfg': needlebench_infer_cfg,
'eval_cfg': needlebench_eval_cfg,
}
needlebench_5needle_zh_datasets.append(dataset_dict)
# Create datasets for different numbers of needles
for num_needles in range(2, 6):
dataset_list_name = f'needlebench_{num_needles}needle_zh_datasets'
for original_context_length in context_lengths:
for depth_percent in depths_list:
dataset_dict = {
'abbr': f'Length{original_context_length}'
f'Depth{int(depth_percent)}_{num_needles}needle_zh_4k',
'type': NeedleBenchMultiDataset,
'path': base_path,
'length': original_context_length,
'depth': int(depth_percent),
'tokenizer_model': 'gpt-4',
'file_list': file_list,
'num_repeats_per_file': 10,
'length_buffer': length_buffer,
'language': language,
'needle_file_name': needle_file_name,
'num_needles': num_needles,
'diff': diff,
'reader_cfg': needlebench_reader_cfg,
'infer_cfg': needlebench_infer_cfg,
'eval_cfg': needlebench_eval_cfg,
}
# Add to the appropriate list using globals()
globals()[f'needlebench_{num_needles}needle_zh_datasets'].append(dataset_dict)

View File

@ -1,111 +1,55 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets.needlebench.parallel import NeedleBenchParallelDataset
from opencompass.datasets.needlebench.parallel import NeedleBenchParallelEvaluator
from opencompass.datasets.needlebench.origin import needlebench_postprocess
from opencompass.datasets.needlebench.origin import needlebench_dataset_postprocess
import math
from mmengine.config import read_base
with read_base():
from .needlebench_single_4k import depths_list as depths, context_lengths
from .needlebench_single_4k import needlebench_reader_cfg, needlebench_infer_cfg, needlebench_eval_cfg
def logistic(x, L=100, x0=50, k=0.1):
return round(L / (1 + math.exp(-k * (x - x0))), 3)
def generate_linear_space(start, end, num):
if num == 1:
return [start]
elif num < 1:
raise ValueError('num must be at least 1.')
step = (end - start) / (num - 1)
return [start + step * i for i in range(num)]
def generate_depth_percents(intervals, interval_type):
if interval_type == 'linear':
return generate_linear_space(0, 100, intervals)
elif interval_type == 'sigmoid':
linear_space = generate_linear_space(0, 100, intervals)
return [logistic(x) for x in linear_space]
else:
raise ValueError('Unsupported interval type')
needlebench_reader_cfg = dict(input_columns=['prompt'], output_column='answer')
needlebench_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(role='HUMAN', prompt='{prompt}'),
dict(role='BOT', prompt='{answer}\n'),
]
),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer),
)
needlebench_eval_cfg = dict(
evaluator=dict(type=NeedleBenchParallelEvaluator),
pred_postprocessor=dict(type=needlebench_postprocess),
dataset_postprocessor=dict(type=needlebench_dataset_postprocess),
pred_role='BOT',
)
context_lengths = list(range(1000, 5000, 1000))
document_depth_percent_intervals = 20
document_depth_percent_interval_type = 'linear'
needlebench_eval_cfg['evaluator']['type'] = NeedleBenchParallelEvaluator
base_path = 'opencompass/needlebench'
file_list = ['PaulGrahamEssays.jsonl']
needlebench_en_datasets = []
needle_file_name = 'needles.jsonl'
depths_float = generate_depth_percents(
document_depth_percent_intervals, document_depth_percent_interval_type
)
depths = [int(depth) for depth in depths_float]
for original_context_length in context_lengths:
dataset_dict = {
'abbr': f'Length{original_context_length}' f'_parallel_en_4k',
'type': NeedleBenchParallelDataset,
'path': base_path,
'needle_file_name': needle_file_name,
'length': original_context_length,
'depths': depths,
'tokenizer_model': 'gpt-4',
'file_list': file_list,
'num_repeats_per_file': 25,
'length_buffer': 1000,
'guide': True,
# Define configurations for both English and Chinese datasets
language_configs = [
{
'file_list': ['PaulGrahamEssays.jsonl'],
'dataset_var': 'needlebench_en_datasets',
'language': 'English',
'reader_cfg': needlebench_reader_cfg,
'infer_cfg': needlebench_infer_cfg,
'eval_cfg': needlebench_eval_cfg,
'length_buffer': 500,
'suffix': 'en'
},
{
'file_list': ['zh_finance.jsonl'],
'dataset_var': 'needlebench_zh_datasets',
'language': 'Chinese',
'length_buffer': 200,
'suffix': 'zh'
}
needlebench_en_datasets.append(dataset_dict)
]
file_list = ['zh_finance.jsonl']
# Initialize empty dataset lists
needlebench_en_datasets = []
needlebench_zh_datasets = []
for original_context_length in context_lengths:
dataset_dict = {
'abbr': f'Length{original_context_length}' f'_parallel_zh_4k',
'type': NeedleBenchParallelDataset,
'path': base_path,
'needle_file_name': needle_file_name,
'length': original_context_length,
'depths': depths,
'tokenizer_model': 'gpt-4',
'file_list': file_list,
'num_repeats_per_file': 25,
'length_buffer': 200,
'guide': True,
'language': 'Chinese',
'reader_cfg': needlebench_reader_cfg,
'infer_cfg': needlebench_infer_cfg,
'eval_cfg': needlebench_eval_cfg,
}
needlebench_zh_datasets.append(dataset_dict)
# Single loop to handle both languages
for config in language_configs:
for original_context_length in context_lengths:
dataset_dict = {
'abbr': f'Length{original_context_length}_parallel_{config["suffix"]}_4k',
'type': NeedleBenchParallelDataset,
'path': base_path,
'needle_file_name': needle_file_name,
'length': original_context_length,
'depths': depths,
'tokenizer_model': 'gpt-4',
'file_list': config['file_list'],
'num_repeats_per_file': 25,
'length_buffer': config['length_buffer'],
'language': config['language'],
'reader_cfg': needlebench_reader_cfg,
'infer_cfg': needlebench_infer_cfg,
'eval_cfg': needlebench_eval_cfg,
}
globals()[config['dataset_var']].append(dataset_dict)

View File

@ -5,30 +5,6 @@ from opencompass.datasets.needlebench.origin import NeedleBenchOriginDataset
from opencompass.datasets.needlebench.origin import NeedleBenchOriginEvaluator
from opencompass.datasets.needlebench.origin import needlebench_postprocess
from opencompass.datasets.needlebench.origin import needlebench_dataset_postprocess
import math
def logistic(x, L=100, x0=50, k=0.1):
return round(L / (1 + math.exp(-k * (x - x0))), 3)
def generate_linear_space(start, end, num):
if num == 1:
return [start]
elif num < 1:
raise ValueError('num must be at least 1.')
step = (end - start) / (num - 1)
return [start + step * i for i in range(num)]
def generate_depth_percents(intervals, interval_type):
if interval_type == 'linear':
return generate_linear_space(0, 100, intervals)
elif interval_type == 'sigmoid':
linear_space = generate_linear_space(0, 100, intervals)
return [logistic(x) for x in linear_space]
else:
raise ValueError('Unsupported interval type')
needlebench_reader_cfg = dict(input_columns=['prompt'], output_column='answer')
@ -54,63 +30,52 @@ needlebench_eval_cfg = dict(
pred_role='BOT',
)
context_lengths = list(range(1000, 5000, 1000))
document_depth_percent_intervals = 20
document_depth_percent_interval_type = 'linear'
context_lengths = list([1000, 2000, 3000, 4000])
depths_list = [0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
base_path = 'opencompass/needlebench'
file_list = ['PaulGrahamEssays.jsonl']
needle_file_name = 'needles.jsonl'
# Define configurations for both English and Chinese datasets
language_configs = [
{
'file_list': ['PaulGrahamEssays.jsonl'],
'dataset_var': 'needlebench_en_datasets',
'language': 'English',
'length_buffer': 500,
'suffix': 'en'
},
{
'file_list': ['zh_finance.jsonl'],
'dataset_var': 'needlebench_zh_datasets',
'language': 'Chinese',
'length_buffer': 200,
'suffix': 'zh'
}
]
# Initialize empty dataset lists
needlebench_en_datasets = []
needle_file_name = 'needles.jsonl'
for original_context_length in context_lengths:
for depth_percent in generate_depth_percents(
document_depth_percent_intervals, document_depth_percent_interval_type
):
dataset_dict = {
'abbr': f'Length{original_context_length}'
f'Depth{int(depth_percent)}_origin_en_4k',
'type': NeedleBenchOriginDataset,
'path': base_path,
'length': original_context_length,
'depth': int(depth_percent),
'tokenizer_model': 'gpt-4',
'file_list': file_list,
'num_repeats_per_file': 10,
'length_buffer': 600,
'guide': True,
'language': 'English',
'needle_file_name': needle_file_name,
'reader_cfg': needlebench_reader_cfg,
'infer_cfg': needlebench_infer_cfg,
'eval_cfg': needlebench_eval_cfg,
}
needlebench_en_datasets.append(dataset_dict)
file_list = ['zh_finance.jsonl']
needlebench_zh_datasets = []
needle_file_name = 'needles.jsonl'
for original_context_length in context_lengths:
for depth_percent in generate_depth_percents(
document_depth_percent_intervals, document_depth_percent_interval_type
):
dataset_dict = {
'abbr': f'Length{original_context_length}'
f'Depth{int(depth_percent)}_origin_zh_4k',
'type': NeedleBenchOriginDataset,
'path': base_path,
'length': original_context_length,
'depth': int(depth_percent),
'tokenizer_model': 'gpt-4',
'file_list': file_list,
'num_repeats_per_file': 10,
'length_buffer': 200,
'guide': True,
'language': 'Chinese',
'needle_file_name': needle_file_name,
'reader_cfg': needlebench_reader_cfg,
'infer_cfg': needlebench_infer_cfg,
'eval_cfg': needlebench_eval_cfg,
}
needlebench_zh_datasets.append(dataset_dict)
# Single loop to handle both languages
for config in language_configs:
for original_context_length in context_lengths:
for depth_percent in depths_list:
dataset_dict = {
'abbr': f'Length{original_context_length}'
f'Depth{int(depth_percent)}_origin_{config["suffix"]}_4k',
'type': NeedleBenchOriginDataset,
'path': base_path,
'length': original_context_length,
'depth': int(depth_percent),
'tokenizer_model': 'gpt-4',
'file_list': config['file_list'],
'num_repeats_per_file': 10,
'length_buffer': config['length_buffer'],
'language': config['language'],
'needle_file_name': needle_file_name,
'reader_cfg': needlebench_reader_cfg,
'infer_cfg': needlebench_infer_cfg,
'eval_cfg': needlebench_eval_cfg,
}
globals()[config['dataset_var']].append(dataset_dict)

View File

@ -1,18 +1,18 @@
from mmengine.config import read_base
with read_base():
from .needlebench_multi_reasoning_8k import needlebench_2needle_en_datasets as needlebench_multi_2needle_en_datasets
from .needlebench_multi_reasoning_8k import needlebench_3needle_en_datasets as needlebench_multi_3needle_en_datasets
from .needlebench_multi_reasoning_8k import needlebench_4needle_en_datasets as needlebench_multi_4needle_en_datasets
from .needlebench_multi_reasoning_8k import needlebench_5needle_en_datasets as needlebench_multi_5needle_en_datasets
from .needlebench_multi_reasoning_8k import needlebench_2needle_zh_datasets as needlebench_multi_2needle_zh_datasets
from .needlebench_multi_reasoning_8k import needlebench_3needle_zh_datasets as needlebench_multi_3needle_zh_datasets
from .needlebench_multi_reasoning_8k import needlebench_4needle_zh_datasets as needlebench_multi_4needle_zh_datasets
from .needlebench_multi_reasoning_8k import needlebench_5needle_zh_datasets as needlebench_multi_5needle_zh_datasets
from opencompass.configs.datasets.needlebench.needlebench_8k.needlebench_multi_reasoning_8k import needlebench_2needle_en_datasets as needlebench_multi_2needle_en_datasets
from opencompass.configs.datasets.needlebench.needlebench_8k.needlebench_multi_reasoning_8k import needlebench_3needle_en_datasets as needlebench_multi_3needle_en_datasets
from opencompass.configs.datasets.needlebench.needlebench_8k.needlebench_multi_reasoning_8k import needlebench_4needle_en_datasets as needlebench_multi_4needle_en_datasets
from opencompass.configs.datasets.needlebench.needlebench_8k.needlebench_multi_reasoning_8k import needlebench_5needle_en_datasets as needlebench_multi_5needle_en_datasets
from opencompass.configs.datasets.needlebench.needlebench_8k.needlebench_multi_reasoning_8k import needlebench_2needle_zh_datasets as needlebench_multi_2needle_zh_datasets
from opencompass.configs.datasets.needlebench.needlebench_8k.needlebench_multi_reasoning_8k import needlebench_3needle_zh_datasets as needlebench_multi_3needle_zh_datasets
from opencompass.configs.datasets.needlebench.needlebench_8k.needlebench_multi_reasoning_8k import needlebench_4needle_zh_datasets as needlebench_multi_4needle_zh_datasets
from opencompass.configs.datasets.needlebench.needlebench_8k.needlebench_multi_reasoning_8k import needlebench_5needle_zh_datasets as needlebench_multi_5needle_zh_datasets
from .needlebench_single_8k import needlebench_en_datasets as needlebench_origin_en_datasets
from .needlebench_single_8k import needlebench_zh_datasets as needlebench_origin_zh_datasets
from .needlebench_multi_retrieval_8k import needlebench_en_datasets as needlebench_parallel_en_datasets
from .needlebench_multi_retrieval_8k import needlebench_zh_datasets as needlebench_parallel_zh_datasets
from opencompass.configs.datasets.needlebench.needlebench_8k.needlebench_single_8k import needlebench_en_datasets as needlebench_origin_en_datasets
from opencompass.configs.datasets.needlebench.needlebench_8k.needlebench_single_8k import needlebench_zh_datasets as needlebench_origin_zh_datasets
from opencompass.configs.datasets.needlebench.needlebench_8k.needlebench_multi_retrieval_8k import needlebench_en_datasets as needlebench_parallel_en_datasets
from opencompass.configs.datasets.needlebench.needlebench_8k.needlebench_multi_retrieval_8k import needlebench_zh_datasets as needlebench_parallel_zh_datasets
needlebench_datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])

View File

@ -1,305 +1,93 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets.needlebench.multi import NeedleBenchMultiDataset
from opencompass.datasets.needlebench.multi import NeedleBenchMultiEvaluator
from opencompass.datasets.needlebench.origin import needlebench_postprocess
from opencompass.datasets.needlebench.origin import needlebench_dataset_postprocess
import math
from mmengine.config import read_base
with read_base():
from .needlebench_single_8k import depths_list, context_lengths
from .needlebench_single_8k import needlebench_reader_cfg, needlebench_infer_cfg
from opencompass.configs.datasets.needlebench.atc.atc_0shot_nocot_2_power_en import needlebench_atc_eval_cfg as needlebench_eval_cfg
def logistic(x, L=100, x0=50, k=0.1):
return round(L / (1 + math.exp(-k * (x - x0))), 3)
def generate_linear_space(start, end, num):
if num == 1:
return [start]
elif num < 1:
raise ValueError('num must be at least 1.')
step = (end - start) / (num - 1)
return [start + step * i for i in range(num)]
def generate_depth_percents(intervals, interval_type):
if interval_type == 'linear':
return generate_linear_space(0, 100, intervals)
elif interval_type == 'sigmoid':
linear_space = generate_linear_space(0, 100, intervals)
return [logistic(x) for x in linear_space]
else:
raise ValueError('Unsupported interval type')
needlebench_reader_cfg = dict(input_columns=['prompt'], output_column='answer')
needlebench_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(role='HUMAN', prompt='{prompt}'),
dict(role='BOT', prompt='{answer}\n'),
]
),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer),
)
needlebench_eval_cfg = dict(
evaluator=dict(type=NeedleBenchMultiEvaluator),
pred_postprocessor=dict(type=needlebench_postprocess),
dataset_postprocessor=dict(type=needlebench_dataset_postprocess),
pred_role='BOT',
)
context_lengths = list(range(5000, 9000, 1000))
document_depth_percent_intervals = 20
document_depth_percent_interval_type = 'linear'
# ----------English Version----------
base_path = 'opencompass/needlebench'
file_list = ['PaulGrahamEssays.jsonl']
needle_file_name = 'multi_needle_reasoning_en.json'
needle_file_name = 'names.json'
diff = 10
num_needles = 2
needlebench_2needle_en_datasets = []
language = 'English'
length_buffer = 500
for original_context_length in context_lengths:
for depth_percent in generate_depth_percents(
document_depth_percent_intervals, document_depth_percent_interval_type
):
dataset_dict = {
'abbr': f'Length{original_context_length}'
f'Depth{int(depth_percent)}_{num_needles}needle_en_8k',
'type': NeedleBenchMultiDataset,
'path': base_path,
'length': original_context_length,
'depth': int(depth_percent),
'tokenizer_model': 'gpt-4',
'file_list': file_list,
'num_repeats_per_file': 10,
'length_buffer': 1000,
'guide': True,
'language': language,
'needle_file_name': needle_file_name,
'num_needles': num_needles,
'diff': diff,
'reader_cfg': needlebench_reader_cfg,
'infer_cfg': needlebench_infer_cfg,
'eval_cfg': needlebench_eval_cfg,
}
needlebench_2needle_en_datasets.append(dataset_dict)
num_needles = 3
# Initialize dataset lists
needlebench_2needle_en_datasets = []
needlebench_3needle_en_datasets = []
for original_context_length in context_lengths:
for depth_percent in generate_depth_percents(
document_depth_percent_intervals, document_depth_percent_interval_type
):
dataset_dict = {
'abbr': f'Length{original_context_length}'
f'Depth{int(depth_percent)}_{num_needles}needle_en_8k',
'type': NeedleBenchMultiDataset,
'path': base_path,
'length': original_context_length,
'depth': int(depth_percent),
'tokenizer_model': 'gpt-4',
'file_list': file_list,
'num_repeats_per_file': 10,
'length_buffer': 1000,
'guide': True,
'language': language,
'needle_file_name': needle_file_name,
'num_needles': num_needles,
'diff': diff,
'reader_cfg': needlebench_reader_cfg,
'infer_cfg': needlebench_infer_cfg,
'eval_cfg': needlebench_eval_cfg,
}
needlebench_3needle_en_datasets.append(dataset_dict)
num_needles = 4
needlebench_4needle_en_datasets = []
for original_context_length in context_lengths:
for depth_percent in generate_depth_percents(
document_depth_percent_intervals, document_depth_percent_interval_type
):
dataset_dict = {
'abbr': f'Length{original_context_length}'
f'Depth{int(depth_percent)}_{num_needles}needle_en_8k',
'type': NeedleBenchMultiDataset,
'path': base_path,
'length': original_context_length,
'depth': int(depth_percent),
'tokenizer_model': 'gpt-4',
'file_list': file_list,
'num_repeats_per_file': 10,
'length_buffer': 1000,
'guide': True,
'language': language,
'needle_file_name': needle_file_name,
'num_needles': num_needles,
'diff': diff,
'reader_cfg': needlebench_reader_cfg,
'infer_cfg': needlebench_infer_cfg,
'eval_cfg': needlebench_eval_cfg,
}
needlebench_4needle_en_datasets.append(dataset_dict)
num_needles = 5
needlebench_5needle_en_datasets = []
for original_context_length in context_lengths:
for depth_percent in generate_depth_percents(
document_depth_percent_intervals, document_depth_percent_interval_type
):
dataset_dict = {
'abbr': f'Length{original_context_length}'
f'Depth{int(depth_percent)}_{num_needles}needle_en_8k',
'type': NeedleBenchMultiDataset,
'path': base_path,
'length': original_context_length,
'depth': int(depth_percent),
'tokenizer_model': 'gpt-4',
'file_list': file_list,
'num_repeats_per_file': 10,
'length_buffer': 1000,
'guide': True,
'language': language,
'needle_file_name': needle_file_name,
'num_needles': num_needles,
'diff': diff,
'reader_cfg': needlebench_reader_cfg,
'infer_cfg': needlebench_infer_cfg,
'eval_cfg': needlebench_eval_cfg,
}
needlebench_5needle_en_datasets.append(dataset_dict)
# Create datasets for different numbers of needles
for num_needles in range(2, 6):
dataset_list_name = f'needlebench_{num_needles}needle_en_datasets'
for original_context_length in context_lengths:
for depth_percent in depths_list:
dataset_dict = {
'abbr': f'Length{original_context_length}'
f'Depth{int(depth_percent)}_{num_needles}needle_en_8k',
'type': NeedleBenchMultiDataset,
'path': base_path,
'length': original_context_length,
'depth': int(depth_percent),
'tokenizer_model': 'gpt-4',
'file_list': file_list,
'num_repeats_per_file': 10,
'length_buffer': length_buffer,
'language': language,
'needle_file_name': needle_file_name,
'num_needles': num_needles,
'diff': diff,
'reader_cfg': needlebench_reader_cfg,
'infer_cfg': needlebench_infer_cfg,
'eval_cfg': needlebench_eval_cfg,
}
# Add to the appropriate list using globals()
globals()[f'needlebench_{num_needles}needle_en_datasets'].append(dataset_dict)
# ----------Chinese Version----------
base_path = 'opencompass/needlebench'
file_list = ['zh_finance.jsonl']
needle_file_name = 'multi_needle_reasoning_zh.json'
needle_file_name = 'names.json'
diff = 10
num_needles = 2
needlebench_2needle_zh_datasets = []
language = 'Chinese'
length_buffer = 200
for original_context_length in context_lengths:
for depth_percent in generate_depth_percents(
document_depth_percent_intervals, document_depth_percent_interval_type
):
dataset_dict = {
'abbr': f'Length{original_context_length}'
f'Depth{int(depth_percent)}_{num_needles}needle_zh_8k',
'type': NeedleBenchMultiDataset,
'path': base_path,
'length': original_context_length,
'depth': int(depth_percent),
'tokenizer_model': 'gpt-4',
'file_list': file_list,
'num_repeats_per_file': 10,
'length_buffer': 200,
'guide': True,
'language': language,
'needle_file_name': needle_file_name,
'num_needles': num_needles,
'diff': diff,
'reader_cfg': needlebench_reader_cfg,
'infer_cfg': needlebench_infer_cfg,
'eval_cfg': needlebench_eval_cfg,
}
needlebench_2needle_zh_datasets.append(dataset_dict)
num_needles = 3
# Initialize dataset lists
needlebench_2needle_zh_datasets = []
needlebench_3needle_zh_datasets = []
for original_context_length in context_lengths:
for depth_percent in generate_depth_percents(
document_depth_percent_intervals, document_depth_percent_interval_type
):
dataset_dict = {
'abbr': f'Length{original_context_length}'
f'Depth{int(depth_percent)}_{num_needles}needle_zh_8k',
'type': NeedleBenchMultiDataset,
'path': base_path,
'length': original_context_length,
'depth': int(depth_percent),
'tokenizer_model': 'gpt-4',
'file_list': file_list,
'num_repeats_per_file': 10,
'length_buffer': 200,
'guide': True,
'language': language,
'needle_file_name': needle_file_name,
'num_needles': num_needles,
'diff': diff,
'reader_cfg': needlebench_reader_cfg,
'infer_cfg': needlebench_infer_cfg,
'eval_cfg': needlebench_eval_cfg,
}
needlebench_3needle_zh_datasets.append(dataset_dict)
num_needles = 4
needlebench_4needle_zh_datasets = []
for original_context_length in context_lengths:
for depth_percent in generate_depth_percents(
document_depth_percent_intervals, document_depth_percent_interval_type
):
dataset_dict = {
'abbr': f'Length{original_context_length}'
f'Depth{int(depth_percent)}_{num_needles}needle_zh_8k',
'type': NeedleBenchMultiDataset,
'path': base_path,
'length': original_context_length,
'depth': int(depth_percent),
'tokenizer_model': 'gpt-4',
'file_list': file_list,
'num_repeats_per_file': 10,
'length_buffer': 200,
'guide': True,
'language': language,
'needle_file_name': needle_file_name,
'num_needles': num_needles,
'diff': diff,
'reader_cfg': needlebench_reader_cfg,
'infer_cfg': needlebench_infer_cfg,
'eval_cfg': needlebench_eval_cfg,
}
needlebench_4needle_zh_datasets.append(dataset_dict)
num_needles = 5
needlebench_5needle_zh_datasets = []
for original_context_length in context_lengths:
for depth_percent in generate_depth_percents(
document_depth_percent_intervals, document_depth_percent_interval_type
):
dataset_dict = {
'abbr': f'Length{original_context_length}'
f'Depth{int(depth_percent)}_{num_needles}needle_zh_8k',
'type': NeedleBenchMultiDataset,
'path': base_path,
'length': original_context_length,
'depth': int(depth_percent),
'tokenizer_model': 'gpt-4',
'file_list': file_list,
'num_repeats_per_file': 10,
'length_buffer': 200,
'guide': True,
'language': language,
'needle_file_name': needle_file_name,
'num_needles': num_needles,
'diff': diff,
'reader_cfg': needlebench_reader_cfg,
'infer_cfg': needlebench_infer_cfg,
'eval_cfg': needlebench_eval_cfg,
}
needlebench_5needle_zh_datasets.append(dataset_dict)
# Create datasets for different numbers of needles
for num_needles in range(2, 6):
dataset_list_name = f'needlebench_{num_needles}needle_zh_datasets'
for original_context_length in context_lengths:
for depth_percent in depths_list:
dataset_dict = {
'abbr': f'Length{original_context_length}'
f'Depth{int(depth_percent)}_{num_needles}needle_zh_8k',
'type': NeedleBenchMultiDataset,
'path': base_path,
'length': original_context_length,
'depth': int(depth_percent),
'tokenizer_model': 'gpt-4',
'file_list': file_list,
'num_repeats_per_file': 10,
'length_buffer': length_buffer,
'language': language,
'needle_file_name': needle_file_name,
'num_needles': num_needles,
'diff': diff,
'reader_cfg': needlebench_reader_cfg,
'infer_cfg': needlebench_infer_cfg,
'eval_cfg': needlebench_eval_cfg,
}
# Add to the appropriate list using globals()
globals()[f'needlebench_{num_needles}needle_zh_datasets'].append(dataset_dict)

View File

@ -1,111 +1,55 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets.needlebench.parallel import NeedleBenchParallelDataset
from opencompass.datasets.needlebench.parallel import NeedleBenchParallelEvaluator
from opencompass.datasets.needlebench.origin import needlebench_postprocess
from opencompass.datasets.needlebench.origin import needlebench_dataset_postprocess
import math
from mmengine.config import read_base
with read_base():
from .needlebench_single_8k import depths_list as depths, context_lengths
from .needlebench_single_8k import needlebench_reader_cfg, needlebench_infer_cfg, needlebench_eval_cfg
def logistic(x, L=100, x0=50, k=0.1):
return round(L / (1 + math.exp(-k * (x - x0))), 3)
def generate_linear_space(start, end, num):
if num == 1:
return [start]
elif num < 1:
raise ValueError('num must be at least 1.')
step = (end - start) / (num - 1)
return [start + step * i for i in range(num)]
def generate_depth_percents(intervals, interval_type):
if interval_type == 'linear':
return generate_linear_space(0, 100, intervals)
elif interval_type == 'sigmoid':
linear_space = generate_linear_space(0, 100, intervals)
return [logistic(x) for x in linear_space]
else:
raise ValueError('Unsupported interval type')
needlebench_reader_cfg = dict(input_columns=['prompt'], output_column='answer')
needlebench_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(role='HUMAN', prompt='{prompt}'),
dict(role='BOT', prompt='{answer}\n'),
]
),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer),
)
needlebench_eval_cfg = dict(
evaluator=dict(type=NeedleBenchParallelEvaluator),
pred_postprocessor=dict(type=needlebench_postprocess),
dataset_postprocessor=dict(type=needlebench_dataset_postprocess),
pred_role='BOT',
)
context_lengths = list(range(5000, 9000, 1000))
document_depth_percent_intervals = 20
document_depth_percent_interval_type = 'linear'
needlebench_eval_cfg['evaluator']['type'] = NeedleBenchParallelEvaluator
base_path = 'opencompass/needlebench'
file_list = ['PaulGrahamEssays.jsonl']
needlebench_en_datasets = []
needle_file_name = 'needles.jsonl'
depths_float = generate_depth_percents(
document_depth_percent_intervals, document_depth_percent_interval_type
)
depths = [int(depth) for depth in depths_float]
for original_context_length in context_lengths:
dataset_dict = {
'abbr': f'Length{original_context_length}' f'_parallel_en_8k',
'type': NeedleBenchParallelDataset,
'path': base_path,
'needle_file_name': needle_file_name,
'length': original_context_length,
'depths': depths,
'tokenizer_model': 'gpt-4',
'file_list': file_list,
'num_repeats_per_file': 25,
'length_buffer': 1300,
'guide': True,
# Define configurations for both English and Chinese datasets
language_configs = [
{
'file_list': ['PaulGrahamEssays.jsonl'],
'dataset_var': 'needlebench_en_datasets',
'language': 'English',
'reader_cfg': needlebench_reader_cfg,
'infer_cfg': needlebench_infer_cfg,
'eval_cfg': needlebench_eval_cfg,
'length_buffer': 500,
'suffix': 'en'
},
{
'file_list': ['zh_finance.jsonl'],
'dataset_var': 'needlebench_zh_datasets',
'language': 'Chinese',
'length_buffer': 200,
'suffix': 'zh'
}
needlebench_en_datasets.append(dataset_dict)
]
file_list = ['zh_finance.jsonl']
# Initialize empty dataset lists
needlebench_en_datasets = []
needlebench_zh_datasets = []
for original_context_length in context_lengths:
dataset_dict = {
'abbr': f'Length{original_context_length}' f'_parallel_zh_8k',
'type': NeedleBenchParallelDataset,
'path': base_path,
'needle_file_name': needle_file_name,
'length': original_context_length,
'depths': depths,
'tokenizer_model': 'gpt-4',
'file_list': file_list,
'num_repeats_per_file': 25,
'length_buffer': 200,
'guide': True,
'language': 'Chinese',
'reader_cfg': needlebench_reader_cfg,
'infer_cfg': needlebench_infer_cfg,
'eval_cfg': needlebench_eval_cfg,
}
needlebench_zh_datasets.append(dataset_dict)
# Single loop to handle both languages
for config in language_configs:
for original_context_length in context_lengths:
dataset_dict = {
'abbr': f'Length{original_context_length}_parallel_{config["suffix"]}_8k',
'type': NeedleBenchParallelDataset,
'path': base_path,
'needle_file_name': needle_file_name,
'length': original_context_length,
'depths': depths,
'tokenizer_model': 'gpt-4',
'file_list': config['file_list'],
'num_repeats_per_file': 25,
'length_buffer': config['length_buffer'],
'language': config['language'],
'reader_cfg': needlebench_reader_cfg,
'infer_cfg': needlebench_infer_cfg,
'eval_cfg': needlebench_eval_cfg,
}
globals()[config['dataset_var']].append(dataset_dict)

View File

@ -1,122 +0,0 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets.needlebench.parallel import NeedleBenchParallelDataset
from opencompass.datasets.needlebench.parallel import NeedleBenchParallelEvaluator
from opencompass.datasets.needlebench.origin import needlebench_postprocess
from opencompass.datasets.needlebench.origin import needlebench_dataset_postprocess
import math
def logistic(x, L=100, x0=50, k=0.1):
return round(L / (1 + math.exp(-k * (x - x0))), 3)
def generate_linear_space(start, end, num):
if num == 1:
return [start]
elif num < 1:
raise ValueError('num must be at least 1.')
step = (end - start) / (num - 1)
return [start + step * i for i in range(num)]
def generate_depth_percents(intervals, interval_type):
if interval_type == 'linear':
return generate_linear_space(0, 100, intervals)
elif interval_type == 'sigmoid':
linear_space = generate_linear_space(0, 100, intervals)
return [logistic(x) for x in linear_space]
else:
raise ValueError('Unsupported interval type')
needlebench_reader_cfg = dict(input_columns=['prompt'], output_column='answer')
needlebench_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(role='HUMAN', prompt='{prompt}'),
dict(role='BOT', prompt='{answer}\n'),
]
),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer),
)
needlebench_eval_cfg = dict(
evaluator=dict(type=NeedleBenchParallelEvaluator),
pred_postprocessor=dict(type=needlebench_postprocess),
dataset_postprocessor=dict(type=needlebench_dataset_postprocess),
pred_role='BOT',
)
context_lengths = list(range(5000, 9000, 1000))
document_depth_percent_intervals_list = [1, 5, 10, 15, 20]
document_depth_percent_interval_type = 'linear'
base_path = 'opencompass/needlebench'
file_list = ['PaulGrahamEssays.jsonl']
needlebench_en_datasets = []
needle_file_name = 'needles.jsonl'
for document_depth_percent_intervals in document_depth_percent_intervals_list:
depths_float = generate_depth_percents(
document_depth_percent_intervals, document_depth_percent_interval_type
)
depths = [int(depth) for depth in depths_float]
for original_context_length in context_lengths:
dataset_dict = {
'abbr': f'Length{original_context_length}'
f'_parallel_en_8k_batch{document_depth_percent_intervals}',
'type': NeedleBenchParallelDataset,
'path': base_path,
'needle_file_name': needle_file_name,
'length': original_context_length,
'depths': depths,
'tokenizer_model': 'gpt-4',
'file_list': file_list,
'num_repeats_per_file': 50,
'length_buffer': 1300,
'guide': True,
'language': 'English',
'reader_cfg': needlebench_reader_cfg,
'infer_cfg': needlebench_infer_cfg,
'eval_cfg': needlebench_eval_cfg,
}
needlebench_en_datasets.append(dataset_dict)
file_list = ['zh_finance.jsonl']
needlebench_zh_datasets = []
needle_file_name = 'needles.jsonl'
for document_depth_percent_intervals in document_depth_percent_intervals_list:
depths_float = generate_depth_percents(
document_depth_percent_intervals, document_depth_percent_interval_type
)
depths = [int(depth) for depth in depths_float]
for original_context_length in context_lengths:
dataset_dict = {
'abbr': f'Length{original_context_length}'
f'_parallel_zh_8k_batch{document_depth_percent_intervals}',
'type': NeedleBenchParallelDataset,
'path': base_path,
'needle_file_name': needle_file_name,
'length': original_context_length,
'depths': depths,
'tokenizer_model': 'gpt-4',
'file_list': file_list,
'num_repeats_per_file': 50,
'length_buffer': 200,
'guide': True,
'language': 'Chinese',
'reader_cfg': needlebench_reader_cfg,
'infer_cfg': needlebench_infer_cfg,
'eval_cfg': needlebench_eval_cfg,
}
needlebench_zh_datasets.append(dataset_dict)

View File

@ -5,30 +5,6 @@ from opencompass.datasets.needlebench.origin import NeedleBenchOriginDataset
from opencompass.datasets.needlebench.origin import NeedleBenchOriginEvaluator
from opencompass.datasets.needlebench.origin import needlebench_postprocess
from opencompass.datasets.needlebench.origin import needlebench_dataset_postprocess
import math
def logistic(x, L=100, x0=50, k=0.1):
return round(L / (1 + math.exp(-k * (x - x0))), 3)
def generate_linear_space(start, end, num):
if num == 1:
return [start]
elif num < 1:
raise ValueError('num must be at least 1.')
step = (end - start) / (num - 1)
return [start + step * i for i in range(num)]
def generate_depth_percents(intervals, interval_type):
if interval_type == 'linear':
return generate_linear_space(0, 100, intervals)
elif interval_type == 'sigmoid':
linear_space = generate_linear_space(0, 100, intervals)
return [logistic(x) for x in linear_space]
else:
raise ValueError('Unsupported interval type')
needlebench_reader_cfg = dict(input_columns=['prompt'], output_column='answer')
@ -54,63 +30,52 @@ needlebench_eval_cfg = dict(
pred_role='BOT',
)
context_lengths = list(range(5000, 9000, 1000))
document_depth_percent_intervals = 20
document_depth_percent_interval_type = 'linear'
context_lengths = list([1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000])
depths_list = [0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
base_path = 'opencompass/needlebench'
file_list = ['PaulGrahamEssays.jsonl']
needle_file_name = 'needles.jsonl'
# Define configurations for both English and Chinese datasets
language_configs = [
{
'file_list': ['PaulGrahamEssays.jsonl'],
'dataset_var': 'needlebench_en_datasets',
'language': 'English',
'length_buffer': 500,
'suffix': 'en'
},
{
'file_list': ['zh_finance.jsonl'],
'dataset_var': 'needlebench_zh_datasets',
'language': 'Chinese',
'length_buffer': 200,
'suffix': 'zh'
}
]
# Initialize empty dataset lists
needlebench_en_datasets = []
needle_file_name = 'needles.jsonl'
for original_context_length in context_lengths:
for depth_percent in generate_depth_percents(
document_depth_percent_intervals, document_depth_percent_interval_type
):
dataset_dict = {
'abbr': f'Length{original_context_length}'
f'Depth{int(depth_percent)}_origin_en_8k',
'type': NeedleBenchOriginDataset,
'path': base_path,
'length': original_context_length,
'depth': int(depth_percent),
'tokenizer_model': 'gpt-4',
'file_list': file_list,
'num_repeats_per_file': 10,
'length_buffer': 800,
'guide': True,
'language': 'English',
'needle_file_name': needle_file_name,
'reader_cfg': needlebench_reader_cfg,
'infer_cfg': needlebench_infer_cfg,
'eval_cfg': needlebench_eval_cfg,
}
needlebench_en_datasets.append(dataset_dict)
file_list = ['zh_finance.jsonl']
needlebench_zh_datasets = []
needle_file_name = 'needles.jsonl'
for original_context_length in context_lengths:
for depth_percent in generate_depth_percents(
document_depth_percent_intervals, document_depth_percent_interval_type
):
dataset_dict = {
'abbr': f'Length{original_context_length}'
f'Depth{int(depth_percent)}_origin_zh_8k',
'type': NeedleBenchOriginDataset,
'path': base_path,
'length': original_context_length,
'depth': int(depth_percent),
'tokenizer_model': 'gpt-4',
'file_list': file_list,
'num_repeats_per_file': 10,
'length_buffer': 200,
'guide': True,
'language': 'Chinese',
'needle_file_name': needle_file_name,
'reader_cfg': needlebench_reader_cfg,
'infer_cfg': needlebench_infer_cfg,
'eval_cfg': needlebench_eval_cfg,
}
needlebench_zh_datasets.append(dataset_dict)
# Single loop to handle both languages
for config in language_configs:
for original_context_length in context_lengths:
for depth_percent in depths_list:
dataset_dict = {
'abbr': f'Length{original_context_length}'
f'Depth{int(depth_percent)}_origin_{config["suffix"]}_8k',
'type': NeedleBenchOriginDataset,
'path': base_path,
'length': original_context_length,
'depth': int(depth_percent),
'tokenizer_model': 'gpt-4',
'file_list': config['file_list'],
'num_repeats_per_file': 10,
'length_buffer': config['length_buffer'],
'language': config['language'],
'needle_file_name': needle_file_name,
'reader_cfg': needlebench_reader_cfg,
'infer_cfg': needlebench_infer_cfg,
'eval_cfg': needlebench_eval_cfg,
}
globals()[config['dataset_var']].append(dataset_dict)

View File

@ -75,7 +75,6 @@ for original_context_length in context_lengths:
'file_list': file_list,
'num_repeats_per_file': 10,
'length_buffer': 600,
'guide': False,
'language': 'English',
'needle_file_name': needle_file_name,
'reader_cfg': needlebench_reader_cfg,
@ -101,7 +100,6 @@ for original_context_length in context_lengths:
'file_list': file_list,
'num_repeats_per_file': 10,
'length_buffer': 200,
'guide': False,
'language': 'Chinese',
'needle_file_name': needle_file_name,
'reader_cfg': needlebench_reader_cfg,

View File

@ -87,9 +87,9 @@ def create_summarizer(context_lengths, depths, dataset_size,
'subsets': [[f'Single-Needle-Retrieval(S-RT)-{dataset_size.upper()}', 'naive_average'],
[f'Multi-Needle-Reasoning(M-RS)-{dataset_size.upper()}', 'naive_average'],
[f'Multi-Needle-Retrieval(M-RT)-{dataset_size.upper()}', 'average_score']],
'weights': {f'Single-Needle-Retrieval(S-RT)-{dataset_size.upper()}': 0.4,
f'Multi-Needle-Reasoning(M-RS)-{dataset_size.upper()}': 0.3,
f'Multi-Needle-Retrieval(M-RT)-{dataset_size.upper()}': 0.3}})
'weights': {f'Single-Needle-Retrieval(S-RT)-{dataset_size.upper()}': 1/3,
f'Multi-Needle-Reasoning(M-RS)-{dataset_size.upper()}': 1/3,
f'Multi-Needle-Retrieval(M-RT)-{dataset_size.upper()}': 1/3}})
summarizer_config = {
'type': NeedleBenchSummarizer,
'summary_groups': summary_groups,
@ -120,200 +120,20 @@ def create_summarizer(context_lengths, depths, dataset_size,
return summarizer_config
depths = [0, 5, 10, 15, 21, 26, 31, 36, 42, 47, 52, 57, 63, 68, 73, 78, 84, 89, 94, 100]
depths_list_sparse = [0, 10, 21, 31, 42, 52, 63, 73, 84, 94, 100]
depths_list_20 = [i for i in range(0, 101, 5)] # [0, 5, 10, ..., 100]
depths_list_10 = [i for i in range(0, 101, 10)] # [0, 10, 20, ..., 100]
context_lengths_4k = list(range(1000, 5000, 1000))
needlebench_4k_summarizer = create_summarizer(context_lengths_4k, depths, '4k')
context_lengths_8k = list(range(5000, 9000, 1000))
needlebench_8k_summarizer = create_summarizer(context_lengths_8k, depths, '8k')
context_lengths_32k = [9000, 13000, 17000, 21000, 25000, 29000, 31000, 32000]
needlebench_32k_summarizer = create_summarizer(context_lengths_32k, depths_list_sparse, '32k')
context_lengths_128k = list([16000, 32000, 48000, 64000, 80000, 96000, 112000, 128000])
needlebench_128k_summarizer = create_summarizer(context_lengths_128k, depths_list_sparse, '128k')
context_lengths_200k = list([16000, 48000, 80000, 112000, 128000, 144000, 176000, 200000])
needlebench_200k_summarizer = create_summarizer(context_lengths_200k, depths_list_sparse, '200k')
context_lengths_256k = list([32000, 128000, 256000])
needlebench_256k_summarizer = create_summarizer(context_lengths_256k, depths_list_sparse, '256k')
context_lengths_1000k = list([20000, 160000, 300000, 440000, 580000, 720000, 860000, 1000000])
needlebench_1000k_summarizer = create_summarizer(context_lengths_1000k, depths_list_sparse, '1000k')
depths_list_internal = [0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, ]
needlebench_internal_32k_summarizer = create_summarizer([32000], depths_list_internal, '32000')
needlebench_internal_100k_summarizer = create_summarizer([100000], depths_list_internal, '100000')
needlebench_internal_200k_summarizer = create_summarizer([200000], depths_list_internal, '200000')
_needlebench_8k_parallel_en_batch1 = []
_needlebench_8k_parallel_en_batch5 = []
_needlebench_8k_parallel_en_batch10 = []
_needlebench_8k_parallel_en_batch15 = []
_needlebench_8k_parallel_en_batch20 = []
_needlebench_8k_parallel_zh_batch1 = []
_needlebench_8k_parallel_zh_batch5 = []
_needlebench_8k_parallel_zh_batch10 = []
_needlebench_8k_parallel_zh_batch15 = []
_needlebench_8k_parallel_zh_batch20 = []
for original_context_length in context_lengths_8k:
_needlebench_8k_parallel_en_batch1.append(f'Length{original_context_length}_parallel_en_8k_batch1')
_needlebench_8k_parallel_en_batch5.append(f'Length{original_context_length}_parallel_en_8k_batch5')
_needlebench_8k_parallel_en_batch10.append(f'Length{original_context_length}_parallel_en_8k_batch10')
_needlebench_8k_parallel_en_batch15.append(f'Length{original_context_length}_parallel_en_8k_batch15')
_needlebench_8k_parallel_en_batch20.append(f'Length{original_context_length}_parallel_en_8k_batch20')
_needlebench_8k_parallel_zh_batch1.append(f'Length{original_context_length}_parallel_zh_8k_batch1')
_needlebench_8k_parallel_zh_batch5.append(f'Length{original_context_length}_parallel_zh_8k_batch5')
_needlebench_8k_parallel_zh_batch10.append(f'Length{original_context_length}_parallel_zh_8k_batch10')
_needlebench_8k_parallel_zh_batch15.append(f'Length{original_context_length}_parallel_zh_8k_batch15')
_needlebench_8k_parallel_zh_batch20.append(f'Length{original_context_length}_parallel_zh_8k_batch20')
_needlebench_8k_parallel_batch1 = _needlebench_8k_parallel_en_batch1 + _needlebench_8k_parallel_zh_batch1
_needlebench_8k_parallel_batch5 = _needlebench_8k_parallel_en_batch5 + _needlebench_8k_parallel_zh_batch5
_needlebench_8k_parallel_batch10 = _needlebench_8k_parallel_en_batch10 + _needlebench_8k_parallel_zh_batch10
_needlebench_8k_parallel_batch15 = _needlebench_8k_parallel_en_batch15 + _needlebench_8k_parallel_zh_batch15
_needlebench_8k_parallel_batch20 = _needlebench_8k_parallel_en_batch20 + _needlebench_8k_parallel_zh_batch20
needlebench_summary_groups = [
{'name': 'parallel_version_batch1', 'subsets': [[_dataset, 'average_score'] for _dataset in _needlebench_8k_parallel_batch1]},
{'name': 'parallel_version_zh_batch1', 'subsets': [[_dataset, 'average_score'] for _dataset in _needlebench_8k_parallel_zh_batch1]},
{'name': 'parallel_version_en_batch1', 'subsets': [[_dataset, 'average_score'] for _dataset in _needlebench_8k_parallel_en_batch1]},
{'name': 'parallel_version_batch5', 'subsets': [[_dataset, 'average_score'] for _dataset in _needlebench_8k_parallel_batch5]},
{'name': 'parallel_version_zh_batch5', 'subsets': [[_dataset, 'average_score'] for _dataset in _needlebench_8k_parallel_zh_batch5]},
{'name': 'parallel_version_en_batch5', 'subsets': [[_dataset, 'average_score'] for _dataset in _needlebench_8k_parallel_en_batch5]},
{'name': 'parallel_version_batch10', 'subsets': [[_dataset, 'average_score'] for _dataset in _needlebench_8k_parallel_batch10]},
{'name': 'parallel_version_zh_batch10', 'subsets': [[_dataset, 'average_score'] for _dataset in _needlebench_8k_parallel_zh_batch10]},
{'name': 'parallel_version_en_batch10', 'subsets': [[_dataset, 'average_score'] for _dataset in _needlebench_8k_parallel_en_batch10]},
{'name': 'parallel_version_batch15', 'subsets': [[_dataset, 'average_score'] for _dataset in _needlebench_8k_parallel_batch15]},
{'name': 'parallel_version_zh_batch15', 'subsets': [[_dataset, 'average_score'] for _dataset in _needlebench_8k_parallel_zh_batch15]},
{'name': 'parallel_version_en_batch15', 'subsets': [[_dataset, 'average_score'] for _dataset in _needlebench_8k_parallel_en_batch15]},
{'name': 'parallel_version_batch20', 'subsets': [[_dataset, 'average_score'] for _dataset in _needlebench_8k_parallel_batch20]},
{'name': 'parallel_version_zh_batch20', 'subsets': [[_dataset, 'average_score'] for _dataset in _needlebench_8k_parallel_zh_batch20]},
{'name': 'parallel_version_en_batch20', 'subsets': [[_dataset, 'average_score'] for _dataset in _needlebench_8k_parallel_en_batch20]},
]
needlebench_8k_batch_overall_summarizer = dict(
dataset_abbrs=[
'--------- NeedleBench-8k Parallel-Needles ---------', # category
'parallel_version_batch1',
'parallel_version_batch5',
'parallel_version_batch10',
'parallel_version_batch15',
'parallel_version_batch20',
'parallel_version_zh_batch1',
'parallel_version_en_batch1',
'parallel_version_zh_batch5',
'parallel_version_en_batch5',
'parallel_version_zh_batch10',
'parallel_version_en_batch10',
'parallel_version_zh_batch15',
'parallel_version_en_batch15',
'parallel_version_zh_batch20',
'parallel_version_en_batch20',
],
summary_groups=needlebench_summary_groups,
)
needlebench_summary_groups = [
{'name': 'parallel_version_batch1', 'subsets': [[_dataset, 'Depth0'] for _dataset in _needlebench_8k_parallel_batch1]},
{'name': 'parallel_version_zh_batch1', 'subsets': [[_dataset, 'Depth0'] for _dataset in _needlebench_8k_parallel_zh_batch1]},
{'name': 'parallel_version_en_batch1', 'subsets': [[_dataset, 'Depth0'] for _dataset in _needlebench_8k_parallel_en_batch1]},
{'name': 'parallel_version_batch5', 'subsets': [[_dataset, 'Depth0'] for _dataset in _needlebench_8k_parallel_batch5]},
{'name': 'parallel_version_zh_batch5', 'subsets': [[_dataset, 'Depth0'] for _dataset in _needlebench_8k_parallel_zh_batch5]},
{'name': 'parallel_version_en_batch5', 'subsets': [[_dataset, 'Depth0'] for _dataset in _needlebench_8k_parallel_en_batch5]},
{'name': 'parallel_version_batch10', 'subsets': [[_dataset, 'Depth0'] for _dataset in _needlebench_8k_parallel_batch10]},
{'name': 'parallel_version_zh_batch10', 'subsets': [[_dataset, 'Depth0'] for _dataset in _needlebench_8k_parallel_zh_batch10]},
{'name': 'parallel_version_en_batch10', 'subsets': [[_dataset, 'Depth0'] for _dataset in _needlebench_8k_parallel_en_batch10]},
{'name': 'parallel_version_batch15', 'subsets': [[_dataset, 'Depth0'] for _dataset in _needlebench_8k_parallel_batch15]},
{'name': 'parallel_version_zh_batch15', 'subsets': [[_dataset, 'Depth0'] for _dataset in _needlebench_8k_parallel_zh_batch15]},
{'name': 'parallel_version_en_batch15', 'subsets': [[_dataset, 'Depth0'] for _dataset in _needlebench_8k_parallel_en_batch15]},
{'name': 'parallel_version_batch20', 'subsets': [[_dataset, 'Depth0'] for _dataset in _needlebench_8k_parallel_batch20]},
{'name': 'parallel_version_zh_batch20', 'subsets': [[_dataset, 'Depth0'] for _dataset in _needlebench_8k_parallel_zh_batch20]},
{'name': 'parallel_version_en_batch20', 'subsets': [[_dataset, 'Depth0'] for _dataset in _needlebench_8k_parallel_en_batch20]},
]
needlebench_8k_batch_depth0_summarizer = dict(
dataset_abbrs=[
'--------- NeedleBench-8k Parallel-Needles ---------', # category
'parallel_version_batch1',
'parallel_version_batch5',
'parallel_version_batch10',
'parallel_version_batch15',
'parallel_version_batch20',
'parallel_version_zh_batch1',
'parallel_version_en_batch1',
'parallel_version_zh_batch5',
'parallel_version_en_batch5',
'parallel_version_zh_batch10',
'parallel_version_en_batch10',
'parallel_version_zh_batch15',
'parallel_version_en_batch15',
'parallel_version_zh_batch20',
'parallel_version_en_batch20',
],
summary_groups=needlebench_summary_groups,
)
def gen_atc_summarizer(needle_num_list):
categories = [
'ZH-Direct-CE', 'EN-Direct-CE',
'ZH-Reasoning-CE', 'EN-Reasoning-CE'
]
needlebench_atc_summary_groups = []
# 根据分类生成summary groups
for category in categories:
# 对于CircularEval相关的评分使用perf_4指标否则使用acc_1指标
metric = 'perf_4' if 'CE' in category else 'acc_1'
# 生成subsets时不需要在数据集名称中包含CircularEval信息
cleaned_category = category.replace('-CE', '').replace('-Direct', '')
needlebench_atc_summary_groups.append({
'name': category,
'subsets': [
[f'NeedleBenchATCDataset-{num_needles}Needle-{cleaned_category}', metric]
for num_needles in needle_num_list
],
'weights': {f'NeedleBenchATCDataset-{num_needles}Needle-{cleaned_category}': num_needles for num_needles in needle_num_list},
})
needlebench_atc_summary_groups.append({
'name': 'ATC-CE-Overall',
'subsets': [
[f'{category}', 'weighted_average'] for category in categories
],
})
atc_dataset_abbrs = []
atc_dataset_abbrs.append(['ATC-CE-Overall', 'naive_average'])
for category in categories:
weighted_average_score_entry = [f'{category}', 'weighted_average']
atc_dataset_abbrs.append(weighted_average_score_entry)
needlebench_atc_summarizer = dict(
dataset_abbrs=[
*atc_dataset_abbrs,
'######## Needlebench-ATC Accuracy ########', # category
*[[f'NeedleBenchATCDataset-{num_needles}Needle-ZH', 'acc_1'] for num_needles in needle_num_list],
'------------------------------------------',
*[[f'NeedleBenchATCDataset-{num_needles}Needle-EN', 'acc_1'] for num_needles in needle_num_list],
'------------------------------------------',
*[[f'NeedleBenchATCDataset-{num_needles}Needle-ZH-Reasoning', 'acc_1'] for num_needles in needle_num_list],
'------------------------------------------',
*[[f'NeedleBenchATCDataset-{num_needles}Needle-EN-Reasoning', 'acc_1'] for num_needles in needle_num_list],
'------------------------------------------',
'######## Needlebench-ATC CircularEval ########', # category
*[[f'NeedleBenchATCDataset-{num_needles}Needle-ZH', 'perf_4'] for num_needles in needle_num_list],
'------------------------------------------',
*[[f'NeedleBenchATCDataset-{num_needles}Needle-EN', 'perf_4'] for num_needles in needle_num_list],
'------------------------------------------',
*[[f'NeedleBenchATCDataset-{num_needles}Needle-ZH-Reasoning', 'perf_4'] for num_needles in needle_num_list],
'------------------------------------------',
*[[f'NeedleBenchATCDataset-{num_needles}Needle-EN-Reasoning', 'perf_4'] for num_needles in needle_num_list],
'------------------------------------------',
],
summary_groups=needlebench_atc_summary_groups
)
return needlebench_atc_summarizer
atc_summarizer_20 = gen_atc_summarizer(list(range(2, 20, 1)))
atc_summarizer_50 = gen_atc_summarizer(list(range(2, 50, 1)))
atc_summarizer_80 = gen_atc_summarizer(list(range(2, 80, 1)))
context_lengths_4k = [1000, 2000, 3000, 4000]
needlebench_4k_summarizer = create_summarizer(context_lengths_4k, depths_list_10, '4k')
context_lengths_8k = [1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000]
needlebench_8k_summarizer = create_summarizer(context_lengths_8k, depths_list_10, '8k')
context_lengths_32k = [1000, 4000, 8000, 12000, 16000, 20000, 24000, 28000, 32000]
needlebench_32k_summarizer = create_summarizer(context_lengths_32k, depths_list_10, '32k')
context_lengths_128k = [1000, 2000, 4000, 8000, 16000, 32000, 64000, 128000]
needlebench_128k_summarizer = create_summarizer(context_lengths_128k, depths_list_10, '128k')
context_lengths_200k = [16000, 48000, 80000, 112000, 128000, 144000, 176000, 200000]
needlebench_200k_summarizer = create_summarizer(context_lengths_200k, depths_list_10, '200k')
context_lengths_256k = [32000, 128000, 256000]
needlebench_256k_summarizer = create_summarizer(context_lengths_256k, depths_list_10, '256k')
context_lengths_1000k = [20000, 160000, 300000, 440000, 580000, 720000, 860000, 1000000]
needlebench_1000k_summarizer = create_summarizer(context_lengths_1000k, depths_list_10, '1000k')

View File

@ -2,14 +2,253 @@
import json
import os
import random
import re
from enum import Enum
from datasets import Dataset
from opencompass.datasets.base import BaseDataset
from opencompass.registry import LOAD_DATASET
from opencompass.registry import ICL_EVALUATORS, LOAD_DATASET, TEXT_POSTPROCESSORS
from opencompass.datasets.needlebench.atc_elder_only import clean_atc_answer, needlebench_atc_postprocess_v2, NeedleBenchATCEvaluator
from opencompass.utils import get_data_path
# 定义问题类型枚举
class QuestionType(Enum):
ELDEST_ANCESTOR = 0 # 最年长祖先
NTH_ANCESTOR = 1 # N级祖先
NTH_DESCENDANT = 2 # N级子节点
RELATIONSHIP_DISTANCE = 3 # 关系距离
# 定义关系术语的代数映射(一代关系还是两代关系)
relationship_generation_map_zh = {
'父亲': 1,
'母亲': 1,
'爸爸': 1,
'妈妈': 1,
'爷爷': 2,
'奶奶': 2,
'姥姥': 2,
'姥爷': 2,
'外公': 2,
'外婆': 2,
}
relationship_generation_map_en = {
'father': 1,
'mother': 1,
'dad': 1,
'mom': 1,
'grandfather': 2,
'grandmother': 2,
'maternal grandmother': 2,
'maternal grandfather': 2,
'paternal grandfather': 2,
'paternal grandmother': 2,
}
relationship_templates_zh_CN = [
'{A}{B}{relationship}',
'{B}{relationship}{A}',
'{A}作为{B}{relationship},对{B}的成长有重要影响。',
'{A}不仅是{B}{relationship},还是{B}的榜样。',
'{A}{B}的成长过程中,不仅仅是{B}{relationship},还是{B}的监护人。',
'{A}{B}来说,不只是一个{relationship},还是一个朋友。',
]
relationship_terms_zh_CN = [
'父亲',
'母亲',
'爸爸',
'妈妈',
'爷爷',
'奶奶',
'姥姥',
'姥爷',
'外公',
'外婆',
]
relationship_terms_en = [
'father',
'mother',
'dad',
'mom',
'grandfather',
'grandmother',
'maternal grandmother',
'maternal grandfather',
'paternal grandfather',
'paternal grandmother',
]
relationship_templates_en = [
"{A} is {B}'s {relationship}.",
"{B}'s {relationship} is {A}.",
("{A}, as {B}'s {relationship}, "
"has a significant impact on {B}'s upbringing."),
("{A} is not only {B}'s {relationship} "
"but also {B}'s role model."),
("During {B}'s upbringing, {A} was not only {B}'s {relationship}, "
"but also {B}'s guardian."),
('For {B}, {A} is not just a {relationship}, '
'but also a friend.'),
"For {B}, {A} is more than just a {relationship}; {A} is a lifelong mentor of {B}.",
]
# Eldest ancestor problem template
shuffled_story_with_prompt_zh_CN = """下面是对你的多步推理能力的测试,这个测试叫做祖先追溯测试,我们会模拟不同人的家庭亲属关系,你的任务是在其中不断推理,直到找到最年长的祖先。
例如
例子1.如果张强的父亲是马克除此以外提供的文本中没有更多关于亲属关系的信息那么在提供的文本中张强能够向上追溯到的最年长的亲人就是马克
例子2.如果李明的姥姥是张红而张红的父亲是张强除此以外提供的文本中没有更多关于亲属关系的信息那么在提供的文本中李明能够向上追溯到的最年长的亲人就是张强
例子3.如果小明是张红的曾孙女张红的祖母是王华王华的父亲是王刚除此以外提供的文本中没有更多关于亲属关系的信息那么小明能够向上追溯到的最年长的亲人就是王刚
注意
1. 你不必纠结这个测试中的人名的性别关系例如一个通常被视为女性化的名字仍然可以是其他人的父亲我们的重点是谁更年长
2. 忽略这个测试中的姓氏遗传问题例如李明仍然可能是王鹏的亲生父亲我们只关注谁更年长不必纠结孩子是否应该继承父亲或母亲的性别
3. 在回答的最后将你的答案放在\\boxed{{}}例如"所以{last_person}能向上追溯到的最年长的亲人就是\\boxed{{某人(你的答案)}}"
现在打乱的家族关系文本如下
{shuffled_story}
在上面提供的打乱的家族关系文本中'{last_person}'的能够向上追溯到的最年长的亲人是谁
"""
shuffled_story_with_prompt_en = """Here is a test for multi-step reasoning ability called the Ancestral Trace Challenge. In this test, we will simulate different people's familial relationships, and your task is to continuously reason through them until you identify the eldest ancestor.
For example:
Example 1: If James Hill's father is Jasmine Lane, and no further information about familial relationships is provided in the text, then the oldest relative James Hill can trace back to in the provided text is \\boxed{{Jasmine Lane}}.
Example 2: If Andrew Williams's grandmother is Dan Newton, and Dan Newton's father is James Hill, and no further information about familial relationships is provided in the text, then the oldest relative Andrew Williams can trace back to in the provided text is \\boxed{{James Hill}}.
Example 3: If Jeff White's father is Kevin Le, Dan Newton's grandmother is Jeff White, and Jeff White's father is Kevin Le, and Shelley Mills is Dan Newton's great-granddaughter, and no further information about familial relationships is provided in the text, then the oldest relative Shelley Mills can trace back to in the provided text is \\boxed{{Kevin Le}}.
Notes:
1. You do not need to worry about the gender consistency of names in this test. For example, a name that is typically considered feminine can still be the father of another person. Our primary focus is on who is older.
2. Ignore surname inheritance issues. For instance, Andrew Williams could still be the biological father of Christopher Baker. We only care about who is older and do not need to consider whether a child should inherit the father's or mother's surname.
3. At the end of your response, remember to put your final answer within \\boxed{{}}. For example: "So the oldest relative '{last_person}' can trace back to in the provided text is \\boxed{{somebody (your answer here)}}."
Now, the scrambled family relationships are provided below:
{shuffled_story}
Given the scrambled family relationships described above, who is the eldest relative that '{last_person}' can trace back to in the context?
"""
# Nth ancestor problem template
nth_ancestor_prompt_zh_CN = """下面是对你的多步推理能力的测试,这个测试叫做祖先追溯测试,我们会模拟不同人的家庭亲属关系,你的任务是在其中不断推理,找到指定人物的特定代祖先。
例如
例子1.如果张强的父亲是马克我们说马克是张强的1代祖先
例子2.如果李明的姥姥是张红姥姥算两代关系而张红的父亲是张强那么张红是李明的2代祖先张强是李明的3代祖先
例子3.如果小明的奶奶是王华奶奶算两代关系王华的妈妈是刘芳那么王华是小明的2代祖先刘芳是小明的3代祖先
注意
1. 你不必纠结这个测试中的人名的性别关系我们只关注辈分关系
2. 忽略这个测试中的姓氏遗传问题我们只关注亲属关系
3. 父亲/母亲/爸爸/妈妈算1代关系爷爷/奶奶/姥姥/姥爷/外公/外婆算2代关系
4. 在回答的最后将你的答案放在\\boxed{{}}例如"所以{person}{n}代祖先就是\\boxed{{某人(你的答案)}}"
现在打乱的家族关系文本如下
{shuffled_story}
在上面提供的打乱的家族关系文本中'{person}'{n}代祖先是谁
"""
nth_ancestor_prompt_en = """Here is a test for multi-step reasoning ability called the Ancestral Trace Challenge. In this test, we will simulate different people's familial relationships, and your task is to identify a specific ancestor of a given person.
For example:
Example 1: If James Hill's father is Jasmine Lane, then Jasmine Lane is James Hill's 1st generation ancestor.
Example 2: If Andrew Williams's grandmother is Dan Newton (grandmother counts as 2 generations), and Dan Newton's father is James Hill, then Dan Newton is Andrew Williams's 2nd generation ancestor, and James Hill is Andrew Williams's 3rd generation ancestor.
Example 3: If Shelley Mills's grandfather is Jeff White (grandfather counts as 2 generations), and Jeff White's mother is Mary Johnson, then Jeff White is Shelley Mills's 2nd generation ancestor, and Mary Johnson is Shelley Mills's 3rd generation ancestor.
Notes:
1. You do not need to worry about the gender consistency of names in this test. We only care about generational relationships.
2. Ignore surname inheritance issues. We only care about familial relationships.
3. Father/mother/dad/mom count as 1 generation, while grandfather/grandmother/maternal grandmother/maternal grandfather/paternal grandfather/paternal grandmother count as 2 generations.
4. At the end of your response, remember to put your final answer within \\boxed{{}}. For example: "So the {n}th generation ancestor of '{person}' is \\boxed{{somebody (your answer here)}}."
Now, the scrambled family relationships are provided below:
{shuffled_story}
Given the scrambled family relationships described above, who is the {n}th generation ancestor of '{person}'?
"""
# Nth descendant problem template
nth_descendant_prompt_zh_CN = """下面是对你的多步推理能力的测试,这个测试叫做家族关系追溯测试,我们会模拟不同人的家庭亲属关系,你的任务是在其中不断推理,找到指定人物的特定代子孙。
例如
例子1.如果马克是张强的父亲我们说张强是马克的1代子孙
例子2.如果张红是李明的姥姥姥姥算两代关系而张强是张红的父亲那么李明是张红的2代子孙李明是张强的3代子孙
例子3.如果王华是小明的爷爷爷爷算两代关系刘芳是王华的妈妈那么小明是王华的2代子孙小明是刘芳的3代子孙
注意
1. 你不必纠结这个测试中的人名的性别关系我们只关注辈分关系
2. 忽略这个测试中的姓氏遗传问题我们只关注亲属关系
3. 父亲/母亲/爸爸/妈妈算1代关系爷爷/奶奶/姥姥/姥爷/外公/外婆算2代关系
4. 在回答的最后将你的答案放在\\boxed{{}}例如"所以{person}{n}代子孙就是\\boxed{{某人(你的答案)}}"
现在打乱的家族关系文本如下
{shuffled_story}
在上面提供的打乱的家族关系文本中'{person}'{n}代子孙是谁
"""
nth_descendant_prompt_en = """Here is a test for multi-step reasoning ability called the Ancestral Trace Challenge. In this test, we will simulate different people's familial relationships, and your task is to identify a specific descendant of a given person.
For example:
Example 1: If Jasmine Lane is James Hill's father, then James Hill is Jasmine Lane's 1st generation descendant.
Example 2: If Dan Newton is Andrew Williams's grandmother (grandmother counts as 2 generations), and James Hill is Dan Newton's father, then Andrew Williams is Dan Newton's 2nd generation descendant, and Andrew Williams is James Hill's 3rd generation descendant.
Example 3: If Jeff White is Shelley Mills's grandfather (grandfather counts as 2 generations), and Mary Johnson is Jeff White's mother, then Shelley Mills is Jeff White's 2nd generation descendant, and Shelley Mills is Mary Johnson's 3rd generation descendant.
Notes:
1. You do not need to worry about the gender consistency of names in this test. We only care about generational relationships.
2. Ignore surname inheritance issues. We only care about familial relationships.
3. Father/mother/dad/mom count as 1 generation, while grandfather/grandmother/maternal grandmother/maternal grandfather/paternal grandfather/paternal grandmother count as 2 generations.
4. At the end of your response, remember to put your final answer within \\boxed{{}}. For example: "So the {n}th generation descendant of '{person}' is \\boxed{{somebody (your answer here)}}."
Now, the scrambled family relationships are provided below:
{shuffled_story}
Given the scrambled family relationships described above, who is the {n}th generation descendant of '{person}'?
"""
# Relationship distance problem template
relationship_distance_prompt_zh_CN = """下面是对你的多步推理能力的测试,这个测试叫做家族关系追溯测试,我们会模拟不同人的家庭亲属关系,你的任务是在其中不断推理,计算两个人之间的关系距离。
关系距离定义为家族图中从一个人到另一个人所需的最少代数差距注意不同关系有不同的代数差距例如
例子1.如果马克是张强的父亲父亲算1代关系那么张强和马克之间的关系距离是1
例子2.如果张红是李明的姥姥姥姥算2代关系而张强是张红的父亲父亲算1代关系那么李明和张红之间的关系距离是2李明和张强之间的关系距离是3
例子3.如果小明的爷爷是王华爷爷算2代关系王华的妈妈是刘芳妈妈算1代关系那么小明和王华之间的关系距离是2小明和刘芳之间的关系距离是3
注意
1. 你不必纠结这个测试中的人名的性别关系我们只关注辈分关系
2. 忽略这个测试中的姓氏遗传问题我们只关注亲属关系
3. 父亲/母亲/爸爸/妈妈算1代关系爷爷/奶奶/姥姥/姥爷/外公/外婆算2代关系
4. 在回答的最后将你的答案放在\\boxed{{}}例如"所以{person_a}{person_b}之间的关系距离是\\boxed{{5}}"
现在打乱的家族关系文本如下
{shuffled_story}
在上面提供的打乱的家族关系文本中'{person_a}''{person_b}'之间的关系距离是多少
"""
relationship_distance_prompt_en = """Here is a test for multi-step reasoning ability called the Ancestral Trace Challenge. In this test, we will simulate different people's familial relationships, and your task is to calculate the relationship distance between two individuals.
The relationship distance is defined as the minimum number of generational gaps needed to go from one person to another in the family graph. Note that different relationships have different generational gaps. For example:
Example 1: If Jasmine Lane is James Hill's father (father counts as 1 generation), then the relationship distance between James Hill and Jasmine Lane is 1.
Example 2: If Dan Newton is Andrew Williams's grandmother (grandmother counts as 2 generations), and James Hill is Dan Newton's father (father counts as 1 generation), then the relationship distance between Andrew Williams and Dan Newton is 2, and the relationship distance between Andrew Williams and James Hill is 3.
Example 3: If Jeff White is Shelley Mills's grandfather (grandfather counts as 2 generations), and Mary Johnson is Jeff White's mother (mother counts as 1 generation), then the relationship distance between Shelley Mills and Jeff White is 2, and the relationship distance between Shelley Mills and Mary Johnson is 3.
Notes:
1. You do not need to worry about the gender consistency of names in this test. We only care about relationship connections.
2. Ignore surname inheritance issues. We only care about familial relationships.
3. Father/mother/dad/mom count as 1 generation, while grandfather/grandmother/maternal grandmother/maternal grandfather/paternal grandfather/paternal grandmother count as 2 generations.
4. At the end of your response, remember to put your final answer within \\boxed{{}}. For example: "So the relationship distance between '{person_a}' and '{person_b}' is \\boxed{{5}}."
Now, the scrambled family relationships are provided below:
{shuffled_story}
Given the scrambled family relationships described above, what is the relationship distance between '{person_a}' and '{person_b}'?
"""
@LOAD_DATASET.register_module()
class NeedleBenchATCDataset(BaseDataset):
@ -20,8 +259,15 @@ class NeedleBenchATCDataset(BaseDataset):
num_needles: int,
language: str,
repeats: int,
# This parameter cannot be passed through mmengine because it is blocked as lazy
question_types: list[QuestionType] = [
QuestionType.ELDEST_ANCESTOR,
QuestionType.NTH_ANCESTOR,
QuestionType.NTH_DESCENDANT,
QuestionType.RELATIONSHIP_DISTANCE,
], # Support specifying a list of question types
):
data = {'prompt': [], 'answer': []}
data = {'prompt': [], 'answer': [], 'question_type': []}
path = get_data_path(path)
if os.environ.get('DATASET_SOURCE') == 'HF':
from huggingface_hub import snapshot_download
@ -33,259 +279,136 @@ class NeedleBenchATCDataset(BaseDataset):
names_data = json.load(file)
all_names = names_data[language].split(',')
# Ensure question_types is not empty
if not question_types:
raise ValueError('question_types cannot be empty')
for question_type in question_types:
# Generate the specified number of examples for each question type
for i in range(repeats):
# Set a different seed for each question type and repeat
# Use the enum value of the question type multiplied by 10000 as the base to ensure non-overlapping seed ranges
seed = (i + 1) + (10000 * question_type.value)
random.seed(seed)
# Randomly select the specified number of names from all names
# The number of names is num_needles + 1
names = random.sample(all_names, num_needles+1)
# Select the corresponding relationship terms and templates according to the language
if language == 'Chinese':
relationship_terms = relationship_terms_zh_CN
relationship_templates = relationship_templates_zh_CN
relationship_map = relationship_generation_map_zh
elif language == 'English':
relationship_terms = relationship_terms_en
relationship_templates = relationship_templates_en
relationship_map = relationship_generation_map_en
else:
raise ValueError('Unsupported language specified. '
'Please choose either "Chinese" or "English".')
for _ in range(repeats):
names = random.sample(all_names, num_needles)
if language == 'Chinese':
def generate_chain_family_story(names, templates, relationship_terms, relationship_map):
story = ''
relationships = []
total_generations = 0 # Track the total generational difference
relationship_terms = [
'父亲',
'母亲',
'爸爸',
'妈妈',
'爷爷',
'奶奶',
'姥姥',
'姥爷',
'外公',
'外婆',
]
for i in range(len(names) - 1):
template = random.choice(templates)
relation_term = random.choice(relationship_terms)
relation = template.format(A=names[i],
B=names[i + 1],
relationship=relation_term)
story += f'{relation}*'
relationship_templates = [
'{A}{B}{relationship}',
'{B}{relationship}{A}',
'{A}作为{B}{relationship},对{B}的成长有重要影响。',
'{A}不仅是{B}{relationship},还是{B}的榜样。',
'{B}{A}所生的孩子。',
'{A}{B}来说,不只是一个{relationship},还是一个朋友。',
'{A}{B}的生命中扮演着{relationship}的角色。',
'{B}{A}视为其{relationship}',
]
elif language == 'English':
# Get the generation difference for this relationship
gen_diff = relationship_map.get(relation_term, 1) # Default to 1 generation
total_generations += gen_diff
relationship_terms = [
'father',
'mother',
'dad',
'mom',
'grandfather',
'grandmother',
'maternal grandmother',
'maternal grandfather',
'paternal grandfather',
'paternal grandmother',
]
# Record relationship information for later use
relationships.append((names[i], names[i + 1], relation_term, gen_diff))
relationship_templates = [
"{A} is {B}'s {relationship}.",
"{B}'s {relationship} is {A}.",
("{A}, as {B}'s {relationship}, "
"has a significant impact on {B}'s upbringing."),
("{A} is not only {B}'s {relationship} "
"but also {B}'s role model."),
'{B} is the child of {A}.',
('For {B}, {A} is not just a {relationship}, '
'but also a friend.'),
("{A} plays the role of {B}'s {relationship} "
"in {B}'s life."),
'{B} considers {A} as their {relationship}.',
]
return story, relationships, total_generations
def generate_chain_family_story(names, templates,
relationship_terms):
story = ''
for i in range(len(names) - 1):
template = random.choice(templates)
relation_term = random.choice(relationship_terms)
relation = template.format(A=names[i],
B=names[i + 1],
relationship=relation_term)
story += f'{relation}*'
return story
chain_story, relationships, total_generations = generate_chain_family_story(
names, relationship_templates, relationship_terms, relationship_map)
chain_story = generate_chain_family_story(names,
relationship_templates,
relationship_terms)
# Split the chain_story into a list of fragments
family_story_fragments = chain_story.split('*')
family_story_fragments = [f for f in family_story_fragments if f]
# Splitting the chain_story into a list of fragments
family_story_fragments = chain_story.split('*')
# Shuffle the list of fragments
random.shuffle(family_story_fragments)
# Shuffling the list of fragments
random.shuffle(family_story_fragments)
# Join the shuffled fragments back into a string
shuffled_story = ''.join(family_story_fragments)
# Joining the shuffled fragments back into a string
shuffled_story = ''.join(family_story_fragments)
if question_type == QuestionType.ELDEST_ANCESTOR:
# Eldest ancestor question
last_person = names[-1]
if language == 'Chinese':
prompt = shuffled_story_with_prompt_zh_CN.format(
shuffled_story=shuffled_story, last_person=last_person)
else:
prompt = shuffled_story_with_prompt_en.format(
shuffled_story=shuffled_story, last_person=last_person)
answer = names[0] # The first person is the eldest ancestor
last_person = names[-1]
elif question_type == QuestionType.NTH_ANCESTOR:
# Nth ancestor question - trace from the youngest person to the oldest
person = names[-1] # The youngest person (end of the chain)
n = total_generations # Use the calculated total generational difference
if language == 'Chinese':
prompt = nth_ancestor_prompt_zh_CN.format(
shuffled_story=shuffled_story, person=person, n=n)
else:
prompt = nth_ancestor_prompt_en.format(
shuffled_story=shuffled_story, person=person, n=n)
answer = names[0] # The oldest person (start of the chain) is the nth ancestor
# Generating the prompt based on the language
if language == 'Chinese':
prompt = f"""
在上面提供的打乱的家族关系文本中'{last_person}'的能够向上追溯到的最年长的亲人是谁
例如
例子1.如果张强的父亲是马克除此以外提供的文本中没有更多关于亲属关系的信息那么在提供的文本中张强能够向上追溯到的最年长的亲人就是马克
例子2.如果李明的姥姥是张红而张红的父亲是张强除此以外提供的文本中没有更多关于亲属关系的信息那么在提供的文本中李明能够向上追溯到的最年长的亲人就是张强
例子3.如果小明是张红的曾孙女张红的祖母是王华王华的父亲是王刚除此以外提供的文本中没有更多关于亲属关系的信息那么小明能够向上追溯到的最年长的亲人就是王刚
"""
elif language == 'English':
prompt = f"""
Given the scrambled family relationships described above, who is the eldest relative that '{last_person}' can trace back to in the context?
For example:
Example 1: If Zhang Qiang's father is Mark, and no further information about familial relationships is provided in the text, then the oldest relative Zhang Qiang can trace back to in the provided text is Mark.
Example 2: If Li Ming's grandmother is Zhang Hong, and Zhang Hong's father is Zhang Qiang, and no further information about familial relationships is provided in the text, then the oldest relative Li Ming can trace back to in the provided text is Zhang Qiang.
Example 3: If Xiao Ming is Zhang Hong's great-granddaughter, Zhang Hong's grandmother is Wang Hua, and Wang Hua's father is Wang Gang, and no further information about familial relationships is provided in the text, then the oldest relative Xiao Ming can trace back to in the provided text is Wang Gang."""
else:
prompt = 'Language not supported.'
raise Exception('Unsupported language specified. '
"Please choose either 'Chinese' or 'English'.")
elif question_type == QuestionType.NTH_DESCENDANT:
# Nth descendant question - trace from the oldest person to the youngest
person = names[0] # The oldest person (start of the chain)
n = total_generations # Use the calculated total generational difference
if language == 'Chinese':
prompt = nth_descendant_prompt_zh_CN.format(
shuffled_story=shuffled_story, person=person, n=n)
else:
prompt = nth_descendant_prompt_en.format(
shuffled_story=shuffled_story, person=person, n=n)
answer = names[-1] # The youngest person (end of the chain) is the nth descendant
# Combine story and prompt
shuffled_story_with_prompt = shuffled_story + ' ' + prompt
elif question_type == QuestionType.RELATIONSHIP_DISTANCE:
# Relationship distance question - calculate the relationship distance between the two ends of the chain
person_a = names[0] # The oldest person
person_b = names[-1] # The youngest person
if language == 'Chinese':
prompt = relationship_distance_prompt_zh_CN.format(
shuffled_story=shuffled_story, person_a=person_a, person_b=person_b)
else:
prompt = relationship_distance_prompt_en.format(
shuffled_story=shuffled_story, person_a=person_a, person_b=person_b)
# Use the calculated total generations as the relationship distance
answer = str(total_generations)
data['prompt'].append(shuffled_story_with_prompt)
data['answer'].append(names[0] + '*' + names[0])
else:
# Default fallback to eldest ancestor question
last_person = names[-1]
if language == 'Chinese':
prompt = shuffled_story_with_prompt_zh_CN.format(
shuffled_story=shuffled_story, last_person=last_person)
else:
prompt = shuffled_story_with_prompt_en.format(
shuffled_story=shuffled_story, last_person=last_person)
answer = names[0] # The first person is the eldest ancestor
data['prompt'].append(prompt)
data['answer'].append(answer)
data['question_type'].append(question_type.name)
dataset = Dataset.from_dict({
'prompt': data['prompt'],
'answer': data['answer'],
'question_type': data['question_type'],
})
return dataset
@LOAD_DATASET.register_module()
class NeedleBenchATCOrderedDataset(BaseDataset):
@staticmethod
def load(
path,
file_name,
num_needles: int,
language: str,
repeats: int,
):
data = {'prompt': [], 'answer': []}
path = get_data_path(path)
if os.environ.get('DATASET_SOURCE') == 'HF':
from huggingface_hub import snapshot_download
path = snapshot_download(repo_id=path, repo_type='dataset')
file_path = os.path.join(path, file_name)
with open(file_path, 'r', encoding='utf-8') as file:
names_data = json.load(file)
all_names = names_data[language].split(',')
for _ in range(repeats):
names = random.sample(all_names, num_needles)
if language == 'Chinese':
relationship_terms = [
'父亲',
'母亲',
'爸爸',
'妈妈',
'爷爷',
'奶奶',
'姥姥',
'姥爷',
'外公',
'外婆',
]
relationship_templates = [
'{A}{B}{relationship}',
'{B}{relationship}{A}',
'{A}作为{B}{relationship},对{B}的成长有重要影响。',
'{A}不仅是{B}{relationship},还是{B}的榜样。',
'{B}{A}所生的孩子。',
'{A}{B}来说,不只是一个{relationship},还是一个朋友。',
'{A}{B}的生命中扮演着{relationship}的角色。',
'{B}{A}视为其{relationship}',
]
elif language == 'English':
relationship_terms = [
'father',
'mother',
'dad',
'mom',
'grandfather',
'grandmother',
'maternal grandmother',
'maternal grandfather',
'paternal grandfather',
'paternal grandmother',
]
relationship_templates = [
"{A} is {B}'s {relationship}.",
"{B}'s {relationship} is {A}.",
("{A}, as {B}'s {relationship}, "
"has a significant impact on {B}'s upbringing."),
("{A} is not only {B}'s {relationship} "
"but also {B}'s role model."),
'{B} is the child of {A}.',
('For {B}, {A} is not just a {relationship}, '
'but also a friend.'),
("{A} plays the role of {B}'s {relationship} "
"in {B}'s life."),
'{B} considers {A} as their {relationship}.',
]
def generate_chain_family_story(names, templates,
relationship_terms):
story = ''
for i in range(len(names) - 1):
template = random.choice(templates)
relation_term = random.choice(relationship_terms)
relation = template.format(A=names[i],
B=names[i + 1],
relationship=relation_term)
story += f'{relation}*'
return story
chain_story = generate_chain_family_story(names,
relationship_templates,
relationship_terms)
# Splitting the chain_story into a list of fragments
family_story_fragments = chain_story.split('*')
# Joining the shuffled fragments back into a string
shuffled_story = ''.join(family_story_fragments)
last_person = names[-1]
# Generating the prompt based on the language
if language == 'Chinese':
prompt = f"""
在上面提供的打乱的家族关系文本中'{last_person}'的能够向上追溯到的最年长的亲人是谁
例如
例子1.如果张强的父亲是马克除此以外提供的文本中没有更多关于亲属关系的信息那么在提供的文本中张强能够向上追溯到的最年长的亲人就是马克
例子2.如果李明的姥姥是张红而张红的父亲是张强除此以外提供的文本中没有更多关于亲属关系的信息那么在提供的文本中李明能够向上追溯到的最年长的亲人就是张强
例子3.如果小明是张红的曾孙女张红的祖母是王华王华的父亲是王刚除此以外提供的文本中没有更多关于亲属关系的信息那么小明能够向上追溯到的最年长的亲人就是王刚
"""
elif language == 'English':
prompt = f"""
Given the scrambled family relationships described above, who is the eldest relative that '{last_person}' can trace back to in the context?
For example:
Example 1: If Zhang Qiang's father is Mark, and no further information about familial relationships is provided in the text, then the oldest relative Zhang Qiang can trace back to in the provided text is Mark.
Example 2: If Li Ming's grandmother is Zhang Hong, and Zhang Hong's father is Zhang Qiang, and no further information about familial relationships is provided in the text, then the oldest relative Li Ming can trace back to in the provided text is Zhang Qiang.
Example 3: If Xiao Ming is Zhang Hong's great-granddaughter, Zhang Hong's grandmother is Wang Hua, and Wang Hua's father is Wang Gang, and no further information about familial relationships is provided in the text, then the oldest relative Xiao Ming can trace back to in the provided text is Wang Gang."""
else:
prompt = 'Language not supported.'
raise Exception('Unsupported language specified. '
"Please choose either 'Chinese' or 'English'.")
# Combine story and prompt
shuffled_story_with_prompt = shuffled_story + ' ' + prompt
data['prompt'].append(shuffled_story_with_prompt)
data['answer'].append(names[0] + '*' + names[0])
dataset = Dataset.from_dict({
'prompt': data['prompt'],
'answer': data['answer'],
})
return dataset
return dataset

View File

@ -5,12 +5,13 @@ import os
import random
from datasets import Dataset
import numpy as np
from opencompass.registry import LOAD_DATASET
from opencompass.utils import get_data_path
from ..base import BaseDataset
from .atc import relationship_terms_zh_CN, relationship_templates_zh_CN, relationship_terms_en, relationship_templates_en
def get_number(options):
result_string = ''
@ -83,58 +84,12 @@ class NeedleBenchATCDataset(BaseDataset):
names = random.sample(all_names, num_needles)
if language == 'Chinese':
relationship_terms = [
'父亲',
'母亲',
'爸爸',
'妈妈',
'爷爷',
'奶奶',
'姥姥',
'姥爷',
'外公',
'外婆',
]
relationship_templates = [
'{A}{B}{relationship}',
'{B}{relationship}{A}',
'{A}作为{B}{relationship},对{B}的成长有重要影响。',
'{A}不仅是{B}{relationship},还是{B}的榜样。',
'{B}{A}所生的孩子。',
'{A}{B}来说,不只是一个{relationship},还是一个朋友。',
'{A}{B}的生命中扮演着{relationship}的角色。',
'{B}{A}视为其{relationship}',
]
relationship_terms = relationship_terms_zh_CN
relationship_templates = relationship_templates_zh_CN
elif language == 'English':
relationship_terms = [
'father',
'mother',
'dad',
'mom',
'grandfather',
'grandmother',
'maternal grandmother',
'maternal grandfather',
'paternal grandfather',
'paternal grandmother',
]
relationship_templates = [
"{A} is {B}'s {relationship}.",
"{B}'s {relationship} is {A}.",
("{A}, as {B}'s {relationship}, "
"has a significant impact on {B}'s upbringing."),
("{A} is not only {B}'s {relationship} "
"but also {B}'s role model."),
'{B} is the child of {A}.',
('For {B}, {A} is not just a {relationship}, '
'but also a friend.'),
("{A} plays the role of {B}'s {relationship} "
"in {B}'s life."),
'{B} considers {A} as their {relationship}.',
]
relationship_terms = relationship_terms_en
relationship_templates = relationship_templates_en
def generate_chain_family_story(names, templates,
relationship_terms):
@ -166,17 +121,48 @@ class NeedleBenchATCDataset(BaseDataset):
# Generating the prompt based on the language
if language == 'Chinese':
prompt = f"""
在上面提供的打乱的家族关系文本中'{last_person}'的能够向上追溯到的最年长的亲人是谁"""
下面是对你的多步推理能力的测试这个测试叫做祖先追溯测试我们会模拟不同人的家庭亲属关系你的任务是在其中不断推理直到找到最年长的祖先
注意
1. 你不必纠结这个测试中的人名的性别关系例如一个通常被视为女性化的名字仍然可以是其他人的父亲我们的重点是谁更年长
2. 忽略这个测试中的姓氏遗传问题例如李明仍然可能是王鹏的亲生父亲我们只关注谁更年长不必纠结孩子是否应该继承父亲或母亲的性别
3. 在回答的最后所以答案是A/B/C/D的格式回答正确答案(你只能选择一个选项)
现在打乱的家族关系文本如下
{shuffled_story}
在上面提供的打乱的家族关系文本中'{last_person}'的能够向上追溯到的最年长的亲人是谁
例如
例子1.如果张强的父亲是马克除此以外提供的文本中没有更多关于亲属关系的信息那么在提供的文本中张强能够向上追溯到的最年长的亲人就是马克
例子2.如果李明的姥姥是张红而张红的父亲是张强除此以外提供的文本中没有更多关于亲属关系的信息那么在提供的文本中李明能够向上追溯到的最年长的亲人就是张强
例子3.如果小明是张红的曾孙女张红的祖母是王华王华的父亲是王刚除此以外提供的文本中没有更多关于亲属关系的信息那么小明能够向上追溯到的最年长的亲人就是王刚
"""
elif language == 'English':
prompt = f"""
Given the scrambled family relationships described above, who is the eldest relative that '{last_person}' can trace back to in the context?"""
Here is a test for multi-step reasoning ability called the Ancestral Trace Challenge. In this test, we will simulate different peoples familial relationships, and your task is to continuously reason through them until you identify the eldest ancestor.
Notes:
1. You do not need to worry about the gender consistency of names in this test. For example, a name that is typically considered feminine can still be the father of another person. Our primary focus is on who is older.
2. Ignore surname inheritance issues. For instance, Li Ming could still be the biological father of Wang Peng. We only care about who is older and do not need to consider whether a child should inherit the fathers or mothers surname.
3. At the end of your response, state the correct answer in the format: So the answer is A/B/C/D.(You must choose only one option.)
Now, the scrambled family relationships are provided below:
{shuffled_story}
Given the scrambled family relationships described above, who is the eldest relative that '{last_person}' can trace back to in the context?
For example:
Example 1: If Zhang Qiang's father is Mark, and no further information about familial relationships is provided in the text, then the oldest relative Zhang Qiang can trace back to in the provided text is Mark.
Example 2: If Li Ming's grandmother is Zhang Hong, and Zhang Hong's father is Zhang Qiang, and no further information about familial relationships is provided in the text, then the oldest relative Li Ming can trace back to in the provided text is Zhang Qiang.
Example 3: If Xiao Ming is Zhang Hong's great-granddaughter, Zhang Hong's grandmother is Wang Hua, and Wang Hua's father is Wang Gang, and no further information about familial relationships is provided in the text, then the oldest relative Xiao Ming can trace back to in the provided text is Wang Gang."""
else:
prompt = 'Language not supported.'
raise Exception('Unsupported language specified. '
"Please choose either 'Chinese' or 'English'.")
# Combine story and prompt
shuffled_story_with_prompt = shuffled_story + ' ' + prompt
shuffled_story_with_prompt = prompt
entry['question'] = shuffled_story_with_prompt
if len(names) < 4:
@ -187,7 +173,14 @@ Given the scrambled family relationships described above, who is the eldest rela
)
names.extend(additional_names)
entry['options'] = names[0:4]
num_samples = 3
if len(names) > 1:
indices = np.linspace(1, len(names) - 1, num_samples, dtype=int) # Generate evenly spaced indices
sampled_names = [names[i] for i in indices] # Select corresponding elements
entry['options'] = names[:1] + sampled_names
else:
entry['options'] = names # Return directly if only one element
assert len(entry['options']) == 4
entry['answer'] = 'A'
# print(entry)
data.extend(get_circular_example(entry, id))

View File

@ -0,0 +1,248 @@
# flake8: noqa
import json
import os
import random
import re
from datasets import Dataset
from opencompass.datasets.base import BaseDataset
from opencompass.registry import ICL_EVALUATORS, LOAD_DATASET, TEXT_POSTPROCESSORS
from opencompass.openicl.icl_evaluator import BaseEvaluator
from opencompass.utils import get_data_path
from opencompass.datasets.math import extract_boxed_answer
relationship_templates_zh_CN = [
'{A}{B}{relationship}',
'{B}{relationship}{A}',
'{A}作为{B}{relationship},对{B}的成长有重要影响。',
'{A}不仅是{B}{relationship},还是{B}的榜样。',
'{B}{A}所生的孩子。',
'{A}{B}来说,不只是一个{relationship},还是一个朋友。',
]
relationship_terms_zh_CN = [
'父亲',
'母亲',
'爸爸',
'妈妈',
'爷爷',
'奶奶',
'姥姥',
'姥爷',
'外公',
'外婆',
]
relationship_terms_en = [
'father',
'mother',
'dad',
'mom',
'grandfather',
'grandmother',
'maternal grandmother',
'maternal grandfather',
'paternal grandfather',
'paternal grandmother',
]
relationship_templates_en = [
"{A} is {B}'s {relationship}.",
"{B}'s {relationship} is {A}.",
("{A}, as {B}'s {relationship}, "
"has a significant impact on {B}'s upbringing."),
("{A} is not only {B}'s {relationship} "
"but also {B}'s role model."),
'{B} is the child of {A}.',
('For {B}, {A} is not just a {relationship}, '
'but also a friend.'),
"For {B}, {A} is more than just a {relationship}; {A} is a lifelong mentor of {B}.",
]
shuffled_story_with_prompt_zh_CN = """下面是对你的多步推理能力的测试,这个测试叫做祖先追溯测试,我们会模拟不同人的家庭亲属关系,你的任务是在其中不断推理,直到找到最年长的祖先。
例如
例子1.如果张强的父亲是马克除此以外提供的文本中没有更多关于亲属关系的信息那么在提供的文本中张强能够向上追溯到的最年长的亲人就是马克
例子2.如果李明的姥姥是张红而张红的父亲是张强除此以外提供的文本中没有更多关于亲属关系的信息那么在提供的文本中李明能够向上追溯到的最年长的亲人就是张强
例子3.如果小明是张红的曾孙女张红的祖母是王华王华的父亲是王刚除此以外提供的文本中没有更多关于亲属关系的信息那么小明能够向上追溯到的最年长的亲人就是王刚
注意
1. 你不必纠结这个测试中的人名的性别关系例如一个通常被视为女性化的名字仍然可以是其他人的父亲我们的重点是谁更年长
2. 忽略这个测试中的姓氏遗传问题例如李明仍然可能是王鹏的亲生父亲我们只关注谁更年长不必纠结孩子是否应该继承父亲或母亲的性别
3. 在回答的最后将你的答案放在\\boxed{{}}例如"所以{last_person}能向上追溯到的最年长的亲人就是\\boxed{{某人(你的答案)}}"
现在打乱的家族关系文本如下
{shuffled_story}
在上面提供的打乱的家族关系文本中'{last_person}'的能够向上追溯到的最年长的亲人是谁
"""
shuffled_story_with_prompt_en = """Here is a test for multi-step reasoning ability called the Ancestral Trace Challenge. In this test, we will simulate different people's familial relationships, and your task is to continuously reason through them until you identify the eldest ancestor.
For example:
Example 1: If James Hill's father is Jasmine Lane, and no further information about familial relationships is provided in the text, then the oldest relative James Hill can trace back to in the provided text is \\boxed{{Jasmine Lane}}.
Example 2: If Andrew Williams's grandmother is Dan Newton, and Dan Newton's father is James Hill, and no further information about familial relationships is provided in the text, then the oldest relative Andrew Williams can trace back to in the provided text is \\boxed{{James Hill}}.
Example 3: If Jeff White's father is Kevin Le, Dan Newton's grandmother is Jeff White, and Jeff White's father is Kevin Le, and Shelley Mills is Dan Newton's great-granddaughter, and no further information about familial relationships is provided in the text, then the oldest relative Shelley Mills can trace back to in the provided text is \\boxed{{Kevin Le}}.
Notes:
1. You do not need to worry about the gender consistency of names in this test. For example, a name that is typically considered feminine can still be the father of another person. Our primary focus is on who is older.
2. Ignore surname inheritance issues. For instance, Andrew Williams could still be the biological father of Christopher Baker. We only care about who is older and do not need to consider whether a child should inherit the father's or mother's surname.
3. At the end of your response, remember to put your final answer within \\boxed{{}}. For example: "So the oldest relative '{last_person}' can trace back to in the provided text is \\boxed{{somebody (your answer here)}}."
Now, the scrambled family relationships are provided below:
{shuffled_story}
Given the scrambled family relationships described above, who is the eldest relative that '{last_person}' can trace back to in the context?
"""
@LOAD_DATASET.register_module()
class NeedleBenchATCDataset(BaseDataset):
@staticmethod
def load(
path,
file_name: str,
num_needles: int,
language: str,
repeats: int,
):
data = {'prompt': [], 'answer': []}
path = get_data_path(path)
if os.environ.get('DATASET_SOURCE') == 'HF':
from huggingface_hub import snapshot_download
path = snapshot_download(repo_id=path, repo_type='dataset')
file_path = os.path.join(path, file_name)
with open(file_path, 'r', encoding='utf-8') as file:
names_data = json.load(file)
all_names = names_data[language].split(',')
for i in range(repeats):
# 使用固定种子来保持样本稳定性
seed = i
random.seed(seed)
names = random.sample(all_names, num_needles)
if language == 'Chinese':
relationship_terms = relationship_terms_zh_CN
relationship_templates = relationship_templates_zh_CN
elif language == 'English':
relationship_terms = relationship_terms_en
relationship_templates = relationship_templates_en
def generate_chain_family_story(names, templates,
relationship_terms):
story = ''
for i in range(len(names) - 1):
template = random.choice(templates)
relation_term = random.choice(relationship_terms)
relation = template.format(A=names[i],
B=names[i + 1],
relationship=relation_term)
story += f'{relation}*'
return story
chain_story = generate_chain_family_story(names,
relationship_templates,
relationship_terms)
# Splitting the chain_story into a list of fragments
family_story_fragments = chain_story.split('*')
# Shuffling the list of fragments
random.shuffle(family_story_fragments)
# Joining the shuffled fragments back into a string
shuffled_story = ''.join(family_story_fragments)
last_person = names[-1]
# Generating the prompt based on the language
if language == 'Chinese':
shuffled_story_with_prompt = shuffled_story_with_prompt_zh_CN.format(shuffled_story=shuffled_story, last_person=last_person)
elif language == 'English':
shuffled_story_with_prompt = shuffled_story_with_prompt_en.format(shuffled_story=shuffled_story, last_person=last_person)
else:
prompt = 'Language not supported.'
raise Exception('Unsupported language specified. '
"Please choose either 'Chinese' or 'English'.")
data['prompt'].append(shuffled_story_with_prompt)
data['answer'].append(names[0])
dataset = Dataset.from_dict({
'prompt': data['prompt'],
'answer': data['answer'],
})
return dataset
def clean_atc_answer(text: str) -> str:
"""Clean answer format specifically for QwQ-32B-Preview model
Args:
text: Raw prediction text
Returns:
Standardized name format after cleaning
"""
if not text or text == "None":
return "None"
# Remove LaTeX commands but keep content
text = re.sub(r'\\text\{([^}]+)\}', r'\1', text)
text = re.sub(r'\\boxed\{([^}]+)\}', r'\1', text)
text = re.sub(r'\\[\[\]]', '', text)
# Remove extra backslashes
text = text.replace('\\\\', '').replace('\\', '')
# Handle extra spaces
text = re.sub(r'\s+', ' ', text).strip()
# Remove quotes
text = text.replace('"', '').replace("'", '')
# Remove tildes (波浪符号)
text = text.replace('~', ' ')
return text
@TEXT_POSTPROCESSORS.register_module('needlebench_atc_postprocess_v2')
def needlebench_atc_postprocess_v2(text: str) -> str:
cand_ans = extract_boxed_answer(text, strip_double_curly_brace=True)
if cand_ans:
return clean_atc_answer(cand_ans)
return "None"
@ICL_EVALUATORS.register_module("needlebench_atc_evaluator")
class NeedleBenchATCEvaluator(BaseEvaluator):
def score(self, predictions, gold):
if len(predictions) != len(gold):
return {'error': 'predictions and gold have different lengths'}
correct_count = 0
details = []
for prediction, reference in zip(predictions, gold):
reference_name = reference
if prediction.strip() == reference_name.strip():
correct_count += 1
detail = {
'pred': prediction,
'answer': reference_name,
'correct': prediction.strip() == reference_name.strip()
}
details.append(detail)
accuracy = (correct_count / len(predictions)) * 100 if predictions else 0
result = {'score': accuracy, 'details': details}
return result

View File

@ -4,32 +4,66 @@ import random
import tiktoken
from datasets import Dataset
from huggingface_hub import hf_hub_download
from opencompass.datasets.base import BaseDataset
from opencompass.openicl import BaseEvaluator
from opencompass.registry import LOAD_DATASET
from opencompass.utils import get_data_path
from opencompass.datasets.needlebench.atc import relationship_templates_zh_CN, relationship_terms_zh_CN, relationship_templates_en, relationship_terms_en
def get_random_needles(counter, file_path, needle_count):
def get_random_needles(counter, file_path, num_needles, language):
with open(file_path, 'r', encoding='utf-8') as file:
data = json.load(file)
names_data = json.load(file)
matching_records = [
record for record in data
if record.get('derivation_count') == needle_count
]
all_names = names_data[language].split(',')
if matching_records:
random.seed(counter)
random_record = random.choice(matching_records)
return {
'needles': random_record['derivations'],
'answer': random_record['answer'],
'retrieval_question': random_record['question']
}
random.seed(counter)
names = random.sample(all_names, num_needles)
if language == 'Chinese':
relationship_terms = relationship_terms_zh_CN
relationship_templates = relationship_templates_zh_CN
elif language == 'English':
relationship_terms = relationship_terms_en
relationship_templates = relationship_templates_en
else:
return None
raise ValueError(f"Unsupported language '{language}' specified.")
def generate_chain_family_story(names, templates, relationship_terms):
story = ''
for i in range(len(names) - 1):
template = random.choice(templates)
relation_term = random.choice(relationship_terms)
relation = template.format(A=names[i], B=names[i + 1], relationship=relation_term)
story += f'{relation}*'
return story
chain_story = generate_chain_family_story(names, relationship_templates, relationship_terms)
# Splitting the chain_story into a list of fragments
family_story_fragments = chain_story.split('*')
# Removing the empty string from the list
family_story_fragments = [fragment for fragment in family_story_fragments if fragment]
# Shuffling the list of fragments
random.shuffle(family_story_fragments)
last_person = names[-1]
# Generating the retrieval question based on the language
if language == 'Chinese':
retrieval_question = f"在上面提供的文本中,'{last_person}'的能够向上追溯到的最年长的亲人是谁?"
elif language == 'English':
retrieval_question = f"Given the context described above, who is the eldest relative that '{last_person}' can trace back to in the context?"
# Returning the story, answer, and retrieval question
return {
'needles': family_story_fragments,
'answer': names[0],
'retrieval_question': retrieval_question,
'last_person': last_person
}
@LOAD_DATASET.register_module()
@ -37,19 +71,18 @@ class NeedleBenchMultiDataset(BaseDataset):
@staticmethod
def load(
path: str,
path: str, # depreciated
length: int,
depth: int,
tokenizer_model: str,
file_list: 'list[str]',
num_repeats_per_file: int,
length_buffer: int,
guide: bool,
language: str,
needle_file_name: str,
num_needles: int,
diff: int,
position: str = 'End',
quesiton_position: str = 'End',
):
data = {'prompt': [], 'answer': []}
tokenizer = tiktoken.encoding_for_model(tokenizer_model)
@ -91,82 +124,123 @@ class NeedleBenchMultiDataset(BaseDataset):
def _decode_tokens(tokens):
return tokenizer.decode(tokens)
def _modify_retrieval_question(retrieval_question):
if language == 'Chinese':
guide_retrieval_question = (retrieval_question +
'在回答之前,请思考文档中与此问题'
'最相关的内容是什么。')
return guide_retrieval_question
elif language == 'English':
guide_retrieval_question = (
retrieval_question + 'Before answering, please consider'
' what in the document is most relevant to this question.')
return guide_retrieval_question
else:
raise ValueError(f"Language '{language}' is not supported.")
def _generate_prompt(context, retrieval_question):
if guide:
retrieval_question = _modify_retrieval_question(
retrieval_question)
def _generate_prompt(context, retrieval_question, last_person):
if language == 'Chinese':
if position == 'End':
prompt = ('你是一个善于回答用户问题的智能AI助手\n'
'请保持你的回答简洁清楚。不要说和下面文档中的无关的话'
',或重复你的回答\n'
f'用户现在给你的文档是{context}\n\n'
f'现在请问:{retrieval_question}')
elif position == 'Start':
prompt = ('你是一个善于回答用户问题的智能AI助手\n'
'请保持你的回答简洁清楚。不要说和下面文档中的无关的话'
',或重复你的回答\n'
f'现在请问:{retrieval_question}',
f'用户现在给你的文档是{context}\n\n')
if quesiton_position == 'End':
prompt = f'''这是一个长文本能力的测试,你需要首先阅读下面的长文档,然后根据文档中的信息回答最后的问题。
长文档的内容如下
<文档>
{context}
</文档>
根据文档中的信息现在请问{retrieval_question}
例如
例子1.如果张强的父亲是马克除此以外提供的文本中没有更多关于亲属关系的信息那么在提供的文本中张强能够向上追溯到的最年长的亲人就是马克
例子2.如果李明的姥姥是张红而张红的父亲是张强除此以外提供的文本中没有更多关于亲属关系的信息那么在提供的文本中李明能够向上追溯到的最年长的亲人就是张强
例子3.如果小明是张红的曾孙女张红的祖母是王华王华的父亲是王刚除此以外提供的文本中没有更多关于亲属关系的信息那么小明能够向上追溯到的最年长的亲人就是王刚
注意
1. 你不必纠结这个测试中的人名的性别关系例如一个通常被视为女性化的名字仍然可以是其他人的父亲我们的重点是谁更年长
2. 忽略这个测试中的姓氏遗传问题例如李明仍然可能是王鹏的亲生父亲我们只关注谁更年长不必纠结孩子是否应该继承父亲或母亲的性别
3. 在回答的最后将你的答案放在\\boxed{{}}例如所以{last_person}能向上追溯到的最年长的亲人就是\\boxed{{你的答案}}
'''
elif quesiton_position == 'Start':
prompt = f'''这是一个长文本能力的测试,你需要首先阅读下面的问题,然后根据最后长文档中的信息回答下面的问题。
现在请问{retrieval_question}
例如
例子1.如果张强的父亲是马克除此以外提供的文本中没有更多关于亲属关系的信息那么在提供的文本中张强能够向上追溯到的最年长的亲人就是马克
例子2.如果李明的姥姥是张红而张红的父亲是张强除此以外提供的文本中没有更多关于亲属关系的信息那么在提供的文本中李明能够向上追溯到的最年长的亲人就是张强
例子3.如果小明是张红的曾孙女张红的祖母是王华王华的父亲是王刚除此以外提供的文本中没有更多关于亲属关系的信息那么小明能够向上追溯到的最年长的亲人就是王刚
注意
1. 你不必纠结这个测试中的人名的性别关系例如一个通常被视为女性化的名字仍然可以是其他人的父亲我们的重点是谁更年长
2. 忽略这个测试中的姓氏遗传问题例如李明仍然可能是王鹏的亲生父亲我们只关注谁更年长不必纠结孩子是否应该继承父亲或母亲的性别
3. 在回答的最后将你的答案放在\\boxed{{}}例如所以{last_person}能向上追溯到的最年长的亲人就是\\boxed{{你的答案}}
长文档内容的如下
<文档>
{context}
</文档>
'''
else:
raise ValueError('Unsupported position. '
raise ValueError('Unsupported quesiton_position. '
'Position must be "End" or "Start".')
elif language == 'English':
if position == 'End':
prompt = ('You are an intelligent AI assistant skilled in '
'answering user questions.\n'
'Please keep your answers concise and clear. Do '
'not talk about irrelevant topics or repeat '
'your answers.\nThe document '
f'given to you by the user is {context}\n\n'
f'Now, the question is: {retrieval_question}')
elif position == 'Start':
prompt = ('You are an intelligent AI assistant skilled in '
'answering user questions.\n'
'Please keep your answers concise and clear. Do '
'not talk about irrelevant topics or repeat '
'your answers.\n'
f'Now, the question is: {retrieval_question}'
'The document given to you by the user'
f' is {context}\n\n')
if quesiton_position == 'End':
prompt = f'''This is a test of long-text capability. You need to first read the long document below, and then answer the final question based on the information in the document.
The content of the long document is as follows
<Document>
{context}
</Document>
Based on the information in the document, now please answer: {retrieval_question}
For example:
Example 1: If James Hill's father is Jasmine Lane, and no further information about familial relationships is provided in the text, then the oldest relative James Hill can trace back to in the provided text is \\boxed{{Jasmine Lane}}.
Example 2: If Andrew Williams's grandmother is Dan Newton, and Dan Newton's father is James Hill, and no further information about familial relationships is provided in the text, then the oldest relative Andrew Williams can trace back to in the provided text is \\boxed{{James Hill}}.
Example 3: If Jeff White's father is Kevin Le, Dan Newton's grandmother is Jeff White, and Jeff White's father is Kevin Le, and Shelley Mills is Dan Newton's great-granddaughter, and no further information about familial relationships is provided in the text, then the oldest relative Shelley Mills can trace back to in the provided text is \\boxed{{Kevin Le}}.
Notes:
1. You do not need to worry about the gender consistency of names in this test. For example, a name that is typically considered feminine can still be the father of another person. Our primary focus is on who is older.
2. Ignore surname inheritance issues. For instance, Andrew Williams could still be the biological father of Christopher Baker. We only care about who is older and do not need to consider whether a child should inherit the father's or mother's surname.
3. At the end of your response, remember to put your final answer within \\boxed{{}}. For example: "So the oldest relative '{last_person}' can trace back to in the provided text is \\boxed{{(your answer here)}}."
'''
elif quesiton_position == 'Start':
prompt = f'''This is a test of long-text capability. You need to first read the question below, and then answer it based on the information in the long document that follows.
Now please answer: {retrieval_question}
For example:
Example 1: If James Hill's father is Jasmine Lane, and no further information about familial relationships is provided in the text, then the oldest relative James Hill can trace back to in the provided text is \\boxed{{Jasmine Lane}}.
Example 2: If Andrew Williams's grandmother is Dan Newton, and Dan Newton's father is James Hill, and no further information about familial relationships is provided in the text, then the oldest relative Andrew Williams can trace back to in the provided text is \\boxed{{James Hill}}.
Example 3: If Jeff White's father is Kevin Le, Dan Newton's grandmother is Jeff White, and Jeff White's father is Kevin Le, and Shelley Mills is Dan Newton's great-granddaughter, and no further information about familial relationships is provided in the text, then the oldest relative Shelley Mills can trace back to in the provided text is \\boxed{{Kevin Le}}.
Notes:
1. You do not need to worry about the gender consistency of names in this test. For example, a name that is typically considered feminine can still be the father of another person. Our primary focus is on who is older.
2. Ignore surname inheritance issues. For instance, Andrew Williams could still be the biological father of Christopher Baker. We only care about who is older and do not need to consider whether a child should inherit the father's or mother's surname.
3. At the end of your response, remember to put your final answer within \\boxed{{}}. For example: "So the oldest relative '{last_person}' can trace back to in the provided text is \\boxed{{(your answer here)}}."
The content of the long document is as follows
<Document>
{context}
</Document>
'''
else:
raise ValueError(f'Unsupported position {position}. '
raise ValueError(f'Unsupported quesiton_position {quesiton_position}. '
'Position must be "End" or "Start".')
else:
raise ValueError(f"Language '{language}' is not supported.")
return prompt
repo_id = 'opencompass/NeedleBench'
file_names = [
'PaulGrahamEssays.jsonl', 'multi_needle_reasoning_en.json',
'multi_needle_reasoning_zh.json', 'zh_finance.jsonl',
'PaulGrahamEssays.jsonl','names.json', 'zh_finance.jsonl',
'zh_game.jsonl', 'zh_general.jsonl', 'zh_government.jsonl',
'zh_movie.jsonl', 'zh_tech.jsonl'
]
path = get_data_path(path)
if os.environ.get('DATASET_SOURCE') == 'HF':
from huggingface_hub import snapshot_download
path = snapshot_download(repo_id=path, repo_type='dataset')
needle_file_path = os.path.join(path, needle_file_name)
downloaded_files = []
base_file_path = ''
for file_name in file_names:
file_path = os.path.join(path, file_name)
if file_name not in file_list:
file_path = hf_hub_download(repo_id=repo_id,
filename=file_name,
repo_type='dataset')
downloaded_files.append(file_path)
base_file_path = '/'.join(file_path.split('/')[:-1])
needle_file_path = os.path.join(base_file_path, needle_file_name)
for file_path in downloaded_files:
if file_path.split('/')[-1] not in file_list:
continue
with open(file_path, 'r', encoding='utf-8') as f:
@ -176,7 +250,8 @@ class NeedleBenchMultiDataset(BaseDataset):
random.seed(counter)
random.shuffle(lines)
random_needle_data = get_random_needles(
counter, needle_file_path, num_needles)
counter, needle_file_path, num_needles+1, language)
last_person = random_needle_data['last_person']
needles = [
'\n' + needle + '\n'
for needle in random_needle_data['needles']
@ -203,66 +278,13 @@ class NeedleBenchMultiDataset(BaseDataset):
needles)
processed_prompt = _generate_prompt(processed_text,
retrieval_question)
retrieval_question, last_person)
data['prompt'].append(processed_prompt)
data['answer'].append(answer + '*' + keyword)
data['answer'].append(keyword)
dataset = Dataset.from_dict({
'prompt': data['prompt'],
'answer': data['answer'],
})
return dataset
class NeedleBenchMultiEvaluator(BaseEvaluator):
def levenshtein_distance(self, s1, s2):
if len(s1) < len(s2):
return self.levenshtein_distance(s2, s1)
if len(s2) == 0:
return len(s1)
previous_row = range(len(s2) + 1)
for i, c1 in enumerate(s1):
current_row = [i + 1]
for j, c2 in enumerate(s2):
insertions = previous_row[j + 1] + 1
deletions = current_row[j] + 1
substitutions = previous_row[j] + (c1 != c2)
current_row.append(min(insertions, deletions, substitutions))
previous_row = current_row
return previous_row[-1]
def score(self, predictions, gold):
if len(predictions) != len(gold):
return {'error': 'predictions and gold have different lengths'}
total_score = 0
details = []
for prediction, reference in zip(predictions, gold):
answer, keyword = reference.split('*')
keywords = keyword.lower().split()
prediction = prediction.lower()
keyword_score = 100 / len(keywords) if keywords else 0
matched_keywords = sum(1 for kword in keywords
if kword in prediction)
score = matched_keywords * keyword_score
detail = {
'pred': prediction,
'answer': reference,
'matched_keywords': matched_keywords,
'score': score
}
total_score += score
details.append(detail)
average_score = total_score / len(predictions) if predictions else 0
return {'score': average_score, 'details': details}
return dataset

View File

@ -43,10 +43,9 @@ class NeedleBenchOriginDataset(BaseDataset):
file_list: list[str],
num_repeats_per_file: int,
length_buffer: int,
guide: bool,
language: str,
needle_file_name: str,
position: str = 'End',
quesiton_position: str = 'End',
):
data = {'prompt': [], 'answer': []}
tokenizer = tiktoken.encoding_for_model(tokenizer_model)
@ -65,83 +64,57 @@ class NeedleBenchOriginDataset(BaseDataset):
def _decode_tokens(tokens):
return tokenizer.decode(tokens)
def _modify_retrieval_question(retrieval_question):
if language == 'Chinese':
parts = retrieval_question.split('请按照')
guide_retrieval_question = (parts[0] + '在回答之前,请思考文档中与此问题'
'最相关的内容是什么。请按照' + parts[1])
return guide_retrieval_question
elif language == 'English':
parts = retrieval_question.split('Please answer in the format')
guide_retrieval_question = (
parts[0] + 'Before answering, please consider'
' what in the document is most relevant to this question.'
' Please answer in the format' + parts[1])
return guide_retrieval_question
else:
raise ValueError(f"Language '{language}' is not supported.")
def _modify_retrieval_question_for_base(retrieval_question):
if language == 'Chinese':
parts = retrieval_question.split('请按照')
retrieval_question = (parts[0] + '在回答之前,请思考文档中与此问题'
'最相关的内容是什么。请按照' + parts[1])
return retrieval_question.replace("请按照'", '')[:-16]
elif language == 'English':
parts = retrieval_question.split('Please answer in the format')
retrieval_question = (
parts[0] + 'Before answering, please consider'
' what in the document is most relevant to this question.'
' Please answer in the format' + parts[1])
return retrieval_question.replace(
"Please answer in the format '", '')[:-10]
else:
raise ValueError(f"Language '{language}' is not supported.")
def _generate_prompt(context, retrieval_question):
if guide:
retrieval_question = _modify_retrieval_question(
retrieval_question)
else:
retrieval_question = _modify_retrieval_question_for_base(
retrieval_question)
if language == 'Chinese':
if position == 'End':
prompt = ('你是一个善于回答用户问题的智能AI助手\n'
'请保持你的回答简洁清楚。不要说和下面文档中的无关的话'
',或重复你的回答\n'
f'用户现在给你的文档是{context}\n\n'
f'现在请问:{retrieval_question}')
elif position == 'Start':
prompt = ('你是一个善于回答用户问题的智能AI助手\n'
'请保持你的回答简洁清楚。不要说和下面文档中的无关的话'
',或重复你的回答\n'
f'现在请问:{retrieval_question}',
f'用户现在给你的文档是{context}\n\n')
if quesiton_position == 'End':
prompt = f'''这是一个长文本能力的测试,你需要首先阅读下面的长文档,然后根据文档中的信息回答最后的问题。
长文档的内容如下
<文档>
{context}
</文档>
根据文档中的信息现在请问{retrieval_question}
'''
elif quesiton_position == 'Start':
prompt = f'''这是一个长文本能力的测试,你需要首先阅读下面的问题,然后根据最后长文档中的信息回答下面的问题。
现在请问{retrieval_question}
长文档内容的如下
<文档>
{context}
</文档>
'''
else:
raise ValueError('Unsupported position. '
raise ValueError('Unsupported quesiton_position. '
'Position must be "End" or "Start".')
elif language == 'English':
if position == 'End':
prompt = ('You are an intelligent AI assistant skilled in '
'answering user questions.\n'
'Please keep your answers concise and clear. Do '
'not talk about irrelevant topics or repeat '
'your answers.\nThe document '
f'given to you by the user is {context}\n\n'
f'Now, the question is: {retrieval_question}')
elif position == 'Start':
prompt = ('You are an intelligent AI assistant skilled in '
'answering user questions.\n'
'Please keep your answers concise and clear. Do '
'not talk about irrelevant topics or repeat '
'your answers.\n'
f'Now, the question is: {retrieval_question}'
'The document given to you by the user'
f' is {context}\n\n')
if quesiton_position == 'End':
prompt = f'''This is a test of long-text capability. You need to first read the long document below, and then answer the final question based on the information in the document.
The content of the long document is as follows
<Document>
{context}
</Document>
Based on the information in the document, now please answer: {retrieval_question}
'''
elif quesiton_position == 'Start':
prompt = f'''This is a test of long-text capability. You need to first read the question below, and then answer it based on the information in the long document that follows.
Now please answer: {retrieval_question}
The content of the long document is as follows
<Document>
{context}
</Document>
'''
else:
raise ValueError(f'Unsupported position {position}. '
raise ValueError(f'Unsupported quesiton_position {quesiton_position}. '
'Position must be "End" or "Start".')
else:
raise ValueError(f"Language '{language}' is not supported.")
@ -209,51 +182,6 @@ class NeedleBenchOriginDataset(BaseDataset):
class NeedleBenchOriginEvaluator(BaseEvaluator):
def __init__(self, use_trim=False):
self.use_trim = use_trim
@staticmethod
def _trim_prediction(prediction, reference):
"""Trims the prediction string based on the length of the reference
string.
Args:
prediction (str): The prediction string.
reference (str): The reference string.
Returns:
str: The trimmed prediction string.
"""
l08 = int(0.8 * len(reference))
l12 = int(1.2 * len(reference))
trimmed_prediction = prediction[:l12]
if len(trimmed_prediction) > l08 and \
reference[-1] in trimmed_prediction[l08:]:
end_pos = l08 + trimmed_prediction[l08:].index(reference[-1]) + 1
trimmed_prediction = trimmed_prediction[:end_pos]
return trimmed_prediction
def levenshtein_distance(self, s1, s2):
if len(s1) < len(s2):
return self.levenshtein_distance(s2, s1)
if len(s2) == 0:
return len(s1)
previous_row = range(len(s2) + 1)
for i, c1 in enumerate(s1):
current_row = [i + 1]
for j, c2 in enumerate(s2):
insertions = previous_row[j + 1] + 1
deletions = current_row[j] + 1
substitutions = previous_row[j] + (c1 != c2)
current_row.append(min(insertions, deletions, substitutions))
previous_row = current_row
return previous_row[-1]
def score(self, predictions, gold):
if len(predictions) != len(gold):
@ -268,26 +196,14 @@ class NeedleBenchOriginEvaluator(BaseEvaluator):
prediction = re.sub(r'\s+', '', prediction)
reference = re.sub(r'\s+', '', reference)
if self.use_trim:
prediction = NeedleBenchOriginEvaluator._trim_prediction(
prediction, reference)
edit_distance = self.levenshtein_distance(prediction, reference)
max_len = max(len(prediction), len(reference))
score = 100 * (1 -
edit_distance / max_len) if max_len != 0 else 100
if keyword in raw_prediction:
print(f'{keyword} is in {prediction}')
score = 100
else:
print(f'{keyword} is not in {prediction}')
score = 0.2 * score
score = 0
detail = {
'pred': prediction,
'answer': reference,
'edit_distance': edit_distance,
'score': score
}
total_score += score
@ -303,6 +219,6 @@ def needlebench_postprocess(text: str) -> str:
return text
@TEXT_POSTPROCESSORS.register_module('needlebench_dataset')
@TEXT_POSTPROCESSORS.register_module('needlebench_dataset_postprocess')
def needlebench_dataset_postprocess(text: str) -> str:
return text

View File

@ -70,9 +70,8 @@ class NeedleBenchParallelDataset(BaseDataset):
file_list: list[str],
num_repeats_per_file: int,
length_buffer: int,
guide: bool,
language: str,
position: str = 'End',
quesiton_position: str = 'End',
):
data = {'prompt': [], 'answer': []}
tokenizer = tiktoken.encoding_for_model(tokenizer_model)
@ -135,65 +134,56 @@ class NeedleBenchParallelDataset(BaseDataset):
def _decode_tokens(tokens):
return tokenizer.decode(tokens)
def _modify_retrieval_question(retrieval_question):
if language == 'Chinese':
parts = retrieval_question.split('请按照')
guide_retrieval_question = (parts[0] + '在回答之前,请思考文档中与此问题'
'最相关的内容是什么。请按照' + parts[1])
return guide_retrieval_question
elif language == 'English':
parts = retrieval_question.split('Please answer in the format')
guide_retrieval_question = (
parts[0] + 'Before answering, please consider'
' what in the document is most relevant to this question.'
' Please answer in the format' + parts[1])
return guide_retrieval_question
else:
raise ValueError(f"Language '{language}' is not supported.")
def _generate_prompt(context, retrieval_question):
if guide:
retrieval_question = _modify_retrieval_question(
retrieval_question)
if language == 'Chinese':
if position == 'End':
prompt = ('你是一个善于回答用户问题的智能AI助手\n'
'请保持你的回答简洁清楚。不要说和下面文档中的无关的话'
',或重复你的回答\n请先仔细阅读下面的文档再依次回答'
f'最后提出的问题\n用户现在给你的文档是{context}\n\n'
f'现在请问:{retrieval_question}\n')
elif position == 'Start':
prompt = ('你是一个善于回答用户问题的智能AI助手\n'
'请保持你的回答简洁清楚。不要说和下面文档中的无关的话'
',或重复你的回答\n请先仔细阅读下面的文档再依次回答'
f'最后提出的问题\n现在请问:{retrieval_question}\n\n'
f'用户现在给你的文档是{context}\n')
else:
raise ValueError(f'Unsupported position {position}. '
'Position must be "End" or "Start".')
if quesiton_position == 'End':
prompt = f'''这是一个长文本能力的测试,你需要首先阅读下面的长文档,然后根据文档中的信息,依次回答最后的问题。
长文档的内容如下
elif language == 'English':
if position == 'End':
prompt = (
'You are an intelligent AI assistant skilled in '
'answering user questions.\n'
'Please keep your answers concise and clear. Do not'
' talk about irrelevant topics or repeat your '
'answers.\n'
f'The document given to you by the user is {context}'
f'\n\nNow, the questions are: {retrieval_question}\n')
elif position == 'Start':
prompt = (
'You are an intelligent AI assistant skilled in '
'answering user questions.\n'
'Please keep your answers concise and clear. Do not'
' talk about irrelevant topics or repeat your '
'answers.\n'
f'\nNow, the questions are: {retrieval_question}\n\n'
f'The document given to you by the user is {context}')
<文档>
{context}
</文档>
根据文档中的信息现在请问{retrieval_question}
'''
elif quesiton_position == 'Start':
prompt = f'''这是一个长文本能力的测试,你需要首先阅读下面的问题,然后根据最后长文档中的信息,依次回答下面的问题。
现在请问{retrieval_question}
长文档内容的如下
<文档>
{context}
</文档>
'''
else:
raise ValueError(f'Unsupported position {position}. '
raise ValueError(f'Unsupported quesiton_position {quesiton_position}. '
'Position must be "End" or "Start".')
elif language == 'English':
if quesiton_position == 'End':
prompt = f'''This is a test of long-text capability. You need to first read the long document below, and then answer the final questions one by one based on the information in the document.
The content of the long document is as follows
<Document>
{context}
</Document>
Based on the information in the document, now please answer: {retrieval_question}
'''
elif quesiton_position == 'Start':
prompt = f'''This is a test of long-text capability. You need to first read the questions below, and then answer them one by one based on the information in the long document that follows.
Now please answer: {retrieval_question}
The content of the long document is as follows
<Document>
{context}
</Document>
'''
else:
raise ValueError(f'Unsupported quesiton_position {quesiton_position}. '
'Position must be "End" or "Start".')
else:
raise ValueError(f"Language '{language}' is not supported.")
@ -279,26 +269,6 @@ class NeedleBenchParallelDataset(BaseDataset):
class NeedleBenchParallelEvaluator(BaseEvaluator):
def levenshtein_distance(self, s1, s2):
if len(s1) < len(s2):
return self.levenshtein_distance(s2, s1)
if len(s2) == 0:
return len(s1)
previous_row = range(len(s2) + 1)
for i, c1 in enumerate(s1):
current_row = [i + 1]
for j, c2 in enumerate(s2):
insertions = previous_row[j + 1] + 1
deletions = current_row[j] + 1
substitutions = previous_row[j] + (c1 != c2)
current_row.append(min(insertions, deletions, substitutions))
previous_row = current_row
return previous_row[-1]
def score(self, predictions, gold):
if len(predictions) != len(gold):
return {'error': 'predictions and gold have different lengths'}

View File

@ -61,15 +61,28 @@ model_name_mapping = {
'qwen1.5-4b-chat-hf': 'Qwen-1.5-4B',
'qwen1.5-14b-chat-hf': 'Qwen-1.5-14B',
'qwen1.5-72b-chat-hf': 'Qwen-1.5-72B',
'qwen1.5-1.8b-chat-vllm': 'Qwen-1.5-1.8B',
'qwen1.5-14b-chat-vllm': 'Qwen-1.5-14B-vLLM',
'qwen1.5-72b-chat-vllm': 'Qwen-1.5-72B-vLLM',
'glm4_notools': 'GLM-4',
'claude-3-opus': 'Claude-3-Opus',
'glm-4-9b-chat-1m-vllm': 'GLM4-9B-Chat-1M',
'internlm2_5-7b-chat-1m-turbomind': 'InternLM2.5-7B-Chat-1M',
'internlm3-8b-instruct-turbomind': 'InternLM3-8B-Instruct',
'llama-3.1-8b-instruct-vllm': 'LLaMA-3.1-8B',
'qwen2.5-1.5b-instruct-vllm': 'Qwen-2.5-1.5B',
'qwen2.5-7b-instruct-vllm': 'Qwen-2.5-7B',
'qwen2.5-14b-instruct-vllm': 'Qwen-2.5-14B',
'qwen2.5-32b-instruct-vllm': 'Qwen-2.5-32B',
'qwen2_5-72b-instruct-vllm': 'Qwen-2.5-72B',
'gemma-3-4b-it-vllm': 'Gemma-3-4B',
'gemma-3-12b-it-vllm': 'Gemma-3-12B',
'gemma-3-27b-it-vllm': 'Gemma-3-27B',
'glm-4-9b-chat-vllm': 'GLM4-9B-Chat',
'llama-3.1-8b-instruct-vllm': 'LLaMA-3.1-8B',
'llama-3.1-70b-instruct-vllm': 'LLaMA-3.1-70B',
# Add more mappings as necessary
}
dataset_mapping_dict = {}
needle_counts = ['2', '3', '4', '5']
@ -103,10 +116,10 @@ def calculate_elementwise_average(model_name, merged_df):
multi_columns = [col for col in score_columns if 'needle' in col]
if origin_columns and parallel_columns and multi_columns:
origin_avg = merged_df[origin_columns].mean(axis=1) * 0.4
parallel_avg = merged_df[parallel_columns].mean(axis=1) * 0.3
multi_avg = merged_df[multi_columns].mean(axis=1) * 0.3
merged_df[model_name] = origin_avg + parallel_avg + multi_avg
origin_avg = merged_df[origin_columns].mean(axis=1)
parallel_avg = merged_df[parallel_columns].mean(axis=1)
multi_avg = merged_df[multi_columns].mean(axis=1)
merged_df[model_name] = (origin_avg + parallel_avg + multi_avg) / 3
else:
relevant_columns = origin_columns or parallel_columns or multi_columns
if relevant_columns:
@ -217,7 +230,7 @@ def save_results_to_plots(txt_results_save_path):
folder_path = os.path.join(plot_path, dataset_mapping_dict[dataset_abbr])
ensure_directory(folder_path)
save_path = os.path.join(folder_path, f'{model_name}.png')
save_path = os.path.join(folder_path, f'{model_name}.pdf')
df = create_model_dataframe(parsed_data, model_name, dataset_abbr, parallel=parallel_flag)
@ -226,25 +239,25 @@ def save_results_to_plots(txt_results_save_path):
model_datasets_scores[dataset_abbr] = '{:.02f}'.format(score)
overall_dataset_abbrs = multi_dataset_abbrs + origin_dataset_abbrs + parallel_dataset_abbrs
overall_score_pic_path = os.path.join(plot_path, f'{model_name}_overall.png')
overall_score_pic_path = os.path.join(plot_path, f'{model_name}_overall.pdf')
merged_df = merge_dataframes(model_name, overall_dataset_abbrs, parsed_data)
averaged_df = calculate_elementwise_average(model_name, merged_df)
overall_score = visualize(averaged_df, overall_score_pic_path, model_name, 'Overall Score')
# Single-Retrieval
single_retrieval_score_pic_path = os.path.join(plot_path, f'{model_name}_single_retrieval_overall.png')
single_retrieval_score_pic_path = os.path.join(plot_path, f'{model_name}_single_retrieval_overall.pdf')
single_retrieval_merged_df = merge_dataframes(model_name, origin_dataset_abbrs, parsed_data)
single_retrieval_averaged_df = calculate_elementwise_average(model_name, single_retrieval_merged_df)
single_retrieval_overall_score = visualize(single_retrieval_averaged_df, single_retrieval_score_pic_path, model_name, 'Single-Retrieval Overall Score')
# Multi-Retrieval
multi_retrieval_score_pic_path = os.path.join(plot_path, f'{model_name}_multi_retrieval_overall.png')
multi_retrieval_score_pic_path = os.path.join(plot_path, f'{model_name}_multi_retrieval_overall.pdf')
multi_retrieval_merged_df = merge_dataframes(model_name, parallel_dataset_abbrs, parsed_data)
multi_retrieval_averaged_df = calculate_elementwise_average(model_name, multi_retrieval_merged_df)
multi_retrieval_overall_score = visualize(multi_retrieval_averaged_df, multi_retrieval_score_pic_path, model_name, 'Multi-Retrieval Overall Score')
# Multi-Reasoning
multi_reasoning_score_pic_path = os.path.join(plot_path, f'{model_name}_multi_reasoning_overall.png')
multi_reasoning_score_pic_path = os.path.join(plot_path, f'{model_name}_multi_reasoning_overall.pdf')
multi_reasoning_merged_df = merge_dataframes(model_name, multi_dataset_abbrs, parsed_data)
multi_reasoning_averaged_df = calculate_elementwise_average(model_name, multi_reasoning_merged_df)
multi_reasoning_overall_score = visualize(multi_reasoning_averaged_df, multi_reasoning_score_pic_path, model_name, 'Multi-Reasoning Overall Score')
@ -279,7 +292,7 @@ def visualize(df_raw, save_path: str,model_name: str ,dataset_type:str):
mean_scores = pivot_table.mean().values
overall_score = mean_scores.mean()
plt.figure(figsize=(10, 6))
plt.figure(figsize=(7.5, 4.5))
ax = plt.gca()
cmap = LinearSegmentedColormap.from_list(
'custom_cmap', ['#F0496E', '#EBB839', '#0CD79F'])
@ -353,11 +366,11 @@ def visualize(df_raw, save_path: str,model_name: str ,dataset_type:str):
directory_path, original_filename = os.path.split(save_path)
filename_suffix = (title_name+'_'+dataset_name).replace(' ', '_')
new_filename = f'{filename_suffix}.png'
new_filename = f'{filename_suffix}.pdf'
new_save_path = os.path.join(directory_path, new_filename)
plt.savefig(new_save_path, format='png', bbox_inches='tight', pad_inches=0)
plt.savefig(new_save_path, format='pdf', bbox_inches='tight', pad_inches=0)
print(f'Saved: {new_save_path}')
plt.close()