mirror of
https://github.com/open-compass/opencompass.git
synced 2025-05-30 16:03:24 +08:00

* Squashed commit of the following: commit c48ad194c3976dc63d1b60d8c8ab2d5ff9e1cbfe Author: DseidLi <2568818204@qq.com> Date: Tue Apr 2 16:57:43 2024 +0800 add atc_choice commit 3ac6efea29619573e6fac8fa3cce464853dcead0 Merge:2d4e559
8e3a9c3 Author: DseidLi <2568818204@qq.com> Date: Tue Apr 2 16:41:38 2024 +0800 Merge branch 'atc_choice' into atc_add_choice commit 8e3a9c396a3e5546d3faf584183f6fd60b974d5e Merge: 150a0360a6a03f
Author: DseidLi <2568818204@qq.com> Date: Tue Mar 26 04:47:07 2024 +0800 Merge branch 'main' into atc_choice Conflicts: configs/summarizers/needlebench.py opencompass/datasets/needlebench/multi.py opencompass/datasets/needlebench/origin.py opencompass/datasets/needlebench/parallel.py commit 150a036d6d990f26a57c974d1af83d88c31a0f9d Merge: 8d6ac9a 940dd18 Author: DseidLi <2568818204@qq.com> Date: Wed Mar 20 03:49:08 2024 +0800 Merge branch 'needlebench_fix' into atc_choice commit 8d6ac9a1a43b1c9d0f0ea27e7d58968a203ea898 Author: DseidLi <2568818204@qq.com> Date: Wed Mar 20 03:41:49 2024 +0800 optimize needlebench code commit 940dd18a4270f24bc69edd2a780182c68918e1a9 Author: DseidLi <2568818204@qq.com> Date: Wed Mar 20 03:39:46 2024 +0800 fix vllm commit d8be6877bc41051f3edcc0421c462c834c0f1c9a Merge: ecad78a2527fda
Author: DseidLi <2568818204@qq.com> Date: Tue Mar 19 21:07:08 2024 +0800 Merge remote-tracking branch 'origin/add_1M_dataset' into atc_choice commit2527fda8a5
Author: DseidLi <2568818204@qq.com> Date: Tue Mar 19 16:03:40 2024 +0800 add model configs commit75425acdf8
Author: DseidLi <2568818204@qq.com> Date: Tue Mar 19 16:02:15 2024 +0800 add prompt postion args commit367ba1ba61
Author: DseidLi <2568818204@qq.com> Date: Wed Feb 28 21:40:00 2024 +0800 add Needlebench-1000K configs commit ecad78af14c4bb00fe325779114b384c57ab30bf Author: DseidLi <2568818204@qq.com> Date: Thu Mar 14 22:08:32 2024 +0800 fix atc commit 08772c0787b18872abadc9ffec3223941a5ee0c2 Merge: 9f3f8cfcaf1cf8
Author: DseidLi <2568818204@qq.com> Date: Thu Mar 14 22:07:28 2024 +0800 Merge branch 'main' into atc_choice Conflicts: configs/datasets/needlebench/readme.md configs/datasets/needlebench/readme_zh-CN.md configs/summarizers/needlebench.py opencompass/datasets/needlebench/atc.py opencompass/summarizers/needlebench.py commit 9f3f8cfb4452722734d334114ac1d14110e57406 Author: DseidLi <2568818204@qq.com> Date: Thu Mar 14 21:35:53 2024 +0800 add atc-choice test commit 52be7c1202376b4e09821188b826f1a805328129 Author: DseidLi <2568818204@qq.com> Date: Wed Mar 6 02:54:15 2024 +0800 update needlebench randomseed and add vllm qwen14b commit fc1effce596ae2e5ece4933e8cd34aef8e64a6f9 Merge: 4e747edcaf1cf8
Author: DseidLi <2568818204@qq.com> Date: Wed Mar 6 02:51:14 2024 +0800 Merge branch 'main' into add_model_configs commit 31834f9b23af3354ac3581ec86d693d0f05cdd1c Merge: 7dabc82120bf8b
Author: DseidLi <2568818204@qq.com> Date: Sun Mar 3 23:29:42 2024 +0800 Merge branch 'main' of https://github.com/open-compass/opencompass into atc_choice commit 4e747ed1988ddbcfcc7fff334601259ade72d363 Author: DseidLi <2568818204@qq.com> Date: Sun Mar 3 22:15:25 2024 +0800 add internlm2-lmdeploy model and gemma configs commit 7dabc828123d711c8cf834d6aab4137bb55e85ed Author: DseidLi <2568818204@qq.com> Date: Sat Mar 2 17:26:15 2024 +0800 add atc choice version -ZH commit996f8ae43d
Author: DseidLi <2568818204@qq.com> Date: Wed Feb 28 16:58:56 2024 +0800 update readme for needlebench commitf7266e873c
Author: DseidLi <2568818204@qq.com> Date: Wed Feb 28 16:44:53 2024 +0800 move readme.md commit1c7375681d
Author: DseidLi <2568818204@qq.com> Date: Wed Feb 28 16:38:31 2024 +0800 fix linting error commitb6524f3ebf
Author: DseidLi <2568818204@qq.com> Date: Wed Feb 28 16:33:51 2024 +0800 lint summarizer commitc0d1190e39
Author: DseidLi <2568818204@qq.com> Date: Wed Feb 28 16:29:03 2024 +0800 add needlebench intro, fix summarizer commit0965baf785
Author: DseidLi <2568818204@qq.com> Date: Mon Feb 26 13:31:26 2024 +0800 fix bug in needlebench summarizer commit5d32b31eb8
Author: DseidLi <2568818204@qq.com> Date: Sat Feb 24 03:19:08 2024 +0800 update act prompt commitaf82a7f085
Merge:32bf9fe
53fe788
Author: DseidLi <2568818204@qq.com> Date: Fri Feb 23 17:50:32 2024 +0800 Merge remote-tracking branch 'upstream/main' into needlebench commit32bf9fe802
Author: DseidLi <2568818204@qq.com> Date: Fri Feb 23 17:31:32 2024 +0800 simplify needlebench 32k, 128k, 200k for eval commita7cb025e05
Author: DseidLi <2568818204@qq.com> Date: Fri Feb 23 14:48:58 2024 +0800 add needlebench * fix summarizer * remove repeated code * remove chinese comments
818 lines
43 KiB
Python
818 lines
43 KiB
Python
from opencompass.summarizers.needlebench import NeedleBenchSummarizer
|
|
from opencompass.summarizers.needlebench import NeedleBenchATCSummarizer
|
|
|
|
# ----------NeedleBench-4k-summarizer----------
|
|
context_lengths_4k = list(range(1000, 5000, 1000))
|
|
depths = [0, 5, 10, 15, 21, 26, 31, 36, 42, 47, 52, 57, 63, 68, 73, 78, 84, 89, 94, 100]
|
|
depths_list_sparse = [0, 10, 21, 31, 42, 52, 63, 73, 84, 94, 100]
|
|
|
|
# Initialize the lists
|
|
_needlebench_4k_2needle_en = []
|
|
_needlebench_4k_3needle_en = []
|
|
_needlebench_4k_4needle_en = []
|
|
_needlebench_4k_5needle_en = []
|
|
_needlebench_4k_2needle_zh = []
|
|
_needlebench_4k_3needle_zh = []
|
|
_needlebench_4k_4needle_zh = []
|
|
_needlebench_4k_5needle_zh = []
|
|
_needlebench_4k_origin_en = []
|
|
_needlebench_4k_origin_zh = []
|
|
|
|
# Fill the lists using nested loops
|
|
for original_context_length in context_lengths_4k:
|
|
for depth_percent in depths:
|
|
_needlebench_4k_2needle_en.append(f'Length{original_context_length}Depth{int(depth_percent)}_2needle_en_4k')
|
|
_needlebench_4k_3needle_en.append(f'Length{original_context_length}Depth{int(depth_percent)}_3needle_en_4k')
|
|
_needlebench_4k_4needle_en.append(f'Length{original_context_length}Depth{int(depth_percent)}_4needle_en_4k')
|
|
_needlebench_4k_5needle_en.append(f'Length{original_context_length}Depth{int(depth_percent)}_5needle_en_4k')
|
|
_needlebench_4k_2needle_zh.append(f'Length{original_context_length}Depth{int(depth_percent)}_2needle_zh_4k')
|
|
_needlebench_4k_3needle_zh.append(f'Length{original_context_length}Depth{int(depth_percent)}_3needle_zh_4k')
|
|
_needlebench_4k_4needle_zh.append(f'Length{original_context_length}Depth{int(depth_percent)}_4needle_zh_4k')
|
|
_needlebench_4k_5needle_zh.append(f'Length{original_context_length}Depth{int(depth_percent)}_5needle_zh_4k')
|
|
|
|
_needlebench_4k_origin_en.append(f'Length{original_context_length}Depth{int(depth_percent)}_origin_en_4k')
|
|
_needlebench_4k_origin_zh.append(f'Length{original_context_length}Depth{int(depth_percent)}_origin_zh_4k')
|
|
|
|
# Concatenate the multi-needle and origin lists
|
|
_needlebench_4k_multi_needle_en = _needlebench_4k_2needle_en + _needlebench_4k_3needle_en + _needlebench_4k_4needle_en + _needlebench_4k_5needle_en
|
|
_needlebench_4k_multi_needle_zh = _needlebench_4k_2needle_zh + _needlebench_4k_3needle_zh + _needlebench_4k_4needle_zh + _needlebench_4k_5needle_zh
|
|
_needlebench_4k_origin = _needlebench_4k_origin_en + _needlebench_4k_origin_zh
|
|
_needlebench_4k_multi_needle = _needlebench_4k_multi_needle_en + _needlebench_4k_multi_needle_zh
|
|
|
|
# Repeating the same process for parallel (assuming it's similar to origin_en)
|
|
_needlebench_4k_parallel_en = []
|
|
_needlebench_4k_parallel_zh = []
|
|
for original_context_length in context_lengths_4k:
|
|
_needlebench_4k_parallel_en.append(f'Length{original_context_length}_parallel_en_4k')
|
|
for original_context_length in context_lengths_4k:
|
|
_needlebench_4k_parallel_zh.append(f'Length{original_context_length}_parallel_zh_4k')
|
|
_needlebench_4k_parallel = _needlebench_4k_parallel_en + _needlebench_4k_parallel_zh
|
|
|
|
needlebench_summary_groups = [
|
|
{'name': 'original_version', 'subsets': _needlebench_4k_origin},
|
|
{'name': 'original_version_zh', 'subsets': _needlebench_4k_origin_zh},
|
|
{'name': 'original_version_en', 'subsets': _needlebench_4k_origin_en},
|
|
|
|
{'name': 'multi_needle_en', 'subsets': _needlebench_4k_multi_needle_en},
|
|
{'name': 'multi_needle2_en', 'subsets': _needlebench_4k_2needle_en},
|
|
{'name': 'multi_needle3_en', 'subsets': _needlebench_4k_3needle_en},
|
|
{'name': 'multi_needle4_en', 'subsets': _needlebench_4k_4needle_en},
|
|
{'name': 'multi_needle5_en', 'subsets': _needlebench_4k_5needle_en},
|
|
|
|
{'name': 'multi_needle_zh', 'subsets': _needlebench_4k_multi_needle_zh},
|
|
{'name': 'multi_needle2_zh', 'subsets': _needlebench_4k_2needle_zh},
|
|
{'name': 'multi_needle3_zh', 'subsets': _needlebench_4k_3needle_zh},
|
|
{'name': 'multi_needle4_zh', 'subsets': _needlebench_4k_4needle_zh},
|
|
{'name': 'multi_needle5_zh', 'subsets': _needlebench_4k_5needle_zh},
|
|
|
|
{'name': 'multi_needle', 'subsets': _needlebench_4k_multi_needle},
|
|
|
|
{'name': 'parallel_version', 'subsets': _needlebench_4k_parallel},
|
|
{'name': 'parallel_version_zh', 'subsets': _needlebench_4k_parallel_zh},
|
|
{'name': 'parallel_version_en', 'subsets': _needlebench_4k_parallel_en},
|
|
|
|
|
|
{'name': 'overall',
|
|
'subsets': [['original_version', 'naive_average'],
|
|
['multi_needle', 'naive_average'],
|
|
['parallel_version', 'average_score']],
|
|
'weights': {'original_version': 0.4,
|
|
'multi_needle': 0.3,
|
|
'parallel_version': 0.3}},
|
|
]
|
|
needlebench_4k_summarizer = dict(
|
|
type=NeedleBenchSummarizer,
|
|
dataset_abbrs=[
|
|
'overall',
|
|
'--------- NeedleBench-4k Single-Needle ---------', # category
|
|
'original_version',
|
|
'original_version_zh',
|
|
'original_version_en',
|
|
'--------- NeedleBench-4k Parallel-Needles ---------', # category
|
|
'parallel_version',
|
|
'parallel_version_zh',
|
|
'parallel_version_en',
|
|
'--------- NeedleBench-4k Multi-Needles ---------', # category
|
|
'multi_needle',
|
|
'multi_needle_en',
|
|
'multi_needle_zh',
|
|
'multi_needle2_en',
|
|
'multi_needle3_en',
|
|
'multi_needle4_en',
|
|
'multi_needle5_en',
|
|
'multi_needle2_zh',
|
|
'multi_needle3_zh',
|
|
'multi_needle4_zh',
|
|
'multi_needle5_zh',
|
|
|
|
# *_needlebench_4k_origin, *_needlebench_4k_multi_needle, *_needlebench_4k_parallel,
|
|
],
|
|
summary_groups=needlebench_summary_groups,
|
|
)
|
|
|
|
# ----------NeedleBench-8k-summarizer----------
|
|
|
|
context_lengths_8k = list(range(5000, 9000, 1000))
|
|
|
|
# Initialize the lists
|
|
_needlebench_8k_2needle_en = []
|
|
_needlebench_8k_3needle_en = []
|
|
_needlebench_8k_4needle_en = []
|
|
_needlebench_8k_5needle_en = []
|
|
_needlebench_8k_2needle_zh = []
|
|
_needlebench_8k_3needle_zh = []
|
|
_needlebench_8k_4needle_zh = []
|
|
_needlebench_8k_5needle_zh = []
|
|
_needlebench_8k_origin_en = []
|
|
_needlebench_8k_origin_zh = []
|
|
|
|
# Fill the lists using nested loops
|
|
for original_context_length in context_lengths_8k:
|
|
for depth_percent in depths:
|
|
_needlebench_8k_2needle_en.append(f'Length{original_context_length}Depth{int(depth_percent)}_2needle_en_8k')
|
|
_needlebench_8k_3needle_en.append(f'Length{original_context_length}Depth{int(depth_percent)}_3needle_en_8k')
|
|
_needlebench_8k_4needle_en.append(f'Length{original_context_length}Depth{int(depth_percent)}_4needle_en_8k')
|
|
_needlebench_8k_5needle_en.append(f'Length{original_context_length}Depth{int(depth_percent)}_5needle_en_8k')
|
|
_needlebench_8k_2needle_zh.append(f'Length{original_context_length}Depth{int(depth_percent)}_2needle_zh_8k')
|
|
_needlebench_8k_3needle_zh.append(f'Length{original_context_length}Depth{int(depth_percent)}_3needle_zh_8k')
|
|
_needlebench_8k_4needle_zh.append(f'Length{original_context_length}Depth{int(depth_percent)}_4needle_zh_8k')
|
|
_needlebench_8k_5needle_zh.append(f'Length{original_context_length}Depth{int(depth_percent)}_5needle_zh_8k')
|
|
|
|
_needlebench_8k_origin_en.append(f'Length{original_context_length}Depth{int(depth_percent)}_origin_en_8k')
|
|
_needlebench_8k_origin_zh.append(f'Length{original_context_length}Depth{int(depth_percent)}_origin_zh_8k')
|
|
|
|
# Concatenate the multi-needle and origin lists
|
|
_needlebench_8k_multi_needle_en = _needlebench_8k_2needle_en + _needlebench_8k_3needle_en + _needlebench_8k_4needle_en + _needlebench_8k_5needle_en
|
|
_needlebench_8k_multi_needle_zh = _needlebench_8k_2needle_zh + _needlebench_8k_3needle_zh + _needlebench_8k_4needle_zh + _needlebench_8k_5needle_zh
|
|
_needlebench_8k_origin = _needlebench_8k_origin_en + _needlebench_8k_origin_zh
|
|
_needlebench_8k_multi_needle = _needlebench_8k_multi_needle_en + _needlebench_8k_multi_needle_zh
|
|
|
|
# Repeating the same process for parallel (assuming it's similar to origin_en)
|
|
_needlebench_8k_parallel_en = []
|
|
_needlebench_8k_parallel_zh = []
|
|
for original_context_length in context_lengths_8k:
|
|
_needlebench_8k_parallel_en.append(f'Length{original_context_length}_parallel_en_8k')
|
|
for original_context_length in context_lengths_8k:
|
|
_needlebench_8k_parallel_zh.append(f'Length{original_context_length}_parallel_zh_8k')
|
|
_needlebench_8k_parallel = _needlebench_8k_parallel_en + _needlebench_8k_parallel_zh
|
|
|
|
needlebench_summary_groups = [
|
|
{'name': 'original_version', 'subsets': _needlebench_8k_origin},
|
|
{'name': 'original_version_zh', 'subsets': _needlebench_8k_origin_zh},
|
|
{'name': 'original_version_en', 'subsets': _needlebench_8k_origin_en},
|
|
|
|
{'name': 'multi_needle_en', 'subsets': _needlebench_8k_multi_needle_en},
|
|
{'name': 'multi_needle2_en', 'subsets': _needlebench_8k_2needle_en},
|
|
{'name': 'multi_needle3_en', 'subsets': _needlebench_8k_3needle_en},
|
|
{'name': 'multi_needle4_en', 'subsets': _needlebench_8k_4needle_en},
|
|
{'name': 'multi_needle5_en', 'subsets': _needlebench_8k_5needle_en},
|
|
|
|
{'name': 'multi_needle_zh', 'subsets': _needlebench_8k_multi_needle_zh},
|
|
{'name': 'multi_needle2_zh', 'subsets': _needlebench_8k_2needle_zh},
|
|
{'name': 'multi_needle3_zh', 'subsets': _needlebench_8k_3needle_zh},
|
|
{'name': 'multi_needle4_zh', 'subsets': _needlebench_8k_4needle_zh},
|
|
{'name': 'multi_needle5_zh', 'subsets': _needlebench_8k_5needle_zh},
|
|
|
|
{'name': 'multi_needle', 'subsets': _needlebench_8k_multi_needle},
|
|
|
|
{'name': 'parallel_version', 'subsets': _needlebench_8k_parallel},
|
|
{'name': 'parallel_version_zh', 'subsets': _needlebench_8k_parallel_zh},
|
|
{'name': 'parallel_version_en', 'subsets': _needlebench_8k_parallel_en},
|
|
|
|
|
|
{'name': 'overall',
|
|
'subsets': [['original_version', 'naive_average'],
|
|
['multi_needle', 'naive_average'],
|
|
['parallel_version', 'average_score']],
|
|
'weights': {'original_version': 0.4,
|
|
'multi_needle': 0.3,
|
|
'parallel_version': 0.3}},
|
|
]
|
|
needlebench_8k_summarizer = dict(
|
|
type=NeedleBenchSummarizer,
|
|
dataset_abbrs=[
|
|
'overall',
|
|
'--------- NeedleBench-8k Single-Needle ---------', # category
|
|
'original_version',
|
|
'original_version_zh',
|
|
'original_version_en',
|
|
'--------- NeedleBench-8k Parallel-Needles ---------', # category
|
|
'parallel_version',
|
|
'parallel_version_zh',
|
|
'parallel_version_en',
|
|
'--------- NeedleBench-8k Multi-Needles ---------', # category
|
|
'multi_needle',
|
|
'multi_needle_en',
|
|
'multi_needle_zh',
|
|
'multi_needle2_en',
|
|
'multi_needle3_en',
|
|
'multi_needle4_en',
|
|
'multi_needle5_en',
|
|
'multi_needle2_zh',
|
|
'multi_needle3_zh',
|
|
'multi_needle4_zh',
|
|
'multi_needle5_zh',
|
|
|
|
# *_needlebench_8k_origin, *_needlebench_8k_multi_needle, *_needlebench_8k_parallel,
|
|
],
|
|
summary_groups=needlebench_summary_groups,
|
|
)
|
|
|
|
# ----------NeedleBench-32k-summarizer----------
|
|
|
|
context_lengths_32k = [9000, 13000, 17000, 21000, 25000, 29000, 31000, 32000]
|
|
|
|
# Initialize the lists
|
|
_needlebench_32k_2needle_en = []
|
|
_needlebench_32k_3needle_en = []
|
|
_needlebench_32k_4needle_en = []
|
|
_needlebench_32k_5needle_en = []
|
|
_needlebench_32k_2needle_zh = []
|
|
_needlebench_32k_3needle_zh = []
|
|
_needlebench_32k_4needle_zh = []
|
|
_needlebench_32k_5needle_zh = []
|
|
_needlebench_32k_origin_en = []
|
|
_needlebench_32k_origin_zh = []
|
|
|
|
# Fill the lists using nested loops
|
|
for original_context_length in context_lengths_32k:
|
|
for depth_percent in depths_list_sparse:
|
|
_needlebench_32k_2needle_en.append(f'Length{original_context_length}Depth{int(depth_percent)}_2needle_en_32k')
|
|
_needlebench_32k_3needle_en.append(f'Length{original_context_length}Depth{int(depth_percent)}_3needle_en_32k')
|
|
_needlebench_32k_4needle_en.append(f'Length{original_context_length}Depth{int(depth_percent)}_4needle_en_32k')
|
|
_needlebench_32k_5needle_en.append(f'Length{original_context_length}Depth{int(depth_percent)}_5needle_en_32k')
|
|
_needlebench_32k_2needle_zh.append(f'Length{original_context_length}Depth{int(depth_percent)}_2needle_zh_32k')
|
|
_needlebench_32k_3needle_zh.append(f'Length{original_context_length}Depth{int(depth_percent)}_3needle_zh_32k')
|
|
_needlebench_32k_4needle_zh.append(f'Length{original_context_length}Depth{int(depth_percent)}_4needle_zh_32k')
|
|
_needlebench_32k_5needle_zh.append(f'Length{original_context_length}Depth{int(depth_percent)}_5needle_zh_32k')
|
|
|
|
_needlebench_32k_origin_en.append(f'Length{original_context_length}Depth{int(depth_percent)}_origin_en_32k')
|
|
_needlebench_32k_origin_zh.append(f'Length{original_context_length}Depth{int(depth_percent)}_origin_zh_32k')
|
|
|
|
# Concatenate the multi-needle and origin lists
|
|
_needlebench_32k_multi_needle_en = _needlebench_32k_2needle_en + _needlebench_32k_3needle_en + _needlebench_32k_4needle_en + _needlebench_32k_5needle_en
|
|
_needlebench_32k_multi_needle_zh = _needlebench_32k_2needle_zh + _needlebench_32k_3needle_zh + _needlebench_32k_4needle_zh + _needlebench_32k_5needle_zh
|
|
_needlebench_32k_origin = _needlebench_32k_origin_en + _needlebench_32k_origin_zh
|
|
_needlebench_32k_multi_needle = _needlebench_32k_multi_needle_en + _needlebench_32k_multi_needle_zh
|
|
|
|
# Repeating the same process for parallel (assuming it's similar to origin_en)
|
|
_needlebench_32k_parallel_en = []
|
|
_needlebench_32k_parallel_zh = []
|
|
for original_context_length in context_lengths_32k:
|
|
_needlebench_32k_parallel_en.append(f'Length{original_context_length}_parallel_en_32k')
|
|
for original_context_length in context_lengths_32k:
|
|
_needlebench_32k_parallel_zh.append(f'Length{original_context_length}_parallel_zh_32k')
|
|
_needlebench_32k_parallel = _needlebench_32k_parallel_en + _needlebench_32k_parallel_zh
|
|
|
|
needlebench_summary_groups = [
|
|
{'name': 'original_version', 'subsets': _needlebench_32k_origin},
|
|
{'name': 'original_version_zh', 'subsets': _needlebench_32k_origin_zh},
|
|
{'name': 'original_version_en', 'subsets': _needlebench_32k_origin_en},
|
|
|
|
{'name': 'multi_needle_en', 'subsets': _needlebench_32k_multi_needle_en},
|
|
{'name': 'multi_needle2_en', 'subsets': _needlebench_32k_2needle_en},
|
|
{'name': 'multi_needle3_en', 'subsets': _needlebench_32k_3needle_en},
|
|
{'name': 'multi_needle4_en', 'subsets': _needlebench_32k_4needle_en},
|
|
{'name': 'multi_needle5_en', 'subsets': _needlebench_32k_5needle_en},
|
|
|
|
{'name': 'multi_needle_zh', 'subsets': _needlebench_32k_multi_needle_zh},
|
|
{'name': 'multi_needle2_zh', 'subsets': _needlebench_32k_2needle_zh},
|
|
{'name': 'multi_needle3_zh', 'subsets': _needlebench_32k_3needle_zh},
|
|
{'name': 'multi_needle4_zh', 'subsets': _needlebench_32k_4needle_zh},
|
|
{'name': 'multi_needle5_zh', 'subsets': _needlebench_32k_5needle_zh},
|
|
|
|
{'name': 'multi_needle', 'subsets': _needlebench_32k_multi_needle},
|
|
|
|
{'name': 'parallel_version', 'subsets': _needlebench_32k_parallel},
|
|
{'name': 'parallel_version_zh', 'subsets': _needlebench_32k_parallel_zh},
|
|
{'name': 'parallel_version_en', 'subsets': _needlebench_32k_parallel_en},
|
|
|
|
|
|
{'name': 'overall',
|
|
'subsets': [['original_version', 'naive_average'],
|
|
['multi_needle', 'naive_average'],
|
|
['parallel_version', 'average_score']],
|
|
'weights': {'original_version': 0.4,
|
|
'multi_needle': 0.3,
|
|
'parallel_version': 0.3}},
|
|
]
|
|
needlebench_32k_summarizer = dict(
|
|
type=NeedleBenchSummarizer,
|
|
dataset_abbrs=[
|
|
'overall',
|
|
'--------- NeedleBench-32k Single-Needle ---------', # category
|
|
'original_version',
|
|
'original_version_zh',
|
|
'original_version_en',
|
|
'--------- NeedleBench-32k Parallel-Needles ---------', # category
|
|
'parallel_version',
|
|
'parallel_version_zh',
|
|
'parallel_version_en',
|
|
'--------- NeedleBench-32k Multi-Needles ---------', # category
|
|
'multi_needle',
|
|
'multi_needle_en',
|
|
'multi_needle_zh',
|
|
'multi_needle2_en',
|
|
'multi_needle3_en',
|
|
'multi_needle4_en',
|
|
'multi_needle5_en',
|
|
'multi_needle2_zh',
|
|
'multi_needle3_zh',
|
|
'multi_needle4_zh',
|
|
'multi_needle5_zh',
|
|
|
|
# *_needlebench_32k_origin, *_needlebench_32k_multi_needle, *_needlebench_32k_parallel,
|
|
],
|
|
summary_groups=needlebench_summary_groups,
|
|
)
|
|
|
|
# ----------NeedleBench-128k-summarizer----------
|
|
|
|
context_lengths_128k = list([16000, 32000, 48000, 64000, 80000, 96000, 112000, 128000])
|
|
|
|
# Initialize the lists
|
|
_needlebench_128k_2needle_en = []
|
|
_needlebench_128k_3needle_en = []
|
|
_needlebench_128k_4needle_en = []
|
|
_needlebench_128k_5needle_en = []
|
|
_needlebench_128k_2needle_zh = []
|
|
_needlebench_128k_3needle_zh = []
|
|
_needlebench_128k_4needle_zh = []
|
|
_needlebench_128k_5needle_zh = []
|
|
_needlebench_128k_origin_en = []
|
|
_needlebench_128k_origin_zh = []
|
|
|
|
# Fill the lists using nested loops
|
|
for original_context_length in context_lengths_128k:
|
|
for depth_percent in depths_list_sparse:
|
|
_needlebench_128k_2needle_en.append(f'Length{original_context_length}Depth{int(depth_percent)}_2needle_en_128k')
|
|
_needlebench_128k_3needle_en.append(f'Length{original_context_length}Depth{int(depth_percent)}_3needle_en_128k')
|
|
_needlebench_128k_4needle_en.append(f'Length{original_context_length}Depth{int(depth_percent)}_4needle_en_128k')
|
|
_needlebench_128k_5needle_en.append(f'Length{original_context_length}Depth{int(depth_percent)}_5needle_en_128k')
|
|
_needlebench_128k_2needle_zh.append(f'Length{original_context_length}Depth{int(depth_percent)}_2needle_zh_128k')
|
|
_needlebench_128k_3needle_zh.append(f'Length{original_context_length}Depth{int(depth_percent)}_3needle_zh_128k')
|
|
_needlebench_128k_4needle_zh.append(f'Length{original_context_length}Depth{int(depth_percent)}_4needle_zh_128k')
|
|
_needlebench_128k_5needle_zh.append(f'Length{original_context_length}Depth{int(depth_percent)}_5needle_zh_128k')
|
|
|
|
_needlebench_128k_origin_en.append(f'Length{original_context_length}Depth{int(depth_percent)}_origin_en_128k')
|
|
_needlebench_128k_origin_zh.append(f'Length{original_context_length}Depth{int(depth_percent)}_origin_zh_128k')
|
|
|
|
# Concatenate the multi-needle and origin lists
|
|
_needlebench_128k_multi_needle_en = _needlebench_128k_2needle_en + _needlebench_128k_3needle_en + _needlebench_128k_4needle_en + _needlebench_128k_5needle_en
|
|
_needlebench_128k_multi_needle_zh = _needlebench_128k_2needle_zh + _needlebench_128k_3needle_zh + _needlebench_128k_4needle_zh + _needlebench_128k_5needle_zh
|
|
_needlebench_128k_origin = _needlebench_128k_origin_en + _needlebench_128k_origin_zh
|
|
_needlebench_128k_multi_needle = _needlebench_128k_multi_needle_en + _needlebench_128k_multi_needle_zh
|
|
|
|
# Repeating the same process for parallel (assuming it's similar to origin_en)
|
|
_needlebench_128k_parallel_en = []
|
|
_needlebench_128k_parallel_zh = []
|
|
for original_context_length in context_lengths_128k:
|
|
_needlebench_128k_parallel_en.append(f'Length{original_context_length}_parallel_en_128k')
|
|
for original_context_length in context_lengths_128k:
|
|
_needlebench_128k_parallel_zh.append(f'Length{original_context_length}_parallel_zh_128k')
|
|
_needlebench_128k_parallel = _needlebench_128k_parallel_en + _needlebench_128k_parallel_zh
|
|
|
|
needlebench_summary_groups = [
|
|
{'name': 'original_version', 'subsets': _needlebench_128k_origin},
|
|
{'name': 'original_version_zh', 'subsets': _needlebench_128k_origin_zh},
|
|
{'name': 'original_version_en', 'subsets': _needlebench_128k_origin_en},
|
|
|
|
{'name': 'multi_needle_en', 'subsets': _needlebench_128k_multi_needle_en},
|
|
{'name': 'multi_needle2_en', 'subsets': _needlebench_128k_2needle_en},
|
|
{'name': 'multi_needle3_en', 'subsets': _needlebench_128k_3needle_en},
|
|
{'name': 'multi_needle4_en', 'subsets': _needlebench_128k_4needle_en},
|
|
{'name': 'multi_needle5_en', 'subsets': _needlebench_128k_5needle_en},
|
|
|
|
{'name': 'multi_needle_zh', 'subsets': _needlebench_128k_multi_needle_zh},
|
|
{'name': 'multi_needle2_zh', 'subsets': _needlebench_128k_2needle_zh},
|
|
{'name': 'multi_needle3_zh', 'subsets': _needlebench_128k_3needle_zh},
|
|
{'name': 'multi_needle4_zh', 'subsets': _needlebench_128k_4needle_zh},
|
|
{'name': 'multi_needle5_zh', 'subsets': _needlebench_128k_5needle_zh},
|
|
|
|
{'name': 'multi_needle', 'subsets': _needlebench_128k_multi_needle},
|
|
|
|
{'name': 'parallel_version', 'subsets': _needlebench_128k_parallel},
|
|
{'name': 'parallel_version_zh', 'subsets': _needlebench_128k_parallel_zh},
|
|
{'name': 'parallel_version_en', 'subsets': _needlebench_128k_parallel_en},
|
|
|
|
|
|
{'name': 'overall',
|
|
'subsets': [['original_version', 'naive_average'],
|
|
['multi_needle', 'naive_average'],
|
|
['parallel_version', 'average_score']],
|
|
'weights': {'original_version': 0.4,
|
|
'multi_needle': 0.3,
|
|
'parallel_version': 0.3}},
|
|
]
|
|
needlebench_128k_summarizer = dict(
|
|
type=NeedleBenchSummarizer,
|
|
dataset_abbrs=[
|
|
'overall',
|
|
'--------- NeedleBench-128k Single-Needle ---------', # category
|
|
'original_version',
|
|
'original_version_zh',
|
|
'original_version_en',
|
|
'--------- NeedleBench-128k Parallel-Needles ---------', # category
|
|
'parallel_version',
|
|
'parallel_version_zh',
|
|
'parallel_version_en',
|
|
'--------- NeedleBench-128k Multi-Needles ---------', # category
|
|
'multi_needle',
|
|
'multi_needle_en',
|
|
'multi_needle_zh',
|
|
'multi_needle2_en',
|
|
'multi_needle3_en',
|
|
'multi_needle4_en',
|
|
'multi_needle5_en',
|
|
'multi_needle2_zh',
|
|
'multi_needle3_zh',
|
|
'multi_needle4_zh',
|
|
'multi_needle5_zh',
|
|
|
|
# *_needlebench_128k_origin, *_needlebench_128k_multi_needle, *_needlebench_128k_parallel,
|
|
],
|
|
summary_groups=needlebench_summary_groups,
|
|
)
|
|
|
|
# ----------NeedleBench-200k-summarizer----------
|
|
|
|
context_lengths_200k = list([16000, 48000, 80000, 112000, 128000, 144000, 176000, 200000])
|
|
# Initialize the lists
|
|
_needlebench_200k_2needle_en = []
|
|
_needlebench_200k_3needle_en = []
|
|
_needlebench_200k_4needle_en = []
|
|
_needlebench_200k_5needle_en = []
|
|
_needlebench_200k_2needle_zh = []
|
|
_needlebench_200k_3needle_zh = []
|
|
_needlebench_200k_4needle_zh = []
|
|
_needlebench_200k_5needle_zh = []
|
|
_needlebench_200k_origin_en = []
|
|
_needlebench_200k_origin_zh = []
|
|
|
|
# Fill the lists using nested loops
|
|
for original_context_length in context_lengths_200k:
|
|
for depth_percent in depths_list_sparse:
|
|
_needlebench_200k_2needle_en.append(f'Length{original_context_length}Depth{int(depth_percent)}_2needle_en_200k')
|
|
_needlebench_200k_3needle_en.append(f'Length{original_context_length}Depth{int(depth_percent)}_3needle_en_200k')
|
|
_needlebench_200k_4needle_en.append(f'Length{original_context_length}Depth{int(depth_percent)}_4needle_en_200k')
|
|
_needlebench_200k_5needle_en.append(f'Length{original_context_length}Depth{int(depth_percent)}_5needle_en_200k')
|
|
_needlebench_200k_2needle_zh.append(f'Length{original_context_length}Depth{int(depth_percent)}_2needle_zh_200k')
|
|
_needlebench_200k_3needle_zh.append(f'Length{original_context_length}Depth{int(depth_percent)}_3needle_zh_200k')
|
|
_needlebench_200k_4needle_zh.append(f'Length{original_context_length}Depth{int(depth_percent)}_4needle_zh_200k')
|
|
_needlebench_200k_5needle_zh.append(f'Length{original_context_length}Depth{int(depth_percent)}_5needle_zh_200k')
|
|
|
|
_needlebench_200k_origin_en.append(f'Length{original_context_length}Depth{int(depth_percent)}_origin_en_200k')
|
|
_needlebench_200k_origin_zh.append(f'Length{original_context_length}Depth{int(depth_percent)}_origin_zh_200k')
|
|
|
|
# Concatenate the multi-needle and origin lists
|
|
_needlebench_200k_multi_needle_en = _needlebench_200k_2needle_en + _needlebench_200k_3needle_en + _needlebench_200k_4needle_en + _needlebench_200k_5needle_en
|
|
_needlebench_200k_multi_needle_zh = _needlebench_200k_2needle_zh + _needlebench_200k_3needle_zh + _needlebench_200k_4needle_zh + _needlebench_200k_5needle_zh
|
|
_needlebench_200k_origin = _needlebench_200k_origin_en + _needlebench_200k_origin_zh
|
|
_needlebench_200k_multi_needle = _needlebench_200k_multi_needle_en + _needlebench_200k_multi_needle_zh
|
|
|
|
# Repeating the same process for parallel (assuming it's similar to origin_en)
|
|
_needlebench_200k_parallel_en = []
|
|
_needlebench_200k_parallel_zh = []
|
|
for original_context_length in context_lengths_200k:
|
|
_needlebench_200k_parallel_en.append(f'Length{original_context_length}_parallel_en_200k')
|
|
for original_context_length in context_lengths_200k:
|
|
_needlebench_200k_parallel_zh.append(f'Length{original_context_length}_parallel_zh_200k')
|
|
_needlebench_200k_parallel = _needlebench_200k_parallel_en + _needlebench_200k_parallel_zh
|
|
|
|
needlebench_summary_groups = [
|
|
{'name': 'original_version', 'subsets': _needlebench_200k_origin},
|
|
{'name': 'original_version_zh', 'subsets': _needlebench_200k_origin_zh},
|
|
{'name': 'original_version_en', 'subsets': _needlebench_200k_origin_en},
|
|
|
|
{'name': 'multi_needle_en', 'subsets': _needlebench_200k_multi_needle_en},
|
|
{'name': 'multi_needle2_en', 'subsets': _needlebench_200k_2needle_en},
|
|
{'name': 'multi_needle3_en', 'subsets': _needlebench_200k_3needle_en},
|
|
{'name': 'multi_needle4_en', 'subsets': _needlebench_200k_4needle_en},
|
|
{'name': 'multi_needle5_en', 'subsets': _needlebench_200k_5needle_en},
|
|
|
|
{'name': 'multi_needle_zh', 'subsets': _needlebench_200k_multi_needle_zh},
|
|
{'name': 'multi_needle2_zh', 'subsets': _needlebench_200k_2needle_zh},
|
|
{'name': 'multi_needle3_zh', 'subsets': _needlebench_200k_3needle_zh},
|
|
{'name': 'multi_needle4_zh', 'subsets': _needlebench_200k_4needle_zh},
|
|
{'name': 'multi_needle5_zh', 'subsets': _needlebench_200k_5needle_zh},
|
|
|
|
{'name': 'multi_needle', 'subsets': _needlebench_200k_multi_needle},
|
|
|
|
{'name': 'parallel_version', 'subsets': _needlebench_200k_parallel},
|
|
{'name': 'parallel_version_zh', 'subsets': _needlebench_200k_parallel_zh},
|
|
{'name': 'parallel_version_en', 'subsets': _needlebench_200k_parallel_en},
|
|
|
|
{'name': 'overall',
|
|
'subsets': [['original_version', 'naive_average'],
|
|
['multi_needle', 'naive_average'],
|
|
['parallel_version', 'average_score']],
|
|
'weights': {'original_version': 0.4,
|
|
'multi_needle': 0.3,
|
|
'parallel_version': 0.3}},
|
|
]
|
|
needlebench_200k_summarizer = dict(
|
|
type=NeedleBenchSummarizer,
|
|
dataset_abbrs=[
|
|
'overall',
|
|
'--------- NeedleBench-200k Single-Needle ---------', # category
|
|
'original_version',
|
|
'original_version_zh',
|
|
'original_version_en',
|
|
'--------- NeedleBench-200k Parallel-Needles ---------', # category
|
|
'parallel_version',
|
|
'parallel_version_zh',
|
|
'parallel_version_en',
|
|
'--------- NeedleBench-200k Multi-Needles ---------', # category
|
|
'multi_needle',
|
|
'multi_needle_en',
|
|
'multi_needle_zh',
|
|
'multi_needle2_en',
|
|
'multi_needle3_en',
|
|
'multi_needle4_en',
|
|
'multi_needle5_en',
|
|
'multi_needle2_zh',
|
|
'multi_needle3_zh',
|
|
'multi_needle4_zh',
|
|
'multi_needle5_zh',
|
|
|
|
# *_needlebench_200k_origin, *_needlebench_200k_multi_needle, *_needlebench_200k_parallel,
|
|
],
|
|
summary_groups=needlebench_summary_groups,
|
|
)
|
|
|
|
# ----------NeedleBench-1000k-summarizer----------
|
|
|
|
context_lengths_1000k = list([20000, 160000, 300000, 440000, 580000, 720000, 860000, 1000000])
|
|
# Initialize the lists
|
|
_needlebench_1000k_2needle_en = []
|
|
_needlebench_1000k_3needle_en = []
|
|
_needlebench_1000k_4needle_en = []
|
|
_needlebench_1000k_5needle_en = []
|
|
_needlebench_1000k_2needle_zh = []
|
|
_needlebench_1000k_3needle_zh = []
|
|
_needlebench_1000k_4needle_zh = []
|
|
_needlebench_1000k_5needle_zh = []
|
|
_needlebench_1000k_origin_en = []
|
|
_needlebench_1000k_origin_zh = []
|
|
|
|
# Fill the lists using nested loops
|
|
for original_context_length in context_lengths_1000k:
|
|
for depth_percent in depths_list_sparse:
|
|
_needlebench_1000k_2needle_en.append(f'Length{original_context_length}Depth{int(depth_percent)}_2needle_en_1000k')
|
|
_needlebench_1000k_3needle_en.append(f'Length{original_context_length}Depth{int(depth_percent)}_3needle_en_1000k')
|
|
_needlebench_1000k_4needle_en.append(f'Length{original_context_length}Depth{int(depth_percent)}_4needle_en_1000k')
|
|
_needlebench_1000k_5needle_en.append(f'Length{original_context_length}Depth{int(depth_percent)}_5needle_en_1000k')
|
|
_needlebench_1000k_2needle_zh.append(f'Length{original_context_length}Depth{int(depth_percent)}_2needle_zh_1000k')
|
|
_needlebench_1000k_3needle_zh.append(f'Length{original_context_length}Depth{int(depth_percent)}_3needle_zh_1000k')
|
|
_needlebench_1000k_4needle_zh.append(f'Length{original_context_length}Depth{int(depth_percent)}_4needle_zh_1000k')
|
|
_needlebench_1000k_5needle_zh.append(f'Length{original_context_length}Depth{int(depth_percent)}_5needle_zh_1000k')
|
|
|
|
_needlebench_1000k_origin_en.append(f'Length{original_context_length}Depth{int(depth_percent)}_origin_en_1000k')
|
|
_needlebench_1000k_origin_zh.append(f'Length{original_context_length}Depth{int(depth_percent)}_origin_zh_1000k')
|
|
|
|
# Concatenate the multi-needle and origin lists
|
|
_needlebench_1000k_multi_needle_en = _needlebench_1000k_2needle_en + _needlebench_1000k_3needle_en + _needlebench_1000k_4needle_en + _needlebench_1000k_5needle_en
|
|
_needlebench_1000k_multi_needle_zh = _needlebench_1000k_2needle_zh + _needlebench_1000k_3needle_zh + _needlebench_1000k_4needle_zh + _needlebench_1000k_5needle_zh
|
|
_needlebench_1000k_origin = _needlebench_1000k_origin_en + _needlebench_1000k_origin_zh
|
|
_needlebench_1000k_multi_needle = _needlebench_1000k_multi_needle_en + _needlebench_1000k_multi_needle_zh
|
|
|
|
# Repeating the same process for parallel (assuming it's similar to origin_en)
|
|
_needlebench_1000k_parallel_en = []
|
|
_needlebench_1000k_parallel_zh = []
|
|
for original_context_length in context_lengths_1000k:
|
|
_needlebench_1000k_parallel_en.append(f'Length{original_context_length}_parallel_en_1000k')
|
|
for original_context_length in context_lengths_1000k:
|
|
_needlebench_1000k_parallel_zh.append(f'Length{original_context_length}_parallel_zh_1000k')
|
|
_needlebench_1000k_parallel = _needlebench_1000k_parallel_en + _needlebench_1000k_parallel_zh
|
|
|
|
needlebench_summary_groups = [
|
|
{'name': 'original_version', 'subsets': _needlebench_1000k_origin},
|
|
{'name': 'original_version_zh', 'subsets': _needlebench_1000k_origin_zh},
|
|
{'name': 'original_version_en', 'subsets': _needlebench_1000k_origin_en},
|
|
|
|
{'name': 'multi_needle_en', 'subsets': _needlebench_1000k_multi_needle_en},
|
|
{'name': 'multi_needle2_en', 'subsets': _needlebench_1000k_2needle_en},
|
|
{'name': 'multi_needle3_en', 'subsets': _needlebench_1000k_3needle_en},
|
|
{'name': 'multi_needle4_en', 'subsets': _needlebench_1000k_4needle_en},
|
|
{'name': 'multi_needle5_en', 'subsets': _needlebench_1000k_5needle_en},
|
|
|
|
{'name': 'multi_needle_zh', 'subsets': _needlebench_1000k_multi_needle_zh},
|
|
{'name': 'multi_needle2_zh', 'subsets': _needlebench_1000k_2needle_zh},
|
|
{'name': 'multi_needle3_zh', 'subsets': _needlebench_1000k_3needle_zh},
|
|
{'name': 'multi_needle4_zh', 'subsets': _needlebench_1000k_4needle_zh},
|
|
{'name': 'multi_needle5_zh', 'subsets': _needlebench_1000k_5needle_zh},
|
|
|
|
{'name': 'multi_needle', 'subsets': _needlebench_1000k_multi_needle},
|
|
|
|
{'name': 'parallel_version', 'subsets': _needlebench_1000k_parallel},
|
|
{'name': 'parallel_version_zh', 'subsets': _needlebench_1000k_parallel_zh},
|
|
{'name': 'parallel_version_en', 'subsets': _needlebench_1000k_parallel_en},
|
|
|
|
{'name': 'overall',
|
|
'subsets': [['original_version', 'naive_average'],
|
|
['multi_needle', 'naive_average'],
|
|
['parallel_version', 'average_score']],
|
|
'weights': {'original_version': 0.4,
|
|
'multi_needle': 0.3,
|
|
'parallel_version': 0.3}},
|
|
]
|
|
needlebench_1000k_summarizer = dict(
|
|
type=NeedleBenchSummarizer,
|
|
dataset_abbrs=[
|
|
'overall',
|
|
'--------- NeedleBench-1000k Single-Needle ---------', # category
|
|
'original_version',
|
|
'original_version_zh',
|
|
'original_version_en',
|
|
'--------- NeedleBench-1000k Parallel-Needles ---------', # category
|
|
'parallel_version',
|
|
'parallel_version_zh',
|
|
'parallel_version_en',
|
|
'--------- NeedleBench-1000k Multi-Needles ---------', # category
|
|
'multi_needle',
|
|
'multi_needle_en',
|
|
'multi_needle_zh',
|
|
'multi_needle2_en',
|
|
'multi_needle3_en',
|
|
'multi_needle4_en',
|
|
'multi_needle5_en',
|
|
'multi_needle2_zh',
|
|
'multi_needle3_zh',
|
|
'multi_needle4_zh',
|
|
'multi_needle5_zh',
|
|
|
|
# *_needlebench_1000k_origin, *_needlebench_1000k_multi_needle, *_needlebench_1000k_parallel,
|
|
],
|
|
summary_groups=needlebench_summary_groups,
|
|
)
|
|
|
|
context_lengths_8k = list(range(5000, 9000, 1000))
|
|
# Repeating the same process for parallel (assuming it's similar to origin_en)
|
|
_needlebench_8k_parallel_en_batch1 = []
|
|
_needlebench_8k_parallel_en_batch5 = []
|
|
_needlebench_8k_parallel_en_batch10 = []
|
|
_needlebench_8k_parallel_en_batch15 = []
|
|
_needlebench_8k_parallel_en_batch20 = []
|
|
_needlebench_8k_parallel_zh_batch1 = []
|
|
_needlebench_8k_parallel_zh_batch5 = []
|
|
_needlebench_8k_parallel_zh_batch10 = []
|
|
_needlebench_8k_parallel_zh_batch15 = []
|
|
_needlebench_8k_parallel_zh_batch20 = []
|
|
for original_context_length in context_lengths_8k:
|
|
_needlebench_8k_parallel_en_batch1.append(f'Length{original_context_length}_parallel_en_8k_batch1')
|
|
_needlebench_8k_parallel_en_batch5.append(f'Length{original_context_length}_parallel_en_8k_batch5')
|
|
_needlebench_8k_parallel_en_batch10.append(f'Length{original_context_length}_parallel_en_8k_batch10')
|
|
_needlebench_8k_parallel_en_batch15.append(f'Length{original_context_length}_parallel_en_8k_batch15')
|
|
_needlebench_8k_parallel_en_batch20.append(f'Length{original_context_length}_parallel_en_8k_batch20')
|
|
_needlebench_8k_parallel_zh_batch1.append(f'Length{original_context_length}_parallel_zh_8k_batch1')
|
|
_needlebench_8k_parallel_zh_batch5.append(f'Length{original_context_length}_parallel_zh_8k_batch5')
|
|
_needlebench_8k_parallel_zh_batch10.append(f'Length{original_context_length}_parallel_zh_8k_batch10')
|
|
_needlebench_8k_parallel_zh_batch15.append(f'Length{original_context_length}_parallel_zh_8k_batch15')
|
|
_needlebench_8k_parallel_zh_batch20.append(f'Length{original_context_length}_parallel_zh_8k_batch20')
|
|
|
|
|
|
_needlebench_8k_parallel_batch1 = _needlebench_8k_parallel_en_batch1 + _needlebench_8k_parallel_zh_batch1
|
|
_needlebench_8k_parallel_batch5 = _needlebench_8k_parallel_en_batch5 + _needlebench_8k_parallel_zh_batch5
|
|
_needlebench_8k_parallel_batch10 = _needlebench_8k_parallel_en_batch10 + _needlebench_8k_parallel_zh_batch10
|
|
_needlebench_8k_parallel_batch15 = _needlebench_8k_parallel_en_batch15 + _needlebench_8k_parallel_zh_batch15
|
|
_needlebench_8k_parallel_batch20 = _needlebench_8k_parallel_en_batch20 + _needlebench_8k_parallel_zh_batch20
|
|
|
|
needlebench_summary_groups = [
|
|
{'name': 'parallel_version_batch1', 'subsets': [[_dataset, "average_score"] for _dataset in _needlebench_8k_parallel_batch1]},
|
|
{'name': 'parallel_version_zh_batch1', 'subsets': [[_dataset, "average_score"] for _dataset in _needlebench_8k_parallel_zh_batch1]},
|
|
{'name': 'parallel_version_en_batch1', 'subsets': [[_dataset, "average_score"] for _dataset in _needlebench_8k_parallel_en_batch1]},
|
|
{'name': 'parallel_version_batch5', 'subsets': [[_dataset, "average_score"] for _dataset in _needlebench_8k_parallel_batch5]},
|
|
{'name': 'parallel_version_zh_batch5', 'subsets': [[_dataset, "average_score"] for _dataset in _needlebench_8k_parallel_zh_batch5]},
|
|
{'name': 'parallel_version_en_batch5', 'subsets': [[_dataset, "average_score"] for _dataset in _needlebench_8k_parallel_en_batch5]},
|
|
{'name': 'parallel_version_batch10', 'subsets': [[_dataset, "average_score"] for _dataset in _needlebench_8k_parallel_batch10]},
|
|
{'name': 'parallel_version_zh_batch10', 'subsets': [[_dataset, "average_score"] for _dataset in _needlebench_8k_parallel_zh_batch10]},
|
|
{'name': 'parallel_version_en_batch10', 'subsets': [[_dataset, "average_score"] for _dataset in _needlebench_8k_parallel_en_batch10]},
|
|
{'name': 'parallel_version_batch15', 'subsets': [[_dataset, "average_score"] for _dataset in _needlebench_8k_parallel_batch15]},
|
|
{'name': 'parallel_version_zh_batch15', 'subsets': [[_dataset, "average_score"] for _dataset in _needlebench_8k_parallel_zh_batch15]},
|
|
{'name': 'parallel_version_en_batch15', 'subsets': [[_dataset, "average_score"] for _dataset in _needlebench_8k_parallel_en_batch15]},
|
|
{'name': 'parallel_version_batch20', 'subsets': [[_dataset, "average_score"] for _dataset in _needlebench_8k_parallel_batch20]},
|
|
{'name': 'parallel_version_zh_batch20', 'subsets': [[_dataset, "average_score"] for _dataset in _needlebench_8k_parallel_zh_batch20]},
|
|
{'name': 'parallel_version_en_batch20', 'subsets': [[_dataset, "average_score"] for _dataset in _needlebench_8k_parallel_en_batch20]},
|
|
]
|
|
|
|
needlebench_8k_batch_overall_summarizer = dict(
|
|
dataset_abbrs=[
|
|
'--------- NeedleBench-8k Parallel-Needles ---------', # category
|
|
'parallel_version_batch1',
|
|
'parallel_version_batch5',
|
|
'parallel_version_batch10',
|
|
'parallel_version_batch15',
|
|
'parallel_version_batch20',
|
|
'parallel_version_zh_batch1',
|
|
'parallel_version_en_batch1',
|
|
'parallel_version_zh_batch5',
|
|
'parallel_version_en_batch5',
|
|
'parallel_version_zh_batch10',
|
|
'parallel_version_en_batch10',
|
|
'parallel_version_zh_batch15',
|
|
'parallel_version_en_batch15',
|
|
'parallel_version_zh_batch20',
|
|
'parallel_version_en_batch20',
|
|
# *_needlebench_8k_origin, *_needlebench_8k_multi_needle, *_needlebench_8k_parallel,
|
|
],
|
|
summary_groups=needlebench_summary_groups,
|
|
)
|
|
|
|
needlebench_summary_groups = [
|
|
{'name': 'parallel_version_batch1', 'subsets': [[_dataset, "Depth0"] for _dataset in _needlebench_8k_parallel_batch1]},
|
|
{'name': 'parallel_version_zh_batch1', 'subsets': [[_dataset, "Depth0"] for _dataset in _needlebench_8k_parallel_zh_batch1]},
|
|
{'name': 'parallel_version_en_batch1', 'subsets': [[_dataset, "Depth0"] for _dataset in _needlebench_8k_parallel_en_batch1]},
|
|
{'name': 'parallel_version_batch5', 'subsets': [[_dataset, "Depth0"] for _dataset in _needlebench_8k_parallel_batch5]},
|
|
{'name': 'parallel_version_zh_batch5', 'subsets': [[_dataset, "Depth0"] for _dataset in _needlebench_8k_parallel_zh_batch5]},
|
|
{'name': 'parallel_version_en_batch5', 'subsets': [[_dataset, "Depth0"] for _dataset in _needlebench_8k_parallel_en_batch5]},
|
|
{'name': 'parallel_version_batch10', 'subsets': [[_dataset, "Depth0"] for _dataset in _needlebench_8k_parallel_batch10]},
|
|
{'name': 'parallel_version_zh_batch10', 'subsets': [[_dataset, "Depth0"] for _dataset in _needlebench_8k_parallel_zh_batch10]},
|
|
{'name': 'parallel_version_en_batch10', 'subsets': [[_dataset, "Depth0"] for _dataset in _needlebench_8k_parallel_en_batch10]},
|
|
{'name': 'parallel_version_batch15', 'subsets': [[_dataset, "Depth0"] for _dataset in _needlebench_8k_parallel_batch15]},
|
|
{'name': 'parallel_version_zh_batch15', 'subsets': [[_dataset, "Depth0"] for _dataset in _needlebench_8k_parallel_zh_batch15]},
|
|
{'name': 'parallel_version_en_batch15', 'subsets': [[_dataset, "Depth0"] for _dataset in _needlebench_8k_parallel_en_batch15]},
|
|
{'name': 'parallel_version_batch20', 'subsets': [[_dataset, "Depth0"] for _dataset in _needlebench_8k_parallel_batch20]},
|
|
{'name': 'parallel_version_zh_batch20', 'subsets': [[_dataset, "Depth0"] for _dataset in _needlebench_8k_parallel_zh_batch20]},
|
|
{'name': 'parallel_version_en_batch20', 'subsets': [[_dataset, "Depth0"] for _dataset in _needlebench_8k_parallel_en_batch20]},
|
|
]
|
|
|
|
needlebench_8k_batch_depth0_summarizer = dict(
|
|
dataset_abbrs=[
|
|
'--------- NeedleBench-8k Parallel-Needles ---------', # category
|
|
'parallel_version_batch1',
|
|
'parallel_version_batch5',
|
|
'parallel_version_batch10',
|
|
'parallel_version_batch15',
|
|
'parallel_version_batch20',
|
|
'parallel_version_zh_batch1',
|
|
'parallel_version_en_batch1',
|
|
'parallel_version_zh_batch5',
|
|
'parallel_version_en_batch5',
|
|
'parallel_version_zh_batch10',
|
|
'parallel_version_en_batch10',
|
|
'parallel_version_zh_batch15',
|
|
'parallel_version_en_batch15',
|
|
'parallel_version_zh_batch20',
|
|
'parallel_version_en_batch20',
|
|
# *_needlebench_8k_origin, *_needlebench_8k_multi_needle, *_needlebench_8k_parallel,
|
|
],
|
|
summary_groups=needlebench_summary_groups,
|
|
)
|
|
|
|
needle_num_list = list(range(2, 20, 1))
|
|
|
|
categories = ['ZH', 'EN', 'ZH-Reasoning', 'EN-Reasoning', 'ZH-CircularEval', 'EN-CircularEval', 'ZH-Reasoning-Circular', 'EN-Reasoning-Circular']
|
|
needlebench_atc_summary_groups = []
|
|
|
|
for category in categories:
|
|
metric = 'perf_4' if 'CircularEval' in category else 'acc_1'
|
|
cleaned_category = category.replace('-CircularEval', '').replace('-Circular', '')
|
|
subsets = [f'NeedleBenchATCDataset-{num_needles}Needle-{cleaned_category}'
|
|
for num_needles in needle_num_list]
|
|
|
|
needlebench_atc_summary_groups.append({
|
|
'name': category,
|
|
'subsets': [
|
|
[f'NeedleBenchATCDataset-{num_needles}Needle-{cleaned_category}',
|
|
metric]
|
|
for num_needles in needle_num_list
|
|
]
|
|
})
|
|
|
|
atc_dataset_abbrs = []
|
|
|
|
for category in categories:
|
|
title = f'######## Needlebench-ATC-{category}-Score ########'
|
|
atc_dataset_abbrs.append(title)
|
|
|
|
weighted_average_score_entry = [f'{category}', 'weighted_average']
|
|
atc_dataset_abbrs.append(weighted_average_score_entry)
|
|
|
|
if atc_dataset_abbrs[-1] == '------------------------------------------':
|
|
atc_dataset_abbrs.pop()
|
|
|
|
needlebench_atc_summarizer = dict(
|
|
dataset_abbrs=[
|
|
*atc_dataset_abbrs,
|
|
'######## Needlebench-ATC Accuracy ########', # category
|
|
*[[f'NeedleBenchATCDataset-{num_needles}Needle-ZH', 'acc_1'] for num_needles in needle_num_list],
|
|
'------------------------------------------',
|
|
*[[f'NeedleBenchATCDataset-{num_needles}Needle-EN', 'acc_1'] for num_needles in needle_num_list],
|
|
'------------------------------------------',
|
|
*[[f'NeedleBenchATCDataset-{num_needles}Needle-ZH-Reasoning', 'acc_1'] for num_needles in needle_num_list],
|
|
'------------------------------------------',
|
|
*[[f'NeedleBenchATCDataset-{num_needles}Needle-EN-Reasoning', 'acc_1'] for num_needles in needle_num_list],
|
|
'------------------------------------------',
|
|
'######## Needlebench-ATC CircularEval ########', # category
|
|
*[[f'NeedleBenchATCDataset-{num_needles}Needle-ZH', 'perf_4'] for num_needles in needle_num_list],
|
|
'------------------------------------------',
|
|
*[[f'NeedleBenchATCDataset-{num_needles}Needle-EN', 'perf_4'] for num_needles in needle_num_list],
|
|
'------------------------------------------',
|
|
*[[f'NeedleBenchATCDataset-{num_needles}Needle-ZH-Reasoning', 'perf_4'] for num_needles in needle_num_list],
|
|
'------------------------------------------',
|
|
*[[f'NeedleBenchATCDataset-{num_needles}Needle-EN-Reasoning', 'perf_4'] for num_needles in needle_num_list],
|
|
'------------------------------------------',
|
|
],
|
|
summary_groups=needlebench_atc_summary_groups
|
|
)
|