From 69aa2f2d5705519ed735e2269ff7a71fabb43cd6 Mon Sep 17 00:00:00 2001 From: Mo Li <82895469+DseidLi@users.noreply.github.com> Date: Thu, 25 Jul 2024 19:01:56 +0800 Subject: [PATCH] [Feature] Make NeedleBench available on HF (#1364) * update_lint * update_huggingface format * fix bug * update docs --- .../advanced_guides/needleinahaystack_eval.md | 2 ++ .../advanced_guides/needleinahaystack_eval.md | 2 ++ opencompass/datasets/needlebench/multi.py | 29 ++++++++++++----- opencompass/datasets/needlebench/origin.py | 31 ++++++++++++++----- opencompass/datasets/needlebench/parallel.py | 30 ++++++++++++------ 5 files changed, 70 insertions(+), 24 deletions(-) diff --git a/docs/en/advanced_guides/needleinahaystack_eval.md b/docs/en/advanced_guides/needleinahaystack_eval.md index 7ad4997a..66f012af 100644 --- a/docs/en/advanced_guides/needleinahaystack_eval.md +++ b/docs/en/advanced_guides/needleinahaystack_eval.md @@ -18,6 +18,8 @@ Within the `NeedleBench` framework of `OpenCompass`, we have designed a series o ### Evaluation Steps +> Note: In the latest code, OpenCompass has been set to automatically load the dataset from [Huggingface API](https://huggingface.co/datasets/opencompass/NeedleBench), so you can **skip directly** the following steps of manually downloading and placing the dataset. + 1. Download the dataset from [here](https://github.com/open-compass/opencompass/files/14741330/needlebench.zip). 2. Place the downloaded files in the `opencompass/data/needlebench/` directory. The expected file structure in the `needlebench` directory is shown below: diff --git a/docs/zh_cn/advanced_guides/needleinahaystack_eval.md b/docs/zh_cn/advanced_guides/needleinahaystack_eval.md index 05457958..74abaa57 100644 --- a/docs/zh_cn/advanced_guides/needleinahaystack_eval.md +++ b/docs/zh_cn/advanced_guides/needleinahaystack_eval.md @@ -18,6 +18,8 @@ ### 评估步骤 +> 注意:在最新代码中,OpenCompass已经设置数据集从[Huggingface的接口](https://huggingface.co/datasets/opencompass/NeedleBench)中自动加载,可以直接跳过下面的手动下载安放数据集。 + 1. 从[这里](https://github.com/open-compass/opencompass/files/14741330/needlebench.zip)下载数据集。 2. 将下载的文件放置于`opencompass/data/needlebench/`目录下。`needlebench`目录中预期的文件结构如下所示: diff --git a/opencompass/datasets/needlebench/multi.py b/opencompass/datasets/needlebench/multi.py index 64a37432..cbb96b1c 100644 --- a/opencompass/datasets/needlebench/multi.py +++ b/opencompass/datasets/needlebench/multi.py @@ -1,10 +1,10 @@ import json import os import random -from pathlib import Path import tiktoken from datasets import Dataset +from huggingface_hub import hf_hub_download from opencompass.datasets.base import BaseDataset from opencompass.openicl import BaseEvaluator @@ -37,7 +37,7 @@ class NeedleBenchMultiDataset(BaseDataset): @staticmethod def load( - path: str, + path: str, # depreciated length: int, depth: int, tokenizer_model: str, @@ -152,13 +152,28 @@ class NeedleBenchMultiDataset(BaseDataset): return prompt - files = Path(path).glob('*.jsonl') - needle_file_path = os.path.join(path, needle_file_name) - for file in files: - if file.name not in file_list: + repo_id = 'opencompass/NeedleBench' + file_names = [ + 'PaulGrahamEssays.jsonl', 'multi_needle_reasoning_en.json', + 'multi_needle_reasoning_zh.json', 'zh_finance.jsonl', + 'zh_game.jsonl', 'zh_general.jsonl', 'zh_government.jsonl', + 'zh_movie.jsonl', 'zh_tech.jsonl' + ] + downloaded_files = [] + base_file_path = '' + for file_name in file_names: + file_path = hf_hub_download(repo_id=repo_id, + filename=file_name, + repo_type='dataset') + downloaded_files.append(file_path) + base_file_path = '/'.join(file_path.split('/')[:-1]) + + needle_file_path = os.path.join(base_file_path, needle_file_name) + for file_path in downloaded_files: + if file_path.split('/')[-1] not in file_list: continue - with open(file, 'r', encoding='utf-8') as f: + with open(file_path, 'r', encoding='utf-8') as f: lines_bak = [json.loads(line.strip()) for line in f] lines = lines_bak.copy() for counter in range(num_repeats_per_file): diff --git a/opencompass/datasets/needlebench/origin.py b/opencompass/datasets/needlebench/origin.py index 42bcc64b..da3e6cc8 100644 --- a/opencompass/datasets/needlebench/origin.py +++ b/opencompass/datasets/needlebench/origin.py @@ -2,10 +2,10 @@ import json import os import random import re -from pathlib import Path import tiktoken from datasets import Dataset +from huggingface_hub import hf_hub_download from opencompass.datasets.base import BaseDataset from opencompass.openicl import BaseEvaluator @@ -36,7 +36,7 @@ class NeedleBenchOriginDataset(BaseDataset): @staticmethod def load( - path: str, + path: str, # depreciated length: int, depth: int, tokenizer_model: str, @@ -128,18 +128,33 @@ class NeedleBenchOriginDataset(BaseDataset): return prompt - files = Path(path).glob('*.jsonl') - for file in files: - if file.name not in file_list: - continue + repo_id = 'opencompass/NeedleBench' + file_names = [ + 'PaulGrahamEssays.jsonl', 'needles.jsonl', 'zh_finance.jsonl', + 'zh_game.jsonl', 'zh_general.jsonl', 'zh_government.jsonl', + 'zh_movie.jsonl', 'zh_tech.jsonl' + ] - with open(file, 'r', encoding='utf-8') as f: + downloaded_files = [] + base_file_path = '' + for file_name in file_names: + file_path = hf_hub_download(repo_id=repo_id, + filename=file_name, + repo_type='dataset') + downloaded_files.append(file_path) + base_file_path = '/'.join(file_path.split('/')[:-1]) + + for file_path in downloaded_files: + if file_path.split('/')[-1] not in file_list: + continue + with open(file_path, 'r', encoding='utf-8') as f: lines_bak = [json.loads(line.strip()) for line in f] lines = lines_bak.copy() for counter in range(num_repeats_per_file): random.seed(counter) random.shuffle(lines) - needle_file_path = os.path.join(path, needle_file_name) + needle_file_path = os.path.join(base_file_path, + needle_file_name) random_needle = get_random_line_by_language( counter, needle_file_path, language) needle = '\n' + random_needle['needle'] + '\n' diff --git a/opencompass/datasets/needlebench/parallel.py b/opencompass/datasets/needlebench/parallel.py index 3514e5ab..59c971f1 100644 --- a/opencompass/datasets/needlebench/parallel.py +++ b/opencompass/datasets/needlebench/parallel.py @@ -1,9 +1,9 @@ import json import random -from pathlib import Path import tiktoken from datasets import Dataset +from huggingface_hub import hf_hub_download from opencompass.datasets.base import BaseDataset from opencompass.openicl import BaseEvaluator @@ -57,7 +57,7 @@ class NeedleBenchParallelDataset(BaseDataset): @staticmethod def load( - path: str, + path: str, # depreciated needle_file_name: str, length: int, depths: list[int], @@ -72,9 +72,22 @@ class NeedleBenchParallelDataset(BaseDataset): data = {'prompt': [], 'answer': []} tokenizer = tiktoken.encoding_for_model(tokenizer_model) - files = Path(path).glob('*.jsonl') - for file in files: - if file.name == needle_file_name: + repo_id = 'opencompass/NeedleBench' + file_names = [ + 'PaulGrahamEssays.jsonl', 'needles.jsonl', 'zh_finance.jsonl', + 'zh_game.jsonl', 'zh_general.jsonl', 'zh_government.jsonl', + 'zh_movie.jsonl', 'zh_tech.jsonl' + ] + + downloaded_files = [] + for file_name in file_names: + file_path = hf_hub_download(repo_id=repo_id, + filename=file_name, + repo_type='dataset') + downloaded_files.append(file_path) + + for file in downloaded_files: + if file.split('/')[-1] == needle_file_name: needle_file_path = file predefined_needles_bak = get_unique_entries(needle_file_path, @@ -178,12 +191,11 @@ class NeedleBenchParallelDataset(BaseDataset): return prompt - files = Path(path).glob('*.jsonl') - for file in files: - if file.name not in file_list: + for file_path in downloaded_files: + if file_path.split('/')[-1] not in file_list: continue - with open(file, 'r', encoding='utf-8') as f: + with open(file_path, 'r', encoding='utf-8') as f: lines_bak = [json.loads(line.strip()) for line in f] lines = lines_bak.copy() for counter in range(num_repeats_per_file):