OpenCompass/opencompass/datasets/ruler/ruler_niah.py
Linchen Xiao a4b54048ae
[Feature] Add Ruler datasets (#1310)
* [Feature] Add Ruler datasets

* pre-commit fixed

* Add model specific tokenizer to dataset

* pre-commit modified

* remove unused import

* fix linting

* add trust_remote to tokenizer load

* lint fix

* comments resolved

* fix lint

* Add readme

* Fix lint

* ruler refactorize

* fix lint

* lint fix

* updated

* lint fix

* fix wonderwords import issue

* prompt modified

* update

* readme updated

* update

* ruler dataset added

* Update

---------

Co-authored-by: tonysy <sy.zhangbuaa@gmail.com>
2024-08-20 11:40:11 +08:00

266 lines
10 KiB
Python

# flake8: noqa: F401, E501
import json
import os
import random
import re
import uuid
from pathlib import Path
import numpy as np
import tiktoken
from datasets import Dataset
from transformers import AutoTokenizer
from opencompass.datasets.base import BaseDataset
from opencompass.openicl import BaseEvaluator
from opencompass.registry import LOAD_DATASET
from opencompass.utils import get_data_path
@LOAD_DATASET.register_module()
class RulerNiahDataset(BaseDataset):
@staticmethod
def load(
base_path: str,
file_path: str,
tokens_to_generate: int = 128,
max_seq_length: int = 4096,
tokenizer_model: str = 'gpt-4',
num_samples: int = 500,
random_seed: int = 42,
template:
str = 'Some special magic {type_needle_v} are hidden within the following text. Make sure to memorize it. I will quiz you about the {type_needle_v} afterwards.\n{context}\nWhat are all the special magic {type_needle_v} for {query} mentioned in the provided text? The special magic {type_needle_v} for {query} mentioned in the provided text are',
num_needle_k: int = 1,
num_needle_v: int = 1,
num_needle_q: int = 1,
type_haystack: str = 'essay',
type_needle_k: str = 'words',
type_needle_v: str = 'numbers',
remove_newline_tab: str = '',
) -> Dataset:
data = {'prompt': [], 'answer': []}
if tokenizer_model == 'gpt-4':
tokenizer = tiktoken.encoding_for_model(tokenizer_model)
else:
tokenizer = AutoTokenizer.from_pretrained(tokenizer_model,
trust_remote_code=True)
random.seed(random_seed)
np.random.seed(random_seed)
num_needle_k = max(num_needle_k, num_needle_q)
# Define Needle/Haystack Format
needle = 'One of the special magic {type_needle_v} for {key} is: {value}.'
if type_haystack == 'essay':
essay = os.path.join(base_path, file_path)
essay = get_data_path(essay, local_mode=True)
# essay = json.load(open(essay))['text']
combined_essay = ''
with open(essay, 'r', encoding='utf-8') as f:
for line in f:
line_text = json.loads(line.strip()).get('text',
'').strip()
combined_essay += line_text + ' '
haystack = re.sub(r'\s+', ' ', combined_essay).split(' ')
elif type_haystack == 'repeat':
haystack = 'The grass is green. The sky is blue. The sun is yellow. Here we go. There and back again.'
elif type_haystack == 'needle':
haystack = needle
else:
raise NotImplementedError(f'{type_haystack} is not implemented.')
try:
import wonderwords
except ImportError:
raise ImportError('''Please install wonderwords by:
pip install wonderwords''')
# Words
nouns = wonderwords.random_word._get_words_from_text_file(
'nounlist.txt')
adjs = wonderwords.random_word._get_words_from_text_file(
'adjectivelist.txt')
# verbs = wonderwords.random_word._get_words_from_text_file("verblist.txt")
words = [f'{adj}-{noun}' for adj in adjs for noun in nouns]
words = sorted(list(set(words)))
# Positions
DEPTHS = list(
np.round(np.linspace(0, 100, num=40, endpoint=True)).astype(int))
def _generate_random_number(num_digits=7):
lower_bound = 10**(num_digits - 1)
upper_bound = 10**num_digits - 1
return str(random.randint(lower_bound, upper_bound))
def _generate_random_word():
word = random.choice(words)
return word
def _generate_random_uuid():
return str(uuid.UUID(int=random.getrandbits(128), version=4))
def _generate_random(type_needle: str):
if type_needle == 'numbers':
return _generate_random_number()
elif type_needle == 'words':
return _generate_random_word()
elif type_needle == 'uuids':
return _generate_random_uuid()
else:
raise NotImplementedError(f'{type_needle} is not implemented.')
def _generate_input_output(num_haystack, type_needle_v, template):
keys, values, needles = [], [], []
for _ in range(num_needle_k):
keys.append(_generate_random(type_needle_k))
value = []
for _ in range(num_needle_v):
value.append(_generate_random(type_needle_v))
needles.append(
needle.format(
type_needle_v=type_needle_v,
key=keys[-1],
value=value[-1],
))
values.append(value)
random.Random(random_seed).shuffle(needles)
# Context
if type_haystack == 'essay':
text = ' '.join(haystack[:num_haystack])
# document_sents = sent_tokenize(text.strip())
document_sents = text.split('. ')
document_sents = [
sentence.strip() for sentence in document_sents if sentence
] # remove possible whitespace
insertion_positions = ([0] + sorted([
int(len(document_sents) * (depth / 100))
for depth in random.sample(DEPTHS, len(needles))
]) + [len(document_sents)])
document_sents_list = []
for i in range(1, len(insertion_positions)):
last_pos = insertion_positions[i - 1]
next_pos = insertion_positions[i]
document_sents_list.append(' '.join(
document_sents[last_pos:next_pos]))
if i - 1 < len(needles):
document_sents_list.append(needles[i - 1])
context = ' '.join(document_sents_list)
else:
if type_haystack == 'repeat':
sentences = [haystack] * num_haystack
elif type_haystack == 'needle':
sentences = [
haystack.format(
type_needle_v=type_needle_v,
key=_generate_random(type_needle_k),
value=_generate_random(type_needle_v),
) for _ in range(num_haystack)
]
indexes = sorted(random.sample(range(num_haystack),
len(needles)),
reverse=True)
for index, element in zip(indexes, needles):
sentences.insert(index, element)
context = '\n'.join(sentences)
## Query and Answer
indices = random.sample(range(num_needle_k), num_needle_q)
queries = [keys[i] for i in indices]
answers = [a for i in indices for a in values[i]]
query = (', '.join(queries[:-1]) + ', and ' +
queries[-1] if len(queries) > 1 else queries[0])
if num_needle_q * num_needle_v == 1:
template = template.replace('Some', 'A')
template = template.replace('are all', 'is')
template = template.replace('are', 'is')
template = template.replace('answers', 'answer')
type_needle_v = type_needle_v[:-1] # remove "s"
input_text = template.format(
type_needle_v=type_needle_v,
context=context,
query=query,
)
return input_text, answers
# Generate Samples
if type_haystack == 'essay':
incremental = 500
elif type_haystack == 'repeat':
incremental = 25
elif type_haystack == 'needle':
incremental = 25
if type_haystack != 'essay' and max_seq_length < 4096:
incremental = 5
num_haystack = incremental
print('Num haystack:', num_haystack)
total_tokens = 0 # Track the total tokens generated for the first example
while total_tokens + tokens_to_generate < max_seq_length:
input_text, answer = _generate_input_output(
num_haystack, type_needle_v, template)
# Calculate the number of tokens in the example
total_tokens = len(tokenizer.encode(input_text + ' '.join(answer)))
print(
f'Max length {max_seq_length} | Current length {total_tokens + tokens_to_generate} | Haystack: {num_haystack}'
)
if total_tokens + tokens_to_generate > max_seq_length:
num_haystack -= incremental
break
if type_haystack == 'essay' and num_haystack > len(haystack):
num_haystack = len(haystack)
break
num_haystack += incremental
# Generate samples
for index in range(num_samples):
used_haystack = num_haystack
while True:
try:
input_text, answer = _generate_input_output(
used_haystack, type_needle_v, template)
length = len(
tokenizer.encode(input_text)) + tokens_to_generate
assert length <= max_seq_length, f'{length} exceeds max_seq_length.'
break
except:
if used_haystack > incremental:
used_haystack -= incremental
if remove_newline_tab:
input_text = ' '.join(
input_text.replace('\n', ' ').replace('\t',
' ').strip().split())
data['prompt'].append(input_text)
data['answer'].append(answer)
dataset = Dataset.from_dict({
'prompt': data['prompt'],
'answer': data['answer'],
})
return dataset
class RulerNiahEvaluator(BaseEvaluator):
def score(self, predictions, gold):
score = (sum([
sum([1.0 if r.lower() in pred.lower() else 0.0
for r in ref]) / len(ref)
for pred, ref in zip(predictions, gold)
]) / len(predictions) * 100)
result = {'score': round(score, 2)}
return result