mirror of
https://github.com/open-compass/opencompass.git
synced 2025-05-30 16:03:24 +08:00

* [Feature] Add Ruler datasets * pre-commit fixed * Add model specific tokenizer to dataset * pre-commit modified * remove unused import * fix linting * add trust_remote to tokenizer load * lint fix * comments resolved * fix lint * Add readme * Fix lint * ruler refactorize * fix lint * lint fix * updated * lint fix * fix wonderwords import issue * prompt modified * update * readme updated * update * ruler dataset added * Update --------- Co-authored-by: tonysy <sy.zhangbuaa@gmail.com>
266 lines
10 KiB
Python
266 lines
10 KiB
Python
# flake8: noqa: F401, E501
|
|
import json
|
|
import os
|
|
import random
|
|
import re
|
|
import uuid
|
|
from pathlib import Path
|
|
|
|
import numpy as np
|
|
import tiktoken
|
|
from datasets import Dataset
|
|
from transformers import AutoTokenizer
|
|
|
|
from opencompass.datasets.base import BaseDataset
|
|
from opencompass.openicl import BaseEvaluator
|
|
from opencompass.registry import LOAD_DATASET
|
|
from opencompass.utils import get_data_path
|
|
|
|
|
|
@LOAD_DATASET.register_module()
|
|
class RulerNiahDataset(BaseDataset):
|
|
|
|
@staticmethod
|
|
def load(
|
|
base_path: str,
|
|
file_path: str,
|
|
tokens_to_generate: int = 128,
|
|
max_seq_length: int = 4096,
|
|
tokenizer_model: str = 'gpt-4',
|
|
num_samples: int = 500,
|
|
random_seed: int = 42,
|
|
template:
|
|
str = 'Some special magic {type_needle_v} are hidden within the following text. Make sure to memorize it. I will quiz you about the {type_needle_v} afterwards.\n{context}\nWhat are all the special magic {type_needle_v} for {query} mentioned in the provided text? The special magic {type_needle_v} for {query} mentioned in the provided text are',
|
|
num_needle_k: int = 1,
|
|
num_needle_v: int = 1,
|
|
num_needle_q: int = 1,
|
|
type_haystack: str = 'essay',
|
|
type_needle_k: str = 'words',
|
|
type_needle_v: str = 'numbers',
|
|
remove_newline_tab: str = '',
|
|
) -> Dataset:
|
|
|
|
data = {'prompt': [], 'answer': []}
|
|
if tokenizer_model == 'gpt-4':
|
|
tokenizer = tiktoken.encoding_for_model(tokenizer_model)
|
|
else:
|
|
tokenizer = AutoTokenizer.from_pretrained(tokenizer_model,
|
|
trust_remote_code=True)
|
|
|
|
random.seed(random_seed)
|
|
np.random.seed(random_seed)
|
|
num_needle_k = max(num_needle_k, num_needle_q)
|
|
|
|
# Define Needle/Haystack Format
|
|
needle = 'One of the special magic {type_needle_v} for {key} is: {value}.'
|
|
if type_haystack == 'essay':
|
|
essay = os.path.join(base_path, file_path)
|
|
essay = get_data_path(essay, local_mode=True)
|
|
# essay = json.load(open(essay))['text']
|
|
combined_essay = ''
|
|
with open(essay, 'r', encoding='utf-8') as f:
|
|
for line in f:
|
|
line_text = json.loads(line.strip()).get('text',
|
|
'').strip()
|
|
combined_essay += line_text + ' '
|
|
haystack = re.sub(r'\s+', ' ', combined_essay).split(' ')
|
|
elif type_haystack == 'repeat':
|
|
haystack = 'The grass is green. The sky is blue. The sun is yellow. Here we go. There and back again.'
|
|
elif type_haystack == 'needle':
|
|
haystack = needle
|
|
else:
|
|
raise NotImplementedError(f'{type_haystack} is not implemented.')
|
|
try:
|
|
import wonderwords
|
|
except ImportError:
|
|
raise ImportError('''Please install wonderwords by:
|
|
pip install wonderwords''')
|
|
# Words
|
|
nouns = wonderwords.random_word._get_words_from_text_file(
|
|
'nounlist.txt')
|
|
adjs = wonderwords.random_word._get_words_from_text_file(
|
|
'adjectivelist.txt')
|
|
# verbs = wonderwords.random_word._get_words_from_text_file("verblist.txt")
|
|
words = [f'{adj}-{noun}' for adj in adjs for noun in nouns]
|
|
words = sorted(list(set(words)))
|
|
|
|
# Positions
|
|
DEPTHS = list(
|
|
np.round(np.linspace(0, 100, num=40, endpoint=True)).astype(int))
|
|
|
|
def _generate_random_number(num_digits=7):
|
|
lower_bound = 10**(num_digits - 1)
|
|
upper_bound = 10**num_digits - 1
|
|
return str(random.randint(lower_bound, upper_bound))
|
|
|
|
def _generate_random_word():
|
|
word = random.choice(words)
|
|
return word
|
|
|
|
def _generate_random_uuid():
|
|
return str(uuid.UUID(int=random.getrandbits(128), version=4))
|
|
|
|
def _generate_random(type_needle: str):
|
|
if type_needle == 'numbers':
|
|
return _generate_random_number()
|
|
elif type_needle == 'words':
|
|
return _generate_random_word()
|
|
elif type_needle == 'uuids':
|
|
return _generate_random_uuid()
|
|
else:
|
|
raise NotImplementedError(f'{type_needle} is not implemented.')
|
|
|
|
def _generate_input_output(num_haystack, type_needle_v, template):
|
|
keys, values, needles = [], [], []
|
|
for _ in range(num_needle_k):
|
|
keys.append(_generate_random(type_needle_k))
|
|
value = []
|
|
for _ in range(num_needle_v):
|
|
value.append(_generate_random(type_needle_v))
|
|
needles.append(
|
|
needle.format(
|
|
type_needle_v=type_needle_v,
|
|
key=keys[-1],
|
|
value=value[-1],
|
|
))
|
|
values.append(value)
|
|
|
|
random.Random(random_seed).shuffle(needles)
|
|
|
|
# Context
|
|
if type_haystack == 'essay':
|
|
text = ' '.join(haystack[:num_haystack])
|
|
# document_sents = sent_tokenize(text.strip())
|
|
document_sents = text.split('. ')
|
|
document_sents = [
|
|
sentence.strip() for sentence in document_sents if sentence
|
|
] # remove possible whitespace
|
|
insertion_positions = ([0] + sorted([
|
|
int(len(document_sents) * (depth / 100))
|
|
for depth in random.sample(DEPTHS, len(needles))
|
|
]) + [len(document_sents)])
|
|
document_sents_list = []
|
|
for i in range(1, len(insertion_positions)):
|
|
last_pos = insertion_positions[i - 1]
|
|
next_pos = insertion_positions[i]
|
|
document_sents_list.append(' '.join(
|
|
document_sents[last_pos:next_pos]))
|
|
if i - 1 < len(needles):
|
|
document_sents_list.append(needles[i - 1])
|
|
context = ' '.join(document_sents_list)
|
|
|
|
else:
|
|
if type_haystack == 'repeat':
|
|
sentences = [haystack] * num_haystack
|
|
elif type_haystack == 'needle':
|
|
sentences = [
|
|
haystack.format(
|
|
type_needle_v=type_needle_v,
|
|
key=_generate_random(type_needle_k),
|
|
value=_generate_random(type_needle_v),
|
|
) for _ in range(num_haystack)
|
|
]
|
|
|
|
indexes = sorted(random.sample(range(num_haystack),
|
|
len(needles)),
|
|
reverse=True)
|
|
for index, element in zip(indexes, needles):
|
|
sentences.insert(index, element)
|
|
context = '\n'.join(sentences)
|
|
|
|
## Query and Answer
|
|
indices = random.sample(range(num_needle_k), num_needle_q)
|
|
queries = [keys[i] for i in indices]
|
|
answers = [a for i in indices for a in values[i]]
|
|
query = (', '.join(queries[:-1]) + ', and ' +
|
|
queries[-1] if len(queries) > 1 else queries[0])
|
|
|
|
if num_needle_q * num_needle_v == 1:
|
|
template = template.replace('Some', 'A')
|
|
template = template.replace('are all', 'is')
|
|
template = template.replace('are', 'is')
|
|
template = template.replace('answers', 'answer')
|
|
type_needle_v = type_needle_v[:-1] # remove "s"
|
|
|
|
input_text = template.format(
|
|
type_needle_v=type_needle_v,
|
|
context=context,
|
|
query=query,
|
|
)
|
|
|
|
return input_text, answers
|
|
|
|
# Generate Samples
|
|
if type_haystack == 'essay':
|
|
incremental = 500
|
|
elif type_haystack == 'repeat':
|
|
incremental = 25
|
|
elif type_haystack == 'needle':
|
|
incremental = 25
|
|
|
|
if type_haystack != 'essay' and max_seq_length < 4096:
|
|
incremental = 5
|
|
|
|
num_haystack = incremental
|
|
print('Num haystack:', num_haystack)
|
|
|
|
total_tokens = 0 # Track the total tokens generated for the first example
|
|
while total_tokens + tokens_to_generate < max_seq_length:
|
|
input_text, answer = _generate_input_output(
|
|
num_haystack, type_needle_v, template)
|
|
# Calculate the number of tokens in the example
|
|
total_tokens = len(tokenizer.encode(input_text + ' '.join(answer)))
|
|
print(
|
|
f'Max length {max_seq_length} | Current length {total_tokens + tokens_to_generate} | Haystack: {num_haystack}'
|
|
)
|
|
if total_tokens + tokens_to_generate > max_seq_length:
|
|
num_haystack -= incremental
|
|
break
|
|
|
|
if type_haystack == 'essay' and num_haystack > len(haystack):
|
|
num_haystack = len(haystack)
|
|
break
|
|
|
|
num_haystack += incremental
|
|
|
|
# Generate samples
|
|
for index in range(num_samples):
|
|
used_haystack = num_haystack
|
|
while True:
|
|
try:
|
|
input_text, answer = _generate_input_output(
|
|
used_haystack, type_needle_v, template)
|
|
length = len(
|
|
tokenizer.encode(input_text)) + tokens_to_generate
|
|
assert length <= max_seq_length, f'{length} exceeds max_seq_length.'
|
|
break
|
|
except:
|
|
if used_haystack > incremental:
|
|
used_haystack -= incremental
|
|
|
|
if remove_newline_tab:
|
|
input_text = ' '.join(
|
|
input_text.replace('\n', ' ').replace('\t',
|
|
' ').strip().split())
|
|
|
|
data['prompt'].append(input_text)
|
|
data['answer'].append(answer)
|
|
|
|
dataset = Dataset.from_dict({
|
|
'prompt': data['prompt'],
|
|
'answer': data['answer'],
|
|
})
|
|
return dataset
|
|
|
|
|
|
class RulerNiahEvaluator(BaseEvaluator):
|
|
|
|
def score(self, predictions, gold):
|
|
score = (sum([
|
|
sum([1.0 if r.lower() in pred.lower() else 0.0
|
|
for r in ref]) / len(ref)
|
|
for pred, ref in zip(predictions, gold)
|
|
]) / len(predictions) * 100)
|
|
result = {'score': round(score, 2)}
|
|
return result
|