[Update] Customizable tokenizer for RULER (#1731)

* Customizable tokenizer for RULER

* Relax requirements
This commit is contained in:
Chang Lan 2024-12-19 02:02:11 -08:00 committed by GitHub
parent 499302857f
commit d70100cdf2
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
17 changed files with 54 additions and 16 deletions

View File

@ -1,3 +1,5 @@
import os
from mmengine.config import read_base from mmengine.config import read_base
with read_base(): with read_base():
@ -12,6 +14,7 @@ import_ds = sum((cwe, fwe, niah, qa, vt), [])
# Evaluation config # Evaluation config
NUM_SAMPLES = 100 # Change to the number of samples you need NUM_SAMPLES = 100 # Change to the number of samples you need
tokenizer_model = os.environ.get('TOKENIZER_MODEL', 'gpt-4')
# Change the context lengths to be tested # Change the context lengths to be tested
max_seq_lens = [1024 * 128] max_seq_lens = [1024 * 128]
abbr_suffixs = ['128k'] abbr_suffixs = ['128k']
@ -25,4 +28,5 @@ for max_seq_len, abbr_suffix in zip(max_seq_lens, abbr_suffixs):
tmp_dataset['abbr'] = tmp_dataset['abbr'] + '_' + abbr_suffix tmp_dataset['abbr'] = tmp_dataset['abbr'] + '_' + abbr_suffix
tmp_dataset['num_samples'] = NUM_SAMPLES tmp_dataset['num_samples'] = NUM_SAMPLES
tmp_dataset['max_seq_length'] = max_seq_len tmp_dataset['max_seq_length'] = max_seq_len
tmp_dataset['tokenizer_model'] = tokenizer_model
ruler_datasets.append(tmp_dataset) ruler_datasets.append(tmp_dataset)

View File

@ -1,3 +1,4 @@
import os
from mmengine.config import read_base from mmengine.config import read_base
@ -13,6 +14,7 @@ import_ds = sum((cwe, fwe, niah, qa, vt), [])
# Evaluation config # Evaluation config
NUM_SAMPLES = 100 # Change to the number of samples you need NUM_SAMPLES = 100 # Change to the number of samples you need
tokenizer_model = os.environ.get('TOKENIZER_MODEL', 'gpt-4')
# Change the context lengths to be tested # Change the context lengths to be tested
max_seq_lens = [1024 * 16] max_seq_lens = [1024 * 16]
abbr_suffixs = ['16k'] abbr_suffixs = ['16k']
@ -26,4 +28,5 @@ for max_seq_len, abbr_suffix in zip(max_seq_lens, abbr_suffixs):
tmp_dataset['abbr'] = tmp_dataset['abbr'] + '_' + abbr_suffix tmp_dataset['abbr'] = tmp_dataset['abbr'] + '_' + abbr_suffix
tmp_dataset['num_samples'] = NUM_SAMPLES tmp_dataset['num_samples'] = NUM_SAMPLES
tmp_dataset['max_seq_length'] = max_seq_len tmp_dataset['max_seq_length'] = max_seq_len
tmp_dataset['tokenizer_model'] = tokenizer_model
ruler_datasets.append(tmp_dataset) ruler_datasets.append(tmp_dataset)

View File

@ -1,3 +1,4 @@
import os
from mmengine.config import read_base from mmengine.config import read_base
@ -13,6 +14,7 @@ import_ds = sum((cwe, fwe, niah, qa, vt), [])
# Evaluation config # Evaluation config
NUM_SAMPLES = 100 # Change to the number of samples you need NUM_SAMPLES = 100 # Change to the number of samples you need
tokenizer_model = os.environ.get('TOKENIZER_MODEL', 'gpt-4')
# Change the context lengths to be tested # Change the context lengths to be tested
max_seq_lens = [1024 * 1024] max_seq_lens = [1024 * 1024]
abbr_suffixs = ['1m'] abbr_suffixs = ['1m']
@ -26,4 +28,5 @@ for max_seq_len, abbr_suffix in zip(max_seq_lens, abbr_suffixs):
tmp_dataset['abbr'] = tmp_dataset['abbr'] + '_' + abbr_suffix tmp_dataset['abbr'] = tmp_dataset['abbr'] + '_' + abbr_suffix
tmp_dataset['num_samples'] = NUM_SAMPLES tmp_dataset['num_samples'] = NUM_SAMPLES
tmp_dataset['max_seq_length'] = max_seq_len tmp_dataset['max_seq_length'] = max_seq_len
tmp_dataset['tokenizer_model'] = tokenizer_model
ruler_datasets.append(tmp_dataset) ruler_datasets.append(tmp_dataset)

View File

@ -1,3 +1,4 @@
import os
from mmengine.config import read_base from mmengine.config import read_base
@ -13,6 +14,7 @@ import_ds = sum((cwe, fwe, niah, qa, vt), [])
# Evaluation config # Evaluation config
NUM_SAMPLES = 100 # Change to the number of samples you need NUM_SAMPLES = 100 # Change to the number of samples you need
tokenizer_model = os.environ.get('TOKENIZER_MODEL', 'gpt-4')
# Change the context lengths to be tested # Change the context lengths to be tested
max_seq_lens = [1024 * 32] max_seq_lens = [1024 * 32]
abbr_suffixs = ['32k'] abbr_suffixs = ['32k']
@ -26,4 +28,5 @@ for max_seq_len, abbr_suffix in zip(max_seq_lens, abbr_suffixs):
tmp_dataset['abbr'] = tmp_dataset['abbr'] + '_' + abbr_suffix tmp_dataset['abbr'] = tmp_dataset['abbr'] + '_' + abbr_suffix
tmp_dataset['num_samples'] = NUM_SAMPLES tmp_dataset['num_samples'] = NUM_SAMPLES
tmp_dataset['max_seq_length'] = max_seq_len tmp_dataset['max_seq_length'] = max_seq_len
tmp_dataset['tokenizer_model'] = tokenizer_model
ruler_datasets.append(tmp_dataset) ruler_datasets.append(tmp_dataset)

View File

@ -1,3 +1,5 @@
import os
from mmengine.config import read_base from mmengine.config import read_base
with read_base(): with read_base():
@ -12,6 +14,7 @@ import_ds = sum((cwe, fwe, niah, qa, vt), [])
# Evaluation config # Evaluation config
NUM_SAMPLES = 100 # Change to the number of samples you need NUM_SAMPLES = 100 # Change to the number of samples you need
tokenizer_model = os.environ.get('TOKENIZER_MODEL', 'gpt-4')
# Change the context lengths to be tested # Change the context lengths to be tested
max_seq_lens = [1024 * 4] max_seq_lens = [1024 * 4]
abbr_suffixs = ['4k'] abbr_suffixs = ['4k']
@ -25,4 +28,5 @@ for max_seq_len, abbr_suffix in zip(max_seq_lens, abbr_suffixs):
tmp_dataset['abbr'] = tmp_dataset['abbr'] + '_' + abbr_suffix tmp_dataset['abbr'] = tmp_dataset['abbr'] + '_' + abbr_suffix
tmp_dataset['num_samples'] = NUM_SAMPLES tmp_dataset['num_samples'] = NUM_SAMPLES
tmp_dataset['max_seq_length'] = max_seq_len tmp_dataset['max_seq_length'] = max_seq_len
tmp_dataset['tokenizer_model'] = tokenizer_model
ruler_datasets.append(tmp_dataset) ruler_datasets.append(tmp_dataset)

View File

@ -1,3 +1,5 @@
import os
from mmengine.config import read_base from mmengine.config import read_base
with read_base(): with read_base():
@ -12,6 +14,7 @@ import_ds = sum((cwe, fwe, niah, qa, vt), [])
# Evaluation config # Evaluation config
NUM_SAMPLES = 100 # Change to the number of samples you need NUM_SAMPLES = 100 # Change to the number of samples you need
tokenizer_model = os.environ.get('TOKENIZER_MODEL', 'gpt-4')
# Change the context lengths to be tested # Change the context lengths to be tested
max_seq_lens = [1024 * 64] max_seq_lens = [1024 * 64]
abbr_suffixs: list[str] = ['64k'] abbr_suffixs: list[str] = ['64k']
@ -25,4 +28,5 @@ for max_seq_len, abbr_suffix in zip(max_seq_lens, abbr_suffixs):
tmp_dataset['abbr'] = tmp_dataset['abbr'] + '_' + abbr_suffix tmp_dataset['abbr'] = tmp_dataset['abbr'] + '_' + abbr_suffix
tmp_dataset['num_samples'] = NUM_SAMPLES tmp_dataset['num_samples'] = NUM_SAMPLES
tmp_dataset['max_seq_length'] = max_seq_len tmp_dataset['max_seq_length'] = max_seq_len
tmp_dataset['tokenizer_model'] = tokenizer_model
ruler_datasets.append(tmp_dataset) ruler_datasets.append(tmp_dataset)

View File

@ -1,3 +1,4 @@
import os
from mmengine.config import read_base from mmengine.config import read_base
@ -13,6 +14,7 @@ import_ds = sum((cwe, fwe, niah, qa, vt), [])
# Evaluation config # Evaluation config
NUM_SAMPLES = 100 # Change to the number of samples you need NUM_SAMPLES = 100 # Change to the number of samples you need
tokenizer_model = os.environ.get('TOKENIZER_MODEL', 'gpt-4')
# Change the context lengths to be tested # Change the context lengths to be tested
max_seq_lens = [1024 * 8] max_seq_lens = [1024 * 8]
abbr_suffixs = ['8k'] abbr_suffixs = ['8k']
@ -26,4 +28,5 @@ for max_seq_len, abbr_suffix in zip(max_seq_lens, abbr_suffixs):
tmp_dataset['abbr'] = tmp_dataset['abbr'] + '_' + abbr_suffix tmp_dataset['abbr'] = tmp_dataset['abbr'] + '_' + abbr_suffix
tmp_dataset['num_samples'] = NUM_SAMPLES tmp_dataset['num_samples'] = NUM_SAMPLES
tmp_dataset['max_seq_length'] = max_seq_len tmp_dataset['max_seq_length'] = max_seq_len
tmp_dataset['tokenizer_model'] = tokenizer_model
ruler_datasets.append(tmp_dataset) ruler_datasets.append(tmp_dataset)

View File

@ -1,9 +1,7 @@
from opencompass.datasets.ruler.ruler_niah import RulerNiahDataset, RulerNiahEvaluator
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_prompt_template import PromptTemplate from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets.ruler.ruler_niah import RulerNiahDataset
from opencompass.datasets.ruler.ruler_niah import RulerNiahEvaluator
# Ruler Dataset settings # Ruler Dataset settings
niah_configurations = [ niah_configurations = [
@ -92,10 +90,7 @@ for index, config in enumerate(niah_configurations):
'type': RulerNiahDataset, 'type': RulerNiahDataset,
'base_path': base_path, 'base_path': base_path,
'file_path': file_path, 'file_path': file_path,
# 'tokenizer_model': model_path,
'tokens_to_generate': 128, 'tokens_to_generate': 128,
# 'max_seq_length': max_seq_len,
# 'num_samples': NUM_SAMPLES,
'type_haystack': config['type_haystack'], 'type_haystack': config['type_haystack'],
'type_needle_k': config['type_needle_k'], 'type_needle_k': config['type_needle_k'],
'type_needle_v': config['type_needle_v'], 'type_needle_v': config['type_needle_v'],

View File

@ -1,3 +1,5 @@
import os
from mmengine.config import read_base from mmengine.config import read_base
with read_base(): with read_base():
@ -12,6 +14,7 @@ import_ds = sum((cwe, fwe, niah, qa, vt), [])
# Evaluation config # Evaluation config
NUM_SAMPLES = 100 # Change to the number of samples you need NUM_SAMPLES = 100 # Change to the number of samples you need
tokenizer_model = os.environ.get('TOKENIZER_MODEL', 'gpt-4')
# Change the context lengths to be tested # Change the context lengths to be tested
max_seq_lens = [1024 * 128] max_seq_lens = [1024 * 128]
abbr_suffixs = ['128k'] abbr_suffixs = ['128k']
@ -25,4 +28,5 @@ for max_seq_len, abbr_suffix in zip(max_seq_lens, abbr_suffixs):
tmp_dataset['abbr'] = tmp_dataset['abbr'] + '_' + abbr_suffix tmp_dataset['abbr'] = tmp_dataset['abbr'] + '_' + abbr_suffix
tmp_dataset['num_samples'] = NUM_SAMPLES tmp_dataset['num_samples'] = NUM_SAMPLES
tmp_dataset['max_seq_length'] = max_seq_len tmp_dataset['max_seq_length'] = max_seq_len
tmp_dataset['tokenizer_model'] = tokenizer_model
ruler_datasets.append(tmp_dataset) ruler_datasets.append(tmp_dataset)

View File

@ -1,3 +1,4 @@
import os
from mmengine.config import read_base from mmengine.config import read_base
@ -13,6 +14,7 @@ import_ds = sum((cwe, fwe, niah, qa, vt), [])
# Evaluation config # Evaluation config
NUM_SAMPLES = 100 # Change to the number of samples you need NUM_SAMPLES = 100 # Change to the number of samples you need
tokenizer_model = os.environ.get('TOKENIZER_MODEL', 'gpt-4')
# Change the context lengths to be tested # Change the context lengths to be tested
max_seq_lens = [1024 * 16] max_seq_lens = [1024 * 16]
abbr_suffixs = ['16k'] abbr_suffixs = ['16k']
@ -26,4 +28,5 @@ for max_seq_len, abbr_suffix in zip(max_seq_lens, abbr_suffixs):
tmp_dataset['abbr'] = tmp_dataset['abbr'] + '_' + abbr_suffix tmp_dataset['abbr'] = tmp_dataset['abbr'] + '_' + abbr_suffix
tmp_dataset['num_samples'] = NUM_SAMPLES tmp_dataset['num_samples'] = NUM_SAMPLES
tmp_dataset['max_seq_length'] = max_seq_len tmp_dataset['max_seq_length'] = max_seq_len
tmp_dataset['tokenizer_model'] = tokenizer_model
ruler_datasets.append(tmp_dataset) ruler_datasets.append(tmp_dataset)

View File

@ -1,3 +1,4 @@
import os
from mmengine.config import read_base from mmengine.config import read_base
@ -13,6 +14,7 @@ import_ds = sum((cwe, fwe, niah, qa, vt), [])
# Evaluation config # Evaluation config
NUM_SAMPLES = 100 # Change to the number of samples you need NUM_SAMPLES = 100 # Change to the number of samples you need
tokenizer_model = os.environ.get('TOKENIZER_MODEL', 'gpt-4')
# Change the context lengths to be tested # Change the context lengths to be tested
max_seq_lens = [1024 * 1024] max_seq_lens = [1024 * 1024]
abbr_suffixs = ['1m'] abbr_suffixs = ['1m']
@ -26,4 +28,5 @@ for max_seq_len, abbr_suffix in zip(max_seq_lens, abbr_suffixs):
tmp_dataset['abbr'] = tmp_dataset['abbr'] + '_' + abbr_suffix tmp_dataset['abbr'] = tmp_dataset['abbr'] + '_' + abbr_suffix
tmp_dataset['num_samples'] = NUM_SAMPLES tmp_dataset['num_samples'] = NUM_SAMPLES
tmp_dataset['max_seq_length'] = max_seq_len tmp_dataset['max_seq_length'] = max_seq_len
tmp_dataset['tokenizer_model'] = tokenizer_model
ruler_datasets.append(tmp_dataset) ruler_datasets.append(tmp_dataset)

View File

@ -1,3 +1,4 @@
import os
from mmengine.config import read_base from mmengine.config import read_base
@ -13,6 +14,7 @@ import_ds = sum((cwe, fwe, niah, qa, vt), [])
# Evaluation config # Evaluation config
NUM_SAMPLES = 100 # Change to the number of samples you need NUM_SAMPLES = 100 # Change to the number of samples you need
tokenizer_model = os.environ.get('TOKENIZER_MODEL', 'gpt-4')
# Change the context lengths to be tested # Change the context lengths to be tested
max_seq_lens = [1024 * 32] max_seq_lens = [1024 * 32]
abbr_suffixs = ['32k'] abbr_suffixs = ['32k']
@ -26,4 +28,5 @@ for max_seq_len, abbr_suffix in zip(max_seq_lens, abbr_suffixs):
tmp_dataset['abbr'] = tmp_dataset['abbr'] + '_' + abbr_suffix tmp_dataset['abbr'] = tmp_dataset['abbr'] + '_' + abbr_suffix
tmp_dataset['num_samples'] = NUM_SAMPLES tmp_dataset['num_samples'] = NUM_SAMPLES
tmp_dataset['max_seq_length'] = max_seq_len tmp_dataset['max_seq_length'] = max_seq_len
tmp_dataset['tokenizer_model'] = tokenizer_model
ruler_datasets.append(tmp_dataset) ruler_datasets.append(tmp_dataset)

View File

@ -1,3 +1,5 @@
import os
from mmengine.config import read_base from mmengine.config import read_base
with read_base(): with read_base():
@ -12,6 +14,7 @@ import_ds = sum((cwe, fwe, niah, qa, vt), [])
# Evaluation config # Evaluation config
NUM_SAMPLES = 100 # Change to the number of samples you need NUM_SAMPLES = 100 # Change to the number of samples you need
tokenizer_model = os.environ.get('TOKENIZER_MODEL', 'gpt-4')
# Change the context lengths to be tested # Change the context lengths to be tested
max_seq_lens = [1024 * 4] max_seq_lens = [1024 * 4]
abbr_suffixs = ['4k'] abbr_suffixs = ['4k']
@ -25,4 +28,5 @@ for max_seq_len, abbr_suffix in zip(max_seq_lens, abbr_suffixs):
tmp_dataset['abbr'] = tmp_dataset['abbr'] + '_' + abbr_suffix tmp_dataset['abbr'] = tmp_dataset['abbr'] + '_' + abbr_suffix
tmp_dataset['num_samples'] = NUM_SAMPLES tmp_dataset['num_samples'] = NUM_SAMPLES
tmp_dataset['max_seq_length'] = max_seq_len tmp_dataset['max_seq_length'] = max_seq_len
tmp_dataset['tokenizer_model'] = tokenizer_model
ruler_datasets.append(tmp_dataset) ruler_datasets.append(tmp_dataset)

View File

@ -1,3 +1,5 @@
import os
from mmengine.config import read_base from mmengine.config import read_base
with read_base(): with read_base():
@ -12,6 +14,7 @@ import_ds = sum((cwe, fwe, niah, qa, vt), [])
# Evaluation config # Evaluation config
NUM_SAMPLES = 100 # Change to the number of samples you need NUM_SAMPLES = 100 # Change to the number of samples you need
tokenizer_model = os.environ.get('TOKENIZER_MODEL', 'gpt-4')
# Change the context lengths to be tested # Change the context lengths to be tested
max_seq_lens = [1024 * 64] max_seq_lens = [1024 * 64]
abbr_suffixs: list[str] = ['64k'] abbr_suffixs: list[str] = ['64k']
@ -25,4 +28,5 @@ for max_seq_len, abbr_suffix in zip(max_seq_lens, abbr_suffixs):
tmp_dataset['abbr'] = tmp_dataset['abbr'] + '_' + abbr_suffix tmp_dataset['abbr'] = tmp_dataset['abbr'] + '_' + abbr_suffix
tmp_dataset['num_samples'] = NUM_SAMPLES tmp_dataset['num_samples'] = NUM_SAMPLES
tmp_dataset['max_seq_length'] = max_seq_len tmp_dataset['max_seq_length'] = max_seq_len
tmp_dataset['tokenizer_model'] = tokenizer_model
ruler_datasets.append(tmp_dataset) ruler_datasets.append(tmp_dataset)

View File

@ -1,3 +1,4 @@
import os
from mmengine.config import read_base from mmengine.config import read_base
@ -13,6 +14,7 @@ import_ds = sum((cwe, fwe, niah, qa, vt), [])
# Evaluation config # Evaluation config
NUM_SAMPLES = 100 # Change to the number of samples you need NUM_SAMPLES = 100 # Change to the number of samples you need
tokenizer_model = os.environ.get('TOKENIZER_MODEL', 'gpt-4')
# Change the context lengths to be tested # Change the context lengths to be tested
max_seq_lens = [1024 * 8] max_seq_lens = [1024 * 8]
abbr_suffixs = ['8k'] abbr_suffixs = ['8k']
@ -26,4 +28,5 @@ for max_seq_len, abbr_suffix in zip(max_seq_lens, abbr_suffixs):
tmp_dataset['abbr'] = tmp_dataset['abbr'] + '_' + abbr_suffix tmp_dataset['abbr'] = tmp_dataset['abbr'] + '_' + abbr_suffix
tmp_dataset['num_samples'] = NUM_SAMPLES tmp_dataset['num_samples'] = NUM_SAMPLES
tmp_dataset['max_seq_length'] = max_seq_len tmp_dataset['max_seq_length'] = max_seq_len
tmp_dataset['tokenizer_model'] = tokenizer_model
ruler_datasets.append(tmp_dataset) ruler_datasets.append(tmp_dataset)

View File

@ -1,9 +1,7 @@
from opencompass.datasets.ruler.ruler_niah import RulerNiahDataset, RulerNiahEvaluator
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_prompt_template import PromptTemplate from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets.ruler.ruler_niah import RulerNiahDataset
from opencompass.datasets.ruler.ruler_niah import RulerNiahEvaluator
# Ruler Dataset settings # Ruler Dataset settings
niah_configurations = [ niah_configurations = [
@ -92,10 +90,7 @@ for index, config in enumerate(niah_configurations):
'type': RulerNiahDataset, 'type': RulerNiahDataset,
'base_path': base_path, 'base_path': base_path,
'file_path': file_path, 'file_path': file_path,
# 'tokenizer_model': model_path,
'tokens_to_generate': 128, 'tokens_to_generate': 128,
# 'max_seq_length': max_seq_len,
# 'num_samples': NUM_SAMPLES,
'type_haystack': config['type_haystack'], 'type_haystack': config['type_haystack'],
'type_needle_k': config['type_needle_k'], 'type_needle_k': config['type_needle_k'],
'type_needle_v': config['type_needle_v'], 'type_needle_v': config['type_needle_v'],

View File

@ -2,7 +2,7 @@ absl-py
accelerate>=0.19.0 accelerate>=0.19.0
cpm_kernels cpm_kernels
datasets>=2.12.0 datasets>=2.12.0
einops==0.5.0 einops>=0.5.0
evaluate>=0.3.0 evaluate>=0.3.0
func_timeout func_timeout
fuzzywuzzy fuzzywuzzy
@ -16,7 +16,7 @@ jieba
json5 json5
jsonlines jsonlines
mmengine-lite mmengine-lite
nltk==3.8 nltk>=3.7
numpy>=1.23.4,<2.0.0 numpy>=1.23.4,<2.0.0
openai openai
OpenCC OpenCC