mirror of
https://github.com/open-compass/opencompass.git
synced 2025-05-30 16:03:24 +08:00
[Feature] Add long context evaluation for base models (#1666)
* [Update] Add base long context evaluation * update
This commit is contained in:
parent
fd7aa83c01
commit
835bf75a36
@ -5,11 +5,18 @@ models = [
|
||||
type=TurboMindModelwithChatTemplate,
|
||||
abbr='mixtral-large-instruct-2407-turbomind',
|
||||
path='mistralai/Mistral-Large-Instruct-2407',
|
||||
engine_config=dict(session_len=32768, max_batch_size=16, tp=4),
|
||||
gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=4096),
|
||||
engine_config=dict(
|
||||
session_len=32768,
|
||||
max_batch_size=16,
|
||||
tp=4,
|
||||
cache_max_entry_count=0.7,
|
||||
),
|
||||
gen_config=dict(
|
||||
top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=4096
|
||||
),
|
||||
max_seq_len=32768,
|
||||
max_out_len=4096,
|
||||
batch_size=16,
|
||||
batch_size=8,
|
||||
run_cfg=dict(num_gpus=4),
|
||||
)
|
||||
]
|
||||
|
@ -138,6 +138,10 @@ needlebench_256k_summarizer = create_summarizer(context_lengths_256k, depths_lis
|
||||
context_lengths_1000k = list([20000, 160000, 300000, 440000, 580000, 720000, 860000, 1000000])
|
||||
needlebench_1000k_summarizer = create_summarizer(context_lengths_1000k, depths_list_sparse, '1000k')
|
||||
|
||||
depths_list_internal = [0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, ]
|
||||
needlebench_internal_32k_summarizer = create_summarizer([32000], depths_list_internal, '32000')
|
||||
needlebench_internal_100k_summarizer = create_summarizer([100000], depths_list_internal, '100000')
|
||||
needlebench_internal_200k_summarizer = create_summarizer([200000], depths_list_internal, '200000')
|
||||
|
||||
_needlebench_8k_parallel_en_batch1 = []
|
||||
_needlebench_8k_parallel_en_batch5 = []
|
||||
|
@ -0,0 +1,8 @@
|
||||
from mmengine.config import read_base
|
||||
|
||||
with read_base():
|
||||
|
||||
from .needlebench_single import needlebench_en_datasets as needlebench_origin_en_datasets
|
||||
from .needlebench_single import needlebench_zh_datasets as needlebench_origin_zh_datasets
|
||||
|
||||
needlebench_datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
|
@ -0,0 +1,111 @@
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.datasets.needlebench.origin import NeedleBenchOriginDataset
|
||||
from opencompass.datasets.needlebench.origin import NeedleBenchOriginEvaluator
|
||||
from opencompass.datasets.needlebench.origin import needlebench_postprocess
|
||||
from opencompass.datasets.needlebench.origin import needlebench_dataset_postprocess
|
||||
import math
|
||||
|
||||
|
||||
def logistic(x, L=100, x0=50, k=0.1):
|
||||
return round(L / (1 + math.exp(-k * (x - x0))), 3)
|
||||
|
||||
|
||||
def generate_linear_space(start, end, num):
|
||||
if num == 1:
|
||||
return [start]
|
||||
elif num < 1:
|
||||
raise ValueError('num must be at least 1.')
|
||||
step = (end - start) / (num - 1)
|
||||
return [start + step * i for i in range(num)]
|
||||
|
||||
|
||||
def generate_depth_percents(intervals, interval_type):
|
||||
if interval_type == 'linear':
|
||||
return generate_linear_space(0, 100, intervals)
|
||||
elif interval_type == 'sigmoid':
|
||||
linear_space = generate_linear_space(0, 100, intervals)
|
||||
return [logistic(x) for x in linear_space]
|
||||
else:
|
||||
raise ValueError('Unsupported interval type')
|
||||
|
||||
|
||||
needlebench_reader_cfg = dict(input_columns=['prompt'], output_column='answer')
|
||||
|
||||
needlebench_infer_cfg = dict(
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(
|
||||
round=[
|
||||
dict(role='HUMAN', prompt='{prompt}'),
|
||||
# dict(role='BOT', prompt='{answer}\n'),
|
||||
]
|
||||
),
|
||||
),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer),
|
||||
)
|
||||
|
||||
needlebench_eval_cfg = dict(
|
||||
evaluator=dict(type=NeedleBenchOriginEvaluator),
|
||||
pred_postprocessor=dict(type=needlebench_postprocess),
|
||||
dataset_postprocessor=dict(type=needlebench_dataset_postprocess),
|
||||
pred_role='BOT',
|
||||
)
|
||||
|
||||
context_lengths = [32000, 100000, 200000, ] # 300000, 400000, 500000, 600000, 700000, 800000, 900000, 1000000
|
||||
depths_list = [0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, ] #
|
||||
|
||||
base_path = 'opencompass/needlebench'
|
||||
file_list = ['en_un_asr.jsonl']
|
||||
needlebench_en_datasets = []
|
||||
needle_file_name = 'needles.jsonl'
|
||||
|
||||
for original_context_length in context_lengths:
|
||||
for depth_percent in depths_list:
|
||||
dataset_dict = {
|
||||
'abbr': f'Length{original_context_length}'
|
||||
f'Depth{int(depth_percent)}_origin_en_{original_context_length}',
|
||||
'type': NeedleBenchOriginDataset,
|
||||
'path': base_path,
|
||||
'length': original_context_length,
|
||||
'depth': int(depth_percent),
|
||||
'tokenizer_model': 'gpt-4',
|
||||
'file_list': file_list,
|
||||
'num_repeats_per_file': 10,
|
||||
'length_buffer': 600,
|
||||
'guide': False,
|
||||
'language': 'English',
|
||||
'needle_file_name': needle_file_name,
|
||||
'reader_cfg': needlebench_reader_cfg,
|
||||
'infer_cfg': needlebench_infer_cfg,
|
||||
'eval_cfg': needlebench_eval_cfg,
|
||||
}
|
||||
needlebench_en_datasets.append(dataset_dict)
|
||||
|
||||
file_list = ['zh_all.jsonl']
|
||||
needlebench_zh_datasets = []
|
||||
needle_file_name = 'needles.jsonl'
|
||||
|
||||
for original_context_length in context_lengths:
|
||||
for depth_percent in depths_list:
|
||||
dataset_dict = {
|
||||
'abbr': f'Length{original_context_length}'
|
||||
f'Depth{int(depth_percent)}_origin_zh_{original_context_length}',
|
||||
'type': NeedleBenchOriginDataset,
|
||||
'path': base_path,
|
||||
'length': original_context_length,
|
||||
'depth': int(depth_percent),
|
||||
'tokenizer_model': 'gpt-4',
|
||||
'file_list': file_list,
|
||||
'num_repeats_per_file': 10,
|
||||
'length_buffer': 200,
|
||||
'guide': False,
|
||||
'language': 'Chinese',
|
||||
'needle_file_name': needle_file_name,
|
||||
'reader_cfg': needlebench_reader_cfg,
|
||||
'infer_cfg': needlebench_infer_cfg,
|
||||
'eval_cfg': needlebench_eval_cfg,
|
||||
}
|
||||
needlebench_zh_datasets.append(dataset_dict)
|
@ -0,0 +1,12 @@
|
||||
from opencompass.models import HuggingFacewithChatTemplate
|
||||
|
||||
models = [
|
||||
dict(
|
||||
type=HuggingFacewithChatTemplate,
|
||||
abbr="ministral-8B-instruct-2410-hf",
|
||||
path="mistralai/Ministral-8B-Instruct-2410",
|
||||
max_out_len=1024,
|
||||
batch_size=8,
|
||||
run_cfg=dict(num_gpus=2),
|
||||
)
|
||||
]
|
@ -0,0 +1,15 @@
|
||||
from opencompass.models import TurboMindModelwithChatTemplate
|
||||
|
||||
models = [
|
||||
dict(
|
||||
type=TurboMindModelwithChatTemplate,
|
||||
abbr="ministral-8B-instruct-2410-turbomind",
|
||||
path="mistralai/Ministral-8B-Instruct-2410",
|
||||
engine_config=dict(session_len=16384, max_batch_size=16, tp=1),
|
||||
gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=4096),
|
||||
max_seq_len=16384,
|
||||
max_out_len=4096,
|
||||
batch_size=16,
|
||||
run_cfg=dict(num_gpus=1),
|
||||
)
|
||||
]
|
@ -5,11 +5,18 @@ models = [
|
||||
type=TurboMindModelwithChatTemplate,
|
||||
abbr='mixtral-large-instruct-2407-turbomind',
|
||||
path='mistralai/Mistral-Large-Instruct-2407',
|
||||
engine_config=dict(session_len=32768, max_batch_size=16, tp=4),
|
||||
gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=4096),
|
||||
engine_config=dict(
|
||||
session_len=32768,
|
||||
max_batch_size=16,
|
||||
tp=4,
|
||||
cache_max_entry_count=0.7,
|
||||
),
|
||||
gen_config=dict(
|
||||
top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=4096
|
||||
),
|
||||
max_seq_len=32768,
|
||||
max_out_len=4096,
|
||||
batch_size=16,
|
||||
batch_size=8,
|
||||
run_cfg=dict(num_gpus=4),
|
||||
)
|
||||
]
|
||||
|
@ -0,0 +1,18 @@
|
||||
from opencompass.models import TurboMindModelwithChatTemplate
|
||||
|
||||
models = [
|
||||
dict(
|
||||
type=TurboMindModelwithChatTemplate,
|
||||
abbr='nvidia-3_1-Nemotron-70b-instruct-HF-turbomind',
|
||||
path='nvidia/Llama-3.1-Nemotron-70B-Instruct-HF',
|
||||
engine_config=dict(max_batch_size=16, tp=4),
|
||||
gen_config=dict(
|
||||
top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=4096
|
||||
),
|
||||
max_seq_len=16384,
|
||||
max_out_len=4096,
|
||||
batch_size=16,
|
||||
run_cfg=dict(num_gpus=4),
|
||||
stop_words=['<|end_of_text|>', '<|eot_id|>'],
|
||||
)
|
||||
]
|
@ -138,6 +138,10 @@ needlebench_256k_summarizer = create_summarizer(context_lengths_256k, depths_lis
|
||||
context_lengths_1000k = list([20000, 160000, 300000, 440000, 580000, 720000, 860000, 1000000])
|
||||
needlebench_1000k_summarizer = create_summarizer(context_lengths_1000k, depths_list_sparse, '1000k')
|
||||
|
||||
depths_list_internal = [0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, ]
|
||||
needlebench_internal_32k_summarizer = create_summarizer([32000], depths_list_internal, '32000')
|
||||
needlebench_internal_100k_summarizer = create_summarizer([100000], depths_list_internal, '100000')
|
||||
needlebench_internal_200k_summarizer = create_summarizer([200000], depths_list_internal, '200000')
|
||||
|
||||
_needlebench_8k_parallel_en_batch1 = []
|
||||
_needlebench_8k_parallel_en_batch5 = []
|
||||
|
@ -81,10 +81,30 @@ class NeedleBenchOriginDataset(BaseDataset):
|
||||
else:
|
||||
raise ValueError(f"Language '{language}' is not supported.")
|
||||
|
||||
def _modify_retrieval_question_for_base(retrieval_question):
|
||||
if language == 'Chinese':
|
||||
parts = retrieval_question.split('请按照')
|
||||
retrieval_question = (parts[0] + '在回答之前,请思考文档中与此问题'
|
||||
'最相关的内容是什么。请按照' + parts[1])
|
||||
return retrieval_question.replace("请按照'", '')[:-16]
|
||||
elif language == 'English':
|
||||
parts = retrieval_question.split('Please answer in the format')
|
||||
retrieval_question = (
|
||||
parts[0] + 'Before answering, please consider'
|
||||
' what in the document is most relevant to this question.'
|
||||
' Please answer in the format' + parts[1])
|
||||
return retrieval_question.replace(
|
||||
"Please answer in the format '", '')[:-10]
|
||||
else:
|
||||
raise ValueError(f"Language '{language}' is not supported.")
|
||||
|
||||
def _generate_prompt(context, retrieval_question):
|
||||
if guide:
|
||||
retrieval_question = _modify_retrieval_question(
|
||||
retrieval_question)
|
||||
else:
|
||||
retrieval_question = _modify_retrieval_question_for_base(
|
||||
retrieval_question)
|
||||
|
||||
if language == 'Chinese':
|
||||
if position == 'End':
|
||||
@ -129,10 +149,10 @@ class NeedleBenchOriginDataset(BaseDataset):
|
||||
return prompt
|
||||
|
||||
file_names = [
|
||||
'PaulGrahamEssays.jsonl', 'multi_needle_reasoning_en.json',
|
||||
'multi_needle_reasoning_zh.json', 'zh_finance.jsonl',
|
||||
'zh_game.jsonl', 'zh_general.jsonl', 'zh_government.jsonl',
|
||||
'zh_movie.jsonl', 'zh_tech.jsonl'
|
||||
'en_un_asr.jsonl', 'zh_all.jsonl', 'PaulGrahamEssays.jsonl',
|
||||
'multi_needle_reasoning_en.json', 'multi_needle_reasoning_zh.json',
|
||||
'zh_finance.jsonl', 'zh_game.jsonl', 'zh_general.jsonl',
|
||||
'zh_government.jsonl', 'zh_movie.jsonl', 'zh_tech.jsonl'
|
||||
]
|
||||
path = get_data_path(path)
|
||||
if os.environ.get('DATASET_SOURCE') == 'HF':
|
||||
|
@ -518,6 +518,7 @@ class HuggingFaceBaseModel(HuggingFacewithChatTemplate):
|
||||
max_seq_len: Optional[int] = None,
|
||||
pad_token_id: Optional[int] = None,
|
||||
stop_words: Optional[str] = [],
|
||||
drop_middle: bool = False,
|
||||
**other_kwargs):
|
||||
|
||||
self.logger = get_logger()
|
||||
@ -525,6 +526,7 @@ class HuggingFaceBaseModel(HuggingFacewithChatTemplate):
|
||||
self.tokenizer_only = tokenizer_only
|
||||
self.template_parser = LMTemplateParser()
|
||||
self.max_seq_len = _get_possible_max_seq_len(max_seq_len, path)
|
||||
self.drop_middle = drop_middle
|
||||
self._load_tokenizer(tokenizer_path or path, tokenizer_kwargs, pad_token_id)
|
||||
if not tokenizer_only:
|
||||
self._load_model(path=path, kwargs=model_kwargs, peft_path=peft_path, peft_kwargs=peft_kwargs)
|
||||
@ -551,7 +553,17 @@ class HuggingFaceBaseModel(HuggingFacewithChatTemplate):
|
||||
add_special_tokens=True,
|
||||
max_length=self.max_seq_len
|
||||
)
|
||||
|
||||
if self.drop_middle:
|
||||
assert len(inputs) == 1
|
||||
input_ids = self.tokenizer(inputs, padding=False, truncation=False)['input_ids']
|
||||
input_ids = torch.tensor(input_ids)
|
||||
if input_ids.shape[-1] > self.max_seq_len:
|
||||
input_ids = torch.cat([input_ids[:, : self.max_seq_len // 2], input_ids[:, - self.max_seq_len // 2:]], dim=-1)
|
||||
tokens = {'input_ids': input_ids, }
|
||||
else:
|
||||
tokens = self.tokenizer.batch_encode_plus(messages, **tokenize_kwargs)
|
||||
|
||||
tokens = {k: v.to(self.model.device) for k, v in tokens.items()}
|
||||
|
||||
generation_kwargs = self.generation_kwargs.copy()
|
||||
@ -603,7 +615,17 @@ class HuggingFaceBaseModel(HuggingFacewithChatTemplate):
|
||||
add_special_tokens=True,
|
||||
max_length=self.max_seq_len
|
||||
)
|
||||
|
||||
if self.drop_middle:
|
||||
assert len(inputs) == 1
|
||||
input_ids = self.tokenizer(inputs, padding=False, truncation=False)['input_ids']
|
||||
input_ids = torch.tensor(input_ids)
|
||||
if input_ids.shape[-1] > self.max_seq_len:
|
||||
input_ids = torch.cat([input_ids[:, : self.max_seq_len // 2], input_ids[:, - self.max_seq_len // 2:]], dim=-1)
|
||||
tokens = {'input_ids': input_ids, }
|
||||
else:
|
||||
tokens = self.tokenizer.batch_encode_plus(messages, **tokenize_kwargs)
|
||||
|
||||
tokens = {k: v.to(self.model.device) for k, v in tokens.items()}
|
||||
outputs = self.model(**tokens)[0]
|
||||
|
||||
|
@ -53,11 +53,13 @@ class TurboMindModel(BaseModel):
|
||||
engine_config: Dict = {},
|
||||
gen_config: Dict = {},
|
||||
batch_padding: bool = False,
|
||||
drop_middle: bool = False,
|
||||
end_str: Optional[str] = None):
|
||||
super().__init__(path=path,
|
||||
max_seq_len=max_seq_len,
|
||||
meta_template=meta_template)
|
||||
self.logger = get_logger()
|
||||
self.drop_middle = drop_middle
|
||||
self.max_seq_len = _get_possible_max_seq_len(max_seq_len, path)
|
||||
from lmdeploy import version_info
|
||||
from transformers import AutoTokenizer
|
||||
@ -118,6 +120,21 @@ class TurboMindModel(BaseModel):
|
||||
}
|
||||
gen_config = GenerationConfig(**gen_config)
|
||||
|
||||
if self.drop_middle:
|
||||
inputs_drop_middle = []
|
||||
for input in inputs:
|
||||
input_ids = self.tokenizer([input],
|
||||
padding=False,
|
||||
truncation=False)['input_ids'][0]
|
||||
if len(input_ids) > self.max_seq_len:
|
||||
input_ids = input_ids[:self.max_seq_len //
|
||||
2] + input_ids[-self.max_seq_len //
|
||||
2:]
|
||||
input = self.tokenizer.decode(input_ids,
|
||||
skip_special_tokens=True)
|
||||
inputs_drop_middle.append(input)
|
||||
inputs = inputs_drop_middle
|
||||
|
||||
results = []
|
||||
outputs = self.pipe(inputs, gen_config=gen_config, do_preprocess=False)
|
||||
for output in outputs:
|
||||
|
Loading…
Reference in New Issue
Block a user