2024-08-20 11:40:11 +08:00
|
|
|
from opencompass.partitioners import (
|
|
|
|
NaivePartitioner,
|
|
|
|
NumWorkerPartitioner,
|
|
|
|
)
|
|
|
|
from mmengine.config import read_base
|
|
|
|
from opencompass.runners import LocalRunner
|
|
|
|
from opencompass.tasks import OpenICLInferTask, OpenICLEvalTask
|
|
|
|
|
|
|
|
with read_base():
|
2024-08-22 14:48:45 +08:00
|
|
|
from opencompass.configs.models.qwen.lmdeploy_qwen2_7b_instruct import (
|
2024-08-20 11:40:11 +08:00
|
|
|
models as qwen2_7b_instruct_model,
|
|
|
|
)
|
2024-08-22 14:48:45 +08:00
|
|
|
from opencompass.configs.models.hf_llama.lmdeploy_llama3_8b_instruct import (
|
2024-08-20 11:40:11 +08:00
|
|
|
models as llama3_8b_instruct_model,
|
|
|
|
)
|
2024-08-22 14:48:45 +08:00
|
|
|
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat_1m import (
|
2024-08-20 11:40:11 +08:00
|
|
|
models as internlm2_5_7b_chat_1m,
|
|
|
|
)
|
2024-08-22 14:48:45 +08:00
|
|
|
from opencompass.configs.datasets.ruler.ruler_niah_gen import niah_datasets # Niah
|
|
|
|
from opencompass.configs.datasets.ruler.ruler_vt_gen import vt_datasets # VT
|
|
|
|
from opencompass.configs.datasets.ruler.ruler_fwe_gen import fwe_datasets # FWE
|
|
|
|
from opencompass.configs.datasets.ruler.ruler_cwe_gen import cwe_datasets # CWE
|
|
|
|
from opencompass.configs.datasets.ruler.ruler_qa_gen import qa_datasets # QA
|
|
|
|
from opencompass.configs.summarizers.groups.ruler import ruler_summary_groups
|
2024-08-20 11:40:11 +08:00
|
|
|
|
|
|
|
import_datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
|
|
|
|
|
|
|
|
# Evaluation config
|
|
|
|
NUM_SAMPLES = 500
|
|
|
|
# Change the context lengths to be tested
|
|
|
|
max_seq_lens = [1024 * 4, 1024 * 8, 1024 * 16, 1024 * 32]
|
|
|
|
abbr_suffixs = ['4k', '8k', '16k', '32k']
|
|
|
|
work_dir = './outputs/ruler'
|
|
|
|
|
|
|
|
# Model Settings
|
|
|
|
qwen2_7b_instruct_model[0]['max_seq_len'] = 33792
|
|
|
|
qwen2_7b_instruct_model[0]['engine_config']['session_len'] = 33792
|
|
|
|
qwen2_7b_instruct_model[0]['engine_config']['tp'] = 2
|
|
|
|
qwen2_7b_instruct_model[0]['run_cfg']['num_gpus'] = 2
|
|
|
|
llama3_8b_instruct_model[0]['max_seq_len'] = 33792
|
|
|
|
llama3_8b_instruct_model[0]['engine_config']['session_len'] = 33792
|
|
|
|
llama3_8b_instruct_model[0]['engine_config']['tp'] = 2
|
|
|
|
llama3_8b_instruct_model[0]['run_cfg']['num_gpus'] = 2
|
|
|
|
model_settings = [
|
|
|
|
[qwen2_7b_instruct_model[0], 'Qwen/Qwen2-7B-Instruct'],
|
|
|
|
[llama3_8b_instruct_model[0], 'meta-llama/Meta-Llama-3-8B-Instruct'],
|
|
|
|
[internlm2_5_7b_chat_1m[0], 'internlm/internlm2_5-7b-chat-1m'],
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
|
|
# Dataset Model Combination
|
|
|
|
datasets = []
|
|
|
|
models = []
|
|
|
|
model_dataset_combinations = []
|
|
|
|
|
|
|
|
# Different seq length
|
|
|
|
for max_seq_len, abbr_suffix in zip(max_seq_lens, abbr_suffixs):
|
|
|
|
for model, model_path in model_settings:
|
|
|
|
_tmp_datasets = []
|
|
|
|
for dataset in import_datasets:
|
|
|
|
tmp_dataset = dataset.deepcopy()
|
|
|
|
tmp_dataset['tokenizer_model'] = model_path
|
|
|
|
tmp_dataset['abbr'] = tmp_dataset['abbr'] + '_' + abbr_suffix
|
|
|
|
tmp_dataset['num_samples'] = NUM_SAMPLES
|
|
|
|
tmp_dataset['max_seq_length'] = max_seq_len
|
|
|
|
_tmp_datasets.append(tmp_dataset)
|
|
|
|
model_dataset_combinations.append(dict(models=[model], datasets=_tmp_datasets))
|
|
|
|
models.append(model)
|
|
|
|
datasets.extend(_tmp_datasets)
|
|
|
|
|
|
|
|
|
|
|
|
infer = dict(
|
|
|
|
partitioner=dict(type=NumWorkerPartitioner),
|
|
|
|
runner=dict(
|
|
|
|
type=LocalRunner, max_num_workers=16, task=dict(type=OpenICLInferTask), retry=5
|
|
|
|
),
|
|
|
|
)
|
|
|
|
|
|
|
|
eval = dict(
|
|
|
|
partitioner=dict(type=NaivePartitioner),
|
|
|
|
runner=dict(type=LocalRunner, max_num_workers=32, task=dict(type=OpenICLEvalTask)),
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
summarizer = dict(
|
|
|
|
dataset_abbrs=abbr_suffixs,
|
|
|
|
summary_groups=sum(
|
|
|
|
[v for k, v in locals().items() if k.endswith('_summary_groups')], []
|
|
|
|
),
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
|
|
|
# dataset version metric mode qwen2-7b-instruct-turbomind llama-3-8b-instruct-turbomind internlm2_5-7b-chat-1m-turbomind
|
|
|
|
# --------- --------- ------------- ------ ----------------------------- ------------------------------- ----------------------------------
|
|
|
|
# 4k - naive_average gen 93.66 93.48 91.20
|
|
|
|
# 8k - naive_average gen 88.38 89.95 89.07
|
|
|
|
# 16k - naive_average gen 84.27 0.14 87.61
|
|
|
|
# 32k - naive_average gen 81.36 0.00 84.59
|
|
|
|
# $$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
|