diff --git a/examples/eval_ruler.py b/examples/eval_ruler.py index b15174d5..f08f854c 100644 --- a/examples/eval_ruler.py +++ b/examples/eval_ruler.py @@ -5,24 +5,21 @@ from opencompass.runners import LocalRunner from opencompass.tasks import OpenICLEvalTask, OpenICLInferTask with read_base(): - from opencompass.configs.datasets.ruler.ruler_cwe_gen import \ - cwe_datasets # CWE - from opencompass.configs.datasets.ruler.ruler_fwe_gen import \ - fwe_datasets # FWE - from opencompass.configs.datasets.ruler.ruler_niah_gen import \ - niah_datasets # Niah - from opencompass.configs.datasets.ruler.ruler_qa_gen import \ - qa_datasets # QA - from opencompass.configs.datasets.ruler.ruler_vt_gen import \ - vt_datasets # VT - from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat_1m import \ - models as internlm2_5_7b_chat_1m - from opencompass.configs.models.hf_llama.lmdeploy_llama3_8b_instruct import \ - models as llama3_8b_instruct_model - from opencompass.configs.models.qwen.lmdeploy_qwen2_7b_instruct import \ - models as qwen2_7b_instruct_model - from opencompass.configs.summarizers.groups.ruler import \ - ruler_summary_groups + from opencompass.configs.datasets.ruler.ruler_cwe_gen import cwe_datasets # CWE + from opencompass.configs.datasets.ruler.ruler_fwe_gen import fwe_datasets # FWE + from opencompass.configs.datasets.ruler.ruler_niah_gen import niah_datasets # Niah + from opencompass.configs.datasets.ruler.ruler_qa_gen import qa_datasets # QA + from opencompass.configs.datasets.ruler.ruler_vt_gen import vt_datasets # VT + from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat_1m import ( + models as internlm2_5_7b_chat_1m, + ) + from opencompass.configs.models.hf_llama.lmdeploy_llama3_8b_instruct import ( + models as llama3_8b_instruct_model, + ) + from opencompass.configs.models.qwen.lmdeploy_qwen2_7b_instruct import ( + models as qwen2_7b_instruct_model, + ) + from opencompass.configs.summarizers.groups.ruler import ruler_summary_groups import_datasets = sum( [niah_datasets, vt_datasets, fwe_datasets, cwe_datasets, qa_datasets], []) diff --git a/opencompass/configs/datasets/ruler/ruler_256k_gen.py b/opencompass/configs/datasets/ruler/ruler_256k_gen.py new file mode 100644 index 00000000..09b3f7ae --- /dev/null +++ b/opencompass/configs/datasets/ruler/ruler_256k_gen.py @@ -0,0 +1,32 @@ +import os + +from mmengine.config import read_base + +with read_base(): + from .ruler_cwe_gen import cwe_datasets as cwe # CWE + from .ruler_fwe_gen import fwe_datasets as fwe # FWE + from .ruler_niah_gen import niah_datasets as niah # Niah + from .ruler_qa_gen import qa_datasets as qa # QA + from .ruler_vt_gen import vt_datasets as vt # VT + + +import_ds = sum((cwe, fwe, niah, qa, vt), []) + +# Evaluation config +NUM_SAMPLES = 100 # Change to the number of samples you need +tokenizer_model = os.environ.get('TOKENIZER_MODEL', 'gpt-4') +# Change the context lengths to be tested +max_seq_lens = [1024 * 256] +abbr_suffixs = ['256k'] + +ruler_datasets = [] + +# Different seq length +for max_seq_len, abbr_suffix in zip(max_seq_lens, abbr_suffixs): + for dataset in import_ds: + tmp_dataset = dataset.deepcopy() + tmp_dataset['abbr'] = tmp_dataset['abbr'] + '_' + abbr_suffix + tmp_dataset['num_samples'] = NUM_SAMPLES + tmp_dataset['max_seq_length'] = max_seq_len + tmp_dataset['tokenizer_model'] = tokenizer_model + ruler_datasets.append(tmp_dataset) diff --git a/opencompass/configs/datasets/ruler/ruler_512k_gen.py b/opencompass/configs/datasets/ruler/ruler_512k_gen.py new file mode 100644 index 00000000..3cc2c111 --- /dev/null +++ b/opencompass/configs/datasets/ruler/ruler_512k_gen.py @@ -0,0 +1,32 @@ +import os + +from mmengine.config import read_base + +with read_base(): + from .ruler_cwe_gen import cwe_datasets as cwe # CWE + from .ruler_fwe_gen import fwe_datasets as fwe # FWE + from .ruler_niah_gen import niah_datasets as niah # Niah + from .ruler_qa_gen import qa_datasets as qa # QA + from .ruler_vt_gen import vt_datasets as vt # VT + + +import_ds = sum((cwe, fwe, niah, qa, vt), []) + +# Evaluation config +NUM_SAMPLES = 100 # Change to the number of samples you need +tokenizer_model = os.environ.get('TOKENIZER_MODEL', 'gpt-4') +# Change the context lengths to be tested +max_seq_lens = [1024 * 512] +abbr_suffixs = ['512k'] + +ruler_datasets = [] + +# Different seq length +for max_seq_len, abbr_suffix in zip(max_seq_lens, abbr_suffixs): + for dataset in import_ds: + tmp_dataset = dataset.deepcopy() + tmp_dataset['abbr'] = tmp_dataset['abbr'] + '_' + abbr_suffix + tmp_dataset['num_samples'] = NUM_SAMPLES + tmp_dataset['max_seq_length'] = max_seq_len + tmp_dataset['tokenizer_model'] = tokenizer_model + ruler_datasets.append(tmp_dataset) diff --git a/opencompass/configs/datasets/ruler/ruler_combined_gen.py b/opencompass/configs/datasets/ruler/ruler_combined_gen.py index 077c4f10..77620824 100644 --- a/opencompass/configs/datasets/ruler/ruler_combined_gen.py +++ b/opencompass/configs/datasets/ruler/ruler_combined_gen.py @@ -8,5 +8,7 @@ with read_base(): from .ruler_32k_gen import ruler_datasets as ruler_32k_ds from .ruler_64k_gen import ruler_datasets as ruler_64k_ds from .ruler_128k_gen import ruler_datasets as ruler_128k_ds + from .ruler_256k_gen import ruler_datasets as ruler_256k_ds + from .ruler_512k_gen import ruler_datasets as ruler_512k_ds ruler_combined_datasets = sum((v for k, v in locals().items() if k.endswith('_ds')), []) diff --git a/opencompass/configs/datasets/ruler/ruler_vt_gen.py b/opencompass/configs/datasets/ruler/ruler_vt_gen.py index 42dadc43..b1e5b7fd 100644 --- a/opencompass/configs/datasets/ruler/ruler_vt_gen.py +++ b/opencompass/configs/datasets/ruler/ruler_vt_gen.py @@ -1,8 +1,7 @@ +from opencompass.datasets.ruler.ruler_vt import RulerVtDataset, RulerVtEvaluator +from opencompass.openicl.icl_inferencer import GenInferencer from opencompass.openicl.icl_prompt_template import PromptTemplate from opencompass.openicl.icl_retriever import ZeroRetriever -from opencompass.openicl.icl_inferencer import GenInferencer -from opencompass.datasets.ruler.ruler_vt import RulerVtDataset -from opencompass.datasets.ruler.ruler_vt import RulerVtEvaluator # VT Dataset vt_datasets = [ diff --git a/opencompass/configs/summarizers/groups/ruler.py b/opencompass/configs/summarizers/groups/ruler.py index 49a76567..dc383852 100644 --- a/opencompass/configs/summarizers/groups/ruler.py +++ b/opencompass/configs/summarizers/groups/ruler.py @@ -1,19 +1,21 @@ +"""RULER summary groups""" + default_ruler_tasks = [ - 'ruler_niah_single_1', - 'ruler_niah_single_2', - 'ruler_niah_single_3', - 'ruler_niah_multikey_1', - 'ruler_niah_multikey_2', - 'ruler_niah_multikey_3', - 'ruler_niah_multivalue', - 'ruler_niah_multiquery', - 'ruler_vt', - 'ruler_fwe', - 'ruler_cwe', - 'ruler_qa_squad', - 'ruler_qa_hotpotqa', + "ruler_niah_single_1", + "ruler_niah_single_2", + "ruler_niah_single_3", + "ruler_niah_multikey_1", + "ruler_niah_multikey_2", + "ruler_niah_multikey_3", + "ruler_niah_multivalue", + "ruler_niah_multiquery", + "ruler_vt", + "ruler_fwe", + "ruler_cwe", + "ruler_qa_squad", + "ruler_qa_hotpotqa", ] -context_window_sizes = ['4k', '8k', '16k', '32k', '64k', '128k', '1m'] +context_window_sizes = ["4k", "8k", "16k", "32k", "64k", "128k", "256k", "512k", "1m"] ruler_summary_groups = [] for context_window_size in context_window_sizes: diff --git a/opencompass/configs/summarizers/ruler.py b/opencompass/configs/summarizers/ruler.py index cb35ac2e..5f9aae56 100644 --- a/opencompass/configs/summarizers/ruler.py +++ b/opencompass/configs/summarizers/ruler.py @@ -47,6 +47,18 @@ ruler_128k_summarizer = dict( [v for k, v in locals().items() if k.endswith('_summary_groups')], [] ), ) +ruler_256k_summarizer = dict( + dataset_abbrs=["ruler_256k"], + summary_groups=sum( + [v for k, v in locals().items() if k.endswith("_summary_groups")], [] + ), +) +ruler_512k_summarizer = dict( + dataset_abbrs=["ruler_512k"], + summary_groups=sum( + [v for k, v in locals().items() if k.endswith("_summary_groups")], [] + ), +) ruler_1m_summarizer = dict( dataset_abbrs=['ruler_1m'], @@ -57,15 +69,17 @@ ruler_1m_summarizer = dict( ruler_combined_summarizer = dict( dataset_abbrs=[ - 'ruler_4k', - 'ruler_8k', - 'ruler_16k', - 'ruler_32k', - 'ruler_64k', - 'ruler_128k', - 'ruler_1m', + "ruler_4k", + "ruler_8k", + "ruler_16k", + "ruler_32k", + "ruler_64k", + "ruler_128k", + "ruler_256k", + "ruler_512k", + "ruler_1m", ], summary_groups=sum( - [v for k, v in locals().items() if k.endswith('_summary_groups')], [] + [v for k, v in locals().items() if k.endswith("_summary_groups")], [] ), )