diff --git a/configs/models/mistral/lmdeploy_mixtral_large_instruct_2407.py b/configs/models/mistral/lmdeploy_mixtral_large_instruct_2407.py index e79a1f73..6dcdce83 100644 --- a/configs/models/mistral/lmdeploy_mixtral_large_instruct_2407.py +++ b/configs/models/mistral/lmdeploy_mixtral_large_instruct_2407.py @@ -5,11 +5,18 @@ models = [ type=TurboMindModelwithChatTemplate, abbr='mixtral-large-instruct-2407-turbomind', path='mistralai/Mistral-Large-Instruct-2407', - engine_config=dict(session_len=32768, max_batch_size=16, tp=4), - gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=4096), + engine_config=dict( + session_len=32768, + max_batch_size=16, + tp=4, + cache_max_entry_count=0.7, + ), + gen_config=dict( + top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=4096 + ), max_seq_len=32768, max_out_len=4096, - batch_size=16, + batch_size=8, run_cfg=dict(num_gpus=4), ) ] diff --git a/configs/summarizers/needlebench.py b/configs/summarizers/needlebench.py index d1aeb984..85d90dc0 100644 --- a/configs/summarizers/needlebench.py +++ b/configs/summarizers/needlebench.py @@ -138,6 +138,10 @@ needlebench_256k_summarizer = create_summarizer(context_lengths_256k, depths_lis context_lengths_1000k = list([20000, 160000, 300000, 440000, 580000, 720000, 860000, 1000000]) needlebench_1000k_summarizer = create_summarizer(context_lengths_1000k, depths_list_sparse, '1000k') +depths_list_internal = [0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, ] +needlebench_internal_32k_summarizer = create_summarizer([32000], depths_list_internal, '32000') +needlebench_internal_100k_summarizer = create_summarizer([100000], depths_list_internal, '100000') +needlebench_internal_200k_summarizer = create_summarizer([200000], depths_list_internal, '200000') _needlebench_8k_parallel_en_batch1 = [] _needlebench_8k_parallel_en_batch5 = [] diff --git a/opencompass/configs/datasets/needlebench/needlebench_base/needlebench_base_gen.py b/opencompass/configs/datasets/needlebench/needlebench_base/needlebench_base_gen.py new file mode 100644 index 00000000..ba111e01 --- /dev/null +++ b/opencompass/configs/datasets/needlebench/needlebench_base/needlebench_base_gen.py @@ -0,0 +1,8 @@ +from mmengine.config import read_base + +with read_base(): + + from .needlebench_single import needlebench_en_datasets as needlebench_origin_en_datasets + from .needlebench_single import needlebench_zh_datasets as needlebench_origin_zh_datasets + +needlebench_datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), []) diff --git a/opencompass/configs/datasets/needlebench/needlebench_base/needlebench_single.py b/opencompass/configs/datasets/needlebench/needlebench_base/needlebench_single.py new file mode 100644 index 00000000..1e0d1934 --- /dev/null +++ b/opencompass/configs/datasets/needlebench/needlebench_base/needlebench_single.py @@ -0,0 +1,111 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets.needlebench.origin import NeedleBenchOriginDataset +from opencompass.datasets.needlebench.origin import NeedleBenchOriginEvaluator +from opencompass.datasets.needlebench.origin import needlebench_postprocess +from opencompass.datasets.needlebench.origin import needlebench_dataset_postprocess +import math + + +def logistic(x, L=100, x0=50, k=0.1): + return round(L / (1 + math.exp(-k * (x - x0))), 3) + + +def generate_linear_space(start, end, num): + if num == 1: + return [start] + elif num < 1: + raise ValueError('num must be at least 1.') + step = (end - start) / (num - 1) + return [start + step * i for i in range(num)] + + +def generate_depth_percents(intervals, interval_type): + if interval_type == 'linear': + return generate_linear_space(0, 100, intervals) + elif interval_type == 'sigmoid': + linear_space = generate_linear_space(0, 100, intervals) + return [logistic(x) for x in linear_space] + else: + raise ValueError('Unsupported interval type') + + +needlebench_reader_cfg = dict(input_columns=['prompt'], output_column='answer') + +needlebench_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict(role='HUMAN', prompt='{prompt}'), + # dict(role='BOT', prompt='{answer}\n'), + ] + ), + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer), +) + +needlebench_eval_cfg = dict( + evaluator=dict(type=NeedleBenchOriginEvaluator), + pred_postprocessor=dict(type=needlebench_postprocess), + dataset_postprocessor=dict(type=needlebench_dataset_postprocess), + pred_role='BOT', +) + +context_lengths = [32000, 100000, 200000, ] # 300000, 400000, 500000, 600000, 700000, 800000, 900000, 1000000 +depths_list = [0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, ] # + +base_path = 'opencompass/needlebench' +file_list = ['en_un_asr.jsonl'] +needlebench_en_datasets = [] +needle_file_name = 'needles.jsonl' + +for original_context_length in context_lengths: + for depth_percent in depths_list: + dataset_dict = { + 'abbr': f'Length{original_context_length}' + f'Depth{int(depth_percent)}_origin_en_{original_context_length}', + 'type': NeedleBenchOriginDataset, + 'path': base_path, + 'length': original_context_length, + 'depth': int(depth_percent), + 'tokenizer_model': 'gpt-4', + 'file_list': file_list, + 'num_repeats_per_file': 10, + 'length_buffer': 600, + 'guide': False, + 'language': 'English', + 'needle_file_name': needle_file_name, + 'reader_cfg': needlebench_reader_cfg, + 'infer_cfg': needlebench_infer_cfg, + 'eval_cfg': needlebench_eval_cfg, + } + needlebench_en_datasets.append(dataset_dict) + +file_list = ['zh_all.jsonl'] +needlebench_zh_datasets = [] +needle_file_name = 'needles.jsonl' + +for original_context_length in context_lengths: + for depth_percent in depths_list: + dataset_dict = { + 'abbr': f'Length{original_context_length}' + f'Depth{int(depth_percent)}_origin_zh_{original_context_length}', + 'type': NeedleBenchOriginDataset, + 'path': base_path, + 'length': original_context_length, + 'depth': int(depth_percent), + 'tokenizer_model': 'gpt-4', + 'file_list': file_list, + 'num_repeats_per_file': 10, + 'length_buffer': 200, + 'guide': False, + 'language': 'Chinese', + 'needle_file_name': needle_file_name, + 'reader_cfg': needlebench_reader_cfg, + 'infer_cfg': needlebench_infer_cfg, + 'eval_cfg': needlebench_eval_cfg, + } + needlebench_zh_datasets.append(dataset_dict) diff --git a/opencompass/configs/models/mistral/hf_ministral_8b_instruct_2410.py b/opencompass/configs/models/mistral/hf_ministral_8b_instruct_2410.py new file mode 100644 index 00000000..5a541a4d --- /dev/null +++ b/opencompass/configs/models/mistral/hf_ministral_8b_instruct_2410.py @@ -0,0 +1,12 @@ +from opencompass.models import HuggingFacewithChatTemplate + +models = [ + dict( + type=HuggingFacewithChatTemplate, + abbr="ministral-8B-instruct-2410-hf", + path="mistralai/Ministral-8B-Instruct-2410", + max_out_len=1024, + batch_size=8, + run_cfg=dict(num_gpus=2), + ) +] diff --git a/opencompass/configs/models/mistral/lmdeploy_ministral_8b_instruct_2410.py b/opencompass/configs/models/mistral/lmdeploy_ministral_8b_instruct_2410.py new file mode 100644 index 00000000..3fadb881 --- /dev/null +++ b/opencompass/configs/models/mistral/lmdeploy_ministral_8b_instruct_2410.py @@ -0,0 +1,15 @@ +from opencompass.models import TurboMindModelwithChatTemplate + +models = [ + dict( + type=TurboMindModelwithChatTemplate, + abbr="ministral-8B-instruct-2410-turbomind", + path="mistralai/Ministral-8B-Instruct-2410", + engine_config=dict(session_len=16384, max_batch_size=16, tp=1), + gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=4096), + max_seq_len=16384, + max_out_len=4096, + batch_size=16, + run_cfg=dict(num_gpus=1), + ) +] diff --git a/opencompass/configs/models/mistral/lmdeploy_mixtral_large_instruct_2407.py b/opencompass/configs/models/mistral/lmdeploy_mixtral_large_instruct_2407.py index e79a1f73..6dcdce83 100644 --- a/opencompass/configs/models/mistral/lmdeploy_mixtral_large_instruct_2407.py +++ b/opencompass/configs/models/mistral/lmdeploy_mixtral_large_instruct_2407.py @@ -5,11 +5,18 @@ models = [ type=TurboMindModelwithChatTemplate, abbr='mixtral-large-instruct-2407-turbomind', path='mistralai/Mistral-Large-Instruct-2407', - engine_config=dict(session_len=32768, max_batch_size=16, tp=4), - gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=4096), + engine_config=dict( + session_len=32768, + max_batch_size=16, + tp=4, + cache_max_entry_count=0.7, + ), + gen_config=dict( + top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=4096 + ), max_seq_len=32768, max_out_len=4096, - batch_size=16, + batch_size=8, run_cfg=dict(num_gpus=4), ) ] diff --git a/opencompass/configs/models/nvidia/lmdeploy_nemotron_70b_instruct_hf.py b/opencompass/configs/models/nvidia/lmdeploy_nemotron_70b_instruct_hf.py new file mode 100644 index 00000000..2ed9f56e --- /dev/null +++ b/opencompass/configs/models/nvidia/lmdeploy_nemotron_70b_instruct_hf.py @@ -0,0 +1,18 @@ +from opencompass.models import TurboMindModelwithChatTemplate + +models = [ + dict( + type=TurboMindModelwithChatTemplate, + abbr='nvidia-3_1-Nemotron-70b-instruct-HF-turbomind', + path='nvidia/Llama-3.1-Nemotron-70B-Instruct-HF', + engine_config=dict(max_batch_size=16, tp=4), + gen_config=dict( + top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=4096 + ), + max_seq_len=16384, + max_out_len=4096, + batch_size=16, + run_cfg=dict(num_gpus=4), + stop_words=['<|end_of_text|>', '<|eot_id|>'], + ) +] diff --git a/opencompass/configs/summarizers/needlebench.py b/opencompass/configs/summarizers/needlebench.py index d1aeb984..85d90dc0 100644 --- a/opencompass/configs/summarizers/needlebench.py +++ b/opencompass/configs/summarizers/needlebench.py @@ -138,6 +138,10 @@ needlebench_256k_summarizer = create_summarizer(context_lengths_256k, depths_lis context_lengths_1000k = list([20000, 160000, 300000, 440000, 580000, 720000, 860000, 1000000]) needlebench_1000k_summarizer = create_summarizer(context_lengths_1000k, depths_list_sparse, '1000k') +depths_list_internal = [0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, ] +needlebench_internal_32k_summarizer = create_summarizer([32000], depths_list_internal, '32000') +needlebench_internal_100k_summarizer = create_summarizer([100000], depths_list_internal, '100000') +needlebench_internal_200k_summarizer = create_summarizer([200000], depths_list_internal, '200000') _needlebench_8k_parallel_en_batch1 = [] _needlebench_8k_parallel_en_batch5 = [] diff --git a/opencompass/datasets/dingo.py b/opencompass/datasets/dingo.py index 753d78dd..ea23b221 100644 --- a/opencompass/datasets/dingo.py +++ b/opencompass/datasets/dingo.py @@ -10,6 +10,7 @@ from datasets import Dataset from opencompass.openicl.icl_evaluator import BaseEvaluator from opencompass.registry import ICL_EVALUATORS, LOAD_DATASET +from opencompass.utils import get_data_path from .base import BaseDataset @@ -19,6 +20,7 @@ class DingoDataset(BaseDataset): @staticmethod def load(path: str): + path = get_data_path(path, local_mode=True) raw_data = [] with open(path, encoding='utf-8') as f: reader = csv.reader(f, delimiter=';') @@ -34,6 +36,7 @@ class DingoLongDataset(BaseDataset): @staticmethod def load(path: str): + path = get_data_path(path, local_mode=True) raw_data = [] with open(path, 'r', encoding='utf-8') as f: for line in f: @@ -46,7 +49,6 @@ class DingoEvaluator(BaseEvaluator): def score(self, origin_prompt: List, predictions: List) -> dict: try: - # from dingo.model.model import Model from dingo.exec import Executor from dingo.io import InputArgs except Exception: @@ -58,27 +60,30 @@ class DingoEvaluator(BaseEvaluator): current_time = time.strftime('%Y%m%d_%H%M%S', time.localtime()) file_data = [{'prompt': pmt, 'prediction': prd} for pmt, prd in zip(origin_prompt, predictions)] - file_name = 'dingo_file_' + current_time + '.jsonl' + os.makedirs('tmp', exist_ok=True) + file_name = os.path.join('tmp', 'dingo_file_' + current_time + '.jsonl') # noqa: E501 + with open(file_name, 'a', encoding='utf-8') as f: for d in file_data: json.dump(d, f, ensure_ascii=False) f.write('\n') - input_data = { - 'eval_models': ['llm_base'], + 'eval_model': 'llm_base', 'input_path': file_name, 'output_path': './outputs/dingo/', + 'save_data': True, 'dataset': 'local', - 'datasource': 'local', 'data_format': 'jsonl', - 'column_prompt': ['prompt'], - 'column_content': ['prediction'], + 'column_prompt': 'prompt', + 'column_content': 'prediction', } - # Model.apply_config(input_data["custom_config_path"]) - input_args = InputArgs(**input_data) - executor = Executor.exec_map['local'](input_args) - result = executor.execute() - summary = result[0].to_dict() - - os.remove(file_name) + try: + input_args = InputArgs(**input_data) + executor = Executor.exec_map['local'](input_args) + result = executor.execute() + summary = result[0].to_dict() + except Exception: + raise + finally: + os.remove(file_name) return summary diff --git a/opencompass/datasets/needlebench/origin.py b/opencompass/datasets/needlebench/origin.py index 3359eae7..f50be746 100644 --- a/opencompass/datasets/needlebench/origin.py +++ b/opencompass/datasets/needlebench/origin.py @@ -81,10 +81,30 @@ class NeedleBenchOriginDataset(BaseDataset): else: raise ValueError(f"Language '{language}' is not supported.") + def _modify_retrieval_question_for_base(retrieval_question): + if language == 'Chinese': + parts = retrieval_question.split('请按照') + retrieval_question = (parts[0] + '在回答之前,请思考文档中与此问题' + '最相关的内容是什么。请按照' + parts[1]) + return retrieval_question.replace("请按照'", '')[:-16] + elif language == 'English': + parts = retrieval_question.split('Please answer in the format') + retrieval_question = ( + parts[0] + 'Before answering, please consider' + ' what in the document is most relevant to this question.' + ' Please answer in the format' + parts[1]) + return retrieval_question.replace( + "Please answer in the format '", '')[:-10] + else: + raise ValueError(f"Language '{language}' is not supported.") + def _generate_prompt(context, retrieval_question): if guide: retrieval_question = _modify_retrieval_question( retrieval_question) + else: + retrieval_question = _modify_retrieval_question_for_base( + retrieval_question) if language == 'Chinese': if position == 'End': @@ -129,10 +149,10 @@ class NeedleBenchOriginDataset(BaseDataset): return prompt file_names = [ - 'PaulGrahamEssays.jsonl', 'multi_needle_reasoning_en.json', - 'multi_needle_reasoning_zh.json', 'zh_finance.jsonl', - 'zh_game.jsonl', 'zh_general.jsonl', 'zh_government.jsonl', - 'zh_movie.jsonl', 'zh_tech.jsonl' + 'en_un_asr.jsonl', 'zh_all.jsonl', 'PaulGrahamEssays.jsonl', + 'multi_needle_reasoning_en.json', 'multi_needle_reasoning_zh.json', + 'zh_finance.jsonl', 'zh_game.jsonl', 'zh_general.jsonl', + 'zh_government.jsonl', 'zh_movie.jsonl', 'zh_tech.jsonl' ] path = get_data_path(path) if os.environ.get('DATASET_SOURCE') == 'HF': diff --git a/opencompass/models/huggingface_above_v4_33.py b/opencompass/models/huggingface_above_v4_33.py index d1d2b3f3..261d3926 100644 --- a/opencompass/models/huggingface_above_v4_33.py +++ b/opencompass/models/huggingface_above_v4_33.py @@ -518,6 +518,7 @@ class HuggingFaceBaseModel(HuggingFacewithChatTemplate): max_seq_len: Optional[int] = None, pad_token_id: Optional[int] = None, stop_words: Optional[str] = [], + drop_middle: bool = False, **other_kwargs): self.logger = get_logger() @@ -525,6 +526,7 @@ class HuggingFaceBaseModel(HuggingFacewithChatTemplate): self.tokenizer_only = tokenizer_only self.template_parser = LMTemplateParser() self.max_seq_len = _get_possible_max_seq_len(max_seq_len, path) + self.drop_middle = drop_middle self._load_tokenizer(tokenizer_path or path, tokenizer_kwargs, pad_token_id) if not tokenizer_only: self._load_model(path=path, kwargs=model_kwargs, peft_path=peft_path, peft_kwargs=peft_kwargs) @@ -551,7 +553,17 @@ class HuggingFaceBaseModel(HuggingFacewithChatTemplate): add_special_tokens=True, max_length=self.max_seq_len ) - tokens = self.tokenizer.batch_encode_plus(messages, **tokenize_kwargs) + + if self.drop_middle: + assert len(inputs) == 1 + input_ids = self.tokenizer(inputs, padding=False, truncation=False)['input_ids'] + input_ids = torch.tensor(input_ids) + if input_ids.shape[-1] > self.max_seq_len: + input_ids = torch.cat([input_ids[:, : self.max_seq_len // 2], input_ids[:, - self.max_seq_len // 2:]], dim=-1) + tokens = {'input_ids': input_ids, } + else: + tokens = self.tokenizer.batch_encode_plus(messages, **tokenize_kwargs) + tokens = {k: v.to(self.model.device) for k, v in tokens.items()} generation_kwargs = self.generation_kwargs.copy() @@ -603,7 +615,17 @@ class HuggingFaceBaseModel(HuggingFacewithChatTemplate): add_special_tokens=True, max_length=self.max_seq_len ) - tokens = self.tokenizer.batch_encode_plus(messages, **tokenize_kwargs) + + if self.drop_middle: + assert len(inputs) == 1 + input_ids = self.tokenizer(inputs, padding=False, truncation=False)['input_ids'] + input_ids = torch.tensor(input_ids) + if input_ids.shape[-1] > self.max_seq_len: + input_ids = torch.cat([input_ids[:, : self.max_seq_len // 2], input_ids[:, - self.max_seq_len // 2:]], dim=-1) + tokens = {'input_ids': input_ids, } + else: + tokens = self.tokenizer.batch_encode_plus(messages, **tokenize_kwargs) + tokens = {k: v.to(self.model.device) for k, v in tokens.items()} outputs = self.model(**tokens)[0] diff --git a/opencompass/models/interntrain.py b/opencompass/models/interntrain.py index 8b9f3e62..31cdba1d 100644 --- a/opencompass/models/interntrain.py +++ b/opencompass/models/interntrain.py @@ -32,16 +32,22 @@ class InternTrainManager: class CurrentInternTrainManager(InternTrainManager): def load_config(self, path, model_config=None): - from internlm.config import Config if model_config is None: - model_config = torch.load(os.path.join(path, 'model_config.pt')) - elif isinstance(model_config, dict): - model_config = Config(model_config) - elif isinstance(model_config, str): - model_config = Config.fromfile(model_config).model + from internlm.checkpoint.checkpoint_manager import try_load_config + model_config = try_load_config( + os.path.join(path, 'model_config.pt')) + elif isinstance(model_config, str) and model_config.endswith('.pt'): + from internlm.checkpoint.checkpoint_manager import try_load_config + model_config = try_load_config(model_config) else: - raise NotImplementedError( - 'model_config should be None, dict or filename.') + from internlm.config import Config + if isinstance(model_config, dict): + model_config = Config(model_config) + elif isinstance(model_config, str): + model_config = Config.fromfile(model_config).model + else: + raise NotImplementedError( + 'model_config should be None, dict or filename.') return model_config @@ -60,6 +66,8 @@ class LegacyInternTrainManager(InternTrainManager): from internlm.core.context import Config if model_config is None: model_config = torch.load(os.path.join(path, 'model_config.pt')) + elif isinstance(model_config, str) and model_config.endswith('.pt'): + model_config = torch.load(model_config) elif isinstance(model_config, dict): model_config = Config(model_config) elif isinstance(model_config, str): @@ -132,6 +140,7 @@ class InternTrain(BaseModel): tokenizer_path: Optional[str] = None, tokenizer_type: str = 'INTERNLM', model_config: Optional[Union[str, Dict]] = None, + parallel_config: Optional[str] = None, model_type: str = 'INTERNLM2', ckpt_type: Optional[str] = None, meta_template: Optional[Dict] = None, @@ -140,11 +149,13 @@ class InternTrain(BaseModel): sync_rank: bool = False, mode='none', end_str: Optional[str] = None): + super().__init__(path=path, max_seq_len=max_seq_len, tokenizer_only=tokenizer_only, meta_template=meta_template, sync_rank=sync_rank) + self.logger = get_logger() # insert interntrain module self.manager = InternTrainManager.build(module_path) @@ -162,6 +173,7 @@ class InternTrain(BaseModel): if not tokenizer_only: self._load_model(path=path, model_config=model_config, + parallel_config=parallel_config, model_type=model_type, model_dtype=model_dtype, ckpt_type=ckpt_type) @@ -196,6 +208,7 @@ class InternTrain(BaseModel): def _load_model(self, path: str, model_config: Optional[str] = None, + parallel_config: Optional[str] = None, model_type: str = 'INTERNLM2', model_dtype: Optional[str] = None, ckpt_type: Optional[str] = None): @@ -216,10 +229,11 @@ class InternTrain(BaseModel): world_size = int(os.getenv('WORLD_SIZE', '1')) tp_size = world_size # TODO self.logger.info(f'world size: {world_size} tp: {tp_size}') - parallel_config = dict(zero1=dict(size=1, fsdp=False), - pipeline=dict(size=1), - tensor=dict(size=tp_size, mode='mtp'), - sequence_parallel=False) + if parallel_config is None: + parallel_config = dict(zero1=dict(size=1, fsdp=False), + pipeline=dict(size=1), + tensor=dict(size=tp_size, mode='mtp'), + sequence_parallel=False) config = dict(model=model_config, parallel=parallel_config, data=dict(use_packed_dataset=False), @@ -253,7 +267,10 @@ class InternTrain(BaseModel): load_func = LOAD_FUNC_DICT[ckpt_type] load_func(path, self.model) - self.model.to(model_config['dtype']).eval().cuda() + if 'moe' in model_type.lower(): + self.model.eval().cuda() + else: + self.model.to(model_config['dtype']).eval().cuda() def _load_tokenizer(self, tokenizer_path: str, tokenizer_type: str): from internlm.core.context.registry import TOKENIZER_INITIALIZER diff --git a/opencompass/models/turbomind.py b/opencompass/models/turbomind.py index cf6a5d99..92cd0950 100644 --- a/opencompass/models/turbomind.py +++ b/opencompass/models/turbomind.py @@ -53,11 +53,13 @@ class TurboMindModel(BaseModel): engine_config: Dict = {}, gen_config: Dict = {}, batch_padding: bool = False, + drop_middle: bool = False, end_str: Optional[str] = None): super().__init__(path=path, max_seq_len=max_seq_len, meta_template=meta_template) self.logger = get_logger() + self.drop_middle = drop_middle self.max_seq_len = _get_possible_max_seq_len(max_seq_len, path) from lmdeploy import version_info from transformers import AutoTokenizer @@ -118,6 +120,21 @@ class TurboMindModel(BaseModel): } gen_config = GenerationConfig(**gen_config) + if self.drop_middle: + inputs_drop_middle = [] + for input in inputs: + input_ids = self.tokenizer([input], + padding=False, + truncation=False)['input_ids'][0] + if len(input_ids) > self.max_seq_len: + input_ids = input_ids[:self.max_seq_len // + 2] + input_ids[-self.max_seq_len // + 2:] + input = self.tokenizer.decode(input_ids, + skip_special_tokens=True) + inputs_drop_middle.append(input) + inputs = inputs_drop_middle + results = [] outputs = self.pipe(inputs, gen_config=gen_config, do_preprocess=False) for output in outputs: diff --git a/opencompass/runners/dlc.py b/opencompass/runners/dlc.py index 094c4269..45f7ec82 100644 --- a/opencompass/runners/dlc.py +++ b/opencompass/runners/dlc.py @@ -167,6 +167,10 @@ class DLCRunner(BaseRunner): # set priority to 1 as default task_priority = self.aliyun_cfg.get('priority', 1) + worker_cpu = self.aliyun_cfg.get('worker_cpu', 12) + worker_memory = self.aliyun_cfg.get('worker_memory', 192) + config_path = (f" --config {self.aliyun_cfg['dlc_config_path']}" + if 'dlc_config_path' in self.aliyun_cfg else '') # Different dlc versions has different commands if self.aliyun_cfg.get('dlc_job_cmd') == 'create': @@ -179,14 +183,14 @@ class DLCRunner(BaseRunner): f'dlc {dlc_job_cmd}' f" --command '{shell_cmd}'" f' --name {task_name[:512]}' - f" --config {self.aliyun_cfg['dlc_config_path']}" + f'{config_path}' f" --workspace_id {self.aliyun_cfg['workspace_id']}" f" --resource_id={self.aliyun_cfg['resource_id']}" f' --priority {task_priority}' f'{worker_cmd}' - f' --worker_cpu {max(num_gpus * 8, 12)}' + f' --worker_cpu {max(num_gpus * 8, worker_cpu)}' f' --worker_gpu {num_gpus}' - f' --worker_memory {max(num_gpus * 128, 192)}Gi' + f' --worker_memory {max(num_gpus * 128, worker_memory)}Gi' f" --worker_image {self.aliyun_cfg['worker_image']}" f" --data_sources={','.join(self.aliyun_cfg['data_sources'])}") get_cmd = partial(task.get_command, @@ -253,8 +257,15 @@ class DLCRunner(BaseRunner): for retry_index in range(num_retry): time.sleep(2) try: - job_info = json.loads( - subprocess.getoutput(f'dlc get job {job_id}')) + raw_job_info = subprocess.getoutput( + f'dlc get job {job_id}{config_path}') + if raw_job_info.startswith( + '/bin/bash') or raw_job_info.startswith( + '[OK]') or raw_job_info.startswith( + '[FAILED]'): + raw_job_info = raw_job_info[raw_job_info. + index('\n') + 1:] + job_info = json.loads(raw_job_info) break except: # noqa: E722 if retry_index > num_retry // 3: @@ -287,7 +298,7 @@ class DLCRunner(BaseRunner): elasped_time).strftime('%Y-%m-%dT%H:%M:%SZ') logs_cmd = ('dlc logs' f' {job_id} {job_id}-master-0' - f" -c {self.aliyun_cfg['dlc_config_path']}" + f'{config_path}' f' --start_time {pri_time}' f' --end_time {cur_time}') try: diff --git a/opencompass/summarizers/summarizer_pretrain.py b/opencompass/summarizers/summarizer_pretrain.py index ac5d2f44..6c19ae72 100644 --- a/opencompass/summarizers/summarizer_pretrain.py +++ b/opencompass/summarizers/summarizer_pretrain.py @@ -15,7 +15,7 @@ from opencompass.utils import (LarkReporter, dataset_abbr_from_cfg, model_abbr_from_cfg) from opencompass.utils.prompt import get_prompt_hash -METRIC_WHITELIST = ['score', 'auc_score', 'accuracy', 'humaneval_pass@1', 'rouge1', 'avg_toxicity_score', 'bleurt_diff', 'matthews_correlation', 'truth'] +METRIC_WHITELIST = ['pass@1', 'score', 'auc_score', 'accuracy', 'humaneval_pass@1', 'rouge1', 'avg_toxicity_score', 'bleurt_diff', 'matthews_correlation', 'truth'] METRIC_BLACKLIST = ['bp', 'sys_len', 'ref_len'] class PretrainSummarizer: @@ -256,14 +256,13 @@ class PretrainSummarizer: f.write('\n'.join([','.join(row) for row in table]) + '\n') self.logger.info(f'write csv to {osp.abspath(output_csv_path)}') - summary_groups = summarizer_cfg.get('summary_groups', []) for sg in summary_groups: for model_abbr in model_abbrs: results = {} eval_modes = [] for dataset_abbr in sg['subsets']: - if dataset_abbr in parsed_results[model_abbr]: + if dataset_abbr in parsed_results[model_abbr] and len(parsed_results[model_abbr][dataset_abbr]) > 1: results[dataset_abbr] = (parsed_results[model_abbr][dataset_abbr][-1],parsed_results[model_abbr][dataset_abbr][-2]) eval_modes.append(dataset_eval_mode.get(dataset_abbr, 'unknown')) @@ -327,8 +326,9 @@ class PretrainSummarizer: for model_abbr in model_abbrs: if dataset_abbr in parsed_results[model_abbr]: if incorrect_bpb != -1 and correct_bpb != -1: - row.append('{:.02f}/{:.02f}'.format(parsed_results[model_abbr][dataset_abbr][correct_bpb], - parsed_results[model_abbr][dataset_abbr][incorrect_bpb])) + right_bpb = parsed_results[model_abbr][dataset_abbr][correct_bpb] + wrong_bpb = parsed_results[model_abbr][dataset_abbr][incorrect_bpb] + row.append('{:.02f}/{:.02f}/{:.02f}'.format(right_bpb,wrong_bpb,wrong_bpb-right_bpb)) else: row.append('{:.02f}'.format(-1)) else: diff --git a/requirements/extra.txt b/requirements/extra.txt index efeef772..96789956 100644 --- a/requirements/extra.txt +++ b/requirements/extra.txt @@ -1,7 +1,8 @@ # Alpaca-eval alpaca-eval==0.6 cn2an -dingo-python +# Dingo +dingo-python==1.1.2 # Icl topk retriever faiss_gpu==1.7.2 # Humaneval, Humaneval X