From a77b8a5cecb1d8dd31a500dec1ad34c6dd6ded9c Mon Sep 17 00:00:00 2001 From: Fengzhe Zhou Date: Thu, 30 May 2024 00:21:58 +0800 Subject: [PATCH] [Sync] format (#1214) --- .../compassbench/compassbench_compare.py | 58 +++++ configs/eval_subjective_compassbench.py | 137 ++++++++++ configs/summarizers/groups/charm_reason.py | 8 +- opencompass/datasets/subjective/__init__.py | 1 + .../datasets/subjective/compassbench.py | 101 ++++++++ .../summarizers/subjective/__init__.py | 1 + .../summarizers/subjective/compassbench.py | 241 ++++++++++++++++++ opencompass/utils/prompt.py | 11 +- opencompass/utils/run.py | 12 +- 9 files changed, 561 insertions(+), 9 deletions(-) create mode 100644 configs/datasets/subjective/compassbench/compassbench_compare.py create mode 100644 configs/eval_subjective_compassbench.py create mode 100644 opencompass/datasets/subjective/compassbench.py create mode 100644 opencompass/summarizers/subjective/compassbench.py diff --git a/configs/datasets/subjective/compassbench/compassbench_compare.py b/configs/datasets/subjective/compassbench/compassbench_compare.py new file mode 100644 index 00000000..942eca7c --- /dev/null +++ b/configs/datasets/subjective/compassbench/compassbench_compare.py @@ -0,0 +1,58 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.openicl.icl_evaluator import LMEvaluator +from opencompass.datasets import CompassBenchDataset + +subjective_reader_cfg = dict( + input_columns=['question', 'judge_prompt'], + output_column='judge', + ) + +data_path ='data/subjective/compassbench' + +subjective_datasets = [] + +versions = ['CompassbenchV1'] + +for version_abbr in versions: + subjective_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict(round=[ + dict( + role='HUMAN', + prompt='{question}' + ), + ]), + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_seq_len=4096, max_out_len=2048), + ) + + subjective_eval_cfg = dict( + evaluator=dict( + type=LMEvaluator, + prompt_template=dict( + type=PromptTemplate, + template=dict(round=[ + dict( + role='HUMAN', + prompt = '{judge_prompt}' + ), + ]), + ), + ), + pred_role='BOT', + ) + + subjective_datasets.append( + dict( + abbr=version_abbr, + type=CompassBenchDataset, + path=data_path, + name=version_abbr, + reader_cfg=subjective_reader_cfg, + infer_cfg=subjective_infer_cfg, + eval_cfg=subjective_eval_cfg + )) diff --git a/configs/eval_subjective_compassbench.py b/configs/eval_subjective_compassbench.py new file mode 100644 index 00000000..ebb1c4e5 --- /dev/null +++ b/configs/eval_subjective_compassbench.py @@ -0,0 +1,137 @@ +from os import getenv as gv +from opencompass.models import HuggingFaceCausalLM +from mmengine.config import read_base + +with read_base(): + from .datasets.subjective.compassbench.compassbench_compare import subjective_datasets + +from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3, OpenAI +from opencompass.partitioners import NaivePartitioner, SizePartitioner +from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner +from opencompass.partitioners.sub_size import SubjectiveSizePartitioner +from opencompass.runners import LocalRunner +from opencompass.runners import SlurmSequentialRunner +from opencompass.tasks import OpenICLInferTask +from opencompass.tasks.subjective_eval import SubjectiveEvalTask +from opencompass.summarizers import CompassBenchSummarizer + +api_meta_template = dict( + round=[ + dict(role='HUMAN', api_role='HUMAN'), + dict(role='BOT', api_role='BOT', generate=True), + ], + reserved_roles=[dict(role='SYSTEM', api_role='SYSTEM')], +) + +# -------------Inference Stage ---------------------------------------- + +from opencompass.models import HuggingFacewithChatTemplate + +models = [ + dict( + type=HuggingFacewithChatTemplate, + abbr='internlm2-chat-7b-hf', + path='internlm/internlm2-chat-7b', + max_out_len=1024, + batch_size=8, + run_cfg=dict(num_gpus=1), + stop_words=['', '<|im_end|>'], + generation_kwargs=dict( + do_sample=True, + ), + ) +] + +datasets = [*subjective_datasets] + +infer = dict( + partitioner=dict(type=NaivePartitioner), + runner=dict( + type=SlurmSequentialRunner, + partition='llmeval', + quotatype='reserved', + max_num_workers=256, + task=dict(type=OpenICLInferTask), + ), +) + +gpt4 = dict( + abbr='gpt4-turbo', + type=OpenAI, + path='gpt-4-1106-preview', + key='', # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well + meta_template=api_meta_template, + query_per_second=1, + max_out_len=2048, + max_seq_len=4096, + batch_size=4, + retry=20, + temperature=1, +) # Re-inference gpt4's predictions or you can choose to use the pre-commited gpt4's predictions + +# -------------Evalation Stage ---------------------------------------- + +## ------------- JudgeLLM Configuration +judge_models = [dict( + abbr='GPT4-Turbo', + type=OpenAI, + path='gpt-4-1106-preview', + key='', # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well + meta_template=api_meta_template, + query_per_second=1, + max_out_len=1024, + max_seq_len=4096, + batch_size=2, + retry=20, + temperature=0, +)] + +judge_models = [ + dict( + type=HuggingFacewithChatTemplate, + abbr='internlm102b', + path='/mnt/petrelfs/caomaosong/backup_hwfile/100bjudge_6w_epoch1/hf', + max_out_len=1024, + batch_size=8, + run_cfg=dict(num_gpus=4), + stop_words=['', '<|im_end|>'], + ), + dict( + type=HuggingFacewithChatTemplate, + abbr='internlm102b2', + path='/mnt/petrelfs/caomaosong/backup_hwfile/100bjudge_6w_epoch1/hf', + max_out_len=1024, + batch_size=8, + run_cfg=dict(num_gpus=4), + stop_words=['', '<|im_end|>'], + ), + dict( + type=HuggingFacewithChatTemplate, + abbr='internlm102b3', + path='/mnt/petrelfs/caomaosong/backup_hwfile/100bjudge_6w_epoch1/hf', + max_out_len=1024, + batch_size=8, + run_cfg=dict(num_gpus=4), + stop_words=['', '<|im_end|>'], + ) +] + +## ------------- Evaluation Configuration +eval = dict( + partitioner=dict( + type=SubjectiveSizePartitioner, + strategy='split', + max_task_size=10000000, + mode='m2n', + infer_order='double', + base_models=[gpt4], + compare_models=models, + judge_models=judge_models, + ), + runner=dict(type=LocalRunner, max_num_workers=32, task=dict(type=SubjectiveEvalTask)), + #given_pred = [{'abbr':'gpt4-turbo', 'path':''}] +) + +work_dir = 'outputs/compassbench/' + +summarizer = dict(type=CompassBenchSummarizer, summary_type='half_add') diff --git a/configs/summarizers/groups/charm_reason.py b/configs/summarizers/groups/charm_reason.py index 52edc8d2..3d1f4c19 100644 --- a/configs/summarizers/groups/charm_reason.py +++ b/configs/summarizers/groups/charm_reason.py @@ -20,16 +20,16 @@ prompts = [ ] -charm_reaso_summary_groups = [] +charm_reason_summary_groups = [] for prompt in prompts: for region in regions: subsets = ['charm-reason-' + region + '_' + task + '_' + prompt for task in charm_tasks] - charm_reaso_summary_groups.append({'name': 'charm-reason-' + region + '_' + prompt, 'subsets': subsets}) + charm_reason_summary_groups.append({'name': 'charm-reason-' + region + '_' + prompt, 'subsets': subsets}) for prompt in prompts: subsets = ['charm-reason-' + region + '_' + prompt for region in regions] - charm_reaso_summary_groups.append({'name': 'charm-reason-' + prompt, 'subsets': subsets}) + charm_reason_summary_groups.append({'name': 'charm-reason-' + prompt, 'subsets': subsets}) -charm_reaso_summary_groups.append( +charm_reason_summary_groups.append( {'name': 'charm-reason-CoT', 'subsets': ['charm-reason-ZH-CoT', 'charm-reason-EN-CoT']} ) diff --git a/opencompass/datasets/subjective/__init__.py b/opencompass/datasets/subjective/__init__.py index 1c4de400..439d6b2b 100644 --- a/opencompass/datasets/subjective/__init__.py +++ b/opencompass/datasets/subjective/__init__.py @@ -1,6 +1,7 @@ from .alignbench import AlignmentBenchDataset # noqa: F401, F403 from .arena_hard import ArenaHardDataset # noqa: F401, F403 from .compass_arena import CompassArenaDataset # noqa: F401, F403 +from .compassbench import CompassBenchDataset # noqa: F401, F403 from .corev2 import Corev2Dataset # noqa: F401, F403 from .creationbench import CreationBenchDataset # noqa: F401, F403 from .information_retrival import IRDataset # noqa: F401, F403 diff --git a/opencompass/datasets/subjective/compassbench.py b/opencompass/datasets/subjective/compassbench.py new file mode 100644 index 00000000..bd6f6a1d --- /dev/null +++ b/opencompass/datasets/subjective/compassbench.py @@ -0,0 +1,101 @@ +# flake8: noqa +import json +import os.path as osp + +from datasets import Dataset + +from opencompass.registry import LOAD_DATASET + +from ..base import BaseDataset + +base_prompt_zh = """请根据 用户问题 以及 相应的两个回答,判断哪一个回答更好。 +[用户问题] +{question} + +[回答1开始] +{prediction} +[回答1结束] + +[回答2开始] +{prediction2} +[回答2结束] + +根据评分要求,请先对两个回答进行评价,最后在以下 3 个选项中做出选择: +A. 回答1更好 +B. 回答2更好 +C. 回答1、2平局 + +如果你认为回答1更好,你的输出应形如: +评价1:回答1 xxx +评价2:回答2 xxx +选择:[[A]] + +如果你认为回答2更好,你的输出应形如: +评价1:回答1 xxx +评价2:回答2 xxx +选择:[[B]] + +如果你认为回答1、2打成平手,你的输出应形如: +评价1:回答1 xxx +评价2:回答2 xxx +选择:[[C]] +""" + +base_prompt_en = """Please evaluate the two responses based on the user's question and then choose from the following three options: +A. Response 1 is better +B. Response 2 is better +C. Both responses are equal + +[user's question] +{question} + +[Response 1 Start] +{prediction} +[Response 1 End] + +[Response 2 Start] +{prediction2} +[Response 2 End] + +If you believe that Response 1 is better, your output should be formatted as follows: +Evaluation 1: Response 1 xxx +Evaluation 2: Response 2 xxx +Choice: [[A]] + +If you believe that Response 2 is better, your output should be formatted as follows: +Evaluation 1: Response 1 xxx +Evaluation 2: Response 2 xxx +Choice: [[B]] + +If you believe that both responses are equally good, your output should be formatted as follows: +Evaluation 1: Response 1 xxx +Evaluation 2: Response 2 xxx +Choice: [[C]] +""" + + +@LOAD_DATASET.register_module() +class CompassBenchDataset(BaseDataset): + + def load(self, path: str, name: str): + filename = osp.join(path, f'{name}.json') + raw_data = [] + with open(filename, 'r', encoding='utf-8') as f: + json_data = json.load(f) + for problem in json_data: + question = problem['question'] + lan = problem['language'] + others = problem['others'] + judge_prompt = base_prompt_zh if lan == 'zh' else base_prompt_en + raw_data.append({ + 'question': question, + 'judge_prompt': judge_prompt, + 'judge': { + 'lan': lan, + 'level': others['level'], + 'category': problem['category'], + 'question': question + } + }) + dataset = Dataset.from_list(raw_data) + return dataset diff --git a/opencompass/summarizers/subjective/__init__.py b/opencompass/summarizers/subjective/__init__.py index 54ed56ad..88de42f5 100644 --- a/opencompass/summarizers/subjective/__init__.py +++ b/opencompass/summarizers/subjective/__init__.py @@ -4,6 +4,7 @@ from .all_obj import AllObjSummarizer from .alpacaeval import AlpacaSummarizer from .arenahard import ArenaHardSummarizer from .compass_arena import CompassArenaSummarizer +from .compassbench import CompassBenchSummarizer from .corev2 import Corev2Summarizer from .creationbench import CreationBenchSummarizer from .flames import FlamesSummarizer diff --git a/opencompass/summarizers/subjective/compassbench.py b/opencompass/summarizers/subjective/compassbench.py new file mode 100644 index 00000000..7a34c54a --- /dev/null +++ b/opencompass/summarizers/subjective/compassbench.py @@ -0,0 +1,241 @@ +# flake8: noqa +# yapf: disable +import os +import os.path as osp +import re +from collections import defaultdict +from datetime import datetime +from itertools import product + +import mmengine +from mmengine import ConfigDict +from tabulate import tabulate + +from opencompass.partitioners.sub_naive import remove_duplicate_pairs +from opencompass.utils import dataset_abbr_from_cfg, model_abbr_from_cfg + +from .utils import get_judgeanswer_and_reference, get_outdir + + +def model_abbr_from_cfg_used_in_summarizer(model): + if model.get('summarizer_abbr', None): + return model['summarizer_abbr'] + else: + return model_abbr_from_cfg(model) + +def post_process_compass_arena(s): + if result := re.findall(r'(?:选择:|Choice: )\[\[([ABC])\]\]', s): + return result[0] + else: + return None + + +def check_position_bias(judged_answers, references, banned_choice=['C']): + """Check position bias for judgellm's judgement. + + Args: + judged_answers: The successfully extracted judgement. + references: The references contains original question, which is used to located the same question for different position judgement. + """ + position_bias_flag = 0 + position_bias_dict = {} + for judge, ref in zip(judged_answers, references): + question = ref['question'] + question_hash = hash(question) + if question_hash not in position_bias_dict: + position_bias_dict[question_hash] = { + 'question': question, + 'judge': judge + } + else: + first_judge = position_bias_dict[question_hash]['judge'] + if judge == first_judge and first_judge not in banned_choice and judge not in banned_choice: + # If second choice is same with first choice, there has position bias. + position_bias_flag += 1 + return position_bias_flag + + +class CompassBenchSummarizer: + """Do the subjectivity analyze based on evaluation results. + + Args: + config (ConfigDict): The configuration object of the evaluation task. + It's expected to be filled out at runtime. + """ + + def __init__(self, + config: ConfigDict, + judge_type='general', + check_pos_bias=True, + summary_type='single') -> None: + self.tasks = [] + self.cfg = config + self.base_models = self.cfg['eval']['partitioner']['base_models'] + self.compare_models = self.cfg['eval']['partitioner']['compare_models'] + self.judge_models = self.cfg.get('judge_models', None) + self.meta_judge_model = self.cfg.eval.partitioner.get('meta_judge_model', None) + self.judge_type = judge_type + assert self.judge_type in ['general'] + self.judge_map = {'general': post_process_compass_arena} + self.judge_function = self.judge_map[self.judge_type] + self.check_pos_bias = check_pos_bias + self.summary_type = summary_type + + def get_score(self, time_str): + output_dir, results_folder = get_outdir(self.cfg, time_str) + model_combinations = list(product(self.base_models, self.compare_models)) + unique_combinations = remove_duplicate_pairs([combo for combo in model_combinations if combo[0] != combo[1]]) + + if self.meta_judge_model is not None: + self.judge_models.append(self.meta_judge_model) + + scores = {} + + for idx, judge_model_cfg in enumerate(self.judge_models): + judge_model = model_abbr_from_cfg(judge_model_cfg) + for dataset in self.cfg['datasets']: + dataset_abbr = dataset_abbr_from_cfg(dataset) + for model_pair in unique_combinations: + model1 = model_pair[0]['abbr'] + model2 = model_pair[1]['abbr'] + if idx == len(self.judge_models): + subdir = model1 + '_' + model2 + '_summarized-by--' + judge_model + else: + subdir = model1 + '_' + model2 + '_judged-by--' + judge_model + subdir_path = os.path.join(results_folder, subdir) + if not os.path.isdir(subdir_path): + print(subdir_path + ' is not exist! please check!') + continue + judged_answers, references = get_judgeanswer_and_reference(dataset, subdir_path, self.judge_function) + if self.check_pos_bias: + bias_num = check_position_bias(judged_answers, references) + else: + bias_num = 0 + win_model1 = defaultdict(float) + win_model2 = defaultdict(float) + categories = defaultdict(float) + difficulties = defaultdict(float) + model1 = references[0]['answer1'] + model2 = references[0]['answer2'] + for prediction, reference in zip(judged_answers, references): + categories[dataset_abbr] += 1 + categories[reference['category']] += 1 + difficulties[reference['level']] += 1 + + if prediction == 'A': + if reference['answer1'] == model1: + score_1, score_2 = 1, 0 + else: + score_1, score_2 = 0, 1 + elif prediction == 'B': + if reference['answer1'] == model1: + score_1, score_2 = 0, 1 + else: + score_1, score_2 = 1, 0 + elif prediction == 'C': + if self.summary_type == 'half_add': + score_1, score_2 = 0.5, 0.5 + else: + score_1, score_2 = 0, 0 + + win_model1[reference['category']] += score_1 + win_model1[dataset_abbr] += score_1 + win_model2[reference['category']] += score_2 + win_model2[dataset_abbr] += score_2 + for category in categories: + win_model1[category] = win_model1[category] / categories[category] * 100 + win_model1[category] = round(win_model1[category], 2) + win_model2[category] = win_model2[category] / categories[category] * 100 + win_model2[category] = round(win_model2[category], 2) + win_model1['position_bias'] = bias_num + win_model2['position_bias'] = bias_num + + if judge_model not in scores: + scores[judge_model] = {} + if dataset_abbr not in scores[judge_model]: + scores[judge_model][dataset_abbr] = {} + scores[judge_model][dataset_abbr][model2] = win_model2 + + return scores + + def summarize( + self, + time_str: str = datetime.now().strftime('%Y%m%d_%H%M%S'), + ): + """Summarize the subjectivity analysis based on evaluation results. + + Args: + time_str (str): Timestamp for file naming. + + Returns: + pd.DataFrame: The summary results. + """ + + + scores = self.get_score(time_str) + # scores['win_' + model1] = win_model1 + output_dir, results_folder = get_outdir(self.cfg, time_str) + + + for idx, judge_model in enumerate(self.judge_models): + judge_abbr = model_abbr_from_cfg(judge_model) + for dataset in self.cfg['datasets']: + dataset_abbr = dataset_abbr_from_cfg(dataset) + summarizer_model_abbrs = [model_abbr_from_cfg_used_in_summarizer(i) for i in self.compare_models] + one_column = list(scores[judge_abbr][dataset_abbr].values())[0] + row_headers = [i for i in one_column.keys() if i not in [dataset_abbr, 'position_bias']] + row_headers = [dataset_abbr, 'position_bias'] + row_headers + headers = [''] + summarizer_model_abbrs + table = [] + for row_header in row_headers: + row = [row_header] + for model_cfg in self.compare_models: + model_abbr = model_abbr_from_cfg(model_cfg) + s = scores[judge_abbr][dataset_abbr][model_abbr].get(row_header, '') + if isinstance(s, float): + s = f'{s:.2f}' + if isinstance(s, int): + s = str(s) + row.append(s) + table.append(row) + txt = tabulate(table, headers=headers) + print(txt) + + if idx == len(self.judge_models): + output_filename = osp.join(output_dir, 'summarized-by--' + judge_abbr + '-' + dataset_abbr + '-report.csv') + else: + output_filename = osp.join(output_dir, 'judged-by--' + judge_abbr + '-' + dataset_abbr + '-report.csv') + + with open(output_filename, 'w') as f: + f.write(','.join(headers) + '\n') + for line in table: + f.write(','.join(line) + '\n') + print(output_filename) + + table = [] + summarizer_model_abbrs = [model_abbr_from_cfg_used_in_summarizer(i) for i in self.compare_models] + headers = [''] + summarizer_model_abbrs + for dataset in self.cfg['datasets']: + dataset_abbr = dataset_abbr_from_cfg(dataset) + row = [dataset_abbr] + for model_cfg in self.compare_models: + model_abbr = model_abbr_from_cfg(model_cfg) + s = scores[judge_abbr][dataset_abbr][model_abbr].get(dataset_abbr, '') + if isinstance(s, float): + s = f'{s:.2f}' + if isinstance(s, int): + s = str(s) + row.append(s) + table.append(row) + txt = tabulate(table, headers=headers) + print(txt) + + if idx == len(self.judge_models): + output_filename = osp.join(output_dir, 'summarized-by--' + judge_abbr + '-overall-report.csv') + else: + output_filename = osp.join(output_dir, 'judged-by--' + judge_abbr + '-overall-report.csv') + with open(output_filename, 'w') as f: + f.write(','.join(headers) + '\n') + for line in table: + f.write(','.join(line) + '\n') + print(output_filename) diff --git a/opencompass/utils/prompt.py b/opencompass/utils/prompt.py index 496eec15..d65f6a03 100644 --- a/opencompass/utils/prompt.py +++ b/opencompass/utils/prompt.py @@ -2,6 +2,7 @@ from __future__ import annotations import hashlib import json +import re from copy import deepcopy from typing import Dict, List, Union @@ -19,9 +20,15 @@ def safe_format(input_str: str, **kwargs) -> str: Returns: str: The formatted string. """ + segs = [input_str] for k, v in kwargs.items(): - input_str = input_str.replace(f'{{{k}}}', str(v)) - return input_str + regex = re.compile(f'(?<={{{k}}})(?={{{k}}})|({{{k}}})') + segs = [regex.split(seg) for seg in segs] + segs = sum(segs, []) + replace_dict = {f'{{{k}}}': str(v) for k, v in kwargs.items()} + segs = [replace_dict.get(seg, seg) for seg in segs] + output_str = ''.join(segs) + return output_str def get_prompt_hash(dataset_cfg: Union[ConfigDict, List[ConfigDict]]) -> str: diff --git a/opencompass/utils/run.py b/opencompass/utils/run.py index b584795f..0072060c 100644 --- a/opencompass/utils/run.py +++ b/opencompass/utils/run.py @@ -86,8 +86,14 @@ def get_config_from_arg(args) -> Config: config['models'] = change_accelerator(config['models'], args.accelerator) if config.get('eval', {}).get('partitioner', {}).get('models') is not None: config['eval']['partitioner']['models'] = change_accelerator(config['eval']['partitioner']['models'], args.accelerator) + if config.get('eval', {}).get('partitioner', {}).get('base_models') is not None: + config['eval']['partitioner']['base_models'] = change_accelerator(config['eval']['partitioner']['base_models'], args.accelerator) + if config.get('eval', {}).get('partitioner', {}).get('compare_models') is not None: + config['eval']['partitioner']['compare_models'] = change_accelerator(config['eval']['partitioner']['compare_models'], args.accelerator) if config.get('eval', {}).get('partitioner', {}).get('judge_models') is not None: config['eval']['partitioner']['judge_models'] = change_accelerator(config['eval']['partitioner']['judge_models'], args.accelerator) + if config.get('judge_models', {}) is not None: + config['judge_models'] = change_accelerator(config['judge_models'], args.accelerator) return config # parse dataset args @@ -211,7 +217,7 @@ def change_accelerator(models, accelerator): mod = TurboMindModel acc_model = dict( type=f'{mod.__module__}.{mod.__name__}', - abbr=model['abbr'].replace('hf', 'lmdeploy') if '-hf' in model['abbr'] else model['abbr'] + '-lmdeploy', + abbr=model['abbr'].replace('hf', 'turbomind') if '-hf' in model['abbr'] else model['abbr'] + '-turbomind', path=model['path'], engine_config=dict(session_len=model['max_seq_len'], max_batch_size=model['batch_size'], @@ -254,7 +260,7 @@ def change_accelerator(models, accelerator): mod = VLLMwithChatTemplate acc_model = dict( type=f'{mod.__module__}.{mod.__name__}', - abbr='-hf'.join(model['abbr'].split('-hf')[:-1]) + '-vllm', + abbr=model['abbr'].replace('hf', 'vllm') if '-hf' in model['abbr'] else model['abbr'] + '-vllm', path=model['path'], model_kwargs=dict(tensor_parallel_size=model['run_cfg']['num_gpus']), max_out_len=model['max_out_len'], @@ -266,7 +272,7 @@ def change_accelerator(models, accelerator): mod = TurboMindModelwithChatTemplate acc_model = dict( type=f'{mod.__module__}.{mod.__name__}', - abbr='-hf'.join(model['abbr'].split('-hf')[:-1]) + '-turbomind', + abbr=model['abbr'].replace('hf', 'turbomind') if '-hf' in model['abbr'] else model['abbr'] + '-turbomind', path=model['path'], engine_config=dict(max_batch_size=model.get('batch_size', 16), tp=model['run_cfg']['num_gpus']), gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9),