mirror of
https://github.com/open-compass/opencompass.git
synced 2025-05-30 16:03:24 +08:00
[Sync] format (#1214)
This commit is contained in:
parent
d59189b87f
commit
a77b8a5cec
@ -0,0 +1,58 @@
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.openicl.icl_evaluator import LMEvaluator
|
||||
from opencompass.datasets import CompassBenchDataset
|
||||
|
||||
subjective_reader_cfg = dict(
|
||||
input_columns=['question', 'judge_prompt'],
|
||||
output_column='judge',
|
||||
)
|
||||
|
||||
data_path ='data/subjective/compassbench'
|
||||
|
||||
subjective_datasets = []
|
||||
|
||||
versions = ['CompassbenchV1']
|
||||
|
||||
for version_abbr in versions:
|
||||
subjective_infer_cfg = dict(
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(round=[
|
||||
dict(
|
||||
role='HUMAN',
|
||||
prompt='{question}'
|
||||
),
|
||||
]),
|
||||
),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer, max_seq_len=4096, max_out_len=2048),
|
||||
)
|
||||
|
||||
subjective_eval_cfg = dict(
|
||||
evaluator=dict(
|
||||
type=LMEvaluator,
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(round=[
|
||||
dict(
|
||||
role='HUMAN',
|
||||
prompt = '{judge_prompt}'
|
||||
),
|
||||
]),
|
||||
),
|
||||
),
|
||||
pred_role='BOT',
|
||||
)
|
||||
|
||||
subjective_datasets.append(
|
||||
dict(
|
||||
abbr=version_abbr,
|
||||
type=CompassBenchDataset,
|
||||
path=data_path,
|
||||
name=version_abbr,
|
||||
reader_cfg=subjective_reader_cfg,
|
||||
infer_cfg=subjective_infer_cfg,
|
||||
eval_cfg=subjective_eval_cfg
|
||||
))
|
137
configs/eval_subjective_compassbench.py
Normal file
137
configs/eval_subjective_compassbench.py
Normal file
@ -0,0 +1,137 @@
|
||||
from os import getenv as gv
|
||||
from opencompass.models import HuggingFaceCausalLM
|
||||
from mmengine.config import read_base
|
||||
|
||||
with read_base():
|
||||
from .datasets.subjective.compassbench.compassbench_compare import subjective_datasets
|
||||
|
||||
from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3, OpenAI
|
||||
from opencompass.partitioners import NaivePartitioner, SizePartitioner
|
||||
from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
|
||||
from opencompass.partitioners.sub_size import SubjectiveSizePartitioner
|
||||
from opencompass.runners import LocalRunner
|
||||
from opencompass.runners import SlurmSequentialRunner
|
||||
from opencompass.tasks import OpenICLInferTask
|
||||
from opencompass.tasks.subjective_eval import SubjectiveEvalTask
|
||||
from opencompass.summarizers import CompassBenchSummarizer
|
||||
|
||||
api_meta_template = dict(
|
||||
round=[
|
||||
dict(role='HUMAN', api_role='HUMAN'),
|
||||
dict(role='BOT', api_role='BOT', generate=True),
|
||||
],
|
||||
reserved_roles=[dict(role='SYSTEM', api_role='SYSTEM')],
|
||||
)
|
||||
|
||||
# -------------Inference Stage ----------------------------------------
|
||||
|
||||
from opencompass.models import HuggingFacewithChatTemplate
|
||||
|
||||
models = [
|
||||
dict(
|
||||
type=HuggingFacewithChatTemplate,
|
||||
abbr='internlm2-chat-7b-hf',
|
||||
path='internlm/internlm2-chat-7b',
|
||||
max_out_len=1024,
|
||||
batch_size=8,
|
||||
run_cfg=dict(num_gpus=1),
|
||||
stop_words=['</s>', '<|im_end|>'],
|
||||
generation_kwargs=dict(
|
||||
do_sample=True,
|
||||
),
|
||||
)
|
||||
]
|
||||
|
||||
datasets = [*subjective_datasets]
|
||||
|
||||
infer = dict(
|
||||
partitioner=dict(type=NaivePartitioner),
|
||||
runner=dict(
|
||||
type=SlurmSequentialRunner,
|
||||
partition='llmeval',
|
||||
quotatype='reserved',
|
||||
max_num_workers=256,
|
||||
task=dict(type=OpenICLInferTask),
|
||||
),
|
||||
)
|
||||
|
||||
gpt4 = dict(
|
||||
abbr='gpt4-turbo',
|
||||
type=OpenAI,
|
||||
path='gpt-4-1106-preview',
|
||||
key='', # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well
|
||||
meta_template=api_meta_template,
|
||||
query_per_second=1,
|
||||
max_out_len=2048,
|
||||
max_seq_len=4096,
|
||||
batch_size=4,
|
||||
retry=20,
|
||||
temperature=1,
|
||||
) # Re-inference gpt4's predictions or you can choose to use the pre-commited gpt4's predictions
|
||||
|
||||
# -------------Evalation Stage ----------------------------------------
|
||||
|
||||
## ------------- JudgeLLM Configuration
|
||||
judge_models = [dict(
|
||||
abbr='GPT4-Turbo',
|
||||
type=OpenAI,
|
||||
path='gpt-4-1106-preview',
|
||||
key='', # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well
|
||||
meta_template=api_meta_template,
|
||||
query_per_second=1,
|
||||
max_out_len=1024,
|
||||
max_seq_len=4096,
|
||||
batch_size=2,
|
||||
retry=20,
|
||||
temperature=0,
|
||||
)]
|
||||
|
||||
judge_models = [
|
||||
dict(
|
||||
type=HuggingFacewithChatTemplate,
|
||||
abbr='internlm102b',
|
||||
path='/mnt/petrelfs/caomaosong/backup_hwfile/100bjudge_6w_epoch1/hf',
|
||||
max_out_len=1024,
|
||||
batch_size=8,
|
||||
run_cfg=dict(num_gpus=4),
|
||||
stop_words=['</s>', '<|im_end|>'],
|
||||
),
|
||||
dict(
|
||||
type=HuggingFacewithChatTemplate,
|
||||
abbr='internlm102b2',
|
||||
path='/mnt/petrelfs/caomaosong/backup_hwfile/100bjudge_6w_epoch1/hf',
|
||||
max_out_len=1024,
|
||||
batch_size=8,
|
||||
run_cfg=dict(num_gpus=4),
|
||||
stop_words=['</s>', '<|im_end|>'],
|
||||
),
|
||||
dict(
|
||||
type=HuggingFacewithChatTemplate,
|
||||
abbr='internlm102b3',
|
||||
path='/mnt/petrelfs/caomaosong/backup_hwfile/100bjudge_6w_epoch1/hf',
|
||||
max_out_len=1024,
|
||||
batch_size=8,
|
||||
run_cfg=dict(num_gpus=4),
|
||||
stop_words=['</s>', '<|im_end|>'],
|
||||
)
|
||||
]
|
||||
|
||||
## ------------- Evaluation Configuration
|
||||
eval = dict(
|
||||
partitioner=dict(
|
||||
type=SubjectiveSizePartitioner,
|
||||
strategy='split',
|
||||
max_task_size=10000000,
|
||||
mode='m2n',
|
||||
infer_order='double',
|
||||
base_models=[gpt4],
|
||||
compare_models=models,
|
||||
judge_models=judge_models,
|
||||
),
|
||||
runner=dict(type=LocalRunner, max_num_workers=32, task=dict(type=SubjectiveEvalTask)),
|
||||
#given_pred = [{'abbr':'gpt4-turbo', 'path':''}]
|
||||
)
|
||||
|
||||
work_dir = 'outputs/compassbench/'
|
||||
|
||||
summarizer = dict(type=CompassBenchSummarizer, summary_type='half_add')
|
@ -20,16 +20,16 @@ prompts = [
|
||||
]
|
||||
|
||||
|
||||
charm_reaso_summary_groups = []
|
||||
charm_reason_summary_groups = []
|
||||
for prompt in prompts:
|
||||
for region in regions:
|
||||
subsets = ['charm-reason-' + region + '_' + task + '_' + prompt for task in charm_tasks]
|
||||
charm_reaso_summary_groups.append({'name': 'charm-reason-' + region + '_' + prompt, 'subsets': subsets})
|
||||
charm_reason_summary_groups.append({'name': 'charm-reason-' + region + '_' + prompt, 'subsets': subsets})
|
||||
|
||||
for prompt in prompts:
|
||||
subsets = ['charm-reason-' + region + '_' + prompt for region in regions]
|
||||
charm_reaso_summary_groups.append({'name': 'charm-reason-' + prompt, 'subsets': subsets})
|
||||
charm_reason_summary_groups.append({'name': 'charm-reason-' + prompt, 'subsets': subsets})
|
||||
|
||||
charm_reaso_summary_groups.append(
|
||||
charm_reason_summary_groups.append(
|
||||
{'name': 'charm-reason-CoT', 'subsets': ['charm-reason-ZH-CoT', 'charm-reason-EN-CoT']}
|
||||
)
|
||||
|
@ -1,6 +1,7 @@
|
||||
from .alignbench import AlignmentBenchDataset # noqa: F401, F403
|
||||
from .arena_hard import ArenaHardDataset # noqa: F401, F403
|
||||
from .compass_arena import CompassArenaDataset # noqa: F401, F403
|
||||
from .compassbench import CompassBenchDataset # noqa: F401, F403
|
||||
from .corev2 import Corev2Dataset # noqa: F401, F403
|
||||
from .creationbench import CreationBenchDataset # noqa: F401, F403
|
||||
from .information_retrival import IRDataset # noqa: F401, F403
|
||||
|
101
opencompass/datasets/subjective/compassbench.py
Normal file
101
opencompass/datasets/subjective/compassbench.py
Normal file
@ -0,0 +1,101 @@
|
||||
# flake8: noqa
|
||||
import json
|
||||
import os.path as osp
|
||||
|
||||
from datasets import Dataset
|
||||
|
||||
from opencompass.registry import LOAD_DATASET
|
||||
|
||||
from ..base import BaseDataset
|
||||
|
||||
base_prompt_zh = """请根据 用户问题 以及 相应的两个回答,判断哪一个回答更好。
|
||||
[用户问题]
|
||||
{question}
|
||||
|
||||
[回答1开始]
|
||||
{prediction}
|
||||
[回答1结束]
|
||||
|
||||
[回答2开始]
|
||||
{prediction2}
|
||||
[回答2结束]
|
||||
|
||||
根据评分要求,请先对两个回答进行评价,最后在以下 3 个选项中做出选择:
|
||||
A. 回答1更好
|
||||
B. 回答2更好
|
||||
C. 回答1、2平局
|
||||
|
||||
如果你认为回答1更好,你的输出应形如:
|
||||
评价1:回答1 xxx
|
||||
评价2:回答2 xxx
|
||||
选择:[[A]]
|
||||
|
||||
如果你认为回答2更好,你的输出应形如:
|
||||
评价1:回答1 xxx
|
||||
评价2:回答2 xxx
|
||||
选择:[[B]]
|
||||
|
||||
如果你认为回答1、2打成平手,你的输出应形如:
|
||||
评价1:回答1 xxx
|
||||
评价2:回答2 xxx
|
||||
选择:[[C]]
|
||||
"""
|
||||
|
||||
base_prompt_en = """Please evaluate the two responses based on the user's question and then choose from the following three options:
|
||||
A. Response 1 is better
|
||||
B. Response 2 is better
|
||||
C. Both responses are equal
|
||||
|
||||
[user's question]
|
||||
{question}
|
||||
|
||||
[Response 1 Start]
|
||||
{prediction}
|
||||
[Response 1 End]
|
||||
|
||||
[Response 2 Start]
|
||||
{prediction2}
|
||||
[Response 2 End]
|
||||
|
||||
If you believe that Response 1 is better, your output should be formatted as follows:
|
||||
Evaluation 1: Response 1 xxx
|
||||
Evaluation 2: Response 2 xxx
|
||||
Choice: [[A]]
|
||||
|
||||
If you believe that Response 2 is better, your output should be formatted as follows:
|
||||
Evaluation 1: Response 1 xxx
|
||||
Evaluation 2: Response 2 xxx
|
||||
Choice: [[B]]
|
||||
|
||||
If you believe that both responses are equally good, your output should be formatted as follows:
|
||||
Evaluation 1: Response 1 xxx
|
||||
Evaluation 2: Response 2 xxx
|
||||
Choice: [[C]]
|
||||
"""
|
||||
|
||||
|
||||
@LOAD_DATASET.register_module()
|
||||
class CompassBenchDataset(BaseDataset):
|
||||
|
||||
def load(self, path: str, name: str):
|
||||
filename = osp.join(path, f'{name}.json')
|
||||
raw_data = []
|
||||
with open(filename, 'r', encoding='utf-8') as f:
|
||||
json_data = json.load(f)
|
||||
for problem in json_data:
|
||||
question = problem['question']
|
||||
lan = problem['language']
|
||||
others = problem['others']
|
||||
judge_prompt = base_prompt_zh if lan == 'zh' else base_prompt_en
|
||||
raw_data.append({
|
||||
'question': question,
|
||||
'judge_prompt': judge_prompt,
|
||||
'judge': {
|
||||
'lan': lan,
|
||||
'level': others['level'],
|
||||
'category': problem['category'],
|
||||
'question': question
|
||||
}
|
||||
})
|
||||
dataset = Dataset.from_list(raw_data)
|
||||
return dataset
|
@ -4,6 +4,7 @@ from .all_obj import AllObjSummarizer
|
||||
from .alpacaeval import AlpacaSummarizer
|
||||
from .arenahard import ArenaHardSummarizer
|
||||
from .compass_arena import CompassArenaSummarizer
|
||||
from .compassbench import CompassBenchSummarizer
|
||||
from .corev2 import Corev2Summarizer
|
||||
from .creationbench import CreationBenchSummarizer
|
||||
from .flames import FlamesSummarizer
|
||||
|
241
opencompass/summarizers/subjective/compassbench.py
Normal file
241
opencompass/summarizers/subjective/compassbench.py
Normal file
@ -0,0 +1,241 @@
|
||||
# flake8: noqa
|
||||
# yapf: disable
|
||||
import os
|
||||
import os.path as osp
|
||||
import re
|
||||
from collections import defaultdict
|
||||
from datetime import datetime
|
||||
from itertools import product
|
||||
|
||||
import mmengine
|
||||
from mmengine import ConfigDict
|
||||
from tabulate import tabulate
|
||||
|
||||
from opencompass.partitioners.sub_naive import remove_duplicate_pairs
|
||||
from opencompass.utils import dataset_abbr_from_cfg, model_abbr_from_cfg
|
||||
|
||||
from .utils import get_judgeanswer_and_reference, get_outdir
|
||||
|
||||
|
||||
def model_abbr_from_cfg_used_in_summarizer(model):
|
||||
if model.get('summarizer_abbr', None):
|
||||
return model['summarizer_abbr']
|
||||
else:
|
||||
return model_abbr_from_cfg(model)
|
||||
|
||||
def post_process_compass_arena(s):
|
||||
if result := re.findall(r'(?:选择:|Choice: )\[\[([ABC])\]\]', s):
|
||||
return result[0]
|
||||
else:
|
||||
return None
|
||||
|
||||
|
||||
def check_position_bias(judged_answers, references, banned_choice=['C']):
|
||||
"""Check position bias for judgellm's judgement.
|
||||
|
||||
Args:
|
||||
judged_answers: The successfully extracted judgement.
|
||||
references: The references contains original question, which is used to located the same question for different position judgement.
|
||||
"""
|
||||
position_bias_flag = 0
|
||||
position_bias_dict = {}
|
||||
for judge, ref in zip(judged_answers, references):
|
||||
question = ref['question']
|
||||
question_hash = hash(question)
|
||||
if question_hash not in position_bias_dict:
|
||||
position_bias_dict[question_hash] = {
|
||||
'question': question,
|
||||
'judge': judge
|
||||
}
|
||||
else:
|
||||
first_judge = position_bias_dict[question_hash]['judge']
|
||||
if judge == first_judge and first_judge not in banned_choice and judge not in banned_choice:
|
||||
# If second choice is same with first choice, there has position bias.
|
||||
position_bias_flag += 1
|
||||
return position_bias_flag
|
||||
|
||||
|
||||
class CompassBenchSummarizer:
|
||||
"""Do the subjectivity analyze based on evaluation results.
|
||||
|
||||
Args:
|
||||
config (ConfigDict): The configuration object of the evaluation task.
|
||||
It's expected to be filled out at runtime.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
config: ConfigDict,
|
||||
judge_type='general',
|
||||
check_pos_bias=True,
|
||||
summary_type='single') -> None:
|
||||
self.tasks = []
|
||||
self.cfg = config
|
||||
self.base_models = self.cfg['eval']['partitioner']['base_models']
|
||||
self.compare_models = self.cfg['eval']['partitioner']['compare_models']
|
||||
self.judge_models = self.cfg.get('judge_models', None)
|
||||
self.meta_judge_model = self.cfg.eval.partitioner.get('meta_judge_model', None)
|
||||
self.judge_type = judge_type
|
||||
assert self.judge_type in ['general']
|
||||
self.judge_map = {'general': post_process_compass_arena}
|
||||
self.judge_function = self.judge_map[self.judge_type]
|
||||
self.check_pos_bias = check_pos_bias
|
||||
self.summary_type = summary_type
|
||||
|
||||
def get_score(self, time_str):
|
||||
output_dir, results_folder = get_outdir(self.cfg, time_str)
|
||||
model_combinations = list(product(self.base_models, self.compare_models))
|
||||
unique_combinations = remove_duplicate_pairs([combo for combo in model_combinations if combo[0] != combo[1]])
|
||||
|
||||
if self.meta_judge_model is not None:
|
||||
self.judge_models.append(self.meta_judge_model)
|
||||
|
||||
scores = {}
|
||||
|
||||
for idx, judge_model_cfg in enumerate(self.judge_models):
|
||||
judge_model = model_abbr_from_cfg(judge_model_cfg)
|
||||
for dataset in self.cfg['datasets']:
|
||||
dataset_abbr = dataset_abbr_from_cfg(dataset)
|
||||
for model_pair in unique_combinations:
|
||||
model1 = model_pair[0]['abbr']
|
||||
model2 = model_pair[1]['abbr']
|
||||
if idx == len(self.judge_models):
|
||||
subdir = model1 + '_' + model2 + '_summarized-by--' + judge_model
|
||||
else:
|
||||
subdir = model1 + '_' + model2 + '_judged-by--' + judge_model
|
||||
subdir_path = os.path.join(results_folder, subdir)
|
||||
if not os.path.isdir(subdir_path):
|
||||
print(subdir_path + ' is not exist! please check!')
|
||||
continue
|
||||
judged_answers, references = get_judgeanswer_and_reference(dataset, subdir_path, self.judge_function)
|
||||
if self.check_pos_bias:
|
||||
bias_num = check_position_bias(judged_answers, references)
|
||||
else:
|
||||
bias_num = 0
|
||||
win_model1 = defaultdict(float)
|
||||
win_model2 = defaultdict(float)
|
||||
categories = defaultdict(float)
|
||||
difficulties = defaultdict(float)
|
||||
model1 = references[0]['answer1']
|
||||
model2 = references[0]['answer2']
|
||||
for prediction, reference in zip(judged_answers, references):
|
||||
categories[dataset_abbr] += 1
|
||||
categories[reference['category']] += 1
|
||||
difficulties[reference['level']] += 1
|
||||
|
||||
if prediction == 'A':
|
||||
if reference['answer1'] == model1:
|
||||
score_1, score_2 = 1, 0
|
||||
else:
|
||||
score_1, score_2 = 0, 1
|
||||
elif prediction == 'B':
|
||||
if reference['answer1'] == model1:
|
||||
score_1, score_2 = 0, 1
|
||||
else:
|
||||
score_1, score_2 = 1, 0
|
||||
elif prediction == 'C':
|
||||
if self.summary_type == 'half_add':
|
||||
score_1, score_2 = 0.5, 0.5
|
||||
else:
|
||||
score_1, score_2 = 0, 0
|
||||
|
||||
win_model1[reference['category']] += score_1
|
||||
win_model1[dataset_abbr] += score_1
|
||||
win_model2[reference['category']] += score_2
|
||||
win_model2[dataset_abbr] += score_2
|
||||
for category in categories:
|
||||
win_model1[category] = win_model1[category] / categories[category] * 100
|
||||
win_model1[category] = round(win_model1[category], 2)
|
||||
win_model2[category] = win_model2[category] / categories[category] * 100
|
||||
win_model2[category] = round(win_model2[category], 2)
|
||||
win_model1['position_bias'] = bias_num
|
||||
win_model2['position_bias'] = bias_num
|
||||
|
||||
if judge_model not in scores:
|
||||
scores[judge_model] = {}
|
||||
if dataset_abbr not in scores[judge_model]:
|
||||
scores[judge_model][dataset_abbr] = {}
|
||||
scores[judge_model][dataset_abbr][model2] = win_model2
|
||||
|
||||
return scores
|
||||
|
||||
def summarize(
|
||||
self,
|
||||
time_str: str = datetime.now().strftime('%Y%m%d_%H%M%S'),
|
||||
):
|
||||
"""Summarize the subjectivity analysis based on evaluation results.
|
||||
|
||||
Args:
|
||||
time_str (str): Timestamp for file naming.
|
||||
|
||||
Returns:
|
||||
pd.DataFrame: The summary results.
|
||||
"""
|
||||
|
||||
|
||||
scores = self.get_score(time_str)
|
||||
# scores['win_' + model1] = win_model1
|
||||
output_dir, results_folder = get_outdir(self.cfg, time_str)
|
||||
|
||||
|
||||
for idx, judge_model in enumerate(self.judge_models):
|
||||
judge_abbr = model_abbr_from_cfg(judge_model)
|
||||
for dataset in self.cfg['datasets']:
|
||||
dataset_abbr = dataset_abbr_from_cfg(dataset)
|
||||
summarizer_model_abbrs = [model_abbr_from_cfg_used_in_summarizer(i) for i in self.compare_models]
|
||||
one_column = list(scores[judge_abbr][dataset_abbr].values())[0]
|
||||
row_headers = [i for i in one_column.keys() if i not in [dataset_abbr, 'position_bias']]
|
||||
row_headers = [dataset_abbr, 'position_bias'] + row_headers
|
||||
headers = [''] + summarizer_model_abbrs
|
||||
table = []
|
||||
for row_header in row_headers:
|
||||
row = [row_header]
|
||||
for model_cfg in self.compare_models:
|
||||
model_abbr = model_abbr_from_cfg(model_cfg)
|
||||
s = scores[judge_abbr][dataset_abbr][model_abbr].get(row_header, '')
|
||||
if isinstance(s, float):
|
||||
s = f'{s:.2f}'
|
||||
if isinstance(s, int):
|
||||
s = str(s)
|
||||
row.append(s)
|
||||
table.append(row)
|
||||
txt = tabulate(table, headers=headers)
|
||||
print(txt)
|
||||
|
||||
if idx == len(self.judge_models):
|
||||
output_filename = osp.join(output_dir, 'summarized-by--' + judge_abbr + '-' + dataset_abbr + '-report.csv')
|
||||
else:
|
||||
output_filename = osp.join(output_dir, 'judged-by--' + judge_abbr + '-' + dataset_abbr + '-report.csv')
|
||||
|
||||
with open(output_filename, 'w') as f:
|
||||
f.write(','.join(headers) + '\n')
|
||||
for line in table:
|
||||
f.write(','.join(line) + '\n')
|
||||
print(output_filename)
|
||||
|
||||
table = []
|
||||
summarizer_model_abbrs = [model_abbr_from_cfg_used_in_summarizer(i) for i in self.compare_models]
|
||||
headers = [''] + summarizer_model_abbrs
|
||||
for dataset in self.cfg['datasets']:
|
||||
dataset_abbr = dataset_abbr_from_cfg(dataset)
|
||||
row = [dataset_abbr]
|
||||
for model_cfg in self.compare_models:
|
||||
model_abbr = model_abbr_from_cfg(model_cfg)
|
||||
s = scores[judge_abbr][dataset_abbr][model_abbr].get(dataset_abbr, '')
|
||||
if isinstance(s, float):
|
||||
s = f'{s:.2f}'
|
||||
if isinstance(s, int):
|
||||
s = str(s)
|
||||
row.append(s)
|
||||
table.append(row)
|
||||
txt = tabulate(table, headers=headers)
|
||||
print(txt)
|
||||
|
||||
if idx == len(self.judge_models):
|
||||
output_filename = osp.join(output_dir, 'summarized-by--' + judge_abbr + '-overall-report.csv')
|
||||
else:
|
||||
output_filename = osp.join(output_dir, 'judged-by--' + judge_abbr + '-overall-report.csv')
|
||||
with open(output_filename, 'w') as f:
|
||||
f.write(','.join(headers) + '\n')
|
||||
for line in table:
|
||||
f.write(','.join(line) + '\n')
|
||||
print(output_filename)
|
@ -2,6 +2,7 @@ from __future__ import annotations
|
||||
|
||||
import hashlib
|
||||
import json
|
||||
import re
|
||||
from copy import deepcopy
|
||||
from typing import Dict, List, Union
|
||||
|
||||
@ -19,9 +20,15 @@ def safe_format(input_str: str, **kwargs) -> str:
|
||||
Returns:
|
||||
str: The formatted string.
|
||||
"""
|
||||
segs = [input_str]
|
||||
for k, v in kwargs.items():
|
||||
input_str = input_str.replace(f'{{{k}}}', str(v))
|
||||
return input_str
|
||||
regex = re.compile(f'(?<={{{k}}})(?={{{k}}})|({{{k}}})')
|
||||
segs = [regex.split(seg) for seg in segs]
|
||||
segs = sum(segs, [])
|
||||
replace_dict = {f'{{{k}}}': str(v) for k, v in kwargs.items()}
|
||||
segs = [replace_dict.get(seg, seg) for seg in segs]
|
||||
output_str = ''.join(segs)
|
||||
return output_str
|
||||
|
||||
|
||||
def get_prompt_hash(dataset_cfg: Union[ConfigDict, List[ConfigDict]]) -> str:
|
||||
|
@ -86,8 +86,14 @@ def get_config_from_arg(args) -> Config:
|
||||
config['models'] = change_accelerator(config['models'], args.accelerator)
|
||||
if config.get('eval', {}).get('partitioner', {}).get('models') is not None:
|
||||
config['eval']['partitioner']['models'] = change_accelerator(config['eval']['partitioner']['models'], args.accelerator)
|
||||
if config.get('eval', {}).get('partitioner', {}).get('base_models') is not None:
|
||||
config['eval']['partitioner']['base_models'] = change_accelerator(config['eval']['partitioner']['base_models'], args.accelerator)
|
||||
if config.get('eval', {}).get('partitioner', {}).get('compare_models') is not None:
|
||||
config['eval']['partitioner']['compare_models'] = change_accelerator(config['eval']['partitioner']['compare_models'], args.accelerator)
|
||||
if config.get('eval', {}).get('partitioner', {}).get('judge_models') is not None:
|
||||
config['eval']['partitioner']['judge_models'] = change_accelerator(config['eval']['partitioner']['judge_models'], args.accelerator)
|
||||
if config.get('judge_models', {}) is not None:
|
||||
config['judge_models'] = change_accelerator(config['judge_models'], args.accelerator)
|
||||
return config
|
||||
|
||||
# parse dataset args
|
||||
@ -211,7 +217,7 @@ def change_accelerator(models, accelerator):
|
||||
mod = TurboMindModel
|
||||
acc_model = dict(
|
||||
type=f'{mod.__module__}.{mod.__name__}',
|
||||
abbr=model['abbr'].replace('hf', 'lmdeploy') if '-hf' in model['abbr'] else model['abbr'] + '-lmdeploy',
|
||||
abbr=model['abbr'].replace('hf', 'turbomind') if '-hf' in model['abbr'] else model['abbr'] + '-turbomind',
|
||||
path=model['path'],
|
||||
engine_config=dict(session_len=model['max_seq_len'],
|
||||
max_batch_size=model['batch_size'],
|
||||
@ -254,7 +260,7 @@ def change_accelerator(models, accelerator):
|
||||
mod = VLLMwithChatTemplate
|
||||
acc_model = dict(
|
||||
type=f'{mod.__module__}.{mod.__name__}',
|
||||
abbr='-hf'.join(model['abbr'].split('-hf')[:-1]) + '-vllm',
|
||||
abbr=model['abbr'].replace('hf', 'vllm') if '-hf' in model['abbr'] else model['abbr'] + '-vllm',
|
||||
path=model['path'],
|
||||
model_kwargs=dict(tensor_parallel_size=model['run_cfg']['num_gpus']),
|
||||
max_out_len=model['max_out_len'],
|
||||
@ -266,7 +272,7 @@ def change_accelerator(models, accelerator):
|
||||
mod = TurboMindModelwithChatTemplate
|
||||
acc_model = dict(
|
||||
type=f'{mod.__module__}.{mod.__name__}',
|
||||
abbr='-hf'.join(model['abbr'].split('-hf')[:-1]) + '-turbomind',
|
||||
abbr=model['abbr'].replace('hf', 'turbomind') if '-hf' in model['abbr'] else model['abbr'] + '-turbomind',
|
||||
path=model['path'],
|
||||
engine_config=dict(max_batch_size=model.get('batch_size', 16), tp=model['run_cfg']['num_gpus']),
|
||||
gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9),
|
||||
|
Loading…
Reference in New Issue
Block a user