[Fix] Fix subjective alignbench (#730)

This commit is contained in:
bittersweet1999 2023-12-23 20:06:53 +08:00 committed by GitHub
parent 0e24f4213e
commit e985100cd1
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 129 additions and 32 deletions

View File

@ -8,6 +8,18 @@ from opencompass.registry import PARTITIONERS
from .naive import NaivePartitioner from .naive import NaivePartitioner
def remove_duplicate_pairs(model_combinations):
combo_dict = {}
for i, combo in enumerate(model_combinations):
sorted_names = tuple(sorted((combo[0]['abbr'], combo[1]['abbr'])))
if sorted_names not in combo_dict:
combo_dict[sorted_names] = i
new_model_combinations = [
model_combinations[i] for i in combo_dict.values()
]
return new_model_combinations
@PARTITIONERS.register_module() @PARTITIONERS.register_module()
class SubjectiveNaivePartitioner(NaivePartitioner): class SubjectiveNaivePartitioner(NaivePartitioner):
"""Naive task partitioner for subjective evaluation. Compared to """Naive task partitioner for subjective evaluation. Compared to
@ -35,17 +47,6 @@ class SubjectiveNaivePartitioner(NaivePartitioner):
self.compare_models = compare_models self.compare_models = compare_models
self.model_pairs = model_pairs self.model_pairs = model_pairs
def remove_duplicate_pairs(self, model_combinations):
combo_dict = {}
for i, combo in enumerate(model_combinations):
sorted_names = tuple(sorted((combo[0]['abbr'], combo[1]['abbr'])))
if sorted_names not in combo_dict:
combo_dict[sorted_names] = i
new_model_combinations = [
model_combinations[i] for i in combo_dict.values()
]
return new_model_combinations
def get_model_combinations( def get_model_combinations(
self, self,
models: List[ConfigDict], models: List[ConfigDict],
@ -57,7 +58,7 @@ class SubjectiveNaivePartitioner(NaivePartitioner):
elif self.mode == 'm2n': elif self.mode == 'm2n':
assert len(base_models) > 0 and len(compare_models) > 0 assert len(base_models) > 0 and len(compare_models) > 0
model_combinations = list(product(base_models, compare_models)) model_combinations = list(product(base_models, compare_models))
unique_combinations = self.remove_duplicate_pairs([ unique_combinations = remove_duplicate_pairs([
combo for combo in model_combinations if combo[0] != combo[1] combo for combo in model_combinations if combo[0] != combo[1]
]) ])
return unique_combinations return unique_combinations

View File

@ -38,9 +38,7 @@ def post_process(judgment: str):
dictionary_str = match.group(1) dictionary_str = match.group(1)
kv_pattern = r"'(.*?)': (\d+)" kv_pattern = r"'(.*?)': (\d+)"
matches = re.findall(kv_pattern, dictionary_str) matches = re.findall(kv_pattern, dictionary_str)
result_dict = {key: int(value) for key, value in matches} result_dict = {key: int(value) for key, value in matches}
return result_dict return result_dict
else: else:
return None return None
@ -95,6 +93,7 @@ class AlignmentBenchSummarizer:
self.eval_model_abbrs = [ self.eval_model_abbrs = [
model_abbr_from_cfg(model) for model in self.eval_model_cfgs model_abbr_from_cfg(model) for model in self.eval_model_cfgs
] ]
self.judge_abbr = self.cfg['judge_model']['abbr']
def summarize(self, def summarize(self,
time_str: str = datetime.now().strftime('%Y%m%d_%H%M%S')): time_str: str = datetime.now().strftime('%Y%m%d_%H%M%S')):
@ -106,6 +105,7 @@ class AlignmentBenchSummarizer:
Returns: Returns:
pd.DataFrame: The summary results. pd.DataFrame: The summary results.
""" """
dataset_cfgs = self.cfg['datasets'] dataset_cfgs = self.cfg['datasets']
work_dir = self.cfg['work_dir'] work_dir = self.cfg['work_dir']
self.work_dir = work_dir self.work_dir = work_dir
@ -118,19 +118,48 @@ class AlignmentBenchSummarizer:
results_folder = osp.join(work_dir, 'results') results_folder = osp.join(work_dir, 'results')
fout_flag, fout_flag2 = 0, 0 fout_flag, fout_flag2 = 0, 0
for subdir in os.listdir(results_folder): for eval_model_abbr in self.eval_model_abbrs:
if subdir not in self.eval_model_abbrs: subdir = eval_model_abbr + '_judged-by--' + self.judge_abbr
continue
subdir_path = os.path.join(results_folder, subdir) subdir_path = os.path.join(results_folder, subdir)
if os.path.isdir(subdir_path): if os.path.isdir(subdir_path):
model, judge_model = subdir.split('_') model, judge_model = eval_model_abbr, self.judge_abbr
fout = osp.join(output_dir, judge_model + 'dimension.csv') fout = osp.join(output_dir,
fout2 = osp.join(output_dir, judge_model + 'capability.csv') 'judged-by--' + judge_model + '-dimension.csv')
fout2 = osp.join(
output_dir,
'judged-by--' + judge_model + '-capability.csv')
for dataset in dataset_cfgs: for dataset in dataset_cfgs:
dataset_abbr = dataset_abbr_from_cfg(dataset) dataset_abbr = dataset_abbr_from_cfg(dataset)
filepath = os.path.join(subdir_path, filename = os.path.join(subdir_path,
dataset_abbr + '.json') dataset_abbr + '.json')
result = mmengine.load(filepath) partial_filename = os.path.join(subdir_path,
dataset_abbr + '_0.json')
if osp.exists(osp.realpath(filename)):
result = mmengine.load(filename)
elif osp.exists(osp.realpath(partial_filename)):
filename = partial_filename
result = {}
i = 1
partial_dict_flag = 0
while osp.exists(osp.realpath(filename)):
res = mmengine.load(filename)
for k, v in res.items():
result[partial_dict_flag] = v
partial_dict_flag += 1
filename = os.path.join(
subdir_path,
dataset_abbr + '_' + str(i) + '.json')
i += 1
else:
result = {}
if len(result) == 0:
print('*' * 100)
print('There are no results for ' + filename + ' or ' +
partial_filename)
print('*' * 100)
assert len(result > 0)
judged_answers = [] judged_answers = []
references = [] references = []
for k, v in result.items(): for k, v in result.items():
@ -144,8 +173,14 @@ class AlignmentBenchSummarizer:
print( print(
f'Among {len(result)} judgements, successfully extracted {len(judged_answers)} judgements.' f'Among {len(result)} judgements, successfully extracted {len(judged_answers)} judgements.'
) )
if len(judged_answers) == 0:
print('*' * 100)
print(
'There are no extracted judgements, please change your judge model or check your prompt!!!'
)
print('*' * 100)
assert len(judged_answers) > 0
# 初始化一个嵌套字典用于存储模型和评分
dimension_ratings = defaultdict(int) dimension_ratings = defaultdict(int)
dimension_counts = defaultdict(int) dimension_counts = defaultdict(int)
capability_ratings = defaultdict(int) capability_ratings = defaultdict(int)
@ -225,6 +260,8 @@ class AlignmentBenchSummarizer:
for sub_category in sub_categories: for sub_category in sub_categories:
row.append(scores[model][sub_category]) row.append(scores[model][sub_category])
writer.writerow(row) writer.writerow(row)
else:
print(subdir_path + ' is not exist! please check!')
with open(fout, 'r') as f: with open(fout, 'r') as f:
x = from_csv(f) x = from_csv(f)
print(x) print(x)

View File

@ -5,6 +5,7 @@ import os.path as osp
import re import re
from collections import defaultdict from collections import defaultdict
from datetime import datetime from datetime import datetime
from itertools import product
import mmengine import mmengine
from mmengine import ConfigDict from mmengine import ConfigDict
@ -14,6 +15,7 @@ try:
except ImportError: except ImportError:
from_csv = None from_csv = None
from opencompass.partitioners.sub_naive import remove_duplicate_pairs
from opencompass.utils import dataset_abbr_from_cfg from opencompass.utils import dataset_abbr_from_cfg
@ -54,6 +56,9 @@ class Corev2Summarizer:
self.tasks = [] self.tasks = []
self.cfg = config self.cfg = config
self.match_method = match_method self.match_method = match_method
self.base_models = self.cfg['eval']['partitioner']['base_models']
self.compare_models = self.cfg['eval']['partitioner']['compare_models']
self.judge_abbr = self.cfg['judge_model']['abbr']
def summarize(self, def summarize(self,
time_str: str = datetime.now().strftime('%Y%m%d_%H%M%S')): time_str: str = datetime.now().strftime('%Y%m%d_%H%M%S')):
@ -76,25 +81,70 @@ class Corev2Summarizer:
mmengine.mkdir_or_exist(output_dir) mmengine.mkdir_or_exist(output_dir)
results_folder = osp.join(work_dir, 'results') results_folder = osp.join(work_dir, 'results')
for subdir in os.listdir(results_folder): model_combinations = list(
product(self.base_models, self.compare_models))
unique_combinations = remove_duplicate_pairs(
[combo for combo in model_combinations if combo[0] != combo[1]])
for model_pair in unique_combinations:
model1, model2, judge_model = model_pair[0]['abbr'], model_pair[1][
'abbr'], self.judge_abbr
subdir = model1 + '_' + model2 + '_judged-by--' + self.judge_abbr
subdir_path = os.path.join(results_folder, subdir) subdir_path = os.path.join(results_folder, subdir)
if os.path.isdir(subdir_path): if os.path.isdir(subdir_path):
model1, model2, judge_model = subdir.split('_') fout = osp.join(output_dir,
fout = osp.join(output_dir, judge_model + '-report.csv') 'judged-by--' + judge_model + '-report.csv')
for dataset in dataset_cfgs: for dataset in dataset_cfgs:
dataset_abbr = dataset_abbr_from_cfg(dataset) dataset_abbr = dataset_abbr_from_cfg(dataset)
filepath = os.path.join(subdir_path, filename = os.path.join(subdir_path,
dataset_abbr + '.json') dataset_abbr + '.json')
result = mmengine.load(filepath) partial_filename = os.path.join(subdir_path,
dataset_abbr + '_0.json')
if osp.exists(osp.realpath(filename)):
result = mmengine.load(filename)
elif osp.exists(osp.realpath(partial_filename)):
filename = partial_filename
result = {}
i = 1
partial_dict_flag = 0
while osp.exists(osp.realpath(filename)):
res = mmengine.load(filename)
for k, v in res.items():
result[partial_dict_flag] = v
partial_dict_flag += 1
filename = os.path.join(
subdir_path,
dataset_abbr + '_' + str(i) + '.json')
i += 1
else:
result = {}
if len(result) == 0:
print('*' * 100)
print('There are no results for ' + filename + ' or ' +
partial_filename)
print('*' * 100)
assert len(result > 0)
judged_answers = [] judged_answers = []
references = [] references = []
for k, v in result.items(): for k, v in result.items():
judged_answers.append( judged_answers.append(
call_function(self.match_method, v['prediction'])) call_function(self.match_method, v['prediction']))
references.append(v['gold']) references.append(v['gold'])
successful_judged_answers = len(
judged_answers) - judged_answers.count(None)
print( print(
f'Among {len(judged_answers)} judgements, successfully extracted {len(judged_answers)-judged_answers.count(None)} judgements.' f'Among {len(judged_answers)} judgements, successfully extracted {successful_judged_answers} judgements.'
) )
if successful_judged_answers == 0:
print('*' * 100)
print(
'There are no extracted judgements, please change your judge model or check your prompt!!!'
)
print('*' * 100)
assert successful_judged_answers > 0
win_both_model1, win_both_model2, half_draw_model1, half_draw_model2, categories = defaultdict( win_both_model1, win_both_model2, half_draw_model1, half_draw_model2, categories = defaultdict(
float), defaultdict(float), defaultdict( float), defaultdict(float), defaultdict(
float), defaultdict(float), defaultdict(float) float), defaultdict(float), defaultdict(float)
@ -168,6 +218,8 @@ class Corev2Summarizer:
writer.writerow( writer.writerow(
[row] + [row] +
[scores[row][column] for column in columns]) [scores[row][column] for column in columns])
else:
print(subdir_path + ' is not exist! please check!')
with open(fout, 'r') as f: with open(fout, 'r') as f:
x = from_csv(f) x = from_csv(f)
print(x) print(x)

View File

@ -96,8 +96,11 @@ class SubjectiveEvalTask(BaseTask):
root, ext = osp.splitext(filename) root, ext = osp.splitext(filename)
partial_filename = root + '_0' + ext partial_filename = root + '_0' + ext
pred_strs = None pred_strs = None
if osp.exists(osp.realpath(filename)) or osp.exists(
if not osp.exists(osp.realpath(filename)) and not osp.exists(
osp.realpath(partial_filename)): osp.realpath(partial_filename)):
return {'error': 'No predictions found.'}
else:
if osp.exists(osp.realpath(filename)): if osp.exists(osp.realpath(filename)):
preds = mmengine.load(filename) preds = mmengine.load(filename)
pred_strs = [ pred_strs = [
@ -172,8 +175,12 @@ class SubjectiveEvalTask(BaseTask):
eval_cfg['evaluator']['output_path'] = out_path eval_cfg['evaluator']['output_path'] = out_path
icl_evaluator = ICL_EVALUATORS.build(eval_cfg['evaluator']) icl_evaluator = ICL_EVALUATORS.build(eval_cfg['evaluator'])
references = (test_set[output_column] if output_column else None) references = (test_set[output_column] if output_column else None)
result = icl_evaluator.score(predictions=model_preds,
references=references) if 'error' not in model_preds:
result = icl_evaluator.score(predictions=model_preds,
references=references)
else:
result = model_preds
if 'error' in result: if 'error' in result:
self.logger.error( self.logger.error(