mirror of
https://github.com/open-compass/opencompass.git
synced 2025-05-30 16:03:24 +08:00
[Fix] Fix subjective alignbench (#730)
This commit is contained in:
parent
0e24f4213e
commit
e985100cd1
@ -8,6 +8,18 @@ from opencompass.registry import PARTITIONERS
|
|||||||
from .naive import NaivePartitioner
|
from .naive import NaivePartitioner
|
||||||
|
|
||||||
|
|
||||||
|
def remove_duplicate_pairs(model_combinations):
|
||||||
|
combo_dict = {}
|
||||||
|
for i, combo in enumerate(model_combinations):
|
||||||
|
sorted_names = tuple(sorted((combo[0]['abbr'], combo[1]['abbr'])))
|
||||||
|
if sorted_names not in combo_dict:
|
||||||
|
combo_dict[sorted_names] = i
|
||||||
|
new_model_combinations = [
|
||||||
|
model_combinations[i] for i in combo_dict.values()
|
||||||
|
]
|
||||||
|
return new_model_combinations
|
||||||
|
|
||||||
|
|
||||||
@PARTITIONERS.register_module()
|
@PARTITIONERS.register_module()
|
||||||
class SubjectiveNaivePartitioner(NaivePartitioner):
|
class SubjectiveNaivePartitioner(NaivePartitioner):
|
||||||
"""Naive task partitioner for subjective evaluation. Compared to
|
"""Naive task partitioner for subjective evaluation. Compared to
|
||||||
@ -35,17 +47,6 @@ class SubjectiveNaivePartitioner(NaivePartitioner):
|
|||||||
self.compare_models = compare_models
|
self.compare_models = compare_models
|
||||||
self.model_pairs = model_pairs
|
self.model_pairs = model_pairs
|
||||||
|
|
||||||
def remove_duplicate_pairs(self, model_combinations):
|
|
||||||
combo_dict = {}
|
|
||||||
for i, combo in enumerate(model_combinations):
|
|
||||||
sorted_names = tuple(sorted((combo[0]['abbr'], combo[1]['abbr'])))
|
|
||||||
if sorted_names not in combo_dict:
|
|
||||||
combo_dict[sorted_names] = i
|
|
||||||
new_model_combinations = [
|
|
||||||
model_combinations[i] for i in combo_dict.values()
|
|
||||||
]
|
|
||||||
return new_model_combinations
|
|
||||||
|
|
||||||
def get_model_combinations(
|
def get_model_combinations(
|
||||||
self,
|
self,
|
||||||
models: List[ConfigDict],
|
models: List[ConfigDict],
|
||||||
@ -57,7 +58,7 @@ class SubjectiveNaivePartitioner(NaivePartitioner):
|
|||||||
elif self.mode == 'm2n':
|
elif self.mode == 'm2n':
|
||||||
assert len(base_models) > 0 and len(compare_models) > 0
|
assert len(base_models) > 0 and len(compare_models) > 0
|
||||||
model_combinations = list(product(base_models, compare_models))
|
model_combinations = list(product(base_models, compare_models))
|
||||||
unique_combinations = self.remove_duplicate_pairs([
|
unique_combinations = remove_duplicate_pairs([
|
||||||
combo for combo in model_combinations if combo[0] != combo[1]
|
combo for combo in model_combinations if combo[0] != combo[1]
|
||||||
])
|
])
|
||||||
return unique_combinations
|
return unique_combinations
|
||||||
|
@ -38,9 +38,7 @@ def post_process(judgment: str):
|
|||||||
dictionary_str = match.group(1)
|
dictionary_str = match.group(1)
|
||||||
kv_pattern = r"'(.*?)': (\d+)"
|
kv_pattern = r"'(.*?)': (\d+)"
|
||||||
matches = re.findall(kv_pattern, dictionary_str)
|
matches = re.findall(kv_pattern, dictionary_str)
|
||||||
|
|
||||||
result_dict = {key: int(value) for key, value in matches}
|
result_dict = {key: int(value) for key, value in matches}
|
||||||
|
|
||||||
return result_dict
|
return result_dict
|
||||||
else:
|
else:
|
||||||
return None
|
return None
|
||||||
@ -95,6 +93,7 @@ class AlignmentBenchSummarizer:
|
|||||||
self.eval_model_abbrs = [
|
self.eval_model_abbrs = [
|
||||||
model_abbr_from_cfg(model) for model in self.eval_model_cfgs
|
model_abbr_from_cfg(model) for model in self.eval_model_cfgs
|
||||||
]
|
]
|
||||||
|
self.judge_abbr = self.cfg['judge_model']['abbr']
|
||||||
|
|
||||||
def summarize(self,
|
def summarize(self,
|
||||||
time_str: str = datetime.now().strftime('%Y%m%d_%H%M%S')):
|
time_str: str = datetime.now().strftime('%Y%m%d_%H%M%S')):
|
||||||
@ -106,6 +105,7 @@ class AlignmentBenchSummarizer:
|
|||||||
Returns:
|
Returns:
|
||||||
pd.DataFrame: The summary results.
|
pd.DataFrame: The summary results.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
dataset_cfgs = self.cfg['datasets']
|
dataset_cfgs = self.cfg['datasets']
|
||||||
work_dir = self.cfg['work_dir']
|
work_dir = self.cfg['work_dir']
|
||||||
self.work_dir = work_dir
|
self.work_dir = work_dir
|
||||||
@ -118,19 +118,48 @@ class AlignmentBenchSummarizer:
|
|||||||
results_folder = osp.join(work_dir, 'results')
|
results_folder = osp.join(work_dir, 'results')
|
||||||
|
|
||||||
fout_flag, fout_flag2 = 0, 0
|
fout_flag, fout_flag2 = 0, 0
|
||||||
for subdir in os.listdir(results_folder):
|
for eval_model_abbr in self.eval_model_abbrs:
|
||||||
if subdir not in self.eval_model_abbrs:
|
subdir = eval_model_abbr + '_judged-by--' + self.judge_abbr
|
||||||
continue
|
|
||||||
subdir_path = os.path.join(results_folder, subdir)
|
subdir_path = os.path.join(results_folder, subdir)
|
||||||
if os.path.isdir(subdir_path):
|
if os.path.isdir(subdir_path):
|
||||||
model, judge_model = subdir.split('_')
|
model, judge_model = eval_model_abbr, self.judge_abbr
|
||||||
fout = osp.join(output_dir, judge_model + 'dimension.csv')
|
fout = osp.join(output_dir,
|
||||||
fout2 = osp.join(output_dir, judge_model + 'capability.csv')
|
'judged-by--' + judge_model + '-dimension.csv')
|
||||||
|
fout2 = osp.join(
|
||||||
|
output_dir,
|
||||||
|
'judged-by--' + judge_model + '-capability.csv')
|
||||||
for dataset in dataset_cfgs:
|
for dataset in dataset_cfgs:
|
||||||
dataset_abbr = dataset_abbr_from_cfg(dataset)
|
dataset_abbr = dataset_abbr_from_cfg(dataset)
|
||||||
filepath = os.path.join(subdir_path,
|
filename = os.path.join(subdir_path,
|
||||||
dataset_abbr + '.json')
|
dataset_abbr + '.json')
|
||||||
result = mmengine.load(filepath)
|
partial_filename = os.path.join(subdir_path,
|
||||||
|
dataset_abbr + '_0.json')
|
||||||
|
if osp.exists(osp.realpath(filename)):
|
||||||
|
result = mmengine.load(filename)
|
||||||
|
elif osp.exists(osp.realpath(partial_filename)):
|
||||||
|
filename = partial_filename
|
||||||
|
result = {}
|
||||||
|
i = 1
|
||||||
|
partial_dict_flag = 0
|
||||||
|
while osp.exists(osp.realpath(filename)):
|
||||||
|
res = mmengine.load(filename)
|
||||||
|
for k, v in res.items():
|
||||||
|
result[partial_dict_flag] = v
|
||||||
|
partial_dict_flag += 1
|
||||||
|
filename = os.path.join(
|
||||||
|
subdir_path,
|
||||||
|
dataset_abbr + '_' + str(i) + '.json')
|
||||||
|
i += 1
|
||||||
|
else:
|
||||||
|
result = {}
|
||||||
|
|
||||||
|
if len(result) == 0:
|
||||||
|
print('*' * 100)
|
||||||
|
print('There are no results for ' + filename + ' or ' +
|
||||||
|
partial_filename)
|
||||||
|
print('*' * 100)
|
||||||
|
assert len(result > 0)
|
||||||
|
|
||||||
judged_answers = []
|
judged_answers = []
|
||||||
references = []
|
references = []
|
||||||
for k, v in result.items():
|
for k, v in result.items():
|
||||||
@ -144,8 +173,14 @@ class AlignmentBenchSummarizer:
|
|||||||
print(
|
print(
|
||||||
f'Among {len(result)} judgements, successfully extracted {len(judged_answers)} judgements.'
|
f'Among {len(result)} judgements, successfully extracted {len(judged_answers)} judgements.'
|
||||||
)
|
)
|
||||||
|
if len(judged_answers) == 0:
|
||||||
|
print('*' * 100)
|
||||||
|
print(
|
||||||
|
'There are no extracted judgements, please change your judge model or check your prompt!!!'
|
||||||
|
)
|
||||||
|
print('*' * 100)
|
||||||
|
assert len(judged_answers) > 0
|
||||||
|
|
||||||
# 初始化一个嵌套字典用于存储模型和评分
|
|
||||||
dimension_ratings = defaultdict(int)
|
dimension_ratings = defaultdict(int)
|
||||||
dimension_counts = defaultdict(int)
|
dimension_counts = defaultdict(int)
|
||||||
capability_ratings = defaultdict(int)
|
capability_ratings = defaultdict(int)
|
||||||
@ -225,6 +260,8 @@ class AlignmentBenchSummarizer:
|
|||||||
for sub_category in sub_categories:
|
for sub_category in sub_categories:
|
||||||
row.append(scores[model][sub_category])
|
row.append(scores[model][sub_category])
|
||||||
writer.writerow(row)
|
writer.writerow(row)
|
||||||
|
else:
|
||||||
|
print(subdir_path + ' is not exist! please check!')
|
||||||
with open(fout, 'r') as f:
|
with open(fout, 'r') as f:
|
||||||
x = from_csv(f)
|
x = from_csv(f)
|
||||||
print(x)
|
print(x)
|
||||||
|
@ -5,6 +5,7 @@ import os.path as osp
|
|||||||
import re
|
import re
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
from itertools import product
|
||||||
|
|
||||||
import mmengine
|
import mmengine
|
||||||
from mmengine import ConfigDict
|
from mmengine import ConfigDict
|
||||||
@ -14,6 +15,7 @@ try:
|
|||||||
except ImportError:
|
except ImportError:
|
||||||
from_csv = None
|
from_csv = None
|
||||||
|
|
||||||
|
from opencompass.partitioners.sub_naive import remove_duplicate_pairs
|
||||||
from opencompass.utils import dataset_abbr_from_cfg
|
from opencompass.utils import dataset_abbr_from_cfg
|
||||||
|
|
||||||
|
|
||||||
@ -54,6 +56,9 @@ class Corev2Summarizer:
|
|||||||
self.tasks = []
|
self.tasks = []
|
||||||
self.cfg = config
|
self.cfg = config
|
||||||
self.match_method = match_method
|
self.match_method = match_method
|
||||||
|
self.base_models = self.cfg['eval']['partitioner']['base_models']
|
||||||
|
self.compare_models = self.cfg['eval']['partitioner']['compare_models']
|
||||||
|
self.judge_abbr = self.cfg['judge_model']['abbr']
|
||||||
|
|
||||||
def summarize(self,
|
def summarize(self,
|
||||||
time_str: str = datetime.now().strftime('%Y%m%d_%H%M%S')):
|
time_str: str = datetime.now().strftime('%Y%m%d_%H%M%S')):
|
||||||
@ -76,25 +81,70 @@ class Corev2Summarizer:
|
|||||||
mmengine.mkdir_or_exist(output_dir)
|
mmengine.mkdir_or_exist(output_dir)
|
||||||
results_folder = osp.join(work_dir, 'results')
|
results_folder = osp.join(work_dir, 'results')
|
||||||
|
|
||||||
for subdir in os.listdir(results_folder):
|
model_combinations = list(
|
||||||
|
product(self.base_models, self.compare_models))
|
||||||
|
unique_combinations = remove_duplicate_pairs(
|
||||||
|
[combo for combo in model_combinations if combo[0] != combo[1]])
|
||||||
|
|
||||||
|
for model_pair in unique_combinations:
|
||||||
|
model1, model2, judge_model = model_pair[0]['abbr'], model_pair[1][
|
||||||
|
'abbr'], self.judge_abbr
|
||||||
|
subdir = model1 + '_' + model2 + '_judged-by--' + self.judge_abbr
|
||||||
subdir_path = os.path.join(results_folder, subdir)
|
subdir_path = os.path.join(results_folder, subdir)
|
||||||
if os.path.isdir(subdir_path):
|
if os.path.isdir(subdir_path):
|
||||||
model1, model2, judge_model = subdir.split('_')
|
fout = osp.join(output_dir,
|
||||||
fout = osp.join(output_dir, judge_model + '-report.csv')
|
'judged-by--' + judge_model + '-report.csv')
|
||||||
for dataset in dataset_cfgs:
|
for dataset in dataset_cfgs:
|
||||||
dataset_abbr = dataset_abbr_from_cfg(dataset)
|
dataset_abbr = dataset_abbr_from_cfg(dataset)
|
||||||
filepath = os.path.join(subdir_path,
|
filename = os.path.join(subdir_path,
|
||||||
dataset_abbr + '.json')
|
dataset_abbr + '.json')
|
||||||
result = mmengine.load(filepath)
|
partial_filename = os.path.join(subdir_path,
|
||||||
|
dataset_abbr + '_0.json')
|
||||||
|
if osp.exists(osp.realpath(filename)):
|
||||||
|
result = mmengine.load(filename)
|
||||||
|
elif osp.exists(osp.realpath(partial_filename)):
|
||||||
|
filename = partial_filename
|
||||||
|
result = {}
|
||||||
|
i = 1
|
||||||
|
partial_dict_flag = 0
|
||||||
|
while osp.exists(osp.realpath(filename)):
|
||||||
|
res = mmengine.load(filename)
|
||||||
|
for k, v in res.items():
|
||||||
|
result[partial_dict_flag] = v
|
||||||
|
partial_dict_flag += 1
|
||||||
|
filename = os.path.join(
|
||||||
|
subdir_path,
|
||||||
|
dataset_abbr + '_' + str(i) + '.json')
|
||||||
|
i += 1
|
||||||
|
else:
|
||||||
|
result = {}
|
||||||
|
|
||||||
|
if len(result) == 0:
|
||||||
|
print('*' * 100)
|
||||||
|
print('There are no results for ' + filename + ' or ' +
|
||||||
|
partial_filename)
|
||||||
|
print('*' * 100)
|
||||||
|
assert len(result > 0)
|
||||||
|
|
||||||
judged_answers = []
|
judged_answers = []
|
||||||
references = []
|
references = []
|
||||||
for k, v in result.items():
|
for k, v in result.items():
|
||||||
judged_answers.append(
|
judged_answers.append(
|
||||||
call_function(self.match_method, v['prediction']))
|
call_function(self.match_method, v['prediction']))
|
||||||
references.append(v['gold'])
|
references.append(v['gold'])
|
||||||
|
successful_judged_answers = len(
|
||||||
|
judged_answers) - judged_answers.count(None)
|
||||||
print(
|
print(
|
||||||
f'Among {len(judged_answers)} judgements, successfully extracted {len(judged_answers)-judged_answers.count(None)} judgements.'
|
f'Among {len(judged_answers)} judgements, successfully extracted {successful_judged_answers} judgements.'
|
||||||
)
|
)
|
||||||
|
if successful_judged_answers == 0:
|
||||||
|
print('*' * 100)
|
||||||
|
print(
|
||||||
|
'There are no extracted judgements, please change your judge model or check your prompt!!!'
|
||||||
|
)
|
||||||
|
print('*' * 100)
|
||||||
|
assert successful_judged_answers > 0
|
||||||
|
|
||||||
win_both_model1, win_both_model2, half_draw_model1, half_draw_model2, categories = defaultdict(
|
win_both_model1, win_both_model2, half_draw_model1, half_draw_model2, categories = defaultdict(
|
||||||
float), defaultdict(float), defaultdict(
|
float), defaultdict(float), defaultdict(
|
||||||
float), defaultdict(float), defaultdict(float)
|
float), defaultdict(float), defaultdict(float)
|
||||||
@ -168,6 +218,8 @@ class Corev2Summarizer:
|
|||||||
writer.writerow(
|
writer.writerow(
|
||||||
[row] +
|
[row] +
|
||||||
[scores[row][column] for column in columns])
|
[scores[row][column] for column in columns])
|
||||||
|
else:
|
||||||
|
print(subdir_path + ' is not exist! please check!')
|
||||||
with open(fout, 'r') as f:
|
with open(fout, 'r') as f:
|
||||||
x = from_csv(f)
|
x = from_csv(f)
|
||||||
print(x)
|
print(x)
|
||||||
|
@ -96,8 +96,11 @@ class SubjectiveEvalTask(BaseTask):
|
|||||||
root, ext = osp.splitext(filename)
|
root, ext = osp.splitext(filename)
|
||||||
partial_filename = root + '_0' + ext
|
partial_filename = root + '_0' + ext
|
||||||
pred_strs = None
|
pred_strs = None
|
||||||
if osp.exists(osp.realpath(filename)) or osp.exists(
|
|
||||||
|
if not osp.exists(osp.realpath(filename)) and not osp.exists(
|
||||||
osp.realpath(partial_filename)):
|
osp.realpath(partial_filename)):
|
||||||
|
return {'error': 'No predictions found.'}
|
||||||
|
else:
|
||||||
if osp.exists(osp.realpath(filename)):
|
if osp.exists(osp.realpath(filename)):
|
||||||
preds = mmengine.load(filename)
|
preds = mmengine.load(filename)
|
||||||
pred_strs = [
|
pred_strs = [
|
||||||
@ -172,8 +175,12 @@ class SubjectiveEvalTask(BaseTask):
|
|||||||
eval_cfg['evaluator']['output_path'] = out_path
|
eval_cfg['evaluator']['output_path'] = out_path
|
||||||
icl_evaluator = ICL_EVALUATORS.build(eval_cfg['evaluator'])
|
icl_evaluator = ICL_EVALUATORS.build(eval_cfg['evaluator'])
|
||||||
references = (test_set[output_column] if output_column else None)
|
references = (test_set[output_column] if output_column else None)
|
||||||
result = icl_evaluator.score(predictions=model_preds,
|
|
||||||
references=references)
|
if 'error' not in model_preds:
|
||||||
|
result = icl_evaluator.score(predictions=model_preds,
|
||||||
|
references=references)
|
||||||
|
else:
|
||||||
|
result = model_preds
|
||||||
|
|
||||||
if 'error' in result:
|
if 'error' in result:
|
||||||
self.logger.error(
|
self.logger.error(
|
||||||
|
Loading…
Reference in New Issue
Block a user