diff --git a/opencompass/partitioners/sub_naive.py b/opencompass/partitioners/sub_naive.py index 5ae1e801..e21193b0 100644 --- a/opencompass/partitioners/sub_naive.py +++ b/opencompass/partitioners/sub_naive.py @@ -8,6 +8,18 @@ from opencompass.registry import PARTITIONERS from .naive import NaivePartitioner +def remove_duplicate_pairs(model_combinations): + combo_dict = {} + for i, combo in enumerate(model_combinations): + sorted_names = tuple(sorted((combo[0]['abbr'], combo[1]['abbr']))) + if sorted_names not in combo_dict: + combo_dict[sorted_names] = i + new_model_combinations = [ + model_combinations[i] for i in combo_dict.values() + ] + return new_model_combinations + + @PARTITIONERS.register_module() class SubjectiveNaivePartitioner(NaivePartitioner): """Naive task partitioner for subjective evaluation. Compared to @@ -35,17 +47,6 @@ class SubjectiveNaivePartitioner(NaivePartitioner): self.compare_models = compare_models self.model_pairs = model_pairs - def remove_duplicate_pairs(self, model_combinations): - combo_dict = {} - for i, combo in enumerate(model_combinations): - sorted_names = tuple(sorted((combo[0]['abbr'], combo[1]['abbr']))) - if sorted_names not in combo_dict: - combo_dict[sorted_names] = i - new_model_combinations = [ - model_combinations[i] for i in combo_dict.values() - ] - return new_model_combinations - def get_model_combinations( self, models: List[ConfigDict], @@ -57,7 +58,7 @@ class SubjectiveNaivePartitioner(NaivePartitioner): elif self.mode == 'm2n': assert len(base_models) > 0 and len(compare_models) > 0 model_combinations = list(product(base_models, compare_models)) - unique_combinations = self.remove_duplicate_pairs([ + unique_combinations = remove_duplicate_pairs([ combo for combo in model_combinations if combo[0] != combo[1] ]) return unique_combinations diff --git a/opencompass/summarizers/alignmentbench.py b/opencompass/summarizers/alignmentbench.py index 11195a8c..ddbf1099 100644 --- a/opencompass/summarizers/alignmentbench.py +++ b/opencompass/summarizers/alignmentbench.py @@ -38,9 +38,7 @@ def post_process(judgment: str): dictionary_str = match.group(1) kv_pattern = r"'(.*?)': (\d+)" matches = re.findall(kv_pattern, dictionary_str) - result_dict = {key: int(value) for key, value in matches} - return result_dict else: return None @@ -95,6 +93,7 @@ class AlignmentBenchSummarizer: self.eval_model_abbrs = [ model_abbr_from_cfg(model) for model in self.eval_model_cfgs ] + self.judge_abbr = self.cfg['judge_model']['abbr'] def summarize(self, time_str: str = datetime.now().strftime('%Y%m%d_%H%M%S')): @@ -106,6 +105,7 @@ class AlignmentBenchSummarizer: Returns: pd.DataFrame: The summary results. """ + dataset_cfgs = self.cfg['datasets'] work_dir = self.cfg['work_dir'] self.work_dir = work_dir @@ -118,19 +118,48 @@ class AlignmentBenchSummarizer: results_folder = osp.join(work_dir, 'results') fout_flag, fout_flag2 = 0, 0 - for subdir in os.listdir(results_folder): - if subdir not in self.eval_model_abbrs: - continue + for eval_model_abbr in self.eval_model_abbrs: + subdir = eval_model_abbr + '_judged-by--' + self.judge_abbr subdir_path = os.path.join(results_folder, subdir) if os.path.isdir(subdir_path): - model, judge_model = subdir.split('_') - fout = osp.join(output_dir, judge_model + 'dimension.csv') - fout2 = osp.join(output_dir, judge_model + 'capability.csv') + model, judge_model = eval_model_abbr, self.judge_abbr + fout = osp.join(output_dir, + 'judged-by--' + judge_model + '-dimension.csv') + fout2 = osp.join( + output_dir, + 'judged-by--' + judge_model + '-capability.csv') for dataset in dataset_cfgs: dataset_abbr = dataset_abbr_from_cfg(dataset) - filepath = os.path.join(subdir_path, + filename = os.path.join(subdir_path, dataset_abbr + '.json') - result = mmengine.load(filepath) + partial_filename = os.path.join(subdir_path, + dataset_abbr + '_0.json') + if osp.exists(osp.realpath(filename)): + result = mmengine.load(filename) + elif osp.exists(osp.realpath(partial_filename)): + filename = partial_filename + result = {} + i = 1 + partial_dict_flag = 0 + while osp.exists(osp.realpath(filename)): + res = mmengine.load(filename) + for k, v in res.items(): + result[partial_dict_flag] = v + partial_dict_flag += 1 + filename = os.path.join( + subdir_path, + dataset_abbr + '_' + str(i) + '.json') + i += 1 + else: + result = {} + + if len(result) == 0: + print('*' * 100) + print('There are no results for ' + filename + ' or ' + + partial_filename) + print('*' * 100) + assert len(result > 0) + judged_answers = [] references = [] for k, v in result.items(): @@ -144,8 +173,14 @@ class AlignmentBenchSummarizer: print( f'Among {len(result)} judgements, successfully extracted {len(judged_answers)} judgements.' ) + if len(judged_answers) == 0: + print('*' * 100) + print( + 'There are no extracted judgements, please change your judge model or check your prompt!!!' + ) + print('*' * 100) + assert len(judged_answers) > 0 - # 初始化一个嵌套字典用于存储模型和评分 dimension_ratings = defaultdict(int) dimension_counts = defaultdict(int) capability_ratings = defaultdict(int) @@ -225,6 +260,8 @@ class AlignmentBenchSummarizer: for sub_category in sub_categories: row.append(scores[model][sub_category]) writer.writerow(row) + else: + print(subdir_path + ' is not exist! please check!') with open(fout, 'r') as f: x = from_csv(f) print(x) diff --git a/opencompass/summarizers/corev2.py b/opencompass/summarizers/corev2.py index 2bb7a954..648d13ca 100644 --- a/opencompass/summarizers/corev2.py +++ b/opencompass/summarizers/corev2.py @@ -5,6 +5,7 @@ import os.path as osp import re from collections import defaultdict from datetime import datetime +from itertools import product import mmengine from mmengine import ConfigDict @@ -14,6 +15,7 @@ try: except ImportError: from_csv = None +from opencompass.partitioners.sub_naive import remove_duplicate_pairs from opencompass.utils import dataset_abbr_from_cfg @@ -54,6 +56,9 @@ class Corev2Summarizer: self.tasks = [] self.cfg = config self.match_method = match_method + self.base_models = self.cfg['eval']['partitioner']['base_models'] + self.compare_models = self.cfg['eval']['partitioner']['compare_models'] + self.judge_abbr = self.cfg['judge_model']['abbr'] def summarize(self, time_str: str = datetime.now().strftime('%Y%m%d_%H%M%S')): @@ -76,25 +81,70 @@ class Corev2Summarizer: mmengine.mkdir_or_exist(output_dir) results_folder = osp.join(work_dir, 'results') - for subdir in os.listdir(results_folder): + model_combinations = list( + product(self.base_models, self.compare_models)) + unique_combinations = remove_duplicate_pairs( + [combo for combo in model_combinations if combo[0] != combo[1]]) + + for model_pair in unique_combinations: + model1, model2, judge_model = model_pair[0]['abbr'], model_pair[1][ + 'abbr'], self.judge_abbr + subdir = model1 + '_' + model2 + '_judged-by--' + self.judge_abbr subdir_path = os.path.join(results_folder, subdir) if os.path.isdir(subdir_path): - model1, model2, judge_model = subdir.split('_') - fout = osp.join(output_dir, judge_model + '-report.csv') + fout = osp.join(output_dir, + 'judged-by--' + judge_model + '-report.csv') for dataset in dataset_cfgs: dataset_abbr = dataset_abbr_from_cfg(dataset) - filepath = os.path.join(subdir_path, + filename = os.path.join(subdir_path, dataset_abbr + '.json') - result = mmengine.load(filepath) + partial_filename = os.path.join(subdir_path, + dataset_abbr + '_0.json') + if osp.exists(osp.realpath(filename)): + result = mmengine.load(filename) + elif osp.exists(osp.realpath(partial_filename)): + filename = partial_filename + result = {} + i = 1 + partial_dict_flag = 0 + while osp.exists(osp.realpath(filename)): + res = mmengine.load(filename) + for k, v in res.items(): + result[partial_dict_flag] = v + partial_dict_flag += 1 + filename = os.path.join( + subdir_path, + dataset_abbr + '_' + str(i) + '.json') + i += 1 + else: + result = {} + + if len(result) == 0: + print('*' * 100) + print('There are no results for ' + filename + ' or ' + + partial_filename) + print('*' * 100) + assert len(result > 0) + judged_answers = [] references = [] for k, v in result.items(): judged_answers.append( call_function(self.match_method, v['prediction'])) references.append(v['gold']) + successful_judged_answers = len( + judged_answers) - judged_answers.count(None) print( - f'Among {len(judged_answers)} judgements, successfully extracted {len(judged_answers)-judged_answers.count(None)} judgements.' + f'Among {len(judged_answers)} judgements, successfully extracted {successful_judged_answers} judgements.' ) + if successful_judged_answers == 0: + print('*' * 100) + print( + 'There are no extracted judgements, please change your judge model or check your prompt!!!' + ) + print('*' * 100) + assert successful_judged_answers > 0 + win_both_model1, win_both_model2, half_draw_model1, half_draw_model2, categories = defaultdict( float), defaultdict(float), defaultdict( float), defaultdict(float), defaultdict(float) @@ -168,6 +218,8 @@ class Corev2Summarizer: writer.writerow( [row] + [scores[row][column] for column in columns]) + else: + print(subdir_path + ' is not exist! please check!') with open(fout, 'r') as f: x = from_csv(f) print(x) diff --git a/opencompass/tasks/subjective_eval.py b/opencompass/tasks/subjective_eval.py index 583c9334..b0001e7e 100644 --- a/opencompass/tasks/subjective_eval.py +++ b/opencompass/tasks/subjective_eval.py @@ -96,8 +96,11 @@ class SubjectiveEvalTask(BaseTask): root, ext = osp.splitext(filename) partial_filename = root + '_0' + ext pred_strs = None - if osp.exists(osp.realpath(filename)) or osp.exists( + + if not osp.exists(osp.realpath(filename)) and not osp.exists( osp.realpath(partial_filename)): + return {'error': 'No predictions found.'} + else: if osp.exists(osp.realpath(filename)): preds = mmengine.load(filename) pred_strs = [ @@ -172,8 +175,12 @@ class SubjectiveEvalTask(BaseTask): eval_cfg['evaluator']['output_path'] = out_path icl_evaluator = ICL_EVALUATORS.build(eval_cfg['evaluator']) references = (test_set[output_column] if output_column else None) - result = icl_evaluator.score(predictions=model_preds, - references=references) + + if 'error' not in model_preds: + result = icl_evaluator.score(predictions=model_preds, + references=references) + else: + result = model_preds if 'error' in result: self.logger.error(