From fbb912ddf31486d3dd296f739c0c2de0ddf0c25c Mon Sep 17 00:00:00 2001 From: bittersweet1999 <148421775+bittersweet1999@users.noreply.github.com> Date: Thu, 21 Dec 2023 15:58:20 +0800 Subject: [PATCH] [Feature] Add abbr for judgemodel in subjective evaluation (#724) * add_judgemodel_abbr * add judgemodel abbr --- opencompass/summarizers/alignmentbench.py | 7 +++-- opencompass/summarizers/corev2.py | 5 ++-- opencompass/summarizers/creationv01.py | 5 ++-- opencompass/tasks/subjective_eval.py | 36 +++++++++++++++++++++-- tools/convert_alignmentbench.py | 2 +- 5 files changed, 45 insertions(+), 10 deletions(-) diff --git a/opencompass/summarizers/alignmentbench.py b/opencompass/summarizers/alignmentbench.py index 69a11d14..11195a8c 100644 --- a/opencompass/summarizers/alignmentbench.py +++ b/opencompass/summarizers/alignmentbench.py @@ -116,15 +116,16 @@ class AlignmentBenchSummarizer: output_dir = osp.join(osp.split(output_path)[0], f'{self.time_str}') mmengine.mkdir_or_exist(output_dir) results_folder = osp.join(work_dir, 'results') - fout = osp.join(output_dir, 'dimension.csv') - fout2 = osp.join(output_dir, 'capability.csv') + fout_flag, fout_flag2 = 0, 0 for subdir in os.listdir(results_folder): if subdir not in self.eval_model_abbrs: continue subdir_path = os.path.join(results_folder, subdir) if os.path.isdir(subdir_path): - model = subdir + model, judge_model = subdir.split('_') + fout = osp.join(output_dir, judge_model + 'dimension.csv') + fout2 = osp.join(output_dir, judge_model + 'capability.csv') for dataset in dataset_cfgs: dataset_abbr = dataset_abbr_from_cfg(dataset) filepath = os.path.join(subdir_path, diff --git a/opencompass/summarizers/corev2.py b/opencompass/summarizers/corev2.py index c3e9c477..2bb7a954 100644 --- a/opencompass/summarizers/corev2.py +++ b/opencompass/summarizers/corev2.py @@ -75,11 +75,12 @@ class Corev2Summarizer: output_dir = osp.join(osp.split(output_path)[0], f'{self.time_str}') mmengine.mkdir_or_exist(output_dir) results_folder = osp.join(work_dir, 'results') - fout = osp.join(output_dir, 'report.csv') + for subdir in os.listdir(results_folder): subdir_path = os.path.join(results_folder, subdir) if os.path.isdir(subdir_path): - model1, model2 = subdir.split('_') + model1, model2, judge_model = subdir.split('_') + fout = osp.join(output_dir, judge_model + '-report.csv') for dataset in dataset_cfgs: dataset_abbr = dataset_abbr_from_cfg(dataset) filepath = os.path.join(subdir_path, diff --git a/opencompass/summarizers/creationv01.py b/opencompass/summarizers/creationv01.py index cac25165..50a081ee 100644 --- a/opencompass/summarizers/creationv01.py +++ b/opencompass/summarizers/creationv01.py @@ -76,11 +76,12 @@ class Creationv01Summarizer: output_dir = osp.join(osp.split(output_path)[0], f'{self.time_str}') mmengine.mkdir_or_exist(output_dir) results_folder = osp.join(work_dir, 'results') - fout = osp.join(output_dir, 'report.csv') + for subdir in os.listdir(results_folder): subdir_path = os.path.join(results_folder, subdir) if os.path.isdir(subdir_path): - model = subdir + model, judge_model = subdir.split('_') + fout = osp.join(output_dir, judge_model + '-report.csv') for dataset in dataset_cfgs: dataset_abbr = dataset_abbr_from_cfg(dataset) filepath = os.path.join(subdir_path, diff --git a/opencompass/tasks/subjective_eval.py b/opencompass/tasks/subjective_eval.py index 1f5e8cbf..583c9334 100644 --- a/opencompass/tasks/subjective_eval.py +++ b/opencompass/tasks/subjective_eval.py @@ -69,7 +69,11 @@ class SubjectiveEvalTask(BaseTask): # Load Dataset eval_cfg = dataset_cfg.get('eval_cfg') output_column = dataset_cfg['reader_cfg']['output_column'] - + if type(model_cfg) == ConfigDict: + model_cfg = (model_cfg, ) + model_cfg += ({ + 'abbr': 'judged-by--' + self.judge_cfg['abbr'] + }, ) out_path = get_infer_output_path( model_cfg, dataset_cfg, osp.join(self.work_dir, 'results')) if osp.exists(out_path): @@ -153,7 +157,14 @@ class SubjectiveEvalTask(BaseTask): # Get out_path out_path = get_infer_output_path(model_cfg, dataset_cfg, osp.join(self.work_dir, 'results')) - model_preds = self._load_model_pred(model_cfg, dataset_cfg, eval_cfg) + new_model_cfg = [] + for m_cfg in model_cfg: + if len(m_cfg) > 1: + new_model_cfg.append(m_cfg) + if len(new_model_cfg) == 1: + new_model_cfg = new_model_cfg[0] + model_preds = self._load_model_pred(new_model_cfg, dataset_cfg, + eval_cfg) if not self.judge_cfg: raise ValueError('missing "eval.runner.task.judge_cfg"') eval_cfg['evaluator']['judge_cfg'] = self.judge_cfg @@ -210,6 +221,27 @@ class SubjectiveEvalTask(BaseTask): return s[start:end] + def get_output_paths(self, file_extension: str = 'json') -> List[str]: + """Get the paths to the output files. Every file should exist if the + task succeeds. + + Args: + file_extension (str): The file extension of the output files. + Default: 'json'. + """ + output_paths = [] + for model, datasets in zip(self.model_cfgs, self.dataset_cfgs): + for dataset in datasets: + if type(model) == ConfigDict: + model = (model, ) + model += ({'abbr': 'judged-by--' + self.judge_cfg['abbr']}, ) + output_paths.append( + get_infer_output_path( + model, dataset, + osp.join(self.work_dir, self.output_subdir), + file_extension)) + return output_paths + def parse_args(): parser = argparse.ArgumentParser(description='Score Calculator') diff --git a/tools/convert_alignmentbench.py b/tools/convert_alignmentbench.py index 4ade537b..ab92f771 100644 --- a/tools/convert_alignmentbench.py +++ b/tools/convert_alignmentbench.py @@ -29,7 +29,7 @@ def extract_predictions_from_json(input_folder): # for prediction output_path = os.path.join(sub_folder, model_name + '_submission.csv') - with open(output_path, 'w', encoding='utf-8') as file: + with open(output_path, 'w', encoding='utf-8-sig') as file: writer = csv.writer(file) for ans in tqdm(all_predictions): writer.writerow([str(ans)])