# flake8: noqa # yapf: disable import csv import os import os.path as osp import re from collections import Counter, defaultdict from datetime import datetime from mmengine import ConfigDict from opencompass.utils import model_abbr_from_cfg from .compass_arena import model_abbr_from_cfg_used_in_summarizer from .utils import get_judgeanswer_and_reference, get_outdir def post_process_husimpleqa(judgement: str): pattern = r'\"evaluation\": \"(.*?)\"' matched_result = re.findall(pattern, judgement) try: judge = matched_result[0].lower() return {'judge': judge} except (ValueError, IndexError) as e: return None def get_capability_results( judged_answers, references, fout, fout_flag, model_abbr, ): dim_judges = defaultdict(list) dim_counts = defaultdict(float) for ans, ref in zip(judged_answers, references): dim_judges['total'].append(ans) dim_counts['total'] += 1 dim = ref['hu_specific_dim'] dim_judges[dim].append(ans) dim_counts[dim] += 1 col_name = ['model'] column = [model_abbr] for dim, judges in dim_judges.items(): c = Counter(judges) dim_count = dim_counts[dim] for judge in ['correct', 'incorrect', 'not_attempted']: count = c[judge] col_name.append(dim + ' ' + judge) column.append(round(count / dim_count, 4)) col_name.append(dim + ' correct given attempted') column.append(round(c['correct'] / (c['correct'] + c['incorrect']), 4)) col_name.append(dim + ' F-Score') column.append(round((2 * c['correct']) / (2 * (c['correct'] + c['incorrect']) + c['not_attempted']),4)) col_name.append(dim + ' count') column.append(dim_count) with open(fout, 'a+', newline='') as csvfile: writer = csv.writer(csvfile) if fout_flag == 0: writer.writerow(col_name) writer.writerow(column) class HuSimpleQASummarizer: """Do the subjectivity analyze based on evaluation results. Args: config (ConfigDict): The configuration object of the evaluation task. """ def __init__(self, config: ConfigDict, prompt_languages) -> None: self.judge_type = 'single' self.tasks = [] self.cfg = config self.prompt_languages = prompt_languages self.eval_model_cfgs = self.cfg['eval']['partitioner']['models'] self.judge_abbr = model_abbr_from_cfg(self.cfg['judge_models'][0]) self.judge_function = post_process_husimpleqa def summarize(self, time_str: str = datetime.now().strftime('%Y%m%d_%H%M%S')): """Summarize the subjectivity analysis based on evaluation results. Args: time_str (str): Timestamp for file naming. Returns: pd.DataFrame: The summary results. """ for language in self.prompt_languages: dataset_cfgs = self.cfg['datasets'] output_dir, results_folder = get_outdir(self.cfg, time_str) fout_flag = 0 for eval_model_cfg in self.eval_model_cfgs: eval_model_abbr = model_abbr_from_cfg(eval_model_cfg) show_model_abbr = model_abbr_from_cfg_used_in_summarizer(eval_model_cfg) subdir_path = os.path.join(results_folder, eval_model_abbr + '_judged-by--' + self.judge_abbr) if os.path.isdir(subdir_path): fout = osp.join(output_dir, 'judged-by--' + self.judge_abbr + '-capability' + '_' + language + '.csv') overall_judged_answers, overall_references = [], [] for dataset in dataset_cfgs: if not dataset['abbr'].endswith('_' + language): continue judged_answers, references = get_judgeanswer_and_reference(dataset, subdir_path, self.judge_function) judged_answers = [item['judge'] for item in judged_answers] overall_judged_answers += judged_answers overall_references += references get_capability_results( overall_judged_answers, overall_references, fout, fout_flag, show_model_abbr, ) fout_flag += 1 else: print(subdir_path + ' is not exist! please check!')