2025-02-11 16:55:07 +08:00
|
|
|
# flake8: noqa
|
|
|
|
# yapf: disable
|
|
|
|
import csv
|
|
|
|
import os
|
|
|
|
import os.path as osp
|
|
|
|
import re
|
|
|
|
from collections import Counter, defaultdict
|
|
|
|
from datetime import datetime
|
|
|
|
|
|
|
|
from mmengine import ConfigDict
|
|
|
|
|
|
|
|
from opencompass.utils import model_abbr_from_cfg
|
|
|
|
|
|
|
|
from .compass_arena import model_abbr_from_cfg_used_in_summarizer
|
|
|
|
from .utils import get_judgeanswer_and_reference, get_outdir
|
|
|
|
|
|
|
|
|
|
|
|
def post_process_husimpleqa(judgement: str):
|
|
|
|
pattern = r'\"evaluation\": \"(.*?)\"'
|
|
|
|
matched_result = re.findall(pattern, judgement)
|
|
|
|
try:
|
|
|
|
judge = matched_result[0].lower()
|
|
|
|
return {'judge': judge}
|
|
|
|
except (ValueError, IndexError) as e:
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
|
|
def get_capability_results(
|
|
|
|
judged_answers,
|
|
|
|
references,
|
|
|
|
fout,
|
|
|
|
fout_flag,
|
|
|
|
model_abbr,
|
|
|
|
):
|
|
|
|
dim_judges = defaultdict(list)
|
|
|
|
dim_counts = defaultdict(float)
|
|
|
|
|
|
|
|
for ans, ref in zip(judged_answers, references):
|
|
|
|
dim_judges['total'].append(ans)
|
|
|
|
dim_counts['total'] += 1
|
|
|
|
dim = ref['hu_specific_dim']
|
|
|
|
dim_judges[dim].append(ans)
|
|
|
|
dim_counts[dim] += 1
|
|
|
|
|
|
|
|
col_name = ['model']
|
|
|
|
column = [model_abbr]
|
|
|
|
for dim, judges in dim_judges.items():
|
|
|
|
c = Counter(judges)
|
|
|
|
dim_count = dim_counts[dim]
|
2025-02-13 11:28:49 +08:00
|
|
|
for judge in ['correct', 'incorrect', 'not_attempted']:
|
|
|
|
count = c[judge]
|
|
|
|
col_name.append(dim + ' ' + judge)
|
2025-02-11 16:55:07 +08:00
|
|
|
column.append(round(count / dim_count, 2))
|
|
|
|
col_name.append(dim + ' count')
|
|
|
|
column.append(dim_count)
|
|
|
|
|
|
|
|
with open(fout, 'a+', newline='') as csvfile:
|
|
|
|
writer = csv.writer(csvfile)
|
|
|
|
if fout_flag == 0:
|
|
|
|
writer.writerow(col_name)
|
|
|
|
writer.writerow(column)
|
|
|
|
|
2025-02-14 20:44:53 +08:00
|
|
|
|
2025-02-11 16:55:07 +08:00
|
|
|
class HuSimpleQASummarizer:
|
|
|
|
"""Do the subjectivity analyze based on evaluation results.
|
|
|
|
|
|
|
|
Args:
|
|
|
|
config (ConfigDict): The configuration object of the evaluation task.
|
|
|
|
"""
|
|
|
|
|
2025-02-14 20:44:53 +08:00
|
|
|
def __init__(self, config: ConfigDict, prompt_languages) -> None:
|
2025-02-11 16:55:07 +08:00
|
|
|
self.judge_type = 'single'
|
|
|
|
self.tasks = []
|
|
|
|
self.cfg = config
|
2025-02-14 20:44:53 +08:00
|
|
|
self.prompt_languages = prompt_languages
|
2025-02-11 16:55:07 +08:00
|
|
|
|
|
|
|
self.eval_model_cfgs = self.cfg['eval']['partitioner']['models']
|
|
|
|
self.judge_abbr = model_abbr_from_cfg(self.cfg['judge_models'][0])
|
|
|
|
self.judge_function = post_process_husimpleqa
|
|
|
|
|
|
|
|
def summarize(self, time_str: str = datetime.now().strftime('%Y%m%d_%H%M%S')):
|
|
|
|
"""Summarize the subjectivity analysis based on evaluation results.
|
|
|
|
|
|
|
|
Args:
|
|
|
|
time_str (str): Timestamp for file naming.
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
pd.DataFrame: The summary results.
|
|
|
|
"""
|
2025-02-14 20:44:53 +08:00
|
|
|
for language in self.prompt_languages:
|
|
|
|
dataset_cfgs = self.cfg['datasets']
|
|
|
|
output_dir, results_folder = get_outdir(self.cfg, time_str)
|
|
|
|
fout_flag = 0
|
|
|
|
for eval_model_cfg in self.eval_model_cfgs:
|
|
|
|
eval_model_abbr = model_abbr_from_cfg(eval_model_cfg)
|
|
|
|
show_model_abbr = model_abbr_from_cfg_used_in_summarizer(eval_model_cfg)
|
|
|
|
subdir_path = os.path.join(results_folder, eval_model_abbr + '_judged-by--' + self.judge_abbr)
|
|
|
|
if os.path.isdir(subdir_path):
|
|
|
|
fout = osp.join(output_dir, 'judged-by--' + self.judge_abbr + '-capability' + '_' + language + '.csv')
|
|
|
|
overall_judged_answers, overall_references = [], []
|
|
|
|
for dataset in dataset_cfgs:
|
|
|
|
if not dataset['abbr'].endswith('_' + language):
|
|
|
|
continue
|
|
|
|
judged_answers, references = get_judgeanswer_and_reference(dataset, subdir_path, self.judge_function)
|
|
|
|
judged_answers = [item['judge'] for item in judged_answers]
|
|
|
|
overall_judged_answers += judged_answers
|
|
|
|
overall_references += references
|
|
|
|
|
|
|
|
get_capability_results(
|
|
|
|
overall_judged_answers,
|
|
|
|
overall_references,
|
|
|
|
fout,
|
|
|
|
fout_flag,
|
|
|
|
show_model_abbr,
|
|
|
|
)
|
|
|
|
fout_flag += 1
|
|
|
|
else:
|
|
|
|
print(subdir_path + ' is not exist! please check!')
|