diff --git a/opencompass/configs/summarizers/bbeh.py b/opencompass/configs/summarizers/bbeh.py deleted file mode 100644 index ba469f82..00000000 --- a/opencompass/configs/summarizers/bbeh.py +++ /dev/null @@ -1,30 +0,0 @@ -from mmengine.config import read_base - -with read_base(): - from .groups.bbeh import bbeh_summary_groups - -# Get all the BBEH subset names from the imported bbeh_summary_groups -bbeh_subsets = [] -for group in bbeh_summary_groups: - if group['name'] == 'bbeh': - bbeh_subsets = group['subsets'] - break - -summarizer = dict( - # Include both individual datasets and the summary metrics we want to see - dataset_abbrs=bbeh_subsets + ['bbeh_naive_average'] + ['bbeh_harmonic_mean'], - - # Define the summary group for bbeh - summary_groups=[ - { - 'name': 'bbeh_naive_average', - 'subsets': bbeh_subsets, - 'metric': 'naive_average' # Explicitly specify the metric to use - }, - { - 'name': 'bbeh_harmonic_mean', - 'subsets': bbeh_subsets, - 'metric': 'harmonic_mean' - } - ] -) \ No newline at end of file diff --git a/opencompass/configs/summarizers/groups/bbeh.py b/opencompass/configs/summarizers/groups/bbeh.py index 5e5cc222..95697144 100644 --- a/opencompass/configs/summarizers/groups/bbeh.py +++ b/opencompass/configs/summarizers/groups/bbeh.py @@ -9,4 +9,5 @@ _bbeh = [ 'bbeh_spatial_reasoning', 'bbeh_sportqa', 'bbeh_temporal_sequence', 'bbeh_time_arithmetic', 'bbeh_web_of_lies', 'bbeh_word_sorting', 'bbeh_zebra_puzzles' ] -bbeh_summary_groups.append({'name': 'bbeh', 'subsets': _bbeh}) +bbeh_summary_groups.append({'name': 'bbeh', 'subsets': _bbeh, 'metric':'naive_average'}) +bbeh_summary_groups.append({'name': 'bbeh', 'subsets': _bbeh, 'metric':'harmonic_mean'}) \ No newline at end of file diff --git a/opencompass/summarizers/default.py b/opencompass/summarizers/default.py index f1094f14..88dd793b 100644 --- a/opencompass/summarizers/default.py +++ b/opencompass/summarizers/default.py @@ -188,18 +188,18 @@ class DefaultSummarizer: eval_modes.append(dataset_eval_mode.get(dataset_abbr, 'unknown')) else: group_metrics = list(functools.reduce(lambda a, b: a & b, [set(dataset_metrics[dataset_abbr]) for dataset_abbr in sg['subsets']])) - if need_smart_metric and len(group_metrics) > 1: - for metric in group_metrics: - for dataset_abbr in sg['subsets']: - scores.setdefault(metric, {})[dataset_abbr + '@' + metric] = parsed_results[model_abbr][dataset_abbr][metric] - eval_modes.append(dataset_eval_mode.get(sg['subsets'][0], 'unknown')) - else: - group_metrics = [default_metric] + group_metrics.append(default_metric) + for metric in group_metrics: for dataset_abbr in sg['subsets']: - metric = dataset_metrics[dataset_abbr][0] - scores.setdefault(default_metric, {})[dataset_abbr + '@' + metric] = parsed_results[model_abbr][dataset_abbr][metric] - eval_modes.append(dataset_eval_mode.get(dataset_abbr, 'unknown')) - + if metric == default_metric: + metric_default = dataset_metrics[dataset_abbr][0] + scores.setdefault(default_metric, {})[dataset_abbr + '@' + metric_default] = \ + parsed_results[model_abbr][dataset_abbr][metric_default] + eval_modes.append(dataset_eval_mode.get(dataset_abbr, 'unknown')) + else: + scores.setdefault(metric, {})[dataset_abbr + '@' + metric] = \ + parsed_results[model_abbr][dataset_abbr][metric] + eval_modes.append(dataset_eval_mode.get(sg['subsets'][0], 'unknown')) result = {} for metric in scores: if default_metric == 'standard_deviation':