diff --git a/opencompass/configs/summarizers/bbeh.py b/opencompass/configs/summarizers/bbeh.py new file mode 100644 index 00000000..ba469f82 --- /dev/null +++ b/opencompass/configs/summarizers/bbeh.py @@ -0,0 +1,30 @@ +from mmengine.config import read_base + +with read_base(): + from .groups.bbeh import bbeh_summary_groups + +# Get all the BBEH subset names from the imported bbeh_summary_groups +bbeh_subsets = [] +for group in bbeh_summary_groups: + if group['name'] == 'bbeh': + bbeh_subsets = group['subsets'] + break + +summarizer = dict( + # Include both individual datasets and the summary metrics we want to see + dataset_abbrs=bbeh_subsets + ['bbeh_naive_average'] + ['bbeh_harmonic_mean'], + + # Define the summary group for bbeh + summary_groups=[ + { + 'name': 'bbeh_naive_average', + 'subsets': bbeh_subsets, + 'metric': 'naive_average' # Explicitly specify the metric to use + }, + { + 'name': 'bbeh_harmonic_mean', + 'subsets': bbeh_subsets, + 'metric': 'harmonic_mean' + } + ] +) \ No newline at end of file diff --git a/opencompass/summarizers/default.py b/opencompass/summarizers/default.py index 8a0da5b2..f1094f14 100644 --- a/opencompass/summarizers/default.py +++ b/opencompass/summarizers/default.py @@ -171,6 +171,8 @@ class DefaultSummarizer: default_metric = 'sum' elif sg.get('weights', []): default_metric = 'weighted_average' + elif sg.get('harmonic_mean', False): + default_metric = 'harmonic_mean' else: default_metric = 'naive_average' @@ -204,6 +206,17 @@ class DefaultSummarizer: avg = sum(scores[metric].values()) / len(scores[metric]) variance = sum((scores[metric][k] - avg) ** 2 for k in scores[metric]) / len(scores[metric]) scores[metric] = result[metric] = math.sqrt(variance) + elif default_metric == 'harmonic_mean': + # Check for non-positive values that would cause issues in harmonic mean + if any(scores[metric][k] <= 0 for k in scores[metric]): + self.logger.warning(f'Non-positive values found when calculating harmonic mean for {sg["name"]}') + # Handle non-positive values (either skip or use a small positive value) + numerator = len(scores[metric]) + denominator = sum(1 / max(scores[metric][k], 1) for k in scores[metric]) + else: + numerator = len(scores[metric]) + denominator = sum(1 / scores[metric][k] for k in scores[metric]) + scores[metric] = result[metric] = numerator / denominator else: if sg.get('weights', []): # check sg['weights'][k] != 0 in case of scores[metric][k] is NaN