mirror of
https://github.com/open-compass/opencompass.git
synced 2025-05-30 16:03:24 +08:00
[Bug] Fix Summarizer logic
This commit is contained in:
parent
15c825a51a
commit
43cf21581a
@ -1,30 +0,0 @@
|
|||||||
from mmengine.config import read_base
|
|
||||||
|
|
||||||
with read_base():
|
|
||||||
from .groups.bbeh import bbeh_summary_groups
|
|
||||||
|
|
||||||
# Get all the BBEH subset names from the imported bbeh_summary_groups
|
|
||||||
bbeh_subsets = []
|
|
||||||
for group in bbeh_summary_groups:
|
|
||||||
if group['name'] == 'bbeh':
|
|
||||||
bbeh_subsets = group['subsets']
|
|
||||||
break
|
|
||||||
|
|
||||||
summarizer = dict(
|
|
||||||
# Include both individual datasets and the summary metrics we want to see
|
|
||||||
dataset_abbrs=bbeh_subsets + ['bbeh_naive_average'] + ['bbeh_harmonic_mean'],
|
|
||||||
|
|
||||||
# Define the summary group for bbeh
|
|
||||||
summary_groups=[
|
|
||||||
{
|
|
||||||
'name': 'bbeh_naive_average',
|
|
||||||
'subsets': bbeh_subsets,
|
|
||||||
'metric': 'naive_average' # Explicitly specify the metric to use
|
|
||||||
},
|
|
||||||
{
|
|
||||||
'name': 'bbeh_harmonic_mean',
|
|
||||||
'subsets': bbeh_subsets,
|
|
||||||
'metric': 'harmonic_mean'
|
|
||||||
}
|
|
||||||
]
|
|
||||||
)
|
|
@ -9,4 +9,5 @@ _bbeh = [
|
|||||||
'bbeh_spatial_reasoning', 'bbeh_sportqa', 'bbeh_temporal_sequence', 'bbeh_time_arithmetic',
|
'bbeh_spatial_reasoning', 'bbeh_sportqa', 'bbeh_temporal_sequence', 'bbeh_time_arithmetic',
|
||||||
'bbeh_web_of_lies', 'bbeh_word_sorting', 'bbeh_zebra_puzzles'
|
'bbeh_web_of_lies', 'bbeh_word_sorting', 'bbeh_zebra_puzzles'
|
||||||
]
|
]
|
||||||
bbeh_summary_groups.append({'name': 'bbeh', 'subsets': _bbeh})
|
bbeh_summary_groups.append({'name': 'bbeh', 'subsets': _bbeh, 'metric':'naive_average'})
|
||||||
|
bbeh_summary_groups.append({'name': 'bbeh', 'subsets': _bbeh, 'metric':'harmonic_mean'})
|
@ -188,18 +188,18 @@ class DefaultSummarizer:
|
|||||||
eval_modes.append(dataset_eval_mode.get(dataset_abbr, 'unknown'))
|
eval_modes.append(dataset_eval_mode.get(dataset_abbr, 'unknown'))
|
||||||
else:
|
else:
|
||||||
group_metrics = list(functools.reduce(lambda a, b: a & b, [set(dataset_metrics[dataset_abbr]) for dataset_abbr in sg['subsets']]))
|
group_metrics = list(functools.reduce(lambda a, b: a & b, [set(dataset_metrics[dataset_abbr]) for dataset_abbr in sg['subsets']]))
|
||||||
if need_smart_metric and len(group_metrics) > 1:
|
group_metrics.append(default_metric)
|
||||||
for metric in group_metrics:
|
for metric in group_metrics:
|
||||||
for dataset_abbr in sg['subsets']:
|
for dataset_abbr in sg['subsets']:
|
||||||
scores.setdefault(metric, {})[dataset_abbr + '@' + metric] = parsed_results[model_abbr][dataset_abbr][metric]
|
if metric == default_metric:
|
||||||
eval_modes.append(dataset_eval_mode.get(sg['subsets'][0], 'unknown'))
|
metric_default = dataset_metrics[dataset_abbr][0]
|
||||||
else:
|
scores.setdefault(default_metric, {})[dataset_abbr + '@' + metric_default] = \
|
||||||
group_metrics = [default_metric]
|
parsed_results[model_abbr][dataset_abbr][metric_default]
|
||||||
for dataset_abbr in sg['subsets']:
|
|
||||||
metric = dataset_metrics[dataset_abbr][0]
|
|
||||||
scores.setdefault(default_metric, {})[dataset_abbr + '@' + metric] = parsed_results[model_abbr][dataset_abbr][metric]
|
|
||||||
eval_modes.append(dataset_eval_mode.get(dataset_abbr, 'unknown'))
|
eval_modes.append(dataset_eval_mode.get(dataset_abbr, 'unknown'))
|
||||||
|
else:
|
||||||
|
scores.setdefault(metric, {})[dataset_abbr + '@' + metric] = \
|
||||||
|
parsed_results[model_abbr][dataset_abbr][metric]
|
||||||
|
eval_modes.append(dataset_eval_mode.get(sg['subsets'][0], 'unknown'))
|
||||||
result = {}
|
result = {}
|
||||||
for metric in scores:
|
for metric in scores:
|
||||||
if default_metric == 'standard_deviation':
|
if default_metric == 'standard_deviation':
|
||||||
|
Loading…
Reference in New Issue
Block a user