OpenCompass/opencompass/summarizers/subjective/compass_arena_bradley_terry.py

# flake8: noqa
import functools
import getpass
import json
import math
import multiprocessing as mp
import os
import os.path as osp
from datetime import datetime
from functools import partial
from typing import Any, Dict, List, Optional, Tuple

import mmengine
import numpy as np
import pandas as pd
import tabulate
from mmengine import ConfigDict
from scipy.optimize import minimize
from scipy.special import expit
from tqdm import tqdm

from opencompass.summarizers import DefaultSubjectiveSummarizer
from opencompass.summarizers.default_subjective import \
    model_abbr_from_cfg_used_in_summarizer
from opencompass.utils import (LarkReporter, dataset_abbr_from_cfg,
                               get_infer_output_path, get_logger,
                               model_abbr_from_cfg)
from opencompass.utils.prompt import get_prompt_hash

STYLE_CONTROL_VARIABLES_V1 = [
    'sum_assistant_tokens',
    'header_count',
    'list_count',
    'bold_count',
]

EXTRA_CONTROL_VARIABLES = []


def get_matchups_models(df):
    n_rows = len(df)
    model_indices, models = pd.factorize(
        pd.concat([df['model_a'], df['model_b']]))
    matchups = np.column_stack(
        [model_indices[:n_rows], model_indices[n_rows:]])
    return matchups, models.to_list()


def preprocess_for_elo(df):
    """
    in Elo we want numpy arrays for matchups and outcomes
      matchups: int32 (N,2)  contains model ids for the competitors in a match
      outcomes: float64 (N,) contains 1.0, 0.5, or 0.0 representing win, tie, or loss for model_a
    """
    matchups, models = get_matchups_models(df)
    outcomes = np.full(len(df), 0.5)
    outcomes[df['winner'] == 'model_a'] = 1.0
    outcomes[df['winner'] == 'model_b'] = 0.0
    return matchups, outcomes, models


def preprocess_for_bt(df):
    """in BT we only need the unique (matchup,outcome) sets along with the
    weights of how often they occur."""
    n_rows = len(df)
    # the 3 columns of schedule represent: model_a id, model_b id, outcome_id
    schedule = np.full((n_rows, 3), fill_value=1, dtype=np.int32)
    # set the two model cols by mapping the model names to their int ids
    schedule[:, [0, 1]], models = get_matchups_models(df)
    # map outcomes to integers (must be same dtype as model ids so it can be in the same array)
    # model_a win -> 2, tie -> 1 (prefilled by default), model_b win -> 0
    schedule[df['winner'] == 'model_a', 2] = 2
    schedule[df['winner'] == 'model_b', 2] = 0
    # count the number of occurrences of each observed result
    matchups_outcomes, weights = np.unique(schedule,
                                           return_counts=True,
                                           axis=0)
    matchups = matchups_outcomes[:, [0, 1]]
    # map 2 -> 1.0, 1 -> 0.5, 0 -> 0.0 which will be used as labels during optimization
    outcomes = matchups_outcomes[:, 2].astype(np.float64) / 2.0
    weights = weights.astype(np.float64)
    # each possible result is weighted according to number of times it occurred in the dataset
    return matchups, outcomes, models, weights


def preprocess_for_style(
    df,
    apply_ratio: List[int] = None,
    style_variables: List[str] = STYLE_CONTROL_VARIABLES_V1,
    control_variables: List[str] = EXTRA_CONTROL_VARIABLES,
    style_var_suffixes: List[str] = None,
    add_one: bool = True,
    normalize_style_features: bool = True,
):
    matchups, outcomes, models = preprocess_for_elo(
        df)  # this can use the same preprocessing as Elo

    n = matchups.shape[0]
    style_k = int(len(style_variables))

    if control_variables is not None:
        control_k = int(len(control_variables))
    else:
        control_k = 0

    if apply_ratio == None:
        apply_ratio = np.repeat(1, style_k)

    def extract_feature(x, feature):
        val = x[feature]
        if isinstance(val, int):
            return val
        else:
            return sum(val.values())

    ## Style variables
    if style_var_suffixes is None:
        style_var_suffixes = ['_a', '_b']

    style_vector = np.zeros(shape=(2 * style_k, n), dtype=np.int32)
    for idx1, model_suffix in enumerate(style_var_suffixes):
        for idx, element in enumerate(style_variables):
            style_vector[idx + (idx1 * style_k), :] = df.conv_metadata.map(
                partial(extract_feature,
                        feature=f'{element}{model_suffix}')).values

    style_vector = np.ascontiguousarray(style_vector)

    style_diff = (style_vector[:style_k] -
                  style_vector[style_k:]).astype(float)
    style_sum = (style_vector[:style_k] + style_vector[style_k:]).astype(float)

    # Add one to prevent division by zero
    if add_one:
        style_sum = style_sum + np.ones(style_diff.shape)

    apply_ratio = np.flatnonzero(apply_ratio)

    # Apply ratio where necessary (length, etc)
    style_diff[apply_ratio] /= style_sum[apply_ratio]

    style_mean = np.mean(style_diff, axis=1)

    if normalize_style_features:
        style_std = np.std(style_diff, axis=1)

        # # features = normalize(style_diff)
        style_features = ((style_diff - style_mean[:, np.newaxis]) /
                          style_std[:, np.newaxis]).T
    else:
        style_features = style_diff.T

    ## Other control variables
    if control_k > 0:
        control_vector = np.zeros(shape=(control_k, n), dtype=np.int32)
        for idx, element in enumerate(control_variables):
            control_vector[idx, :] = df[element]

        control_vector = np.ascontiguousarray(control_vector).astype(float)

        control_features = control_vector.T

        # combine style and other control features
        features = np.hstack([style_features, control_features])
    else:
        features = style_features

    return matchups, features, outcomes, models


def fit_vectorized_elo(
    matchups,
    outcomes,
    sample_indices,
    num_models: int,
    k: float = 4.0,
    base: float = 10.0,
    init_rating: float = 1000.0,
    scale: float = 400.0,
):
    """fit multiple sets of Elo ratings on different samples of the data at the
    same time."""
    alpha = math.log(base) / scale
    num_samples = sample_indices.shape[1]
    ratings = np.zeros(shape=(num_samples, num_models), dtype=np.float64)
    # iterate over the rows of sample_indices, each column is an index into a match in the input arrays
    sample_range = np.arange(num_samples)
    for matchup_indices in sample_indices:
        model_a_indices = matchups[matchup_indices, 0]
        model_b_indices = matchups[matchup_indices, 1]
        model_a_ratings = ratings[sample_range, model_a_indices]
        model_b_ratings = ratings[sample_range, model_b_indices]
        sample_outcomes = outcomes[matchup_indices]
        probs = expit(alpha * (model_a_ratings - model_b_ratings))
        updates = k * (sample_outcomes - probs)
        ratings[sample_range, model_a_indices] += updates
        ratings[sample_range, model_b_indices] -= updates
    return ratings + init_rating


def compute_elo(
    df,
    k: float = 4.0,
    base: float = 10.0,
    init_rating: float = 1000.0,
    scale: float = 400.0,
):
    matchups, outcomes, models = preprocess_for_elo(df)
    alpha = math.log(base) / scale
    ratings = np.full(shape=(len(models), ), fill_value=init_rating)

    for (model_a_idx, model_b_idx), outcome in zip(matchups, outcomes):
        prob = 1.0 / (1.0 +
                      math.exp(alpha *
                               (ratings[model_b_idx] - ratings[model_a_idx])))
        update = k * (outcome - prob)
        ratings[model_a_idx] += update
        ratings[model_b_idx] -= update

    return {model: ratings[idx] for idx, model in enumerate(models)}


def compute_bootstrap_elo(
    df,
    num_round: int = 100,
    k: float = 4.0,
    base: float = 10.0,
    init_rating: float = 1000.0,
    scale: float = 400.0,
):
    matchups, outcomes, models = preprocess_for_elo(df)
    sample_indices = np.random.randint(low=0,
                                       high=len(df),
                                       size=(len(df), num_round))
    ratings = fit_vectorized_elo(matchups, outcomes, sample_indices,
                                 len(models), k, base, init_rating, scale)
    df = pd.DataFrame(data=ratings, columns=models)
    return df[df.median().sort_values(ascending=False).index]


def bt_loss_and_grad(ratings, matchups, outcomes, weights, alpha=1.0):
    matchup_ratings = ratings[matchups]
    logits = alpha * (matchup_ratings[:, 0] - matchup_ratings[:, 1])
    probs = expit(logits)
    # this form naturally counts a draw as half a win and half a loss
    loss = -((np.log(probs) * outcomes + np.log(1.0 - probs) *
              (1.0 - outcomes)) * weights).sum()
    matchups_grads = -alpha * (outcomes - probs) * weights
    model_grad = np.zeros_like(ratings)
    # aggregate gradients at the model level using the indices in matchups
    np.add.at(
        model_grad,
        matchups[:, [0, 1]],
        matchups_grads[:, None] * np.array([1.0, -1.0], dtype=np.float64),
    )
    return loss, model_grad


def fit_bt(matchups, outcomes, weights, n_models, alpha, tol=1e-6):
    initial_ratings = np.zeros(n_models, dtype=np.float64)
    result = minimize(
        fun=bt_loss_and_grad,
        x0=initial_ratings,
        args=(matchups, outcomes, weights, alpha),
        jac=True,
        method='L-BFGS-B',
        options={
            'disp': False,
            'maxiter': 100,
            'gtol': tol
        },
    )
    return result['x']


def scale_and_offset(
    ratings,
    models,
    scale: float = 400.0,
    init_rating: float = 1000.0,
    baseline_model: str = None,
    baseline_rating: float = 1000.0,
):
    """convert ratings from the natural scale to the Elo rating scale with an
    anchored baseline."""
    scaled_ratings = (ratings * scale) + init_rating

    if baseline_model is not None:
        if baseline_model in models:
            baseline_idx = models.index(baseline_model)
            scaled_ratings += baseline_rating - scaled_ratings[...,
                                                               [baseline_idx]]

    return scaled_ratings


def compute_bt(
    df,
    base: float = 10.0,
    scale: float = 400.0,
    init_rating: float = 1000.0,
    baseline_model: str = None,
    baseline_rating: float = 1000.0,
    tol: float = 1e-6,
):
    matchups, outcomes, models, weights = preprocess_for_bt(df)
    ratings = fit_bt(matchups, outcomes, weights, len(models), math.log(base),
                     tol)

    scaled_ratings = scale_and_offset(
        ratings=ratings,
        models=models,
        scale=scale,
        init_rating=init_rating,
        baseline_model=baseline_model,
        baseline_rating=baseline_rating,
    )

    return pd.Series(scaled_ratings, index=models).sort_values(ascending=False)


def compute_bootstrap_bt(
    battles,
    num_round: int,
    base: float = 10.0,
    scale: float = 400.0,
    init_rating: float = 1000.0,
    baseline_model: str = None,
    baseline_rating: float = 1000.0,
    tol: float = 1e-6,
    num_cpu: int = None,
):
    matchups, outcomes, models, weights = preprocess_for_bt(battles)
    # bootstrap sample the unique outcomes and their counts directly using the multinomial distribution
    rng = np.random.default_rng(seed=0)
    idxs = rng.multinomial(n=len(battles),
                           pvals=weights / weights.sum(),
                           size=(num_round))
    # only the distribution over their occurrence counts changes between samples (and it can be 0)
    boot_weights = idxs.astype(np.float64) / len(battles)

    # the only thing different across samples is the distribution of weights
    bt_fn = partial(fit_bt,
                    matchups,
                    outcomes,
                    n_models=len(models),
                    alpha=np.log(base),
                    tol=tol)
    with mp.Pool(num_cpu if num_cpu else os.cpu_count() - 1) as pool:
        results = list(
            tqdm(pool.imap_unordered(bt_fn, boot_weights), total=num_round))

    ratings = np.array(results)

    scaled_ratings = scale_and_offset(
        ratings=ratings,
        models=models,
        scale=scale,
        init_rating=init_rating,
        baseline_model=baseline_model,
        baseline_rating=baseline_rating,
    )

    df = pd.DataFrame(scaled_ratings, columns=models)
    return df[df.median().sort_values(ascending=False).index]


DIFF_MASK = np.array(
    [1.0, -1.0], dtype=np.float64
)  # create globally to not incur the instantiation cost in each call


def contextual_bt_loss_and_grad(
    params,
    n_competitors,
    matchups,
    features,
    outcomes,
    alpha=1.0,
    reg=1.0,
    half_reg=0.5,
):
    reg_loss = half_reg * np.inner(params, params)

    # Split params into ratings and feature parameters
    ratings = params[:n_competitors]
    feature_params = params[n_competitors:]

    matchup_ratings = ratings[matchups]
    bt_logits = alpha * (matchup_ratings[:, 0] - matchup_ratings[:, 1])
    context_logits = np.dot(features, feature_params)
    probs = expit(bt_logits + context_logits)
    loss = (-((np.log(probs) * outcomes + np.log(1.0 - probs) *
               (1.0 - outcomes))).sum() + reg_loss)

    error = outcomes - probs
    grad = reg * params  # initialize the grad as the regularization grad
    matchups_grads = -alpha * error
    np.add.at(grad[:n_competitors], matchups[:, [0, 1]],
              matchups_grads[:, None] * DIFF_MASK)
    grad[n_competitors:] -= np.dot(features.T, error)
    return loss, grad


# note on regularization:
# default reg is to 0.5 since the LogisticRegression default is 1.0
# in the original implementation, matchups were duplicated
# that made the ratio of log loss to reg loss "twice as high"
# in this non-duplicated version for parity we also reduce the reg by one half to match
def fit_contextual_bt(
        matchups,
        features,
        outcomes,
        models,
        idxs=None,
        alpha=math.log(10.0),
        reg=0.5,
        tol=1e-6,
):
    n_features = features.shape[1]
    n_models = len(models)
    initial_params = np.zeros(n_models + n_features, dtype=np.float64)
    half_reg = reg / 2.0

    # sample idxs optionally allow for fitting on a bootstrap sample of the dataset
    if idxs is not None:
        matchups, features, outcomes = matchups[idxs], features[
            idxs], outcomes[idxs]

    result = minimize(
        fun=contextual_bt_loss_and_grad,
        x0=initial_params,
        args=(n_models, matchups, features, outcomes, alpha, reg, half_reg),
        jac=True,
        method='L-BFGS-B',
        options={
            'disp': False,
            'maxiter': 100,
            'gtol': tol
        },
    )
    return result['x']


def compute_style_control(
    df: pd.DataFrame,
    alpha: float = math.log(10.0),
    reg: float = 0.5,
    scale: float = 400.0,
    init_rating: float = 1000.0,
    baseline_model: str = None,
    baseline_rating: float = 1000.0,
    normalize_style_features: bool = True,
    control_variables: List[str] = None,
    odds_ratio: bool = True,
    tol: float = 1e-6,
):
    if control_variables is not None:
        _df = pd.get_dummies(
            data=df,
            columns=control_variables,
            drop_first=
            False,  # Since the model is fitted without an intercept, we keep all levels of each categorical
        )

        # One-hot encode categorical control variables
        one_hot_ctrls = []
        for col in _df.columns:
            for ctrl_var in control_variables:
                if col.startswith(ctrl_var):
                    one_hot_ctrls.append(col)
                    break

    matchups, features, outcomes, models = preprocess_for_style(
        _df,
        normalize_style_features=normalize_style_features,
        style_variables=STYLE_CONTROL_VARIABLES_V1,
        control_variables=one_hot_ctrls,
    )
    ratings_params = fit_contextual_bt(
        matchups,
        features,
        outcomes,
        models=models,
        alpha=alpha,
        reg=reg,
        tol=tol,
    )
    ratings = ratings_params[:len(models)]

    if odds_ratio:
        params = np.exp(ratings_params[len(models):])
    else:
        params = ratings_params[len(models):]

    scaled_ratings = scale_and_offset(
        ratings=ratings,
        models=models,
        scale=scale,
        init_rating=init_rating,
        baseline_model=baseline_model,
        baseline_rating=baseline_rating,
    )
    scaled_ratings = pd.Series(scaled_ratings,
                               index=models).sort_values(ascending=False)

    control_coefficients = {
        k: v
        for k, v in zip(STYLE_CONTROL_VARIABLES_V1 + one_hot_ctrls, params)
    }

    return scaled_ratings, control_coefficients


def compute_bootstrap_style_control(
    df,
    num_round: int,
    alpha: float = math.log(10.0),
    reg: float = 0.5,
    scale: float = 400.0,
    init_rating: float = 1000.0,
    baseline_model: str = None,
    baseline_rating: float = 1000.0,
    normalize_style_features: bool = True,
    control_variables: List[str] = None,
    odds_ratio: bool = True,
    tol: float = 1e-6,
    num_cpu: int = None,
):
    if control_variables is not None:
        _df = pd.get_dummies(
            data=df,
            columns=control_variables,
            drop_first=
            False,  # Since the model is fitted without an intercept, we keep all levels of each categorical
        )

        # One-hot encode categorical control variables
        one_hot_ctrls = []
        for col in _df.columns:
            for ctrl_var in control_variables:
                if col.startswith(ctrl_var):
                    one_hot_ctrls.append(col)
                    break

    matchups, features, outcomes, models = preprocess_for_style(
        _df,
        normalize_style_features=normalize_style_features,
        style_variables=STYLE_CONTROL_VARIABLES_V1,
        control_variables=one_hot_ctrls,
    )

    contextual_bt_fn = partial(
        fit_contextual_bt,
        matchups,
        features,
        outcomes,
        models,
        alpha=alpha,
        reg=reg,
        tol=tol,
    )

    boot_idxs = np.random.randint(low=0,
                                  high=matchups.shape[0],
                                  size=(num_round, matchups.shape[0]))

    with mp.Pool(num_cpu if num_cpu else os.cpu_count()) as pool:
        results = list(
            tqdm(pool.imap_unordered(contextual_bt_fn, boot_idxs),
                 total=num_round))

    ratings_params = np.array(results)
    ratings = ratings_params[:, :len(models)]

    if odds_ratio:
        params = np.exp(ratings_params[:, len(models):].mean(axis=0))
    else:
        params = ratings_params[:, len(models):].mean(axis=0)

    scaled_ratings = scale_and_offset(
        ratings=ratings,
        models=models,
        scale=scale,
        init_rating=init_rating,
        baseline_model=baseline_model,
        baseline_rating=baseline_rating,
    )
    df = pd.DataFrame(scaled_ratings, columns=models)

    control_coefficients = {
        k: v
        for k, v in zip(STYLE_CONTROL_VARIABLES_V1 + one_hot_ctrls, params)
    }

    return df[df.median().sort_values(
        ascending=False).index], control_coefficients


class CompassArenaBradleyTerrySummarizer(DefaultSubjectiveSummarizer):
    """Summarizer for fitting and Bradley-Terry model to pairwise matchups
    according to https://github.com/lm-sys/FastChat/tree/main.

    Args:
        config (ConfigDict): The configuration object of the evaluation task. It's expected to be filled out at runtime.
        dataset_abbrs (Optional[List[str]], optional): Dataset abbreviations to be listed in the summary. Defaults to None.
        summary_groups (List, optional): Passed to DefaultSubjectiveSummarizer. Not used for this class. Defaults to None.
        prompt_db (_type_, optional): Legacy parameter kept for backward compatibility. Defaults to None.
        rating_system (str, optional): Rating system used. Currently only supports "bradleyterry". Defaults to "bradleyterry".
        num_bootstrap (int, optional): The number of bootstraps for estimating the confidence intervals. Defaults to 300.
        num_cpu (int, optional): The number of CPUs to use for the BT bootstrapping process. Defaults to None.
        with_control_vars (bool, optional): Whether to include additional covariates (including style features and group variables) when fitting the BT model. Defaults to True.
        normalize_style_features (bool, optional): Whether to normalize style features BEFORE fitting the BT model (implementation by FastChat). Turn this off for easier interpretation of odds ratios (when odds_ratio==True). Defaults to True.
        odds_ratio (bool, optional): Whether to report odds ratios (np.exp(beta_k)) instead of the original coefficients. Defaults to True.
        groups (List[str], optional): Group variables to include while fitting the BT model. These must be available in the input dataset for each observation. Defaults to None.
    """

    def __init__(
        self,
        config: ConfigDict,
        dataset_abbrs: Optional[List[str]] = None,
        summary_groups: List = None,
        prompt_db=None,
        rating_system: str = 'bradleyterry',
        num_bootstrap: int = 300,
        num_cpu: int = None,
        with_control_vars: bool = True,
        normalize_style_features: bool = True,
        odds_ratio: bool = True,
        groups: List[str] = None,
    ) -> None:
        summary_groups = [] if summary_groups is None else summary_groups
        super().__init__(config, dataset_abbrs, summary_groups, prompt_db)

        self.summarizer_cfg = self.cfg['summarizer']
        self.rating_system = 'bradleyterry'  # Only bradleyterry supported
        self.num_bootstrap = num_bootstrap
        self.num_cpu = num_cpu
        self.with_control_vars = with_control_vars
        self.normalize_style_features = normalize_style_features
        self.odds_ratio = odds_ratio
        self.groups = [] if groups is None else groups

    def _pick_up_results(self, judge_abbr):
        """The function reads the numerical results of evaluations from the
        output folder based on the configuration file, and ultimately returns
        four dictionaries, each containing processed information in different
        formats. The contents of the four dictionaries are as follows:

        - raw_results: contains the raw results of each model on each dataset (excluding details).
        - parsed_results: contains the results of each model on each dataset for each metric, with metrics in METRIC_BLACKLIST being ignored.
        - dataset_metrics: contains the list of metrics for each dataset, consistent with the metrics in parsed_results. The list is ordered according to the METRIC_WHITELIST,
            with metrics appearing earlier considered more important.
        - dataset_eval_mode: contains the evaluation mode for each dataset.
        """
        # raw_results: {model_abbr: {dataset_abbr: result}}
        raw_results: Dict[str, Dict[str, Any]] = {}
        # # parsed_results: {model_abbr: {dataset_abbr: {metric: score}}}
        # parsed_results: Dict[str, Dict[str, Dict[str, float]]] = {}
        # # dataset_metrics: {dataset_abbr: [metric]}
        # dataset_metrics: Dict[str, List[str]] = {}

        for model in self.model_cfgs:
            model_abbr = model_abbr_from_cfg_used_in_summarizer(model)
            # parsed_results.setdefault(model_abbr, {})
            # raw_results.setdefault(model_abbr, {})

            for dataset in self.dataset_cfgs:
                base_models = dataset.get('base_models', None)
                if base_models is None:
                    raise ValueError(
                        'CompassArenaBradleyTerrySummarizer requires at least one `base_model` in specified in the dataset config.'
                    )

                base_models_list = [item['abbr'] for item in base_models]

                dataset_abbr = dataset_abbr_from_cfg(dataset)
                raw_results.setdefault(dataset_abbr, {})

                for base_model_abbr in base_models_list:
                    raw_results[dataset_abbr].setdefault(base_model_abbr, [])

                    origin_path = get_infer_output_path(
                        model, dataset, osp.join(self.work_dir, 'results'))
                    if base_model_abbr != '':
                        temp_path, dataset_json_name = (
                            origin_path.rsplit('/', 1)[0],
                            origin_path.rsplit('/', 1)[1],
                        )
                        filepath = osp.join(
                            temp_path.rsplit('/', 1)[0],
                            base_model_abbr + '_' +
                            temp_path.rsplit('/', 1)[1] + '_judged-by--' +
                            judge_abbr,
                            dataset_json_name,
                        )
                    else:
                        filepath = osp.join(
                            origin_path.rsplit('/', 1)[0] + '_judged-by--' +
                            judge_abbr,
                            origin_path.rsplit('/', 1)[1],
                        )
                    if not osp.exists(filepath):
                        continue

                    result = mmengine.load(filepath)
                    result.pop('details', None)

                    # raw_results[dataset_abbr] = result
                    raw_results[dataset_abbr][base_model_abbr].extend(
                        result['matches'])

                    if 'error' in result:
                        self.logger.debug(
                            f'error in {model_abbr} {dataset_abbr} {result["error"]}'
                        )
                        continue

        # dataset_eval_mode: {dataset_abbr: eval_mode}
        dataset_eval_mode: Dict[str, str] = {}
        for dataset in self.dataset_cfgs:
            inferencer = (dataset.get('infer_cfg', {}).get('inferencer',
                                                           {}).get('type', ''))
            inferencer = (inferencer if isinstance(inferencer, str) else
                          inferencer.__name__)
            dataset_abbr = dataset_abbr_from_cfg(dataset)
            if 'GenInferencer' in inferencer:
                dataset_eval_mode[dataset_abbr] = 'gen'
            elif 'PPLInferencer' in inferencer:
                dataset_eval_mode[dataset_abbr] = 'ppl'
            elif 'LLInferencer' in inferencer:
                dataset_eval_mode[dataset_abbr] = 'll'
            else:
                dataset_eval_mode[dataset_abbr] = 'unknown'
                self.logger.warning(
                    f'unknown inferencer: {inferencer} - {dataset_abbr}')

        # return raw_results, parsed_results, dataset_metrics, dataset_eval_mode
        return raw_results, dataset_eval_mode

    def _calculate_ratings(
        self,
        matches: Dict,
        base_model: str = None,
        groups: List[str] = None,
    ) -> Tuple[pd.DataFrame, Dict]:

        rating_system = self.rating_system
        num_bootstrap = self.num_bootstrap
        num_cpu = self.num_cpu
        with_control_vars = self.with_control_vars

        matches_df = pd.DataFrame(matches)

        num_battles = (matches_df['model_a'].value_counts().add(
            matches_df['model_b'].value_counts(), fill_value=0))

        # if rating_system == "bradleyterry":
        if with_control_vars:
            bootstrap_df, bootstrap_coef = compute_bootstrap_style_control(
                df=matches_df,
                num_round=num_bootstrap,
                baseline_model=base_model,
                normalize_style_features=self.normalize_style_features,
                control_variables=groups,
                odds_ratio=self.odds_ratio,
            )
            elo_rating_final, coef_final = compute_style_control(
                df=matches_df,
                baseline_model=base_model,
                normalize_style_features=self.normalize_style_features,
                control_variables=groups,
                odds_ratio=self.odds_ratio,
            )
        else:
            bootstrap_df = compute_bootstrap_bt(
                battles=matches_df,
                num_round=num_bootstrap,
                baseline_model=base_model,
                num_cpu=num_cpu,
            )
            elo_rating_final = compute_bt(
                df=matches_df,
                baseline_model=base_model,
            )

        # print(elo_rating_final)

        # elif rating_system == "elo":
        #     bootstrap_df = compute_bootstrap_elo(
        #         df=matches_df,
        #         num_round=num_bootstrap,
        #         num_cpu=num_cpu,
        #     )
        #     elo_rating_final = compute_elo(matches_df)

        model_rating_q025 = bootstrap_df.quantile(0.025)
        model_rating_q975 = bootstrap_df.quantile(0.975)

        # compute ranking based on CI
        model_order = list(elo_rating_final.index)

        ranking = {}
        for i, model_a in enumerate(model_order):
            ranking[model_a] = 1
            for j, model_b in enumerate(model_order):
                if i == j:
                    continue
                if model_rating_q025[model_b] > model_rating_q975[model_a]:
                    ranking[model_a] += 1

        leaderboard_table_df = pd.DataFrame(
            {
                'rating': elo_rating_final,
                'ranking_ub': pd.Series(ranking),
                'std_dev': bootstrap_df.std(),
                'rating_q975': model_rating_q975,
                'rating_q025': model_rating_q025,
                'num_battles': num_battles,
            }, )
        leaderboard_table_df['model_name'] = leaderboard_table_df.index

        leaderboard_table_df.sort_values(
            by=['rating'],
            ascending=False,
            inplace=True,
        )
        leaderboard_table_df['ranking'] = np.arange(
            1,
            len(leaderboard_table_df) + 1)

        if rating_system == 'bradleyterry' and with_control_vars:
            control_coefficients = {
                'bootstrap': bootstrap_coef,
                'final': coef_final,
            }
        else:
            control_coefficients = {'final': []}

        return leaderboard_table_df, control_coefficients['final']

    def _output_to_file(
        self,
        output_path,
        time_str: str,
        tables: Dict,
        metadata: Dict,
        judge_abbr: str,
        dataset_eval_mode: str,
    ):
        # Output to file
        if output_path is None:
            output_path = osp.join(self.work_dir, 'summary',
                                   f'summary_{time_str}.json')
            output_csv_path = osp.join(self.work_dir, 'summary',
                                       f'summary_{time_str}.csv')
        else:
            output_csv_path = output_path.replace('.json', '.csv')
        output_path = output_path.split(
            '.json')[0] + '_by_' + judge_abbr + '.json'

        output_dir = osp.split(output_path)[0]
        mmengine.mkdir_or_exist(output_dir)

        with open(output_path, 'w', encoding='utf-8') as f:
            json.dump(metadata, f, ensure_ascii=False, indent=4)
        self.logger.info(f'write summary to {osp.abspath(output_path)}')

        prompt_version = {
            dataset_abbr_from_cfg(d): get_prompt_hash(d)[:6]
            for d in self.dataset_cfgs
        }

        full_results = []
        for base_model_abbr, datasets in tables.items():
            base_model_results = []
            for dataset_abbr, table_df in datasets.items():
                table_df['dataset'] = dataset_abbr
                table_df['version'] = prompt_version.get(dataset_abbr, '-')
                table_df['metric'] = 'bt_rating'
                table_df['mode'] = dataset_eval_mode[dataset_abbr]
                table_df['base_model'] = base_model_abbr

                base_model_results.append(table_df)

            cur_base_model_result_df = pd.concat(base_model_results)
            full_results.append(cur_base_model_result_df)

        full_results_df = pd.concat(full_results)
        full_results_df = full_results_df[[
            'dataset',
            'version',
            'base_model',
            'metric',
            'mode',
            'ranking',
            'ranking_ub',
            'model_name',
            'rating',
            'rating_q975',
            'rating_q025',
            'std_dev',
            'num_battles',
        ]]

        output_csv_path = (output_csv_path.split('.csv')[0] + '_by_' +
                           judge_abbr + '.csv')

        with pd.option_context(
                'display.max_rows',
                20,
                'display.max_columns',
                20,
                'display.expand_frame_repr',
                False,
        ):
            print(full_results_df.reset_index(drop=True).round(2))

        full_results_df.to_csv(
            output_csv_path,
            index=False,
        )
        self.logger.info(f'write csv to {osp.abspath(output_csv_path)}')

    def flip_dict_levels(self, original_dict: Dict):
        """Flips the two levels of a nested dictionary so that dict[lvl1][lvl2]
        becomes dict[lvl2][lvl1].

        Args:
            original_dict (dict): The original nested dictionary.

        Returns:
            dict: The flipped dictionary.
        """
        flipped_dict = {}
        for lvl1, lvl2_dict in original_dict.items():
            for lvl2, value in lvl2_dict.items():
                if lvl2 not in flipped_dict:
                    flipped_dict[lvl2] = {}
                flipped_dict[lvl2][lvl1] = value

        return flipped_dict

    def summarize(
            self,
            output_path: str = None,
            time_str: str = datetime.now().strftime('%Y%m%d_%H%M%S'),
    ):
        """Summarize evaluation results and format output table.

        Args:
            output_path (str, optional): Output path. Defaults to None.
            time_str (str, optional): Timestamp for file suffix. Defaults to
            datetime.now().strftime('%Y%m%d_%H%M%S').
        """
        all_scores = {}
        for judge_model in self.judge_models:
            control_coefficients = {}
            leaderboard_tables = {}

            judge_abbr = model_abbr_from_cfg(judge_model)

            # pick up results
            raw_results, dataset_eval_mode = self._pick_up_results(judge_abbr)

            all_matches = []
            for dataset_abbr, base_models in raw_results.items():
                control_coefficients[dataset_abbr] = {}
                leaderboard_tables[dataset_abbr] = {}

                dataset_matches = base_models[list(base_models)[0]]
                all_matches.extend(dataset_matches)

                for base_model_abbr, matches in base_models.items():
                    cur_table_df, cur_ctrl_coefs = self._calculate_ratings(
                        matches=matches,
                        base_model=base_model_abbr,
                        groups=self.groups,
                    )

                    control_coefficients[dataset_abbr][
                        base_model_abbr] = cur_ctrl_coefs
                    leaderboard_tables[dataset_abbr][
                        base_model_abbr] = cur_table_df

                    print('-' * 10 +
                          f"{dataset_abbr + ':' + base_model_abbr}\n" +
                          '-' * 10)
                    # print(cur_table_df)
                    print(cur_ctrl_coefs)

            leaderboard_tables = self.flip_dict_levels(leaderboard_tables)

            # Output to .json / .csv files
            self._output_to_file(
                output_path=output_path,
                time_str=time_str,
                tables=leaderboard_tables,
                metadata=control_coefficients,
                judge_abbr=judge_abbr,
                dataset_eval_mode=dataset_eval_mode,
            )

            # Fit another BT model with the first base_model and combining matches from all datasets
            all_scores_df, all_scores_ctrl_coefs = self._calculate_ratings(
                matches=all_matches,
                base_model=list(base_models)[0],
                groups=self.groups,
            )

            all_scores[judge_abbr] = pd.Series(
                all_scores_df['rating'],
                index=all_scores_df['model_name'],
            ).to_dict()

        print(f'{all_scores=}')
        print(f'{all_scores_ctrl_coefs=}')

        return {'CompassArenaSubjBenchBradleyTerry': all_scores}