added predicted win rates reporting to bradley terry subj eval methods with an option to switch between win rates and elo ratings (#1815)

2025-05-30 16:03:24 +08:00 · 2025-01-10 18:20:25 +08:00 · 2025-01-10 18:20:25 +08:00 · 7f2aeeff26
commit 7f2aeeff26
parent 121d482378
3 changed files with 76 additions and 1 deletions
--- a/configs/eval_compassarena_subjectivebench_bradleyterry.py
+++ b/configs/eval_compassarena_subjectivebench_bradleyterry.py
@ -121,6 +121,7 @@ eval = dict(
 summarizer = dict(
    type=CompassArenaBradleyTerrySummarizer,
    rating_system='bradleyterry',
    report_pred_win_rates=True,
    num_bootstrap=100,
    num_cpu=None,
    with_control_vars=True,
--- a/configs/eval_subjective_bradleyterry.py
+++ b/configs/eval_subjective_bradleyterry.py
@ -122,6 +122,7 @@ eval = dict(
 summarizer = dict(
    type=CompassArenaBradleyTerrySummarizer,
    rating_system='bradleyterry',
    report_pred_win_rates=True,
    num_bootstrap=100,
    num_cpu=None,
    with_control_vars=True,
--- a/opencompass/summarizers/subjective/compass_arena_bradley_terry.py
+++ b/opencompass/summarizers/subjective/compass_arena_bradley_terry.py
@ -6,6 +6,7 @@ import math
 import multiprocessing as mp
 import os
 import os.path as osp
 from collections import defaultdict
 from datetime import datetime
 from functools import partial
 from typing import Any, Dict, List, Optional, Tuple
@ -607,6 +608,7 @@ class CompassArenaBradleyTerrySummarizer(DefaultSubjectiveSummarizer):
        summary_groups (List, optional): Passed to DefaultSubjectiveSummarizer. Not used for this class. Defaults to None.
        prompt_db (_type_, optional): Legacy parameter kept for backward compatibility. Defaults to None.
        rating_system (str, optional): Rating system used. Currently only supports "bradleyterry". Defaults to "bradleyterry".
        report_pred_win_rates (bool, optional): Whether to report the predicted win rates (against the baseline model) instead of the arena ratings. Defaults to True.
        num_bootstrap (int, optional): The number of bootstraps for estimating the confidence intervals. Defaults to 300.
        num_cpu (int, optional): The number of CPUs to use for the BT bootstrapping process. Defaults to None.
        with_control_vars (bool, optional): Whether to include additional covariates (including style features and group variables) when fitting the BT model. Defaults to True.
@ -622,6 +624,7 @@ class CompassArenaBradleyTerrySummarizer(DefaultSubjectiveSummarizer):
        summary_groups: List = None,
        prompt_db=None,
        rating_system: str = 'bradleyterry',
        report_pred_win_rates: bool = True,
        num_bootstrap: int = 300,
        num_cpu: int = None,
        with_control_vars: bool = True,
@ -634,6 +637,7 @@ class CompassArenaBradleyTerrySummarizer(DefaultSubjectiveSummarizer):
        self.summarizer_cfg = self.cfg['summarizer']
        self.rating_system = 'bradleyterry'  # Only bradleyterry supported
        self.report_pred_win_rates = report_pred_win_rates
        self.num_bootstrap = num_bootstrap
        self.num_cpu = num_cpu
        self.with_control_vars = with_control_vars
@ -897,6 +901,7 @@ class CompassArenaBradleyTerrySummarizer(DefaultSubjectiveSummarizer):
            'ranking',
            'ranking_ub',
            'model_name',
            'predicted_win_rate',
            'rating',
            'rating_q975',
            'rating_q025',
@ -942,6 +947,55 @@ class CompassArenaBradleyTerrySummarizer(DefaultSubjectiveSummarizer):
        return flipped_dict
    def predict_win_rate(
        self,
        ratings_df: pd.DataFrame,
        baseline_model: str,
        base: float = 10.0,
        scaling_factor: float = 400.0,
        round_win_rate: int = None,
    ) -> pd.DataFrame:
        """Predict win rates between all models using their ELO ratings.
        Args:
            ratings_df (pd.DataFrame): DataFrame containing model ratings with model names as index
            baseline_model (str): Name of baseline model to use as reference
            base (float): Base for the ELO formula (default 10.0)
            scaling_factor (float): Scaling factor for rating differences (default 400.0)
        Returns:
            pd.DataFrame: DataFrame with an additional column 'predicted_win_rate' containing
                the predicted win rate against the baseline model
        """
        if baseline_model not in ratings_df.index:
            raise ValueError(
                f'Baseline model {baseline_model} not found in ratings')
        # Create a copy of the ratings dataframe to avoid modifying the original
        result_df = ratings_df.copy()
        # Initialize the predicted_win_rate column with 0.5 for the baseline model
        result_df['predicted_win_rate'] = 0.5
        # Get the baseline model's rating
        baseline_rating = ratings_df.loc[baseline_model, 'rating']
        # Calculate win probabilities for all models against the baseline
        for model, row in ratings_df.iterrows():
            if model != baseline_model:
                model_rating = row['rating']
                # ELO win probability formula
                win_rate = 1 / (1 + base**(
                    (baseline_rating - model_rating) / scaling_factor))
                result_df.loc[model, 'predicted_win_rate'] = win_rate
        if round_win_rate is not None:
            result_df['predicted_win_rate'] = result_df[
                'predicted_win_rate'].round(round_win_rate)
        return result_df
    def summarize(
            self,
            output_path: str = None,
@ -981,6 +1035,13 @@ class CompassArenaBradleyTerrySummarizer(DefaultSubjectiveSummarizer):
                        groups=self.groups,
                    )
                    # Calculate predicted win_rate
                    cur_table_df = self.predict_win_rate(
                        ratings_df=cur_table_df,
                        baseline_model=base_model_abbr,
                        round_win_rate=4,
                    )
                    control_coefficients[dataset_abbr][
                        base_model_abbr] = cur_ctrl_coefs
                    leaderboard_tables[dataset_abbr][
@ -1011,12 +1072,24 @@ class CompassArenaBradleyTerrySummarizer(DefaultSubjectiveSummarizer):
                    base_model=list(base_models)[0],
                    groups=self.groups,
                ))
            # Calculate predicted win_rate
            cur_judge_all_scores_df = self.predict_win_rate(
                ratings_df=cur_judge_all_scores_df,
                baseline_model=list(base_models)[0],
                round_win_rate=4,
            )
            cur_judge_all_scores_df['judge'] = judge_abbr
            all_scores_df_list.append(cur_judge_all_scores_df)
            # Report predicted win rate or ratings
            if self.report_pred_win_rates:
                _scores = cur_judge_all_scores_df['predicted_win_rate']
            else:
                _scores = cur_judge_all_scores_df['rating']
            all_scores[judge_abbr] = pd.Series(
-                cur_judge_all_scores_df['rating'],
+                _scores,
                index=cur_judge_all_scores_df['model_name'],
            ).to_dict()