mirror of
https://github.com/open-compass/opencompass.git
synced 2025-05-30 16:03:24 +08:00
added predicted win rates reporting to bradley terry subj eval methods with an option to switch between win rates and elo ratings (#1815)
This commit is contained in:
parent
121d482378
commit
7f2aeeff26
@ -121,6 +121,7 @@ eval = dict(
|
|||||||
summarizer = dict(
|
summarizer = dict(
|
||||||
type=CompassArenaBradleyTerrySummarizer,
|
type=CompassArenaBradleyTerrySummarizer,
|
||||||
rating_system='bradleyterry',
|
rating_system='bradleyterry',
|
||||||
|
report_pred_win_rates=True,
|
||||||
num_bootstrap=100,
|
num_bootstrap=100,
|
||||||
num_cpu=None,
|
num_cpu=None,
|
||||||
with_control_vars=True,
|
with_control_vars=True,
|
||||||
|
@ -122,6 +122,7 @@ eval = dict(
|
|||||||
summarizer = dict(
|
summarizer = dict(
|
||||||
type=CompassArenaBradleyTerrySummarizer,
|
type=CompassArenaBradleyTerrySummarizer,
|
||||||
rating_system='bradleyterry',
|
rating_system='bradleyterry',
|
||||||
|
report_pred_win_rates=True,
|
||||||
num_bootstrap=100,
|
num_bootstrap=100,
|
||||||
num_cpu=None,
|
num_cpu=None,
|
||||||
with_control_vars=True,
|
with_control_vars=True,
|
||||||
|
@ -6,6 +6,7 @@ import math
|
|||||||
import multiprocessing as mp
|
import multiprocessing as mp
|
||||||
import os
|
import os
|
||||||
import os.path as osp
|
import os.path as osp
|
||||||
|
from collections import defaultdict
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from functools import partial
|
from functools import partial
|
||||||
from typing import Any, Dict, List, Optional, Tuple
|
from typing import Any, Dict, List, Optional, Tuple
|
||||||
@ -607,6 +608,7 @@ class CompassArenaBradleyTerrySummarizer(DefaultSubjectiveSummarizer):
|
|||||||
summary_groups (List, optional): Passed to DefaultSubjectiveSummarizer. Not used for this class. Defaults to None.
|
summary_groups (List, optional): Passed to DefaultSubjectiveSummarizer. Not used for this class. Defaults to None.
|
||||||
prompt_db (_type_, optional): Legacy parameter kept for backward compatibility. Defaults to None.
|
prompt_db (_type_, optional): Legacy parameter kept for backward compatibility. Defaults to None.
|
||||||
rating_system (str, optional): Rating system used. Currently only supports "bradleyterry". Defaults to "bradleyterry".
|
rating_system (str, optional): Rating system used. Currently only supports "bradleyterry". Defaults to "bradleyterry".
|
||||||
|
report_pred_win_rates (bool, optional): Whether to report the predicted win rates (against the baseline model) instead of the arena ratings. Defaults to True.
|
||||||
num_bootstrap (int, optional): The number of bootstraps for estimating the confidence intervals. Defaults to 300.
|
num_bootstrap (int, optional): The number of bootstraps for estimating the confidence intervals. Defaults to 300.
|
||||||
num_cpu (int, optional): The number of CPUs to use for the BT bootstrapping process. Defaults to None.
|
num_cpu (int, optional): The number of CPUs to use for the BT bootstrapping process. Defaults to None.
|
||||||
with_control_vars (bool, optional): Whether to include additional covariates (including style features and group variables) when fitting the BT model. Defaults to True.
|
with_control_vars (bool, optional): Whether to include additional covariates (including style features and group variables) when fitting the BT model. Defaults to True.
|
||||||
@ -622,6 +624,7 @@ class CompassArenaBradleyTerrySummarizer(DefaultSubjectiveSummarizer):
|
|||||||
summary_groups: List = None,
|
summary_groups: List = None,
|
||||||
prompt_db=None,
|
prompt_db=None,
|
||||||
rating_system: str = 'bradleyterry',
|
rating_system: str = 'bradleyterry',
|
||||||
|
report_pred_win_rates: bool = True,
|
||||||
num_bootstrap: int = 300,
|
num_bootstrap: int = 300,
|
||||||
num_cpu: int = None,
|
num_cpu: int = None,
|
||||||
with_control_vars: bool = True,
|
with_control_vars: bool = True,
|
||||||
@ -634,6 +637,7 @@ class CompassArenaBradleyTerrySummarizer(DefaultSubjectiveSummarizer):
|
|||||||
|
|
||||||
self.summarizer_cfg = self.cfg['summarizer']
|
self.summarizer_cfg = self.cfg['summarizer']
|
||||||
self.rating_system = 'bradleyterry' # Only bradleyterry supported
|
self.rating_system = 'bradleyterry' # Only bradleyterry supported
|
||||||
|
self.report_pred_win_rates = report_pred_win_rates
|
||||||
self.num_bootstrap = num_bootstrap
|
self.num_bootstrap = num_bootstrap
|
||||||
self.num_cpu = num_cpu
|
self.num_cpu = num_cpu
|
||||||
self.with_control_vars = with_control_vars
|
self.with_control_vars = with_control_vars
|
||||||
@ -897,6 +901,7 @@ class CompassArenaBradleyTerrySummarizer(DefaultSubjectiveSummarizer):
|
|||||||
'ranking',
|
'ranking',
|
||||||
'ranking_ub',
|
'ranking_ub',
|
||||||
'model_name',
|
'model_name',
|
||||||
|
'predicted_win_rate',
|
||||||
'rating',
|
'rating',
|
||||||
'rating_q975',
|
'rating_q975',
|
||||||
'rating_q025',
|
'rating_q025',
|
||||||
@ -942,6 +947,55 @@ class CompassArenaBradleyTerrySummarizer(DefaultSubjectiveSummarizer):
|
|||||||
|
|
||||||
return flipped_dict
|
return flipped_dict
|
||||||
|
|
||||||
|
def predict_win_rate(
|
||||||
|
self,
|
||||||
|
ratings_df: pd.DataFrame,
|
||||||
|
baseline_model: str,
|
||||||
|
base: float = 10.0,
|
||||||
|
scaling_factor: float = 400.0,
|
||||||
|
round_win_rate: int = None,
|
||||||
|
) -> pd.DataFrame:
|
||||||
|
"""Predict win rates between all models using their ELO ratings.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
ratings_df (pd.DataFrame): DataFrame containing model ratings with model names as index
|
||||||
|
baseline_model (str): Name of baseline model to use as reference
|
||||||
|
base (float): Base for the ELO formula (default 10.0)
|
||||||
|
scaling_factor (float): Scaling factor for rating differences (default 400.0)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
pd.DataFrame: DataFrame with an additional column 'predicted_win_rate' containing
|
||||||
|
the predicted win rate against the baseline model
|
||||||
|
"""
|
||||||
|
if baseline_model not in ratings_df.index:
|
||||||
|
raise ValueError(
|
||||||
|
f'Baseline model {baseline_model} not found in ratings')
|
||||||
|
|
||||||
|
# Create a copy of the ratings dataframe to avoid modifying the original
|
||||||
|
result_df = ratings_df.copy()
|
||||||
|
|
||||||
|
# Initialize the predicted_win_rate column with 0.5 for the baseline model
|
||||||
|
|
||||||
|
result_df['predicted_win_rate'] = 0.5
|
||||||
|
|
||||||
|
# Get the baseline model's rating
|
||||||
|
baseline_rating = ratings_df.loc[baseline_model, 'rating']
|
||||||
|
|
||||||
|
# Calculate win probabilities for all models against the baseline
|
||||||
|
for model, row in ratings_df.iterrows():
|
||||||
|
if model != baseline_model:
|
||||||
|
model_rating = row['rating']
|
||||||
|
# ELO win probability formula
|
||||||
|
win_rate = 1 / (1 + base**(
|
||||||
|
(baseline_rating - model_rating) / scaling_factor))
|
||||||
|
result_df.loc[model, 'predicted_win_rate'] = win_rate
|
||||||
|
|
||||||
|
if round_win_rate is not None:
|
||||||
|
result_df['predicted_win_rate'] = result_df[
|
||||||
|
'predicted_win_rate'].round(round_win_rate)
|
||||||
|
|
||||||
|
return result_df
|
||||||
|
|
||||||
def summarize(
|
def summarize(
|
||||||
self,
|
self,
|
||||||
output_path: str = None,
|
output_path: str = None,
|
||||||
@ -981,6 +1035,13 @@ class CompassArenaBradleyTerrySummarizer(DefaultSubjectiveSummarizer):
|
|||||||
groups=self.groups,
|
groups=self.groups,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Calculate predicted win_rate
|
||||||
|
cur_table_df = self.predict_win_rate(
|
||||||
|
ratings_df=cur_table_df,
|
||||||
|
baseline_model=base_model_abbr,
|
||||||
|
round_win_rate=4,
|
||||||
|
)
|
||||||
|
|
||||||
control_coefficients[dataset_abbr][
|
control_coefficients[dataset_abbr][
|
||||||
base_model_abbr] = cur_ctrl_coefs
|
base_model_abbr] = cur_ctrl_coefs
|
||||||
leaderboard_tables[dataset_abbr][
|
leaderboard_tables[dataset_abbr][
|
||||||
@ -1011,12 +1072,24 @@ class CompassArenaBradleyTerrySummarizer(DefaultSubjectiveSummarizer):
|
|||||||
base_model=list(base_models)[0],
|
base_model=list(base_models)[0],
|
||||||
groups=self.groups,
|
groups=self.groups,
|
||||||
))
|
))
|
||||||
|
# Calculate predicted win_rate
|
||||||
|
cur_judge_all_scores_df = self.predict_win_rate(
|
||||||
|
ratings_df=cur_judge_all_scores_df,
|
||||||
|
baseline_model=list(base_models)[0],
|
||||||
|
round_win_rate=4,
|
||||||
|
)
|
||||||
cur_judge_all_scores_df['judge'] = judge_abbr
|
cur_judge_all_scores_df['judge'] = judge_abbr
|
||||||
|
|
||||||
all_scores_df_list.append(cur_judge_all_scores_df)
|
all_scores_df_list.append(cur_judge_all_scores_df)
|
||||||
|
|
||||||
|
# Report predicted win rate or ratings
|
||||||
|
if self.report_pred_win_rates:
|
||||||
|
_scores = cur_judge_all_scores_df['predicted_win_rate']
|
||||||
|
else:
|
||||||
|
_scores = cur_judge_all_scores_df['rating']
|
||||||
|
|
||||||
all_scores[judge_abbr] = pd.Series(
|
all_scores[judge_abbr] = pd.Series(
|
||||||
cur_judge_all_scores_df['rating'],
|
_scores,
|
||||||
index=cur_judge_all_scores_df['model_name'],
|
index=cur_judge_all_scores_df['model_name'],
|
||||||
).to_dict()
|
).to_dict()
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user